ISCA Final Presentaiton - Compilations

29
HSA COMPILATION WEN-MEI HWU, CTO, MULTICOREWARE INC WITH RAY I-JUI SUNG

Transcript of ISCA Final Presentaiton - Compilations

Page 1: ISCA Final Presentaiton -  Compilations

HSA COMPILATIONWEN-MEI HWU, CTO, MULTICOREWARE INC

WITH RAY I-JUI SUNG

Page 2: ISCA Final Presentaiton -  Compilations

KEY HSA FEATURES FOR COMPILATION

ALL-PROCESSORS-EQUAL

GPU and CPU have equal

flexibility to create and

dispatch work items

EQUAL ACCESS TO ENTIRE SYSTEM MEMORY

GPU and CPU have

uniform visibility into entire

memory space

Unified Coherent

Memory

GPUCPU

Single Dispatch Path

GPUCPU

© Copyright 2014 HSA Foundation. All Rights Reserved

Page 3: ISCA Final Presentaiton -  Compilations

A QUICK REVIEW OF OPENCLCURRENT STATE OF PORTABLE HETEROGENEOUS

PARALLEL PROGRAMMING

Page 4: ISCA Final Presentaiton -  Compilations

DEVICE CODE IN OPENCL

SIMPLE MATRIX MULTIPLICATION

__kernel void

matrixMul(__global float* C, __global float* A, __global float* B, int wA, int wB) {

int tx = get_global_id(0);

int ty = get_global_id(1);

float value = 0;

for (int k = 0; k < wA; ++k)

{

float elementA = A[ty * wA + k];

float elementB = B[k * wB + tx];

value += elementA * elementB;

}

C[ty * wA + tx] = value;

}

Explicit thread index usage.

Reasonably readable.

Portable across CPUs, GPUs, and FPGAs

© Copyright 2014 HSA Foundation. All Rights Reserved

Page 5: ISCA Final Presentaiton -  Compilations

HOST CODE IN OPENCL -

CONCEPTUAL

1. allocate and initialize memory on host side

2. Initialize OpenCL

3. allocate device memory and move the data

4. Load and build device code

5. Launch kernel

a. append arguments

6. move the data back from device

© Copyright 2014 HSA Foundation. All Rights Reserved

Page 6: ISCA Final Presentaiton -  Compilations

int main(int argc, char** argv){

// set seed for rand()

srand(2006);

/****************************************************/

/* Allocate and initialize memory on Host Side */

/****************************************************/

// allocate and initialize host memory for matrices A and B

unsigned int size_A = WA * HA;

unsigned int mem_size_A = sizeof(float) * size_A;

float* h_A = (float*) malloc(mem_size_A);

unsigned int size_B = WB * HB;

unsigned int mem_size_B = sizeof(float) * size_B;

float* h_B = (float*) malloc(mem_size_B);

randomInit(h_A, size_A);

randomInit(h_B, size_B);

// allocate host memory for the result C

unsigned int size_C = WC * HC;

unsigned int mem_size_C = sizeof(float) * size_C;

float* h_C = (float*) malloc(mem_size_C);

/*****************************************/

/* Initialize OpenCL */

/*****************************************/

// OpenCL specific variables

cl_context clGPUContext;

cl_command_queue clCommandQue;

cl_program clProgram;

size_t dataBytes;

size_t kernelLength;

cl_int errcode;

// OpenCL device memory pointers for matrices

cl_mem d_A;

cl_mem d_B;

cl_mem d_C;

clGPUContext = clCreateContextFromType(0,

CL_DEVICE_TYPE_GPU,

NULL, NULL, &errcode);

shrCheckError(errcode, CL_SUCCESS);

// get the list of GPU devices associated with context

errcode = clGetContextInfo(clGPUContext,

CL_CONTEXT_DEVICES, 0, NULL,

&dataBytes);

cl_device_id *clDevices = (cl_device_id *)

malloc(dataBytes);

errcode |= clGetContextInfo(clGPUContext,

CL_CONTEXT_DEVICES, dataBytes,

clDevices, NULL);

shrCheckError(errcode, CL_SUCCESS);

//Create a command-queue

clCommandQue = clCreateCommandQueue(clGPUContext,

clDevices[0], 0, &errcode);

shrCheckError(errcode, CL_SUCCESS);

// 3. Allocate device memory and move data

d_C = clCreateBuffer(clGPUContext,

CL_MEM_READ_WRITE,

mem_size_A, NULL, &errcode);

d_A = clCreateBuffer(clGPUContext,

CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR,

mem_size_A, h_A, &errcode);

d_B = clCreateBuffer(clGPUContext,

CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR,

mem_size_B, h_B, &errcode);

// 4. Load and build OpenCL kernel

char *clMatrixMul = oclLoadProgSource("kernel.cl",

"// My comment\n",

&kernelLength);

shrCheckError(clMatrixMul != NULL, shrTRUE);

clProgram = clCreateProgramWithSource(clGPUContext,

1, (const char **)&clMatrixMul,

&kernelLength, &errcode);

shrCheckError(errcode, CL_SUCCESS);

errcode = clBuildProgram(clProgram, 0,

NULL, NULL, NULL, NULL);

shrCheckError(errcode, CL_SUCCESS);

clKernel = clCreateKernel(clProgram,

"matrixMul", &errcode);

shrCheckError(errcode, CL_SUCCESS);

// 5. Launch OpenCL kernel

size_t localWorkSize[2], globalWorkSize[2];

int wA = WA;

int wC = WC;

errcode = clSetKernelArg(clKernel, 0,

sizeof(cl_mem), (void *)&d_C);

errcode |= clSetKernelArg(clKernel, 1,

sizeof(cl_mem), (void *)&d_A);

errcode |= clSetKernelArg(clKernel, 2,

sizeof(cl_mem), (void *)&d_B);

errcode |= clSetKernelArg(clKernel, 3,

sizeof(int), (void *)&wA);

errcode |= clSetKernelArg(clKernel, 4,

sizeof(int), (void *)&wC);

shrCheckError(errcode, CL_SUCCESS);

localWorkSize[0] = 16;

localWorkSize[1] = 16;

globalWorkSize[0] = 1024;

globalWorkSize[1] = 1024;

errcode = clEnqueueNDRangeKernel(clCommandQue,

clKernel, 2, NULL, globalWorkSize,

localWorkSize, 0, NULL, NULL);

shrCheckError(errcode, CL_SUCCESS);

// 6. Retrieve result from device

errcode = clEnqueueReadBuffer(clCommandQue,

d_C, CL_TRUE, 0, mem_size_C,

h_C, 0, NULL, NULL);

shrCheckError(errcode, CL_SUCCESS);

// 7. clean up memory

free(h_A);

free(h_B);

free(h_C);

clReleaseMemObject(d_A);

clReleaseMemObject(d_C);

clReleaseMemObject(d_B);

free(clDevices);

free(clMatrixMul);

clReleaseContext(clGPUContext);

clReleaseKernel(clKernel);

clReleaseProgram(clProgram);

clReleaseCommandQueue(clCommandQue);}

almost 100 lines of code

– tedious and hard to maintain

It does not take advantage of HAS features.

It will likely need to be changed for OpenCL 2.0.

Page 7: ISCA Final Presentaiton -  Compilations

COMPARING SEVERAL HIGH-LEVEL

PROGRAMMING INTERFACES

C++AMP Thrust Bolt OpenACC SYCL

C++ Language

extension

proposed by

Microsoft

library

proposed

by CUDA

library

proposed

by AMD

Annotation

and

Pragmas

proposed

by PGI

C++

wrapper

for

OpenCL

All these proposals aim to reduce tedious boiler

plate code and provide transparent porting to future

systems (future proofing).

© Copyright 2014 HSA Foundation. All Rights Reserved

Page 8: ISCA Final Presentaiton -  Compilations

OPENACCHSA ENABLES SIMPLER IMPLEMENTATION OR

BETTER OPTIMIZATION

© Copyright 2014 HSA Foundation. All Rights Reserved

Page 9: ISCA Final Presentaiton -  Compilations

OPENACC- SIMPLE MATRIX MULTIPLICATION EXAMPLE

1. void MatrixMulti(float *C, const float *A, const float *B, int hA, int wA, int wB)

2 {

3 #pragma acc parallel loop copyin(A[0:hA*wA]) copyin(B[0:wA*wB]) copyout(C[0:hA*wB])

4 for (int i=0; i<hA; i++) {

5 #pragma acc loop

6 for (int j=0; j<wB; j++) {

7 float sum = 0;

8 for (int k=0; k<wA; k++) {

9 float a = A[i*wA+k];

10 float b = B[k*wB+j];

11 sum += a*b;

12 }

13 C[i*Nw+j] = sum;

14 }

15 }

16 }

Little Host Code Overhead

Programmer annotation of

kernel computation

Programmer annotation of data movement

© Copyright 2014 HSA Foundation. All Rights Reserved

Page 10: ISCA Final Presentaiton -  Compilations

ADVANTAGE OF HSA FOR OPENACC

Flexibility in copyin and copyout implementation

Flexible code generation for nested acc parallel loops

E.g., inner loop bounds that depend on outer loop iterations

Compiler data affinity optimization (especially OpenACC kernel regions)

The compiler does not have to undo programmer managed data transfers

© Copyright 2014 HSA Foundation. All Rights Reserved

Page 11: ISCA Final Presentaiton -  Compilations

C++AMP HSA ENABLES EFFICIENT COMPILATION OF AN

EVEN HIGHER LEVEL OF PROGRAMMING

INTERFACE

© Copyright 2014 HSA Foundation. All Rights Reserved

Page 12: ISCA Final Presentaiton -  Compilations

C++ AMP

● C++ Accelerated Massive Parallelism

● Designed for data level parallelism

● Extension of C++11 proposed by Microsoft

● An open specification with multiple implementations aiming at standardization

● MS Visual Studio 2013

● MulticoreWare CLAMP

● GPU data modeled as C++14-like containers for multidimensional arrays

● GPU kernels modeled as C++11 lambda

● Minimal extension to C++ for simplicity and future proofing

© Copyright 2014 HSA Foundation. All Rights Reserved

Page 13: ISCA Final Presentaiton -  Compilations

MATRIX MULTIPLICATION IN C++AMP

void MultiplyWithAMP(int* aMatrix, int* bMatrix, int *productMatrix,

int ha, int hb, int hc) {

array_view<int, 2> a(ha, hb, aMatrix);

array_view<int, 2> b(hb, hc, bMatrix);

array_view<int, 2> product(ha, hc, productMatrix);

parallel_for_each(

product.extent,

[=](index<2> idx) restrict(amp) {

int row = idx[0];

int col = idx[1];

for (int inner = 0; inner < 2; inner++) {

product[idx] += a(row, inner) * b(inner, col);

}

}

);

product.synchronize();}

clGPUContext = clCreateContextFromType(0,

CL_DEVICE_TYPE_GPU,

NULL, NULL, &errcode);

shrCheckError(errcode, CL_SUCCESS);

// get the list of GPU devices associated

// with context

errcode = clGetContextInfo(clGPUContext,

CL_CONTEXT_DEVICES, 0, NULL,

&dataBytes);

cl_device_id *clDevices = (cl_device_id *)

malloc(dataBytes);

errcode |= clGetContextInfo(clGPUContext,

CL_CONTEXT_DEVICES, dataBytes,

clDevices, NULL);

shrCheckError(errcode, CL_SUCCESS);

//Create a command-queue

clCommandQue =

clCreateCommandQueue(clGPUContext,

clDevices[0], 0, &errcode);

shrCheckError(errcode, CL_SUCCESS);

__kernel void

matrixMul(__global float* C, __global float* A,

__global float* B, int wA, int wB) {

int tx = get_global_id(0);

int ty = get_global_id(1);

float value = 0;

for (int k = 0; k < wA; ++k)

{

float elementA = A[ty * wA + k];

float elementB = B[k * wB + tx];

value += elementA * elementB;

}

C[ty * wA + tx] = value;}

© Copyright 2014 HSA Foundation. All Rights Reserved

Page 14: ISCA Final Presentaiton -  Compilations

C++AMP PROGRAMMING MODEL

void MultiplyWithAMP(int* aMatrix, int* bMatrix, int *productMatrix) {

array_view<int, 2> a(3, 2, aMatrix);

array_view<int, 2> b(2, 3, bMatrix);

array_view<int, 2> product(3, 3, productMatrix);

parallel_for_each(

product.extent,

[=](index<2> idx) restrict(amp) {

int row = idx[0];

int col = idx[1];

for (int inner = 0; inner < 2; inner++) {

product[idx] += a(row, inner) * b(inner, col);

}

}

);

product.synchronize();}

GPU data

modeled as

data container

© Copyright 2014 HSA Foundation. All Rights Reserved

Page 15: ISCA Final Presentaiton -  Compilations

C++AMP PROGRAMMING MODEL

void MultiplyWithAMP(int* aMatrix, int* bMatrix, int *productMatrix) {

array_view<int, 2> a(3, 2, aMatrix);

array_view<int, 2> b(2, 3, bMatrix);

array_view<int, 2> product(3, 3, productMatrix);

parallel_for_each(

product.extent,

[=](index<2> idx) restrict(amp) {

int row = idx[0];

int col = idx[1];

for (int inner = 0; inner < 2; inner++) {

product[idx] += a(row, inner) * b(inner, col);

}

}

);

product.synchronize();}

Kernels modeled as

lambdas; arguments are

implicitly modeled as

captured variables,

programmer do not need to

specify copyin and copyout

© Copyright 2014 HSA Foundation. All Rights Reserved

Page 16: ISCA Final Presentaiton -  Compilations

C++AMP PROGRAMMING MODEL

void MultiplyWithAMP(int* aMatrix, int* bMatrix, int *productMatrix) {

array_view<int, 2> a(3, 2, aMatrix);

array_view<int, 2> b(2, 3, bMatrix);

array_view<int, 2> product(3, 3, productMatrix);

parallel_for_each(

product.extent,

[=](index<2> idx) restrict(amp) {

int row = idx[0];

int col = idx[1];

for (int inner = 0; inner < 2; inner++) {

product[idx] += a(row, inner) * b(inner, col);

}

}

);

product.synchronize();

}

Execution

interface; marking

an implicitly

parallel region for

GPU execution

© Copyright 2014 HSA Foundation. All Rights Reserved

Page 17: ISCA Final Presentaiton -  Compilations

MCW C++AMP (CLAMP)

● Runs on Linux and Mac OS X

● Output code compatible with all major OpenCL stacks: AMD, Apple/Intel (OS X),

NVIDIA and even POCL

● Clang/LLVM-based, open source

o Translate C++AMP code to OpenCL C or OpenCL 1.2 SPIR

o With template helper library

● Runtime: OpenCL 1.1/HSA Runtime and GMAC for non-HSA systems

● One of the two C++ AMP implementations recognized by HSA foundation

© Copyright 2014 HSA Foundation. All Rights Reserved

Page 18: ISCA Final Presentaiton -  Compilations

MCW C++ AMP COMPILER

● Device Path

o generate OpenCL C code and SPIR

o emit kernel function

● Host Path

o preparation to launch the code

C++ AMP

source code

Clang/LLVM 3.3

Device

Code

Host

Code

© Copyright 2014 HSA Foundation. All Rights Reserved

Page 19: ISCA Final Presentaiton -  Compilations

TRANSLATION

parallel_for_each(product.extent,

[=](index<2> idx) restrict(amp) {

int row = idx[0];

int col = idx[1];

for (int inner = 0; inner < 2; inner++) {

product[idx] += a(row, inner) * b(inner, col);

}

});

__kernel void

matrixMul(__global float* C, __global float*

A,

__global float* B, int wA, int wB){

int tx = get_global_id(0);

int ty = get_global_id(1);

float value = 0;

for (int k = 0; k < wA; ++k)

{

float elementA = A[ty * wA + k];

float elementB = B[k * wB + tx];

value += elementA * elementB;

}

C[ty * wA + tx] = value;}

● Append the arguments

● Set the index

● emit kernel function

● implicit memory management

© Copyright 2014 HSA Foundation. All Rights Reserved

Page 20: ISCA Final Presentaiton -  Compilations

EXECUTION ON NON-HSA OPENCL

PLATFORMS

C++ AMP

source code

Clang/LLVM

3.3

Device Code

C++ AMP

source code

Clang/LLVM

3.3

Host Code

gmac

OpenCL

Our work

Runtime

© Copyright 2014 HSA Foundation. All Rights Reserved

Page 21: ISCA Final Presentaiton -  Compilations

GMAC

● unified virtual address space in

software

● Can have high overhead

sometimes

● In HSA (e.g., AMD Kaveri), GMAC

is not longer needed

Gelado, et al, ASPLOS 2010

© Copyright 2014 HSA Foundation. All Rights Reserved

Page 22: ISCA Final Presentaiton -  Compilations

CASE STUDY:BINOMIAL OPTION PRICING

Line of Codes

0

50

100

150

200

250

300

350

C++AMP OpenCL

Lines of Code by Cloc

Host

Kernel

© Copyright 2014 HSA Foundation. All Rights Reserved

Page 23: ISCA Final Presentaiton -  Compilations

PERFORMANCE ON NON-HSA SYSTEMSBINOMIAL OPTION PRICING

0

0.02

0.04

0.06

0.08

0.1

0.12

Total GPU Time Kernel-only

Tim

e in

Seco

nd

s

Performance on an NV Tesla C2050

OpenCL

C++AMP

© Copyright 2014 HSA Foundation. All Rights Reserved

Page 24: ISCA Final Presentaiton -  Compilations

EXECUTION ON HSA

C++ AMP

source code

Clang/LLVM

3.3

Device SPIR

C++ AMP

source code

Clang/LLVM

3.3

Host SPIR

HSA Runtime

Compile Time

Runtime

© Copyright 2014 HSA Foundation. All Rights Reserved

Page 25: ISCA Final Presentaiton -  Compilations

WHAT WE NEED TO DO?

● Kernel function

o emit the kernel function with required arguments

● On Host side

o a function that recursively traverses the object and append the arguments to OpenCL

stack.

● On Device side

o reconstruct it on the device code for future use.

© Copyright 2014 HSA Foundation. All Rights Reserved

Page 26: ISCA Final Presentaiton -  Compilations

WHY COMPILING C++AMP TO OPENCL IS

NOT TRIVIAL

● C++AMP → LLVM IR → OpenCL C or SPIR

● arguments passing (lambda capture vs function calls)

● explicit V.S. implicit memory transfer

● Heavy lifting is done by compiler and runtime

© Copyright 2014 HSA Foundation. All Rights Reserved

Page 27: ISCA Final Presentaiton -  Compilations

EXAMPLE

struct A { int a; };struct B : A { int b; };struct C { B b; int c; };

struct C c;

c.c = 100;

auto fn = [=] () { int qq = c.c; };

© Copyright 2014 HSA Foundation. All Rights Reserved

Page 28: ISCA Final Presentaiton -  Compilations

TRANSLATION

parallel_for_each(product.extent,

[=](index<2> idx) restrict(amp) {

int row = idx[0];

int col = idx[1];

for (int inner = 0; inner < 2; inner++) {

product[idx] += a(row, inner) * b(inner, col);

}

});

__kernel void

matrixMul(__global float* C, __global float* A,

__global float* B, int wA, int wB){

int tx = get_global_id(0);

int ty = get_global_id(1);

float value = 0;

for (int k = 0; k < wA; ++k)

{

float elementA = A[ty * wA + k];

float elementB = B[k * wB + tx];

value += elementA * elementB;

}

C[ty * wA + tx] = value;}

● Compiler

● Turn captured variables into

OpenCL arguments

● Populate the index<N> in OCL

kernel

● Runtime

● Implicit memory management

© Copyright 2014 HSA Foundation. All Rights Reserved

Page 29: ISCA Final Presentaiton -  Compilations

QUESTIONS?

© Copyright 2014 HSA Foundation. All Rights Reserved