ISCA Final Presentaiton - Compilations

HSA COMPILATIONWEN-MEI HWU, CTO, MULTICOREWARE INC

WITH RAY I-JUI SUNG

KEY HSA FEATURES FOR COMPILATION

ALL-PROCESSORS-EQUAL

GPU and CPU have equal

flexibility to create and

dispatch work items

EQUAL ACCESS TO ENTIRE SYSTEM MEMORY

GPU and CPU have

uniform visibility into entire

memory space

Unified Coherent

Memory

GPUCPU

Single Dispatch Path

GPUCPU

© Copyright 2014 HSA Foundation. All Rights Reserved

A QUICK REVIEW OF OPENCLCURRENT STATE OF PORTABLE HETEROGENEOUS

PARALLEL PROGRAMMING

DEVICE CODE IN OPENCL

SIMPLE MATRIX MULTIPLICATION

__kernel void

matrixMul(__global float* C, __global float* A, __global float* B, int wA, int wB) {

int tx = get_global_id(0);

int ty = get_global_id(1);

float value = 0;

for (int k = 0; k < wA; ++k)

{

float elementA = A[ty * wA + k];

float elementB = B[k * wB + tx];

value += elementA * elementB;

}

C[ty * wA + tx] = value;

}

Explicit thread index usage.

Reasonably readable.

Portable across CPUs, GPUs, and FPGAs


HOST CODE IN OPENCL -

CONCEPTUAL

1. allocate and initialize memory on host side

2. Initialize OpenCL

3. allocate device memory and move the data

4. Load and build device code

5. Launch kernel

a. append arguments

6. move the data back from device


int main(int argc, char** argv){

// set seed for rand()

srand(2006);

/****************************************************/

/* Allocate and initialize memory on Host Side */

/****************************************************/

// allocate and initialize host memory for matrices A and B

unsigned int size_A = WA * HA;

unsigned int mem_size_A = sizeof(float) * size_A;

float* h_A = (float*) malloc(mem_size_A);

unsigned int size_B = WB * HB;

unsigned int mem_size_B = sizeof(float) * size_B;

float* h_B = (float*) malloc(mem_size_B);

randomInit(h_A, size_A);

randomInit(h_B, size_B);

// allocate host memory for the result C

unsigned int size_C = WC * HC;

unsigned int mem_size_C = sizeof(float) * size_C;

float* h_C = (float*) malloc(mem_size_C);

/*****************************************/

/* Initialize OpenCL */

/*****************************************/

// OpenCL specific variables

cl_context clGPUContext;

cl_command_queue clCommandQue;

cl_program clProgram;

size_t dataBytes;

size_t kernelLength;

cl_int errcode;

// OpenCL device memory pointers for matrices

cl_mem d_A;

cl_mem d_B;

cl_mem d_C;

clGPUContext = clCreateContextFromType(0,

CL_DEVICE_TYPE_GPU,

NULL, NULL, &errcode);

shrCheckError(errcode, CL_SUCCESS);

// get the list of GPU devices associated with context

errcode = clGetContextInfo(clGPUContext,

CL_CONTEXT_DEVICES, 0, NULL,

&dataBytes);

cl_device_id *clDevices = (cl_device_id *)

malloc(dataBytes);

errcode |= clGetContextInfo(clGPUContext,

CL_CONTEXT_DEVICES, dataBytes,

clDevices, NULL);


//Create a command-queue

clCommandQue = clCreateCommandQueue(clGPUContext,

clDevices[0], 0, &errcode);


// 3. Allocate device memory and move data

d_C = clCreateBuffer(clGPUContext,

CL_MEM_READ_WRITE,

mem_size_A, NULL, &errcode);

d_A = clCreateBuffer(clGPUContext,

CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR,

mem_size_A, h_A, &errcode);

d_B = clCreateBuffer(clGPUContext,

CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR,

mem_size_B, h_B, &errcode);

// 4. Load and build OpenCL kernel

char *clMatrixMul = oclLoadProgSource("kernel.cl",

"// My comment\n",

&kernelLength);

shrCheckError(clMatrixMul != NULL, shrTRUE);

clProgram = clCreateProgramWithSource(clGPUContext,

1, (const char **)&clMatrixMul,

&kernelLength, &errcode);


errcode = clBuildProgram(clProgram, 0,

NULL, NULL, NULL, NULL);


clKernel = clCreateKernel(clProgram,

"matrixMul", &errcode);


// 5. Launch OpenCL kernel

size_t localWorkSize[2], globalWorkSize[2];

int wA = WA;

int wC = WC;

errcode = clSetKernelArg(clKernel, 0,

sizeof(cl_mem), (void *)&d_C);

errcode |= clSetKernelArg(clKernel, 1,

sizeof(cl_mem), (void *)&d_A);


sizeof(cl_mem), (void *)&d_B);


sizeof(int), (void *)&wA);


sizeof(int), (void *)&wC);


localWorkSize[0] = 16;

localWorkSize[1] = 16;

globalWorkSize[0] = 1024;

globalWorkSize[1] = 1024;

errcode = clEnqueueNDRangeKernel(clCommandQue,

clKernel, 2, NULL, globalWorkSize,

localWorkSize, 0, NULL, NULL);


// 6. Retrieve result from device

errcode = clEnqueueReadBuffer(clCommandQue,

d_C, CL_TRUE, 0, mem_size_C,

h_C, 0, NULL, NULL);


// 7. clean up memory

free(h_A);

free(h_B);

free(h_C);

clReleaseMemObject(d_A);

clReleaseMemObject(d_C);

clReleaseMemObject(d_B);

free(clDevices);

free(clMatrixMul);

clReleaseContext(clGPUContext);

clReleaseKernel(clKernel);

clReleaseProgram(clProgram);

clReleaseCommandQueue(clCommandQue);}

almost 100 lines of code

– tedious and hard to maintain

It does not take advantage of HAS features.

It will likely need to be changed for OpenCL 2.0.

COMPARING SEVERAL HIGH-LEVEL

PROGRAMMING INTERFACES

C++AMP Thrust Bolt OpenACC SYCL

C++ Language

extension

proposed by

Microsoft

library

proposed

by CUDA

library

proposed

by AMD

Annotation

and

Pragmas

proposed

by PGI

C++

wrapper

for

OpenCL

All these proposals aim to reduce tedious boiler

plate code and provide transparent porting to future

systems (future proofing).


OPENACCHSA ENABLES SIMPLER IMPLEMENTATION OR

BETTER OPTIMIZATION


OPENACC- SIMPLE MATRIX MULTIPLICATION EXAMPLE

1. void MatrixMulti(float *C, const float *A, const float *B, int hA, int wA, int wB)

2 {

3 #pragma acc parallel loop copyin(A[0:hA*wA]) copyin(B[0:wA*wB]) copyout(C[0:hA*wB])

4 for (int i=0; i<hA; i++) {

5 #pragma acc loop

6 for (int j=0; j<wB; j++) {

7 float sum = 0;

8 for (int k=0; k<wA; k++) {

9 float a = A[i*wA+k];

10 float b = B[k*wB+j];

11 sum += a*b;

12 }

13 C[i*Nw+j] = sum;

14 }

15 }

16 }

Little Host Code Overhead

Programmer annotation of

kernel computation

Programmer annotation of data movement


ADVANTAGE OF HSA FOR OPENACC

Flexibility in copyin and copyout implementation

Flexible code generation for nested acc parallel loops

E.g., inner loop bounds that depend on outer loop iterations

Compiler data affinity optimization (especially OpenACC kernel regions)

The compiler does not have to undo programmer managed data transfers


C++AMP HSA ENABLES EFFICIENT COMPILATION OF AN

EVEN HIGHER LEVEL OF PROGRAMMING

INTERFACE


C++ AMP

● C++ Accelerated Massive Parallelism

● Designed for data level parallelism

● Extension of C++11 proposed by Microsoft

● An open specification with multiple implementations aiming at standardization

● MS Visual Studio 2013

● MulticoreWare CLAMP

● GPU data modeled as C++14-like containers for multidimensional arrays

● GPU kernels modeled as C++11 lambda

● Minimal extension to C++ for simplicity and future proofing


MATRIX MULTIPLICATION IN C++AMP

void MultiplyWithAMP(int* aMatrix, int* bMatrix, int *productMatrix,

int ha, int hb, int hc) {

array_view<int, 2> a(ha, hb, aMatrix);

array_view<int, 2> b(hb, hc, bMatrix);

array_view<int, 2> product(ha, hc, productMatrix);

parallel_for_each(

product.extent,

[=](index<2> idx) restrict(amp) {

int row = idx[0];

int col = idx[1];

for (int inner = 0; inner < 2; inner++) {

product[idx] += a(row, inner) * b(inner, col);

}

}

);

product.synchronize();}

clGPUContext = clCreateContextFromType(0,

CL_DEVICE_TYPE_GPU,

NULL, NULL, &errcode);


// get the list of GPU devices associated

// with context

errcode = clGetContextInfo(clGPUContext,

CL_CONTEXT_DEVICES, 0, NULL,

&dataBytes);

cl_device_id *clDevices = (cl_device_id *)

malloc(dataBytes);

errcode |= clGetContextInfo(clGPUContext,

CL_CONTEXT_DEVICES, dataBytes,

clDevices, NULL);


//Create a command-queue

clCommandQue =

clCreateCommandQueue(clGPUContext,

clDevices[0], 0, &errcode);


__kernel void

matrixMul(__global float* C, __global float* A,

__global float* B, int wA, int wB) {



float value = 0;

for (int k = 0; k < wA; ++k)

{




}

C[ty * wA + tx] = value;}


C++AMP PROGRAMMING MODEL

void MultiplyWithAMP(int* aMatrix, int* bMatrix, int *productMatrix) {

array_view<int, 2> a(3, 2, aMatrix);

array_view<int, 2> b(2, 3, bMatrix);

array_view<int, 2> product(3, 3, productMatrix);

parallel_for_each(

product.extent,


int row = idx[0];

int col = idx[1];



}

}

);


GPU data

modeled as

data container







parallel_for_each(

product.extent,


int row = idx[0];

int col = idx[1];



}

}

);


Kernels modeled as

lambdas; arguments are

implicitly modeled as

captured variables,

programmer do not need to

specify copyin and copyout







parallel_for_each(

product.extent,


int row = idx[0];

int col = idx[1];



}

}

);

product.synchronize();

}

Execution

interface; marking

an implicitly

parallel region for

GPU execution


MCW C++AMP (CLAMP)

● Runs on Linux and Mac OS X

● Output code compatible with all major OpenCL stacks: AMD, Apple/Intel (OS X),

NVIDIA and even POCL

● Clang/LLVM-based, open source

o Translate C++AMP code to OpenCL C or OpenCL 1.2 SPIR

o With template helper library

● Runtime: OpenCL 1.1/HSA Runtime and GMAC for non-HSA systems

● One of the two C++ AMP implementations recognized by HSA foundation


MCW C++ AMP COMPILER

● Device Path

o generate OpenCL C code and SPIR

o emit kernel function

● Host Path

o preparation to launch the code

C++ AMP

source code

Clang/LLVM 3.3

Device

Code

Host

Code


TRANSLATION

parallel_for_each(product.extent,


int row = idx[0];

int col = idx[1];



}

});

__kernel void

matrixMul(__global float* C, __global float*

A,

__global float* B, int wA, int wB){



float value = 0;

for (int k = 0; k < wA; ++k)

{




}


● Append the arguments

● Set the index

● emit kernel function

● implicit memory management


EXECUTION ON NON-HSA OPENCL

PLATFORMS

C++ AMP

source code

Clang/LLVM

3.3

Device Code

C++ AMP

source code

Clang/LLVM

3.3

Host Code

gmac

OpenCL

Our work

Runtime


GMAC

● unified virtual address space in

software

● Can have high overhead

sometimes

● In HSA (e.g., AMD Kaveri), GMAC

is not longer needed

Gelado, et al, ASPLOS 2010


CASE STUDY:BINOMIAL OPTION PRICING

Line of Codes

0

50

100

150

200

250

300

350

C++AMP OpenCL

Lines of Code by Cloc

Host

Kernel


PERFORMANCE ON NON-HSA SYSTEMSBINOMIAL OPTION PRICING

0

0.02

0.04

0.06

0.08

0.1

0.12

Total GPU Time Kernel-only

Tim

e in

Seco

nd

s

Performance on an NV Tesla C2050

OpenCL

C++AMP


EXECUTION ON HSA

C++ AMP

source code

Clang/LLVM

3.3

Device SPIR

C++ AMP

source code

Clang/LLVM

3.3

Host SPIR

HSA Runtime

Compile Time

Runtime


WHAT WE NEED TO DO?

● Kernel function

o emit the kernel function with required arguments

● On Host side

o a function that recursively traverses the object and append the arguments to OpenCL

stack.

● On Device side

o reconstruct it on the device code for future use.


WHY COMPILING C++AMP TO OPENCL IS

NOT TRIVIAL

● C++AMP → LLVM IR → OpenCL C or SPIR

● arguments passing (lambda capture vs function calls)

● explicit V.S. implicit memory transfer

● Heavy lifting is done by compiler and runtime


EXAMPLE

struct A { int a; };struct B : A { int b; };struct C { B b; int c; };

struct C c;

c.c = 100;

auto fn = [=] () { int qq = c.c; };


TRANSLATION

parallel_for_each(product.extent,


int row = idx[0];

int col = idx[1];



}

});

__kernel void

matrixMul(__global float* C, __global float* A,

__global float* B, int wA, int wB){



float value = 0;

for (int k = 0; k < wA; ++k)

{




}


● Compiler

● Turn captured variables into

OpenCL arguments

● Populate the index<N> in OCL

kernel

● Runtime

● Implicit memory management


QUESTIONS?


ISCA Final Presentaiton - Compilations

Technology

Transcript of ISCA Final Presentaiton - Compilations