ISCA Final Presentaiton - Compilations
-
Upload
hsa-foundation -
Category
Technology
-
view
907 -
download
2
Transcript of ISCA Final Presentaiton - Compilations
HSA COMPILATIONWEN-MEI HWU, CTO, MULTICOREWARE INC
WITH RAY I-JUI SUNG
KEY HSA FEATURES FOR COMPILATION
ALL-PROCESSORS-EQUAL
GPU and CPU have equal
flexibility to create and
dispatch work items
EQUAL ACCESS TO ENTIRE SYSTEM MEMORY
GPU and CPU have
uniform visibility into entire
memory space
Unified Coherent
Memory
GPUCPU
Single Dispatch Path
GPUCPU
© Copyright 2014 HSA Foundation. All Rights Reserved
A QUICK REVIEW OF OPENCLCURRENT STATE OF PORTABLE HETEROGENEOUS
PARALLEL PROGRAMMING
DEVICE CODE IN OPENCL
SIMPLE MATRIX MULTIPLICATION
__kernel void
matrixMul(__global float* C, __global float* A, __global float* B, int wA, int wB) {
int tx = get_global_id(0);
int ty = get_global_id(1);
float value = 0;
for (int k = 0; k < wA; ++k)
{
float elementA = A[ty * wA + k];
float elementB = B[k * wB + tx];
value += elementA * elementB;
}
C[ty * wA + tx] = value;
}
Explicit thread index usage.
Reasonably readable.
Portable across CPUs, GPUs, and FPGAs
© Copyright 2014 HSA Foundation. All Rights Reserved
HOST CODE IN OPENCL -
CONCEPTUAL
1. allocate and initialize memory on host side
2. Initialize OpenCL
3. allocate device memory and move the data
4. Load and build device code
5. Launch kernel
a. append arguments
6. move the data back from device
© Copyright 2014 HSA Foundation. All Rights Reserved
int main(int argc, char** argv){
// set seed for rand()
srand(2006);
/****************************************************/
/* Allocate and initialize memory on Host Side */
/****************************************************/
// allocate and initialize host memory for matrices A and B
unsigned int size_A = WA * HA;
unsigned int mem_size_A = sizeof(float) * size_A;
float* h_A = (float*) malloc(mem_size_A);
unsigned int size_B = WB * HB;
unsigned int mem_size_B = sizeof(float) * size_B;
float* h_B = (float*) malloc(mem_size_B);
randomInit(h_A, size_A);
randomInit(h_B, size_B);
// allocate host memory for the result C
unsigned int size_C = WC * HC;
unsigned int mem_size_C = sizeof(float) * size_C;
float* h_C = (float*) malloc(mem_size_C);
/*****************************************/
/* Initialize OpenCL */
/*****************************************/
// OpenCL specific variables
cl_context clGPUContext;
cl_command_queue clCommandQue;
cl_program clProgram;
size_t dataBytes;
size_t kernelLength;
cl_int errcode;
// OpenCL device memory pointers for matrices
cl_mem d_A;
cl_mem d_B;
cl_mem d_C;
clGPUContext = clCreateContextFromType(0,
CL_DEVICE_TYPE_GPU,
NULL, NULL, &errcode);
shrCheckError(errcode, CL_SUCCESS);
// get the list of GPU devices associated with context
errcode = clGetContextInfo(clGPUContext,
CL_CONTEXT_DEVICES, 0, NULL,
&dataBytes);
cl_device_id *clDevices = (cl_device_id *)
malloc(dataBytes);
errcode |= clGetContextInfo(clGPUContext,
CL_CONTEXT_DEVICES, dataBytes,
clDevices, NULL);
shrCheckError(errcode, CL_SUCCESS);
//Create a command-queue
clCommandQue = clCreateCommandQueue(clGPUContext,
clDevices[0], 0, &errcode);
shrCheckError(errcode, CL_SUCCESS);
// 3. Allocate device memory and move data
d_C = clCreateBuffer(clGPUContext,
CL_MEM_READ_WRITE,
mem_size_A, NULL, &errcode);
d_A = clCreateBuffer(clGPUContext,
CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR,
mem_size_A, h_A, &errcode);
d_B = clCreateBuffer(clGPUContext,
CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR,
mem_size_B, h_B, &errcode);
// 4. Load and build OpenCL kernel
char *clMatrixMul = oclLoadProgSource("kernel.cl",
"// My comment\n",
&kernelLength);
shrCheckError(clMatrixMul != NULL, shrTRUE);
clProgram = clCreateProgramWithSource(clGPUContext,
1, (const char **)&clMatrixMul,
&kernelLength, &errcode);
shrCheckError(errcode, CL_SUCCESS);
errcode = clBuildProgram(clProgram, 0,
NULL, NULL, NULL, NULL);
shrCheckError(errcode, CL_SUCCESS);
clKernel = clCreateKernel(clProgram,
"matrixMul", &errcode);
shrCheckError(errcode, CL_SUCCESS);
// 5. Launch OpenCL kernel
size_t localWorkSize[2], globalWorkSize[2];
int wA = WA;
int wC = WC;
errcode = clSetKernelArg(clKernel, 0,
sizeof(cl_mem), (void *)&d_C);
errcode |= clSetKernelArg(clKernel, 1,
sizeof(cl_mem), (void *)&d_A);
errcode |= clSetKernelArg(clKernel, 2,
sizeof(cl_mem), (void *)&d_B);
errcode |= clSetKernelArg(clKernel, 3,
sizeof(int), (void *)&wA);
errcode |= clSetKernelArg(clKernel, 4,
sizeof(int), (void *)&wC);
shrCheckError(errcode, CL_SUCCESS);
localWorkSize[0] = 16;
localWorkSize[1] = 16;
globalWorkSize[0] = 1024;
globalWorkSize[1] = 1024;
errcode = clEnqueueNDRangeKernel(clCommandQue,
clKernel, 2, NULL, globalWorkSize,
localWorkSize, 0, NULL, NULL);
shrCheckError(errcode, CL_SUCCESS);
// 6. Retrieve result from device
errcode = clEnqueueReadBuffer(clCommandQue,
d_C, CL_TRUE, 0, mem_size_C,
h_C, 0, NULL, NULL);
shrCheckError(errcode, CL_SUCCESS);
// 7. clean up memory
free(h_A);
free(h_B);
free(h_C);
clReleaseMemObject(d_A);
clReleaseMemObject(d_C);
clReleaseMemObject(d_B);
free(clDevices);
free(clMatrixMul);
clReleaseContext(clGPUContext);
clReleaseKernel(clKernel);
clReleaseProgram(clProgram);
clReleaseCommandQueue(clCommandQue);}
almost 100 lines of code
– tedious and hard to maintain
It does not take advantage of HAS features.
It will likely need to be changed for OpenCL 2.0.
COMPARING SEVERAL HIGH-LEVEL
PROGRAMMING INTERFACES
C++AMP Thrust Bolt OpenACC SYCL
C++ Language
extension
proposed by
Microsoft
library
proposed
by CUDA
library
proposed
by AMD
Annotation
and
Pragmas
proposed
by PGI
C++
wrapper
for
OpenCL
All these proposals aim to reduce tedious boiler
plate code and provide transparent porting to future
systems (future proofing).
© Copyright 2014 HSA Foundation. All Rights Reserved
OPENACCHSA ENABLES SIMPLER IMPLEMENTATION OR
BETTER OPTIMIZATION
© Copyright 2014 HSA Foundation. All Rights Reserved
OPENACC- SIMPLE MATRIX MULTIPLICATION EXAMPLE
1. void MatrixMulti(float *C, const float *A, const float *B, int hA, int wA, int wB)
2 {
3 #pragma acc parallel loop copyin(A[0:hA*wA]) copyin(B[0:wA*wB]) copyout(C[0:hA*wB])
4 for (int i=0; i<hA; i++) {
5 #pragma acc loop
6 for (int j=0; j<wB; j++) {
7 float sum = 0;
8 for (int k=0; k<wA; k++) {
9 float a = A[i*wA+k];
10 float b = B[k*wB+j];
11 sum += a*b;
12 }
13 C[i*Nw+j] = sum;
14 }
15 }
16 }
Little Host Code Overhead
Programmer annotation of
kernel computation
Programmer annotation of data movement
© Copyright 2014 HSA Foundation. All Rights Reserved
ADVANTAGE OF HSA FOR OPENACC
Flexibility in copyin and copyout implementation
Flexible code generation for nested acc parallel loops
E.g., inner loop bounds that depend on outer loop iterations
Compiler data affinity optimization (especially OpenACC kernel regions)
The compiler does not have to undo programmer managed data transfers
© Copyright 2014 HSA Foundation. All Rights Reserved
C++AMP HSA ENABLES EFFICIENT COMPILATION OF AN
EVEN HIGHER LEVEL OF PROGRAMMING
INTERFACE
© Copyright 2014 HSA Foundation. All Rights Reserved
C++ AMP
● C++ Accelerated Massive Parallelism
● Designed for data level parallelism
● Extension of C++11 proposed by Microsoft
● An open specification with multiple implementations aiming at standardization
● MS Visual Studio 2013
● MulticoreWare CLAMP
● GPU data modeled as C++14-like containers for multidimensional arrays
● GPU kernels modeled as C++11 lambda
● Minimal extension to C++ for simplicity and future proofing
© Copyright 2014 HSA Foundation. All Rights Reserved
MATRIX MULTIPLICATION IN C++AMP
void MultiplyWithAMP(int* aMatrix, int* bMatrix, int *productMatrix,
int ha, int hb, int hc) {
array_view<int, 2> a(ha, hb, aMatrix);
array_view<int, 2> b(hb, hc, bMatrix);
array_view<int, 2> product(ha, hc, productMatrix);
parallel_for_each(
product.extent,
[=](index<2> idx) restrict(amp) {
int row = idx[0];
int col = idx[1];
for (int inner = 0; inner < 2; inner++) {
product[idx] += a(row, inner) * b(inner, col);
}
}
);
product.synchronize();}
clGPUContext = clCreateContextFromType(0,
CL_DEVICE_TYPE_GPU,
NULL, NULL, &errcode);
shrCheckError(errcode, CL_SUCCESS);
// get the list of GPU devices associated
// with context
errcode = clGetContextInfo(clGPUContext,
CL_CONTEXT_DEVICES, 0, NULL,
&dataBytes);
cl_device_id *clDevices = (cl_device_id *)
malloc(dataBytes);
errcode |= clGetContextInfo(clGPUContext,
CL_CONTEXT_DEVICES, dataBytes,
clDevices, NULL);
shrCheckError(errcode, CL_SUCCESS);
//Create a command-queue
clCommandQue =
clCreateCommandQueue(clGPUContext,
clDevices[0], 0, &errcode);
shrCheckError(errcode, CL_SUCCESS);
__kernel void
matrixMul(__global float* C, __global float* A,
__global float* B, int wA, int wB) {
int tx = get_global_id(0);
int ty = get_global_id(1);
float value = 0;
for (int k = 0; k < wA; ++k)
{
float elementA = A[ty * wA + k];
float elementB = B[k * wB + tx];
value += elementA * elementB;
}
C[ty * wA + tx] = value;}
© Copyright 2014 HSA Foundation. All Rights Reserved
C++AMP PROGRAMMING MODEL
void MultiplyWithAMP(int* aMatrix, int* bMatrix, int *productMatrix) {
array_view<int, 2> a(3, 2, aMatrix);
array_view<int, 2> b(2, 3, bMatrix);
array_view<int, 2> product(3, 3, productMatrix);
parallel_for_each(
product.extent,
[=](index<2> idx) restrict(amp) {
int row = idx[0];
int col = idx[1];
for (int inner = 0; inner < 2; inner++) {
product[idx] += a(row, inner) * b(inner, col);
}
}
);
product.synchronize();}
GPU data
modeled as
data container
© Copyright 2014 HSA Foundation. All Rights Reserved
C++AMP PROGRAMMING MODEL
void MultiplyWithAMP(int* aMatrix, int* bMatrix, int *productMatrix) {
array_view<int, 2> a(3, 2, aMatrix);
array_view<int, 2> b(2, 3, bMatrix);
array_view<int, 2> product(3, 3, productMatrix);
parallel_for_each(
product.extent,
[=](index<2> idx) restrict(amp) {
int row = idx[0];
int col = idx[1];
for (int inner = 0; inner < 2; inner++) {
product[idx] += a(row, inner) * b(inner, col);
}
}
);
product.synchronize();}
Kernels modeled as
lambdas; arguments are
implicitly modeled as
captured variables,
programmer do not need to
specify copyin and copyout
© Copyright 2014 HSA Foundation. All Rights Reserved
C++AMP PROGRAMMING MODEL
void MultiplyWithAMP(int* aMatrix, int* bMatrix, int *productMatrix) {
array_view<int, 2> a(3, 2, aMatrix);
array_view<int, 2> b(2, 3, bMatrix);
array_view<int, 2> product(3, 3, productMatrix);
parallel_for_each(
product.extent,
[=](index<2> idx) restrict(amp) {
int row = idx[0];
int col = idx[1];
for (int inner = 0; inner < 2; inner++) {
product[idx] += a(row, inner) * b(inner, col);
}
}
);
product.synchronize();
}
Execution
interface; marking
an implicitly
parallel region for
GPU execution
© Copyright 2014 HSA Foundation. All Rights Reserved
MCW C++AMP (CLAMP)
● Runs on Linux and Mac OS X
● Output code compatible with all major OpenCL stacks: AMD, Apple/Intel (OS X),
NVIDIA and even POCL
● Clang/LLVM-based, open source
o Translate C++AMP code to OpenCL C or OpenCL 1.2 SPIR
o With template helper library
● Runtime: OpenCL 1.1/HSA Runtime and GMAC for non-HSA systems
● One of the two C++ AMP implementations recognized by HSA foundation
© Copyright 2014 HSA Foundation. All Rights Reserved
MCW C++ AMP COMPILER
● Device Path
o generate OpenCL C code and SPIR
o emit kernel function
● Host Path
o preparation to launch the code
C++ AMP
source code
Clang/LLVM 3.3
Device
Code
Host
Code
© Copyright 2014 HSA Foundation. All Rights Reserved
TRANSLATION
parallel_for_each(product.extent,
[=](index<2> idx) restrict(amp) {
int row = idx[0];
int col = idx[1];
for (int inner = 0; inner < 2; inner++) {
product[idx] += a(row, inner) * b(inner, col);
}
});
__kernel void
matrixMul(__global float* C, __global float*
A,
__global float* B, int wA, int wB){
int tx = get_global_id(0);
int ty = get_global_id(1);
float value = 0;
for (int k = 0; k < wA; ++k)
{
float elementA = A[ty * wA + k];
float elementB = B[k * wB + tx];
value += elementA * elementB;
}
C[ty * wA + tx] = value;}
● Append the arguments
● Set the index
● emit kernel function
● implicit memory management
© Copyright 2014 HSA Foundation. All Rights Reserved
EXECUTION ON NON-HSA OPENCL
PLATFORMS
C++ AMP
source code
Clang/LLVM
3.3
Device Code
C++ AMP
source code
Clang/LLVM
3.3
Host Code
gmac
OpenCL
Our work
Runtime
© Copyright 2014 HSA Foundation. All Rights Reserved
GMAC
● unified virtual address space in
software
● Can have high overhead
sometimes
● In HSA (e.g., AMD Kaveri), GMAC
is not longer needed
Gelado, et al, ASPLOS 2010
© Copyright 2014 HSA Foundation. All Rights Reserved
CASE STUDY:BINOMIAL OPTION PRICING
Line of Codes
0
50
100
150
200
250
300
350
C++AMP OpenCL
Lines of Code by Cloc
Host
Kernel
© Copyright 2014 HSA Foundation. All Rights Reserved
PERFORMANCE ON NON-HSA SYSTEMSBINOMIAL OPTION PRICING
0
0.02
0.04
0.06
0.08
0.1
0.12
Total GPU Time Kernel-only
Tim
e in
Seco
nd
s
Performance on an NV Tesla C2050
OpenCL
C++AMP
© Copyright 2014 HSA Foundation. All Rights Reserved
EXECUTION ON HSA
C++ AMP
source code
Clang/LLVM
3.3
Device SPIR
C++ AMP
source code
Clang/LLVM
3.3
Host SPIR
HSA Runtime
Compile Time
Runtime
© Copyright 2014 HSA Foundation. All Rights Reserved
WHAT WE NEED TO DO?
● Kernel function
o emit the kernel function with required arguments
● On Host side
o a function that recursively traverses the object and append the arguments to OpenCL
stack.
● On Device side
o reconstruct it on the device code for future use.
© Copyright 2014 HSA Foundation. All Rights Reserved
WHY COMPILING C++AMP TO OPENCL IS
NOT TRIVIAL
● C++AMP → LLVM IR → OpenCL C or SPIR
● arguments passing (lambda capture vs function calls)
● explicit V.S. implicit memory transfer
● Heavy lifting is done by compiler and runtime
© Copyright 2014 HSA Foundation. All Rights Reserved
EXAMPLE
struct A { int a; };struct B : A { int b; };struct C { B b; int c; };
struct C c;
c.c = 100;
auto fn = [=] () { int qq = c.c; };
© Copyright 2014 HSA Foundation. All Rights Reserved
TRANSLATION
parallel_for_each(product.extent,
[=](index<2> idx) restrict(amp) {
int row = idx[0];
int col = idx[1];
for (int inner = 0; inner < 2; inner++) {
product[idx] += a(row, inner) * b(inner, col);
}
});
__kernel void
matrixMul(__global float* C, __global float* A,
__global float* B, int wA, int wB){
int tx = get_global_id(0);
int ty = get_global_id(1);
float value = 0;
for (int k = 0; k < wA; ++k)
{
float elementA = A[ty * wA + k];
float elementB = B[k * wB + tx];
value += elementA * elementB;
}
C[ty * wA + tx] = value;}
● Compiler
● Turn captured variables into
OpenCL arguments
● Populate the index<N> in OCL
kernel
● Runtime
● Implicit memory management
© Copyright 2014 HSA Foundation. All Rights Reserved
QUESTIONS?
© Copyright 2014 HSA Foundation. All Rights Reserved