1 //===------ PPCGCodeGeneration.cpp - Polly Accelerator Code Generation. ---===//
3 // The LLVM Compiler Infrastructure
5 // This file is distributed under the University of Illinois Open Source
6 // License. See LICENSE.TXT for details.
8 //===----------------------------------------------------------------------===//
10 // Take a scop created by ScopInfo and map it to GPU code using the ppcg
11 // GPU mapping strategy.
13 //===----------------------------------------------------------------------===//
15 #include "polly/CodeGen/IslNodeBuilder.h"
16 #include "polly/CodeGen/Utils.h"
17 #include "polly/DependenceInfo.h"
18 #include "polly/LinkAllPasses.h"
19 #include "polly/Options.h"
20 #include "polly/ScopInfo.h"
21 #include "llvm/Analysis/AliasAnalysis.h"
22 #include "llvm/Analysis/BasicAliasAnalysis.h"
23 #include "llvm/Analysis/GlobalsModRef.h"
24 #include "llvm/Analysis/PostDominators.h"
25 #include "llvm/Analysis/ScalarEvolutionAliasAnalysis.h"
27 #include "isl/union_map.h"
30 #include "ppcg/cuda.h"
32 #include "ppcg/gpu_print.h"
33 #include "ppcg/ppcg.h"
34 #include "ppcg/schedule.h"
37 #include "llvm/Support/Debug.h"
39 using namespace polly
;
42 #define DEBUG_TYPE "polly-codegen-ppcg"
44 static cl::opt
<bool> DumpSchedule("polly-acc-dump-schedule",
45 cl::desc("Dump the computed GPU Schedule"),
46 cl::Hidden
, cl::init(false), cl::ZeroOrMore
,
47 cl::cat(PollyCategory
));
50 DumpCode("polly-acc-dump-code",
51 cl::desc("Dump C code describing the GPU mapping"), cl::Hidden
,
52 cl::init(false), cl::ZeroOrMore
, cl::cat(PollyCategory
));
54 static cl::opt
<bool> DumpKernelIR("polly-acc-dump-kernel-ir",
55 cl::desc("Dump the kernel LLVM-IR"),
56 cl::Hidden
, cl::init(false), cl::ZeroOrMore
,
57 cl::cat(PollyCategory
));
59 /// Create the ast expressions for a ScopStmt.
61 /// This function is a callback for to generate the ast expressions for each
62 /// of the scheduled ScopStmts.
63 static __isl_give isl_id_to_ast_expr
*pollyBuildAstExprForStmt(
64 void *Stmt
, isl_ast_build
*Build
,
65 isl_multi_pw_aff
*(*FunctionIndex
)(__isl_take isl_multi_pw_aff
*MPA
,
66 isl_id
*Id
, void *User
),
68 isl_ast_expr
*(*FunctionExpr
)(isl_ast_expr
*Expr
, isl_id
*Id
, void *User
),
71 // TODO: Implement the AST expression generation. For now we just return a
72 // nullptr to ensure that we do not free uninitialized pointers.
77 /// Generate code for a GPU specific isl AST.
79 /// The GPUNodeBuilder augments the general existing IslNodeBuilder, which
80 /// generates code for general-prupose AST nodes, with special functionality
81 /// for generating GPU specific user nodes.
83 /// @see GPUNodeBuilder::createUser
84 class GPUNodeBuilder
: public IslNodeBuilder
{
86 GPUNodeBuilder(PollyIRBuilder
&Builder
, ScopAnnotator
&Annotator
, Pass
*P
,
87 const DataLayout
&DL
, LoopInfo
&LI
, ScalarEvolution
&SE
,
88 DominatorTree
&DT
, Scop
&S
, gpu_prog
*Prog
)
89 : IslNodeBuilder(Builder
, Annotator
, P
, DL
, LI
, SE
, DT
, S
), Prog(Prog
) {}
92 /// A module containing GPU code.
94 /// This pointer is only set in case we are currently generating GPU code.
95 std::unique_ptr
<Module
> GPUModule
;
97 /// The GPU program we generate code for.
100 /// Class to free isl_ids.
103 void operator()(__isl_take isl_id
*Id
) { isl_id_free(Id
); };
106 /// A set containing all isl_ids allocated in a GPU kernel.
108 /// By releasing this set all isl_ids will be freed.
109 std::set
<std::unique_ptr
<isl_id
, IslIdDeleter
>> KernelIDs
;
111 /// Create code for user-defined AST nodes.
113 /// These AST nodes can be of type:
115 /// - ScopStmt: A computational statement (TODO)
116 /// - Kernel: A GPU kernel call (TODO)
117 /// - Data-Transfer: A GPU <-> CPU data-transfer (TODO)
118 /// - In-kernel synchronization
119 /// - In-kernel memory copy statement
121 /// @param UserStmt The ast node to generate code for.
122 virtual void createUser(__isl_take isl_ast_node
*UserStmt
);
124 /// Create GPU kernel.
126 /// Code generate the kernel described by @p KernelStmt.
128 /// @param KernelStmt The ast node to generate kernel code for.
129 void createKernel(__isl_take isl_ast_node
*KernelStmt
);
131 /// Create kernel function.
133 /// Create a kernel function located in a newly created module that can serve
134 /// as target for device code generation. Set the Builder to point to the
135 /// start block of this newly created function.
137 /// @param Kernel The kernel to generate code for.
138 void createKernelFunction(ppcg_kernel
*Kernel
);
140 /// Create the declaration of a kernel function.
142 /// The kernel function takes as arguments:
144 /// - One i8 pointer for each external array reference used in the kernel.
147 /// - Other LLVM Value references (TODO)
149 /// @param Kernel The kernel to generate the function declaration for.
150 /// @returns The newly declared function.
151 Function
*createKernelFunctionDecl(ppcg_kernel
*Kernel
);
153 /// Insert intrinsic functions to obtain thread and block ids.
155 /// @param The kernel to generate the intrinsic functions for.
156 void insertKernelIntrinsics(ppcg_kernel
*Kernel
);
158 /// Create an in-kernel synchronization call.
159 void createKernelSync();
161 /// Finalize the generation of the kernel function.
163 /// Free the LLVM-IR module corresponding to the kernel and -- if requested --
164 /// dump its IR to stderr.
165 void finalizeKernelFunction();
168 /// Check if one string is a prefix of another.
170 /// @param String The string in which to look for the prefix.
171 /// @param Prefix The prefix to look for.
172 static bool isPrefix(std::string String
, std::string Prefix
) {
173 return String
.find(Prefix
) == 0;
176 void GPUNodeBuilder::createUser(__isl_take isl_ast_node
*UserStmt
) {
177 isl_ast_expr
*Expr
= isl_ast_node_user_get_expr(UserStmt
);
178 isl_ast_expr
*StmtExpr
= isl_ast_expr_get_op_arg(Expr
, 0);
179 isl_id
*Id
= isl_ast_expr_get_id(StmtExpr
);
181 isl_ast_expr_free(StmtExpr
);
183 const char *Str
= isl_id_get_name(Id
);
184 if (!strcmp(Str
, "kernel")) {
185 createKernel(UserStmt
);
186 isl_ast_expr_free(Expr
);
190 if (isPrefix(Str
, "to_device") || isPrefix(Str
, "from_device")) {
191 // TODO: Insert memory copies
192 isl_ast_expr_free(Expr
);
193 isl_ast_node_free(UserStmt
);
197 isl_id
*Anno
= isl_ast_node_get_annotation(UserStmt
);
198 struct ppcg_kernel_stmt
*KernelStmt
=
199 (struct ppcg_kernel_stmt
*)isl_id_get_user(Anno
);
202 switch (KernelStmt
->type
) {
203 case ppcg_kernel_domain
:
204 // TODO Create kernel user stmt
205 isl_ast_expr_free(Expr
);
206 isl_ast_node_free(UserStmt
);
208 case ppcg_kernel_copy
:
209 // TODO: Create kernel copy stmt
210 isl_ast_expr_free(Expr
);
211 isl_ast_node_free(UserStmt
);
213 case ppcg_kernel_sync
:
215 isl_ast_expr_free(Expr
);
216 isl_ast_node_free(UserStmt
);
220 isl_ast_expr_free(Expr
);
221 isl_ast_node_free(UserStmt
);
225 void GPUNodeBuilder::createKernelSync() {
226 Module
*M
= Builder
.GetInsertBlock()->getParent()->getParent();
227 auto *Sync
= Intrinsic::getDeclaration(M
, Intrinsic::nvvm_barrier0
);
228 Builder
.CreateCall(Sync
, {});
231 void GPUNodeBuilder::createKernel(__isl_take isl_ast_node
*KernelStmt
) {
232 isl_id
*Id
= isl_ast_node_get_annotation(KernelStmt
);
233 ppcg_kernel
*Kernel
= (ppcg_kernel
*)isl_id_get_user(Id
);
235 isl_ast_node_free(KernelStmt
);
237 assert(Kernel
->tree
&& "Device AST of kernel node is empty");
239 Instruction
&HostInsertPoint
= *Builder
.GetInsertPoint();
240 IslExprBuilder::IDToValueTy HostIDs
= IDToValue
;
242 createKernelFunction(Kernel
);
244 create(isl_ast_node_copy(Kernel
->tree
));
246 Builder
.SetInsertPoint(&HostInsertPoint
);
249 finalizeKernelFunction();
252 /// Compute the DataLayout string for the NVPTX backend.
254 /// @param is64Bit Are we looking for a 64 bit architecture?
255 static std::string
computeNVPTXDataLayout(bool is64Bit
) {
256 std::string Ret
= "e";
261 Ret
+= "-i64:64-v16:16-v32:32-n16:32:64";
266 Function
*GPUNodeBuilder::createKernelFunctionDecl(ppcg_kernel
*Kernel
) {
267 std::vector
<Type
*> Args
;
268 std::string Identifier
= "kernel_" + std::to_string(Kernel
->id
);
270 for (long i
= 0; i
< Prog
->n_array
; i
++) {
271 if (!ppcg_kernel_requires_array_argument(Kernel
, i
))
274 Args
.push_back(Builder
.getInt8PtrTy());
277 int NumHostIters
= isl_space_dim(Kernel
->space
, isl_dim_set
);
279 for (long i
= 0; i
< NumHostIters
; i
++)
280 Args
.push_back(Builder
.getInt64Ty());
282 int NumVars
= isl_space_dim(Kernel
->space
, isl_dim_param
);
284 for (long i
= 0; i
< NumVars
; i
++)
285 Args
.push_back(Builder
.getInt64Ty());
287 auto *FT
= FunctionType::get(Builder
.getVoidTy(), Args
, false);
288 auto *FN
= Function::Create(FT
, Function::ExternalLinkage
, Identifier
,
290 FN
->setCallingConv(CallingConv::PTX_Kernel
);
292 auto Arg
= FN
->arg_begin();
293 for (long i
= 0; i
< Kernel
->n_array
; i
++) {
294 if (!ppcg_kernel_requires_array_argument(Kernel
, i
))
297 Arg
->setName(Prog
->array
[i
].name
);
301 for (long i
= 0; i
< NumHostIters
; i
++) {
302 isl_id
*Id
= isl_space_get_dim_id(Kernel
->space
, isl_dim_set
, i
);
303 Arg
->setName(isl_id_get_name(Id
));
304 IDToValue
[Id
] = &*Arg
;
305 KernelIDs
.insert(std::unique_ptr
<isl_id
, IslIdDeleter
>(Id
));
309 for (long i
= 0; i
< NumVars
; i
++) {
310 isl_id
*Id
= isl_space_get_dim_id(Kernel
->space
, isl_dim_param
, i
);
311 Arg
->setName(isl_id_get_name(Id
));
312 IDToValue
[Id
] = &*Arg
;
313 KernelIDs
.insert(std::unique_ptr
<isl_id
, IslIdDeleter
>(Id
));
320 void GPUNodeBuilder::insertKernelIntrinsics(ppcg_kernel
*Kernel
) {
321 Intrinsic::ID IntrinsicsBID
[] = {Intrinsic::nvvm_read_ptx_sreg_ctaid_x
,
322 Intrinsic::nvvm_read_ptx_sreg_ctaid_y
};
324 Intrinsic::ID IntrinsicsTID
[] = {Intrinsic::nvvm_read_ptx_sreg_tid_x
,
325 Intrinsic::nvvm_read_ptx_sreg_tid_y
,
326 Intrinsic::nvvm_read_ptx_sreg_tid_z
};
328 auto addId
= [this](__isl_take isl_id
*Id
, Intrinsic::ID Intr
) mutable {
329 std::string Name
= isl_id_get_name(Id
);
330 Module
*M
= Builder
.GetInsertBlock()->getParent()->getParent();
331 Function
*IntrinsicFn
= Intrinsic::getDeclaration(M
, Intr
);
332 Value
*Val
= Builder
.CreateCall(IntrinsicFn
, {});
333 Val
= Builder
.CreateIntCast(Val
, Builder
.getInt64Ty(), false, Name
);
335 KernelIDs
.insert(std::unique_ptr
<isl_id
, IslIdDeleter
>(Id
));
338 for (int i
= 0; i
< Kernel
->n_grid
; ++i
) {
339 isl_id
*Id
= isl_id_list_get_id(Kernel
->block_ids
, i
);
340 addId(Id
, IntrinsicsBID
[i
]);
343 for (int i
= 0; i
< Kernel
->n_block
; ++i
) {
344 isl_id
*Id
= isl_id_list_get_id(Kernel
->thread_ids
, i
);
345 addId(Id
, IntrinsicsTID
[i
]);
349 void GPUNodeBuilder::createKernelFunction(ppcg_kernel
*Kernel
) {
351 std::string Identifier
= "kernel_" + std::to_string(Kernel
->id
);
352 GPUModule
.reset(new Module(Identifier
, Builder
.getContext()));
353 GPUModule
->setTargetTriple(Triple::normalize("nvptx64-nvidia-cuda"));
354 GPUModule
->setDataLayout(computeNVPTXDataLayout(true /* is64Bit */));
356 Function
*FN
= createKernelFunctionDecl(Kernel
);
358 BasicBlock
*PrevBlock
= Builder
.GetInsertBlock();
359 auto EntryBlock
= BasicBlock::Create(Builder
.getContext(), "entry", FN
);
361 DominatorTree
&DT
= P
->getAnalysis
<DominatorTreeWrapperPass
>().getDomTree();
362 DT
.addNewBlock(EntryBlock
, PrevBlock
);
364 Builder
.SetInsertPoint(EntryBlock
);
365 Builder
.CreateRetVoid();
366 Builder
.SetInsertPoint(EntryBlock
, EntryBlock
->begin());
368 insertKernelIntrinsics(Kernel
);
371 void GPUNodeBuilder::finalizeKernelFunction() {
374 outs() << *GPUModule
<< "\n";
381 class PPCGCodeGeneration
: public ScopPass
{
385 /// The scop that is currently processed.
391 const DataLayout
*DL
;
394 PPCGCodeGeneration() : ScopPass(ID
) {}
396 /// Construct compilation options for PPCG.
398 /// @returns The compilation options.
399 ppcg_options
*createPPCGOptions() {
401 (ppcg_debug_options
*)malloc(sizeof(ppcg_debug_options
));
402 auto Options
= (ppcg_options
*)malloc(sizeof(ppcg_options
));
404 DebugOptions
->dump_schedule_constraints
= false;
405 DebugOptions
->dump_schedule
= false;
406 DebugOptions
->dump_final_schedule
= false;
407 DebugOptions
->dump_sizes
= false;
409 Options
->debug
= DebugOptions
;
411 Options
->reschedule
= true;
412 Options
->scale_tile_loops
= false;
413 Options
->wrap
= false;
415 Options
->non_negative_parameters
= false;
416 Options
->ctx
= nullptr;
417 Options
->sizes
= nullptr;
419 Options
->tile_size
= 32;
421 Options
->use_private_memory
= false;
422 Options
->use_shared_memory
= false;
423 Options
->max_shared_memory
= 0;
425 Options
->target
= PPCG_TARGET_CUDA
;
426 Options
->openmp
= false;
427 Options
->linearize_device_arrays
= true;
428 Options
->live_range_reordering
= false;
430 Options
->opencl_compiler_options
= nullptr;
431 Options
->opencl_use_gpu
= false;
432 Options
->opencl_n_include_file
= 0;
433 Options
->opencl_include_files
= nullptr;
434 Options
->opencl_print_kernel_types
= false;
435 Options
->opencl_embed_kernel_code
= false;
437 Options
->save_schedule_file
= nullptr;
438 Options
->load_schedule_file
= nullptr;
443 /// Get a tagged access relation containing all accesses of type @p AccessTy.
445 /// Instead of a normal access of the form:
447 /// Stmt[i,j,k] -> Array[f_0(i,j,k), f_1(i,j,k)]
449 /// a tagged access has the form
451 /// [Stmt[i,j,k] -> id[]] -> Array[f_0(i,j,k), f_1(i,j,k)]
453 /// where 'id' is an additional space that references the memory access that
454 /// triggered the access.
456 /// @param AccessTy The type of the memory accesses to collect.
458 /// @return The relation describing all tagged memory accesses.
459 isl_union_map
*getTaggedAccesses(enum MemoryAccess::AccessType AccessTy
) {
460 isl_union_map
*Accesses
= isl_union_map_empty(S
->getParamSpace());
462 for (auto &Stmt
: *S
)
463 for (auto &Acc
: Stmt
)
464 if (Acc
->getType() == AccessTy
) {
465 isl_map
*Relation
= Acc
->getAccessRelation();
466 Relation
= isl_map_intersect_domain(Relation
, Stmt
.getDomain());
468 isl_space
*Space
= isl_map_get_space(Relation
);
469 Space
= isl_space_range(Space
);
470 Space
= isl_space_from_range(Space
);
471 Space
= isl_space_set_tuple_id(Space
, isl_dim_in
, Acc
->getId());
472 isl_map
*Universe
= isl_map_universe(Space
);
473 Relation
= isl_map_domain_product(Relation
, Universe
);
474 Accesses
= isl_union_map_add_map(Accesses
, Relation
);
480 /// Get the set of all read accesses, tagged with the access id.
482 /// @see getTaggedAccesses
483 isl_union_map
*getTaggedReads() {
484 return getTaggedAccesses(MemoryAccess::READ
);
487 /// Get the set of all may (and must) accesses, tagged with the access id.
489 /// @see getTaggedAccesses
490 isl_union_map
*getTaggedMayWrites() {
491 return isl_union_map_union(getTaggedAccesses(MemoryAccess::MAY_WRITE
),
492 getTaggedAccesses(MemoryAccess::MUST_WRITE
));
495 /// Get the set of all must accesses, tagged with the access id.
497 /// @see getTaggedAccesses
498 isl_union_map
*getTaggedMustWrites() {
499 return getTaggedAccesses(MemoryAccess::MUST_WRITE
);
502 /// Collect parameter and array names as isl_ids.
504 /// To reason about the different parameters and arrays used, ppcg requires
505 /// a list of all isl_ids in use. As PPCG traditionally performs
506 /// source-to-source compilation each of these isl_ids is mapped to the
507 /// expression that represents it. As we do not have a corresponding
508 /// expression in Polly, we just map each id to a 'zero' expression to match
509 /// the data format that ppcg expects.
511 /// @returns Retun a map from collected ids to 'zero' ast expressions.
512 __isl_give isl_id_to_ast_expr
*getNames() {
513 auto *Names
= isl_id_to_ast_expr_alloc(
515 S
->getNumParams() + std::distance(S
->array_begin(), S
->array_end()));
516 auto *Zero
= isl_ast_expr_from_val(isl_val_zero(S
->getIslCtx()));
517 auto *Space
= S
->getParamSpace();
519 for (int I
= 0, E
= S
->getNumParams(); I
< E
; ++I
) {
520 isl_id
*Id
= isl_space_get_dim_id(Space
, isl_dim_param
, I
);
521 Names
= isl_id_to_ast_expr_set(Names
, Id
, isl_ast_expr_copy(Zero
));
524 for (auto &Array
: S
->arrays()) {
525 auto Id
= Array
.second
->getBasePtrId();
526 Names
= isl_id_to_ast_expr_set(Names
, Id
, isl_ast_expr_copy(Zero
));
529 isl_space_free(Space
);
530 isl_ast_expr_free(Zero
);
535 /// Create a new PPCG scop from the current scop.
537 /// The PPCG scop is initialized with data from the current polly::Scop. From
538 /// this initial data, the data-dependences in the PPCG scop are initialized.
539 /// We do not use Polly's dependence analysis for now, to ensure we match
540 /// the PPCG default behaviour more closely.
542 /// @returns A new ppcg scop.
543 ppcg_scop
*createPPCGScop() {
544 auto PPCGScop
= (ppcg_scop
*)malloc(sizeof(ppcg_scop
));
546 PPCGScop
->options
= createPPCGOptions();
551 PPCGScop
->context
= S
->getContext();
552 PPCGScop
->domain
= S
->getDomains();
553 PPCGScop
->call
= nullptr;
554 PPCGScop
->tagged_reads
= getTaggedReads();
555 PPCGScop
->reads
= S
->getReads();
556 PPCGScop
->live_in
= nullptr;
557 PPCGScop
->tagged_may_writes
= getTaggedMayWrites();
558 PPCGScop
->may_writes
= S
->getWrites();
559 PPCGScop
->tagged_must_writes
= getTaggedMustWrites();
560 PPCGScop
->must_writes
= S
->getMustWrites();
561 PPCGScop
->live_out
= nullptr;
562 PPCGScop
->tagged_must_kills
= isl_union_map_empty(S
->getParamSpace());
563 PPCGScop
->tagger
= nullptr;
565 PPCGScop
->independence
= nullptr;
566 PPCGScop
->dep_flow
= nullptr;
567 PPCGScop
->tagged_dep_flow
= nullptr;
568 PPCGScop
->dep_false
= nullptr;
569 PPCGScop
->dep_forced
= nullptr;
570 PPCGScop
->dep_order
= nullptr;
571 PPCGScop
->tagged_dep_order
= nullptr;
573 PPCGScop
->schedule
= S
->getScheduleTree();
574 PPCGScop
->names
= getNames();
576 PPCGScop
->pet
= nullptr;
578 compute_tagger(PPCGScop
);
579 compute_dependences(PPCGScop
);
584 /// Collect the array acesses in a statement.
586 /// @param Stmt The statement for which to collect the accesses.
588 /// @returns A list of array accesses.
589 gpu_stmt_access
*getStmtAccesses(ScopStmt
&Stmt
) {
590 gpu_stmt_access
*Accesses
= nullptr;
592 for (MemoryAccess
*Acc
: Stmt
) {
593 auto Access
= isl_alloc_type(S
->getIslCtx(), struct gpu_stmt_access
);
594 Access
->read
= Acc
->isRead();
595 Access
->write
= Acc
->isWrite();
596 Access
->access
= Acc
->getAccessRelation();
597 isl_space
*Space
= isl_map_get_space(Access
->access
);
598 Space
= isl_space_range(Space
);
599 Space
= isl_space_from_range(Space
);
600 Space
= isl_space_set_tuple_id(Space
, isl_dim_in
, Acc
->getId());
601 isl_map
*Universe
= isl_map_universe(Space
);
602 Access
->tagged_access
=
603 isl_map_domain_product(Acc
->getAccessRelation(), Universe
);
604 Access
->exact_write
= Acc
->isWrite();
605 Access
->ref_id
= Acc
->getId();
606 Access
->next
= Accesses
;
613 /// Collect the list of GPU statements.
615 /// Each statement has an id, a pointer to the underlying data structure,
616 /// as well as a list with all memory accesses.
618 /// TODO: Initialize the list of memory accesses.
620 /// @returns A linked-list of statements.
621 gpu_stmt
*getStatements() {
622 gpu_stmt
*Stmts
= isl_calloc_array(S
->getIslCtx(), struct gpu_stmt
,
623 std::distance(S
->begin(), S
->end()));
626 for (auto &Stmt
: *S
) {
627 gpu_stmt
*GPUStmt
= &Stmts
[i
];
629 GPUStmt
->id
= Stmt
.getDomainId();
631 // We use the pet stmt pointer to keep track of the Polly statements.
632 GPUStmt
->stmt
= (pet_stmt
*)&Stmt
;
633 GPUStmt
->accesses
= getStmtAccesses(Stmt
);
640 /// Derive the extent of an array.
642 /// The extent of an array is defined by the set of memory locations for
643 /// which a memory access in the iteration domain exists.
645 /// @param Array The array to derive the extent for.
647 /// @returns An isl_set describing the extent of the array.
648 __isl_give isl_set
*getExtent(ScopArrayInfo
*Array
) {
649 isl_union_map
*Accesses
= S
->getAccesses();
650 Accesses
= isl_union_map_intersect_domain(Accesses
, S
->getDomains());
651 isl_union_set
*AccessUSet
= isl_union_map_range(Accesses
);
653 isl_union_set_extract_set(AccessUSet
, Array
->getSpace());
654 isl_union_set_free(AccessUSet
);
659 /// Derive the bounds of an array.
661 /// For the first dimension we derive the bound of the array from the extent
662 /// of this dimension. For inner dimensions we obtain their size directly from
665 /// @param PPCGArray The array to compute bounds for.
666 /// @param Array The polly array from which to take the information.
667 void setArrayBounds(gpu_array_info
&PPCGArray
, ScopArrayInfo
*Array
) {
668 if (PPCGArray
.n_index
> 0) {
669 isl_set
*Dom
= isl_set_copy(PPCGArray
.extent
);
670 Dom
= isl_set_project_out(Dom
, isl_dim_set
, 1, PPCGArray
.n_index
- 1);
671 isl_pw_aff
*Bound
= isl_set_dim_max(isl_set_copy(Dom
), 0);
673 Dom
= isl_pw_aff_domain(isl_pw_aff_copy(Bound
));
674 isl_local_space
*LS
= isl_local_space_from_space(isl_set_get_space(Dom
));
675 isl_aff
*One
= isl_aff_zero_on_domain(LS
);
676 One
= isl_aff_add_constant_si(One
, 1);
677 Bound
= isl_pw_aff_add(Bound
, isl_pw_aff_alloc(Dom
, One
));
678 Bound
= isl_pw_aff_gist(Bound
, S
->getContext());
679 PPCGArray
.bound
[0] = Bound
;
682 for (unsigned i
= 1; i
< PPCGArray
.n_index
; ++i
) {
683 isl_pw_aff
*Bound
= Array
->getDimensionSizePw(i
);
684 auto LS
= isl_pw_aff_get_domain_space(Bound
);
685 auto Aff
= isl_multi_aff_zero(LS
);
686 Bound
= isl_pw_aff_pullback_multi_aff(Bound
, Aff
);
687 PPCGArray
.bound
[i
] = Bound
;
691 /// Create the arrays for @p PPCGProg.
693 /// @param PPCGProg The program to compute the arrays for.
694 void createArrays(gpu_prog
*PPCGProg
) {
696 for (auto &Element
: S
->arrays()) {
697 ScopArrayInfo
*Array
= Element
.second
.get();
699 std::string TypeName
;
700 raw_string_ostream
OS(TypeName
);
702 OS
<< *Array
->getElementType();
705 gpu_array_info
&PPCGArray
= PPCGProg
->array
[i
];
707 PPCGArray
.space
= Array
->getSpace();
708 PPCGArray
.type
= strdup(TypeName
.c_str());
709 PPCGArray
.size
= Array
->getElementType()->getPrimitiveSizeInBits() / 8;
710 PPCGArray
.name
= strdup(Array
->getName().c_str());
711 PPCGArray
.extent
= nullptr;
712 PPCGArray
.n_index
= Array
->getNumberOfDimensions();
714 isl_alloc_array(S
->getIslCtx(), isl_pw_aff
*, PPCGArray
.n_index
);
715 PPCGArray
.extent
= getExtent(Array
);
717 PPCGArray
.refs
= nullptr;
718 PPCGArray
.accessed
= true;
719 PPCGArray
.read_only_scalar
= false;
720 PPCGArray
.has_compound_element
= false;
721 PPCGArray
.local
= false;
722 PPCGArray
.declare_local
= false;
723 PPCGArray
.global
= false;
724 PPCGArray
.linearize
= false;
725 PPCGArray
.dep_order
= nullptr;
727 setArrayBounds(PPCGArray
, Array
);
730 collect_references(PPCGProg
, &PPCGArray
);
734 /// Create an identity map between the arrays in the scop.
736 /// @returns An identity map between the arrays in the scop.
737 isl_union_map
*getArrayIdentity() {
738 isl_union_map
*Maps
= isl_union_map_empty(S
->getParamSpace());
740 for (auto &Item
: S
->arrays()) {
741 ScopArrayInfo
*Array
= Item
.second
.get();
742 isl_space
*Space
= Array
->getSpace();
743 Space
= isl_space_map_from_set(Space
);
744 isl_map
*Identity
= isl_map_identity(Space
);
745 Maps
= isl_union_map_add_map(Maps
, Identity
);
751 /// Create a default-initialized PPCG GPU program.
753 /// @returns A new gpu grogram description.
754 gpu_prog
*createPPCGProg(ppcg_scop
*PPCGScop
) {
759 auto PPCGProg
= isl_calloc_type(S
->getIslCtx(), struct gpu_prog
);
761 PPCGProg
->ctx
= S
->getIslCtx();
762 PPCGProg
->scop
= PPCGScop
;
763 PPCGProg
->context
= isl_set_copy(PPCGScop
->context
);
764 PPCGProg
->read
= isl_union_map_copy(PPCGScop
->reads
);
765 PPCGProg
->may_write
= isl_union_map_copy(PPCGScop
->may_writes
);
766 PPCGProg
->must_write
= isl_union_map_copy(PPCGScop
->must_writes
);
767 PPCGProg
->tagged_must_kill
=
768 isl_union_map_copy(PPCGScop
->tagged_must_kills
);
769 PPCGProg
->to_inner
= getArrayIdentity();
770 PPCGProg
->to_outer
= getArrayIdentity();
771 PPCGProg
->may_persist
= compute_may_persist(PPCGProg
);
772 PPCGProg
->any_to_outer
= nullptr;
773 PPCGProg
->array_order
= nullptr;
774 PPCGProg
->n_stmts
= std::distance(S
->begin(), S
->end());
775 PPCGProg
->stmts
= getStatements();
776 PPCGProg
->n_array
= std::distance(S
->array_begin(), S
->array_end());
777 PPCGProg
->array
= isl_calloc_array(S
->getIslCtx(), struct gpu_array_info
,
780 createArrays(PPCGProg
);
785 struct PrintGPUUserData
{
786 struct cuda_info
*CudaInfo
;
787 struct gpu_prog
*PPCGProg
;
788 std::vector
<ppcg_kernel
*> Kernels
;
791 /// Print a user statement node in the host code.
793 /// We use ppcg's printing facilities to print the actual statement and
794 /// additionally build up a list of all kernels that are encountered in the
797 /// @param P The printer to print to
798 /// @param Options The printing options to use
799 /// @param Node The node to print
800 /// @param User A user pointer to carry additional data. This pointer is
801 /// expected to be of type PrintGPUUserData.
803 /// @returns A printer to which the output has been printed.
804 static __isl_give isl_printer
*
805 printHostUser(__isl_take isl_printer
*P
,
806 __isl_take isl_ast_print_options
*Options
,
807 __isl_take isl_ast_node
*Node
, void *User
) {
808 auto Data
= (struct PrintGPUUserData
*)User
;
809 auto Id
= isl_ast_node_get_annotation(Node
);
812 bool IsUser
= !strcmp(isl_id_get_name(Id
), "user");
814 // If this is a user statement, format it ourselves as ppcg would
815 // otherwise try to call pet functionality that is not available in
818 P
= isl_printer_start_line(P
);
819 P
= isl_printer_print_ast_node(P
, Node
);
820 P
= isl_printer_end_line(P
);
822 isl_ast_print_options_free(Options
);
826 auto Kernel
= (struct ppcg_kernel
*)isl_id_get_user(Id
);
828 Data
->Kernels
.push_back(Kernel
);
831 return print_host_user(P
, Options
, Node
, User
);
834 /// Print C code corresponding to the control flow in @p Kernel.
836 /// @param Kernel The kernel to print
837 void printKernel(ppcg_kernel
*Kernel
) {
838 auto *P
= isl_printer_to_str(S
->getIslCtx());
839 P
= isl_printer_set_output_format(P
, ISL_FORMAT_C
);
840 auto *Options
= isl_ast_print_options_alloc(S
->getIslCtx());
841 P
= isl_ast_node_print(Kernel
->tree
, P
, Options
);
842 char *String
= isl_printer_get_str(P
);
843 printf("%s\n", String
);
848 /// Print C code corresponding to the GPU code described by @p Tree.
850 /// @param Tree An AST describing GPU code
851 /// @param PPCGProg The PPCG program from which @Tree has been constructed.
852 void printGPUTree(isl_ast_node
*Tree
, gpu_prog
*PPCGProg
) {
853 auto *P
= isl_printer_to_str(S
->getIslCtx());
854 P
= isl_printer_set_output_format(P
, ISL_FORMAT_C
);
856 PrintGPUUserData Data
;
857 Data
.PPCGProg
= PPCGProg
;
859 auto *Options
= isl_ast_print_options_alloc(S
->getIslCtx());
861 isl_ast_print_options_set_print_user(Options
, printHostUser
, &Data
);
862 P
= isl_ast_node_print(Tree
, P
, Options
);
863 char *String
= isl_printer_get_str(P
);
865 printf("%s\n", String
);
869 for (auto Kernel
: Data
.Kernels
) {
870 printf("# kernel%d\n", Kernel
->id
);
875 // Generate a GPU program using PPCG.
877 // GPU mapping consists of multiple steps:
879 // 1) Compute new schedule for the program.
880 // 2) Map schedule to GPU (TODO)
881 // 3) Generate code for new schedule (TODO)
883 // We do not use here the Polly ScheduleOptimizer, as the schedule optimizer
884 // is mostly CPU specific. Instead, we use PPCG's GPU code generation
885 // strategy directly from this pass.
886 gpu_gen
*generateGPU(ppcg_scop
*PPCGScop
, gpu_prog
*PPCGProg
) {
888 auto PPCGGen
= isl_calloc_type(S
->getIslCtx(), struct gpu_gen
);
890 PPCGGen
->ctx
= S
->getIslCtx();
891 PPCGGen
->options
= PPCGScop
->options
;
892 PPCGGen
->print
= nullptr;
893 PPCGGen
->print_user
= nullptr;
894 PPCGGen
->build_ast_expr
= &pollyBuildAstExprForStmt
;
895 PPCGGen
->prog
= PPCGProg
;
896 PPCGGen
->tree
= nullptr;
897 PPCGGen
->types
.n
= 0;
898 PPCGGen
->types
.name
= nullptr;
899 PPCGGen
->sizes
= nullptr;
900 PPCGGen
->used_sizes
= nullptr;
901 PPCGGen
->kernel_id
= 0;
903 // Set scheduling strategy to same strategy PPCG is using.
904 isl_options_set_schedule_outer_coincidence(PPCGGen
->ctx
, true);
905 isl_options_set_schedule_maximize_band_depth(PPCGGen
->ctx
, true);
906 isl_options_set_schedule_whole_component(PPCGGen
->ctx
, false);
908 isl_schedule
*Schedule
= get_schedule(PPCGGen
);
910 int has_permutable
= has_any_permutable_node(Schedule
);
912 if (!has_permutable
|| has_permutable
< 0) {
913 Schedule
= isl_schedule_free(Schedule
);
915 Schedule
= map_to_device(PPCGGen
, Schedule
);
916 PPCGGen
->tree
= generate_code(PPCGGen
, isl_schedule_copy(Schedule
));
920 isl_printer
*P
= isl_printer_to_str(S
->getIslCtx());
921 P
= isl_printer_set_yaml_style(P
, ISL_YAML_STYLE_BLOCK
);
922 P
= isl_printer_print_str(P
, "Schedule\n");
923 P
= isl_printer_print_str(P
, "========\n");
925 P
= isl_printer_print_schedule(P
, Schedule
);
927 P
= isl_printer_print_str(P
, "No schedule found\n");
929 printf("%s\n", isl_printer_get_str(P
));
937 printGPUTree(PPCGGen
->tree
, PPCGProg
);
939 printf("No code generated\n");
942 isl_schedule_free(Schedule
);
947 /// Free gpu_gen structure.
949 /// @param PPCGGen The ppcg_gen object to free.
950 void freePPCGGen(gpu_gen
*PPCGGen
) {
951 isl_ast_node_free(PPCGGen
->tree
);
952 isl_union_map_free(PPCGGen
->sizes
);
953 isl_union_map_free(PPCGGen
->used_sizes
);
957 /// Free the options in the ppcg scop structure.
959 /// ppcg is not freeing these options for us. To avoid leaks we do this
962 /// @param PPCGScop The scop referencing the options to free.
963 void freeOptions(ppcg_scop
*PPCGScop
) {
964 free(PPCGScop
->options
->debug
);
965 PPCGScop
->options
->debug
= nullptr;
966 free(PPCGScop
->options
);
967 PPCGScop
->options
= nullptr;
970 /// Generate code for a given GPU AST described by @p Root.
972 /// @param Root An isl_ast_node pointing to the root of the GPU AST.
973 /// @param Prog The GPU Program to generate code for.
974 void generateCode(__isl_take isl_ast_node
*Root
, gpu_prog
*Prog
) {
975 ScopAnnotator Annotator
;
976 Annotator
.buildAliasScopes(*S
);
978 Region
*R
= &S
->getRegion();
980 simplifyRegion(R
, DT
, LI
, RI
);
982 BasicBlock
*EnteringBB
= R
->getEnteringBlock();
984 PollyIRBuilder Builder
= createPollyIRBuilder(EnteringBB
, Annotator
);
986 GPUNodeBuilder
NodeBuilder(Builder
, Annotator
, this, *DL
, *LI
, *SE
, *DT
, *S
,
989 // Only build the run-time condition and parameters _after_ having
990 // introduced the conditional branch. This is important as the conditional
991 // branch will guard the original scop from new induction variables that
992 // the SCEVExpander may introduce while code generating the parameters and
993 // which may introduce scalar dependences that prevent us from correctly
994 // code generating this scop.
995 BasicBlock
*StartBlock
=
996 executeScopConditionally(*S
, this, Builder
.getTrue());
999 // TODO: Verify run-time checks
1000 auto SplitBlock
= StartBlock
->getSinglePredecessor();
1001 Builder
.SetInsertPoint(SplitBlock
->getTerminator());
1002 NodeBuilder
.addParameters(S
->getContext());
1003 Builder
.SetInsertPoint(&*StartBlock
->begin());
1004 NodeBuilder
.create(Root
);
1005 NodeBuilder
.finalizeSCoP(*S
);
1008 bool runOnScop(Scop
&CurrentScop
) override
{
1010 LI
= &getAnalysis
<LoopInfoWrapperPass
>().getLoopInfo();
1011 DT
= &getAnalysis
<DominatorTreeWrapperPass
>().getDomTree();
1012 SE
= &getAnalysis
<ScalarEvolutionWrapperPass
>().getSE();
1013 DL
= &S
->getRegion().getEntry()->getParent()->getParent()->getDataLayout();
1014 RI
= &getAnalysis
<RegionInfoPass
>().getRegionInfo();
1016 // We currently do not support scops with invariant loads.
1017 if (S
->hasInvariantAccesses())
1020 auto PPCGScop
= createPPCGScop();
1021 auto PPCGProg
= createPPCGProg(PPCGScop
);
1022 auto PPCGGen
= generateGPU(PPCGScop
, PPCGProg
);
1025 generateCode(isl_ast_node_copy(PPCGGen
->tree
), PPCGProg
);
1027 freeOptions(PPCGScop
);
1028 freePPCGGen(PPCGGen
);
1029 gpu_prog_free(PPCGProg
);
1030 ppcg_scop_free(PPCGScop
);
1035 void printScop(raw_ostream
&, Scop
&) const override
{}
1037 void getAnalysisUsage(AnalysisUsage
&AU
) const override
{
1038 AU
.addRequired
<DominatorTreeWrapperPass
>();
1039 AU
.addRequired
<RegionInfoPass
>();
1040 AU
.addRequired
<ScalarEvolutionWrapperPass
>();
1041 AU
.addRequired
<ScopDetection
>();
1042 AU
.addRequired
<ScopInfoRegionPass
>();
1043 AU
.addRequired
<LoopInfoWrapperPass
>();
1045 AU
.addPreserved
<AAResultsWrapperPass
>();
1046 AU
.addPreserved
<BasicAAWrapperPass
>();
1047 AU
.addPreserved
<LoopInfoWrapperPass
>();
1048 AU
.addPreserved
<DominatorTreeWrapperPass
>();
1049 AU
.addPreserved
<GlobalsAAWrapperPass
>();
1050 AU
.addPreserved
<PostDominatorTreeWrapperPass
>();
1051 AU
.addPreserved
<ScopDetection
>();
1052 AU
.addPreserved
<ScalarEvolutionWrapperPass
>();
1053 AU
.addPreserved
<SCEVAAWrapperPass
>();
1055 // FIXME: We do not yet add regions for the newly generated code to the
1057 AU
.addPreserved
<RegionInfoPass
>();
1058 AU
.addPreserved
<ScopInfoRegionPass
>();
1063 char PPCGCodeGeneration::ID
= 1;
1065 Pass
*polly::createPPCGCodeGenerationPass() { return new PPCGCodeGeneration(); }
1067 INITIALIZE_PASS_BEGIN(PPCGCodeGeneration
, "polly-codegen-ppcg",
1068 "Polly - Apply PPCG translation to SCOP", false, false)
1069 INITIALIZE_PASS_DEPENDENCY(DependenceInfo
);
1070 INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass
);
1071 INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass
);
1072 INITIALIZE_PASS_DEPENDENCY(RegionInfoPass
);
1073 INITIALIZE_PASS_DEPENDENCY(ScalarEvolutionWrapperPass
);
1074 INITIALIZE_PASS_DEPENDENCY(ScopDetection
);
1075 INITIALIZE_PASS_END(PPCGCodeGeneration
, "polly-codegen-ppcg",
1076 "Polly - Apply PPCG translation to SCOP", false, false)