LLVM 21.0.0git
OMPIRBuilder.cpp
Go to the documentation of this file.
1//===- OpenMPIRBuilder.cpp - Builder for LLVM-IR for OpenMP directives ----===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8/// \file
9///
10/// This file implements the OpenMPIRBuilder class, which is used as a
11/// convenient way to create LLVM instructions for OpenMP directives.
12///
13//===----------------------------------------------------------------------===//
14
17#include "llvm/ADT/SmallSet.h"
19#include "llvm/ADT/StringRef.h"
29#include "llvm/IR/Attributes.h"
30#include "llvm/IR/BasicBlock.h"
31#include "llvm/IR/CFG.h"
32#include "llvm/IR/CallingConv.h"
33#include "llvm/IR/Constant.h"
34#include "llvm/IR/Constants.h"
35#include "llvm/IR/DIBuilder.h"
38#include "llvm/IR/Function.h"
40#include "llvm/IR/IRBuilder.h"
41#include "llvm/IR/LLVMContext.h"
42#include "llvm/IR/MDBuilder.h"
43#include "llvm/IR/Metadata.h"
45#include "llvm/IR/PassManager.h"
47#include "llvm/IR/Value.h"
59
60#include <cstdint>
61#include <optional>
62
63#define DEBUG_TYPE "openmp-ir-builder"
64
65using namespace llvm;
66using namespace omp;
67
68static cl::opt<bool>
69 OptimisticAttributes("openmp-ir-builder-optimistic-attributes", cl::Hidden,
70 cl::desc("Use optimistic attributes describing "
71 "'as-if' properties of runtime calls."),
72 cl::init(false));
73
75 "openmp-ir-builder-unroll-threshold-factor", cl::Hidden,
76 cl::desc("Factor for the unroll threshold to account for code "
77 "simplifications still taking place"),
78 cl::init(1.5));
79
80#ifndef NDEBUG
81/// Return whether IP1 and IP2 are ambiguous, i.e. that inserting instructions
82/// at position IP1 may change the meaning of IP2 or vice-versa. This is because
83/// an InsertPoint stores the instruction before something is inserted. For
84/// instance, if both point to the same instruction, two IRBuilders alternating
85/// creating instruction will cause the instructions to be interleaved.
88 if (!IP1.isSet() || !IP2.isSet())
89 return false;
90 return IP1.getBlock() == IP2.getBlock() && IP1.getPoint() == IP2.getPoint();
91}
92
94 // Valid ordered/unordered and base algorithm combinations.
95 switch (SchedType & ~OMPScheduleType::MonotonicityMask) {
96 case OMPScheduleType::UnorderedStaticChunked:
97 case OMPScheduleType::UnorderedStatic:
98 case OMPScheduleType::UnorderedDynamicChunked:
99 case OMPScheduleType::UnorderedGuidedChunked:
100 case OMPScheduleType::UnorderedRuntime:
101 case OMPScheduleType::UnorderedAuto:
102 case OMPScheduleType::UnorderedTrapezoidal:
103 case OMPScheduleType::UnorderedGreedy:
104 case OMPScheduleType::UnorderedBalanced:
105 case OMPScheduleType::UnorderedGuidedIterativeChunked:
106 case OMPScheduleType::UnorderedGuidedAnalyticalChunked:
107 case OMPScheduleType::UnorderedSteal:
108 case OMPScheduleType::UnorderedStaticBalancedChunked:
109 case OMPScheduleType::UnorderedGuidedSimd:
110 case OMPScheduleType::UnorderedRuntimeSimd:
111 case OMPScheduleType::OrderedStaticChunked:
112 case OMPScheduleType::OrderedStatic:
113 case OMPScheduleType::OrderedDynamicChunked:
114 case OMPScheduleType::OrderedGuidedChunked:
115 case OMPScheduleType::OrderedRuntime:
116 case OMPScheduleType::OrderedAuto:
117 case OMPScheduleType::OrderdTrapezoidal:
118 case OMPScheduleType::NomergeUnorderedStaticChunked:
119 case OMPScheduleType::NomergeUnorderedStatic:
120 case OMPScheduleType::NomergeUnorderedDynamicChunked:
121 case OMPScheduleType::NomergeUnorderedGuidedChunked:
122 case OMPScheduleType::NomergeUnorderedRuntime:
123 case OMPScheduleType::NomergeUnorderedAuto:
124 case OMPScheduleType::NomergeUnorderedTrapezoidal:
125 case OMPScheduleType::NomergeUnorderedGreedy:
126 case OMPScheduleType::NomergeUnorderedBalanced:
127 case OMPScheduleType::NomergeUnorderedGuidedIterativeChunked:
128 case OMPScheduleType::NomergeUnorderedGuidedAnalyticalChunked:
129 case OMPScheduleType::NomergeUnorderedSteal:
130 case OMPScheduleType::NomergeOrderedStaticChunked:
131 case OMPScheduleType::NomergeOrderedStatic:
132 case OMPScheduleType::NomergeOrderedDynamicChunked:
133 case OMPScheduleType::NomergeOrderedGuidedChunked:
134 case OMPScheduleType::NomergeOrderedRuntime:
135 case OMPScheduleType::NomergeOrderedAuto:
136 case OMPScheduleType::NomergeOrderedTrapezoidal:
137 break;
138 default:
139 return false;
140 }
141
142 // Must not set both monotonicity modifiers at the same time.
143 OMPScheduleType MonotonicityFlags =
144 SchedType & OMPScheduleType::MonotonicityMask;
145 if (MonotonicityFlags == OMPScheduleType::MonotonicityMask)
146 return false;
147
148 return true;
149}
150#endif
151
152static const omp::GV &getGridValue(const Triple &T, Function *Kernel) {
153 if (T.isAMDGPU()) {
154 StringRef Features =
155 Kernel->getFnAttribute("target-features").getValueAsString();
156 if (Features.count("+wavefrontsize64"))
157 return omp::getAMDGPUGridValues<64>();
158 return omp::getAMDGPUGridValues<32>();
159 }
160 if (T.isNVPTX())
162 llvm_unreachable("No grid value available for this architecture!");
163}
164
165/// Determine which scheduling algorithm to use, determined from schedule clause
166/// arguments.
167static OMPScheduleType
168getOpenMPBaseScheduleType(llvm::omp::ScheduleKind ClauseKind, bool HasChunks,
169 bool HasSimdModifier) {
170 // Currently, the default schedule it static.
171 switch (ClauseKind) {
172 case OMP_SCHEDULE_Default:
173 case OMP_SCHEDULE_Static:
174 return HasChunks ? OMPScheduleType::BaseStaticChunked
175 : OMPScheduleType::BaseStatic;
176 case OMP_SCHEDULE_Dynamic:
177 return OMPScheduleType::BaseDynamicChunked;
178 case OMP_SCHEDULE_Guided:
179 return HasSimdModifier ? OMPScheduleType::BaseGuidedSimd
180 : OMPScheduleType::BaseGuidedChunked;
181 case OMP_SCHEDULE_Auto:
183 case OMP_SCHEDULE_Runtime:
184 return HasSimdModifier ? OMPScheduleType::BaseRuntimeSimd
185 : OMPScheduleType::BaseRuntime;
186 }
187 llvm_unreachable("unhandled schedule clause argument");
188}
189
190/// Adds ordering modifier flags to schedule type.
191static OMPScheduleType
193 bool HasOrderedClause) {
194 assert((BaseScheduleType & OMPScheduleType::ModifierMask) ==
195 OMPScheduleType::None &&
196 "Must not have ordering nor monotonicity flags already set");
197
198 OMPScheduleType OrderingModifier = HasOrderedClause
199 ? OMPScheduleType::ModifierOrdered
200 : OMPScheduleType::ModifierUnordered;
201 OMPScheduleType OrderingScheduleType = BaseScheduleType | OrderingModifier;
202
203 // Unsupported combinations
204 if (OrderingScheduleType ==
205 (OMPScheduleType::BaseGuidedSimd | OMPScheduleType::ModifierOrdered))
206 return OMPScheduleType::OrderedGuidedChunked;
207 else if (OrderingScheduleType == (OMPScheduleType::BaseRuntimeSimd |
208 OMPScheduleType::ModifierOrdered))
209 return OMPScheduleType::OrderedRuntime;
210
211 return OrderingScheduleType;
212}
213
214/// Adds monotonicity modifier flags to schedule type.
215static OMPScheduleType
217 bool HasSimdModifier, bool HasMonotonic,
218 bool HasNonmonotonic, bool HasOrderedClause) {
219 assert((ScheduleType & OMPScheduleType::MonotonicityMask) ==
220 OMPScheduleType::None &&
221 "Must not have monotonicity flags already set");
222 assert((!HasMonotonic || !HasNonmonotonic) &&
223 "Monotonic and Nonmonotonic are contradicting each other");
224
225 if (HasMonotonic) {
226 return ScheduleType | OMPScheduleType::ModifierMonotonic;
227 } else if (HasNonmonotonic) {
228 return ScheduleType | OMPScheduleType::ModifierNonmonotonic;
229 } else {
230 // OpenMP 5.1, 2.11.4 Worksharing-Loop Construct, Description.
231 // If the static schedule kind is specified or if the ordered clause is
232 // specified, and if the nonmonotonic modifier is not specified, the
233 // effect is as if the monotonic modifier is specified. Otherwise, unless
234 // the monotonic modifier is specified, the effect is as if the
235 // nonmonotonic modifier is specified.
236 OMPScheduleType BaseScheduleType =
237 ScheduleType & ~OMPScheduleType::ModifierMask;
238 if ((BaseScheduleType == OMPScheduleType::BaseStatic) ||
239 (BaseScheduleType == OMPScheduleType::BaseStaticChunked) ||
240 HasOrderedClause) {
241 // The monotonic is used by default in openmp runtime library, so no need
242 // to set it.
243 return ScheduleType;
244 } else {
245 return ScheduleType | OMPScheduleType::ModifierNonmonotonic;
246 }
247 }
248}
249
250/// Determine the schedule type using schedule and ordering clause arguments.
251static OMPScheduleType
252computeOpenMPScheduleType(ScheduleKind ClauseKind, bool HasChunks,
253 bool HasSimdModifier, bool HasMonotonicModifier,
254 bool HasNonmonotonicModifier, bool HasOrderedClause) {
255 OMPScheduleType BaseSchedule =
256 getOpenMPBaseScheduleType(ClauseKind, HasChunks, HasSimdModifier);
257 OMPScheduleType OrderedSchedule =
258 getOpenMPOrderingScheduleType(BaseSchedule, HasOrderedClause);
260 OrderedSchedule, HasSimdModifier, HasMonotonicModifier,
261 HasNonmonotonicModifier, HasOrderedClause);
262
264 return Result;
265}
266
267/// Emit an implicit cast to convert \p XRead to type of variable \p V
269 llvm::Value *V) {
270 // TODO: Add this functionality to the `AtomicInfo` interface
271 llvm::Type *XReadType = XRead->getType();
272 llvm::Type *VType = V->getType();
273 if (llvm::AllocaInst *vAlloca = dyn_cast<llvm::AllocaInst>(V))
274 VType = vAlloca->getAllocatedType();
275
276 if (XReadType->isStructTy() && VType->isStructTy())
277 // No need to extract or convert. A direct
278 // `store` will suffice.
279 return XRead;
280
281 if (XReadType->isStructTy())
282 XRead = Builder.CreateExtractValue(XRead, /*Idxs=*/0);
283 if (VType->isIntegerTy() && XReadType->isFloatingPointTy())
284 XRead = Builder.CreateFPToSI(XRead, VType);
285 else if (VType->isFloatingPointTy() && XReadType->isIntegerTy())
286 XRead = Builder.CreateSIToFP(XRead, VType);
287 else if (VType->isIntegerTy() && XReadType->isIntegerTy())
288 XRead = Builder.CreateIntCast(XRead, VType, true);
289 else if (VType->isFloatingPointTy() && XReadType->isFloatingPointTy())
290 XRead = Builder.CreateFPCast(XRead, VType);
291 return XRead;
292}
293
294/// Make \p Source branch to \p Target.
295///
296/// Handles two situations:
297/// * \p Source already has an unconditional branch.
298/// * \p Source is a degenerate block (no terminator because the BB is
299/// the current head of the IR construction).
301 if (Instruction *Term = Source->getTerminator()) {
302 auto *Br = cast<BranchInst>(Term);
303 assert(!Br->isConditional() &&
304 "BB's terminator must be an unconditional branch (or degenerate)");
305 BasicBlock *Succ = Br->getSuccessor(0);
306 Succ->removePredecessor(Source, /*KeepOneInputPHIs=*/true);
307 Br->setSuccessor(0, Target);
308 return;
309 }
310
311 auto *NewBr = BranchInst::Create(Target, Source);
312 NewBr->setDebugLoc(DL);
313}
314
316 bool CreateBranch) {
317 assert(New->getFirstInsertionPt() == New->begin() &&
318 "Target BB must not have PHI nodes");
319
320 // Move instructions to new block.
321 BasicBlock *Old = IP.getBlock();
322 New->splice(New->begin(), Old, IP.getPoint(), Old->end());
323
324 if (CreateBranch)
325 BranchInst::Create(New, Old);
326}
327
328void llvm::spliceBB(IRBuilder<> &Builder, BasicBlock *New, bool CreateBranch) {
330 BasicBlock *Old = Builder.GetInsertBlock();
331
332 spliceBB(Builder.saveIP(), New, CreateBranch);
333 if (CreateBranch)
334 Builder.SetInsertPoint(Old->getTerminator());
335 else
336 Builder.SetInsertPoint(Old);
337
338 // SetInsertPoint also updates the Builder's debug location, but we want to
339 // keep the one the Builder was configured to use.
341}
342
345 BasicBlock *Old = IP.getBlock();
347 Old->getContext(), Name.isTriviallyEmpty() ? Old->getName() : Name,
348 Old->getParent(), Old->getNextNode());
349 spliceBB(IP, New, CreateBranch);
350 New->replaceSuccessorsPhiUsesWith(Old, New);
351 return New;
352}
353
354BasicBlock *llvm::splitBB(IRBuilderBase &Builder, bool CreateBranch,
357 BasicBlock *New = splitBB(Builder.saveIP(), CreateBranch, Name);
358 if (CreateBranch)
359 Builder.SetInsertPoint(Builder.GetInsertBlock()->getTerminator());
360 else
361 Builder.SetInsertPoint(Builder.GetInsertBlock());
362 // SetInsertPoint also updates the Builder's debug location, but we want to
363 // keep the one the Builder was configured to use.
365 return New;
366}
367
368BasicBlock *llvm::splitBB(IRBuilder<> &Builder, bool CreateBranch,
371 BasicBlock *New = splitBB(Builder.saveIP(), CreateBranch, Name);
372 if (CreateBranch)
373 Builder.SetInsertPoint(Builder.GetInsertBlock()->getTerminator());
374 else
375 Builder.SetInsertPoint(Builder.GetInsertBlock());
376 // SetInsertPoint also updates the Builder's debug location, but we want to
377 // keep the one the Builder was configured to use.
379 return New;
380}
381
383 llvm::Twine Suffix) {
384 BasicBlock *Old = Builder.GetInsertBlock();
385 return splitBB(Builder, CreateBranch, Old->getName() + Suffix);
386}
387
388// This function creates a fake integer value and a fake use for the integer
389// value. It returns the fake value created. This is useful in modeling the
390// extra arguments to the outlined functions.
392 OpenMPIRBuilder::InsertPointTy OuterAllocaIP,
394 OpenMPIRBuilder::InsertPointTy InnerAllocaIP,
395 const Twine &Name = "", bool AsPtr = true) {
396 Builder.restoreIP(OuterAllocaIP);
397 Instruction *FakeVal;
398 AllocaInst *FakeValAddr =
399 Builder.CreateAlloca(Builder.getInt32Ty(), nullptr, Name + ".addr");
400 ToBeDeleted.push_back(FakeValAddr);
401
402 if (AsPtr) {
403 FakeVal = FakeValAddr;
404 } else {
405 FakeVal =
406 Builder.CreateLoad(Builder.getInt32Ty(), FakeValAddr, Name + ".val");
407 ToBeDeleted.push_back(FakeVal);
408 }
409
410 // Generate a fake use of this value
411 Builder.restoreIP(InnerAllocaIP);
412 Instruction *UseFakeVal;
413 if (AsPtr) {
414 UseFakeVal =
415 Builder.CreateLoad(Builder.getInt32Ty(), FakeVal, Name + ".use");
416 } else {
417 UseFakeVal =
418 cast<BinaryOperator>(Builder.CreateAdd(FakeVal, Builder.getInt32(10)));
419 }
420 ToBeDeleted.push_back(UseFakeVal);
421 return FakeVal;
422}
423
424//===----------------------------------------------------------------------===//
425// OpenMPIRBuilderConfig
426//===----------------------------------------------------------------------===//
427
428namespace {
430/// Values for bit flags for marking which requires clauses have been used.
431enum OpenMPOffloadingRequiresDirFlags {
432 /// flag undefined.
433 OMP_REQ_UNDEFINED = 0x000,
434 /// no requires directive present.
435 OMP_REQ_NONE = 0x001,
436 /// reverse_offload clause.
437 OMP_REQ_REVERSE_OFFLOAD = 0x002,
438 /// unified_address clause.
439 OMP_REQ_UNIFIED_ADDRESS = 0x004,
440 /// unified_shared_memory clause.
441 OMP_REQ_UNIFIED_SHARED_MEMORY = 0x008,
442 /// dynamic_allocators clause.
443 OMP_REQ_DYNAMIC_ALLOCATORS = 0x010,
444 LLVM_MARK_AS_BITMASK_ENUM(/*LargestValue=*/OMP_REQ_DYNAMIC_ALLOCATORS)
445};
446
447} // anonymous namespace
448
450 : RequiresFlags(OMP_REQ_UNDEFINED) {}
451
453 bool IsTargetDevice, bool IsGPU, bool OpenMPOffloadMandatory,
454 bool HasRequiresReverseOffload, bool HasRequiresUnifiedAddress,
455 bool HasRequiresUnifiedSharedMemory, bool HasRequiresDynamicAllocators)
456 : IsTargetDevice(IsTargetDevice), IsGPU(IsGPU),
457 OpenMPOffloadMandatory(OpenMPOffloadMandatory),
458 RequiresFlags(OMP_REQ_UNDEFINED) {
459 if (HasRequiresReverseOffload)
460 RequiresFlags |= OMP_REQ_REVERSE_OFFLOAD;
461 if (HasRequiresUnifiedAddress)
462 RequiresFlags |= OMP_REQ_UNIFIED_ADDRESS;
463 if (HasRequiresUnifiedSharedMemory)
464 RequiresFlags |= OMP_REQ_UNIFIED_SHARED_MEMORY;
465 if (HasRequiresDynamicAllocators)
466 RequiresFlags |= OMP_REQ_DYNAMIC_ALLOCATORS;
467}
468
470 return RequiresFlags & OMP_REQ_REVERSE_OFFLOAD;
471}
472
474 return RequiresFlags & OMP_REQ_UNIFIED_ADDRESS;
475}
476
478 return RequiresFlags & OMP_REQ_UNIFIED_SHARED_MEMORY;
479}
480
482 return RequiresFlags & OMP_REQ_DYNAMIC_ALLOCATORS;
483}
484
486 return hasRequiresFlags() ? RequiresFlags
487 : static_cast<int64_t>(OMP_REQ_NONE);
488}
489
491 if (Value)
492 RequiresFlags |= OMP_REQ_REVERSE_OFFLOAD;
493 else
494 RequiresFlags &= ~OMP_REQ_REVERSE_OFFLOAD;
495}
496
498 if (Value)
499 RequiresFlags |= OMP_REQ_UNIFIED_ADDRESS;
500 else
501 RequiresFlags &= ~OMP_REQ_UNIFIED_ADDRESS;
502}
503
505 if (Value)
506 RequiresFlags |= OMP_REQ_UNIFIED_SHARED_MEMORY;
507 else
508 RequiresFlags &= ~OMP_REQ_UNIFIED_SHARED_MEMORY;
509}
510
512 if (Value)
513 RequiresFlags |= OMP_REQ_DYNAMIC_ALLOCATORS;
514 else
515 RequiresFlags &= ~OMP_REQ_DYNAMIC_ALLOCATORS;
516}
517
518//===----------------------------------------------------------------------===//
519// OpenMPIRBuilder
520//===----------------------------------------------------------------------===//
521
523 IRBuilderBase &Builder,
524 SmallVector<Value *> &ArgsVector) {
526 Value *PointerNum = Builder.getInt32(KernelArgs.NumTargetItems);
527 auto Int32Ty = Type::getInt32Ty(Builder.getContext());
528 constexpr const size_t MaxDim = 3;
529 Value *ZeroArray = Constant::getNullValue(ArrayType::get(Int32Ty, MaxDim));
530 Value *Flags = Builder.getInt64(KernelArgs.HasNoWait);
531
532 assert(!KernelArgs.NumTeams.empty() && !KernelArgs.NumThreads.empty());
533
534 Value *NumTeams3D =
535 Builder.CreateInsertValue(ZeroArray, KernelArgs.NumTeams[0], {0});
536 Value *NumThreads3D =
537 Builder.CreateInsertValue(ZeroArray, KernelArgs.NumThreads[0], {0});
538 for (unsigned I :
539 seq<unsigned>(1, std::min(KernelArgs.NumTeams.size(), MaxDim)))
540 NumTeams3D =
541 Builder.CreateInsertValue(NumTeams3D, KernelArgs.NumTeams[I], {I});
542 for (unsigned I :
543 seq<unsigned>(1, std::min(KernelArgs.NumThreads.size(), MaxDim)))
544 NumThreads3D =
545 Builder.CreateInsertValue(NumThreads3D, KernelArgs.NumThreads[I], {I});
546
547 ArgsVector = {Version,
548 PointerNum,
549 KernelArgs.RTArgs.BasePointersArray,
550 KernelArgs.RTArgs.PointersArray,
551 KernelArgs.RTArgs.SizesArray,
552 KernelArgs.RTArgs.MapTypesArray,
553 KernelArgs.RTArgs.MapNamesArray,
554 KernelArgs.RTArgs.MappersArray,
555 KernelArgs.NumIterations,
556 Flags,
557 NumTeams3D,
558 NumThreads3D,
559 KernelArgs.DynCGGroupMem};
560}
561
563 LLVMContext &Ctx = Fn.getContext();
564
565 // Get the function's current attributes.
566 auto Attrs = Fn.getAttributes();
567 auto FnAttrs = Attrs.getFnAttrs();
568 auto RetAttrs = Attrs.getRetAttrs();
570 for (size_t ArgNo = 0; ArgNo < Fn.arg_size(); ++ArgNo)
571 ArgAttrs.emplace_back(Attrs.getParamAttrs(ArgNo));
572
573 // Add AS to FnAS while taking special care with integer extensions.
574 auto addAttrSet = [&](AttributeSet &FnAS, const AttributeSet &AS,
575 bool Param = true) -> void {
576 bool HasSignExt = AS.hasAttribute(Attribute::SExt);
577 bool HasZeroExt = AS.hasAttribute(Attribute::ZExt);
578 if (HasSignExt || HasZeroExt) {
579 assert(AS.getNumAttributes() == 1 &&
580 "Currently not handling extension attr combined with others.");
581 if (Param) {
582 if (auto AK = TargetLibraryInfo::getExtAttrForI32Param(T, HasSignExt))
583 FnAS = FnAS.addAttribute(Ctx, AK);
584 } else if (auto AK =
585 TargetLibraryInfo::getExtAttrForI32Return(T, HasSignExt))
586 FnAS = FnAS.addAttribute(Ctx, AK);
587 } else {
588 FnAS = FnAS.addAttributes(Ctx, AS);
589 }
590 };
591
592#define OMP_ATTRS_SET(VarName, AttrSet) AttributeSet VarName = AttrSet;
593#include "llvm/Frontend/OpenMP/OMPKinds.def"
594
595 // Add attributes to the function declaration.
596 switch (FnID) {
597#define OMP_RTL_ATTRS(Enum, FnAttrSet, RetAttrSet, ArgAttrSets) \
598 case Enum: \
599 FnAttrs = FnAttrs.addAttributes(Ctx, FnAttrSet); \
600 addAttrSet(RetAttrs, RetAttrSet, /*Param*/ false); \
601 for (size_t ArgNo = 0; ArgNo < ArgAttrSets.size(); ++ArgNo) \
602 addAttrSet(ArgAttrs[ArgNo], ArgAttrSets[ArgNo]); \
603 Fn.setAttributes(AttributeList::get(Ctx, FnAttrs, RetAttrs, ArgAttrs)); \
604 break;
605#include "llvm/Frontend/OpenMP/OMPKinds.def"
606 default:
607 // Attributes are optional.
608 break;
609 }
610}
611
614 FunctionType *FnTy = nullptr;
615 Function *Fn = nullptr;
616
617 // Try to find the declation in the module first.
618 switch (FnID) {
619#define OMP_RTL(Enum, Str, IsVarArg, ReturnType, ...) \
620 case Enum: \
621 FnTy = FunctionType::get(ReturnType, ArrayRef<Type *>{__VA_ARGS__}, \
622 IsVarArg); \
623 Fn = M.getFunction(Str); \
624 break;
625#include "llvm/Frontend/OpenMP/OMPKinds.def"
626 }
627
628 if (!Fn) {
629 // Create a new declaration if we need one.
630 switch (FnID) {
631#define OMP_RTL(Enum, Str, ...) \
632 case Enum: \
633 Fn = Function::Create(FnTy, GlobalValue::ExternalLinkage, Str, M); \
634 break;
635#include "llvm/Frontend/OpenMP/OMPKinds.def"
636 }
637
638 // Add information if the runtime function takes a callback function
639 if (FnID == OMPRTL___kmpc_fork_call || FnID == OMPRTL___kmpc_fork_teams) {
640 if (!Fn->hasMetadata(LLVMContext::MD_callback)) {
641 LLVMContext &Ctx = Fn->getContext();
642 MDBuilder MDB(Ctx);
643 // Annotate the callback behavior of the runtime function:
644 // - The callback callee is argument number 2 (microtask).
645 // - The first two arguments of the callback callee are unknown (-1).
646 // - All variadic arguments to the runtime function are passed to the
647 // callback callee.
648 Fn->addMetadata(
649 LLVMContext::MD_callback,
651 2, {-1, -1}, /* VarArgsArePassed */ true)}));
652 }
653 }
654
655 LLVM_DEBUG(dbgs() << "Created OpenMP runtime function " << Fn->getName()
656 << " with type " << *Fn->getFunctionType() << "\n");
657 addAttributes(FnID, *Fn);
658
659 } else {
660 LLVM_DEBUG(dbgs() << "Found OpenMP runtime function " << Fn->getName()
661 << " with type " << *Fn->getFunctionType() << "\n");
662 }
663
664 assert(Fn && "Failed to create OpenMP runtime function");
665
666 return {FnTy, Fn};
667}
668
671 auto *Fn = dyn_cast<llvm::Function>(RTLFn.getCallee());
672 assert(Fn && "Failed to create OpenMP runtime function pointer");
673 return Fn;
674}
675
676void OpenMPIRBuilder::initialize() { initializeTypes(M); }
677
680 BasicBlock &EntryBlock = Function->getEntryBlock();
681 BasicBlock::iterator MoveLocInst = EntryBlock.getFirstNonPHIIt();
682
683 // Loop over blocks looking for constant allocas, skipping the entry block
684 // as any allocas there are already in the desired location.
685 for (auto Block = std::next(Function->begin(), 1); Block != Function->end();
686 Block++) {
687 for (auto Inst = Block->getReverseIterator()->begin();
688 Inst != Block->getReverseIterator()->end();) {
689 if (auto *AllocaInst = dyn_cast_if_present<llvm::AllocaInst>(Inst)) {
690 Inst++;
691 if (!isa<ConstantData>(AllocaInst->getArraySize()))
692 continue;
693 AllocaInst->moveBeforePreserving(MoveLocInst);
694 } else {
695 Inst++;
696 }
697 }
698 }
699}
700
702 SmallPtrSet<BasicBlock *, 32> ParallelRegionBlockSet;
704 SmallVector<OutlineInfo, 16> DeferredOutlines;
705 for (OutlineInfo &OI : OutlineInfos) {
706 // Skip functions that have not finalized yet; may happen with nested
707 // function generation.
708 if (Fn && OI.getFunction() != Fn) {
709 DeferredOutlines.push_back(OI);
710 continue;
711 }
712
713 ParallelRegionBlockSet.clear();
714 Blocks.clear();
715 OI.collectBlocks(ParallelRegionBlockSet, Blocks);
716
717 Function *OuterFn = OI.getFunction();
718 CodeExtractorAnalysisCache CEAC(*OuterFn);
719 // If we generate code for the target device, we need to allocate
720 // struct for aggregate params in the device default alloca address space.
721 // OpenMP runtime requires that the params of the extracted functions are
722 // passed as zero address space pointers. This flag ensures that
723 // CodeExtractor generates correct code for extracted functions
724 // which are used by OpenMP runtime.
725 bool ArgsInZeroAddressSpace = Config.isTargetDevice();
726 CodeExtractor Extractor(Blocks, /* DominatorTree */ nullptr,
727 /* AggregateArgs */ true,
728 /* BlockFrequencyInfo */ nullptr,
729 /* BranchProbabilityInfo */ nullptr,
730 /* AssumptionCache */ nullptr,
731 /* AllowVarArgs */ true,
732 /* AllowAlloca */ true,
733 /* AllocaBlock*/ OI.OuterAllocaBB,
734 /* Suffix */ ".omp_par", ArgsInZeroAddressSpace);
735
736 LLVM_DEBUG(dbgs() << "Before outlining: " << *OuterFn << "\n");
737 LLVM_DEBUG(dbgs() << "Entry " << OI.EntryBB->getName()
738 << " Exit: " << OI.ExitBB->getName() << "\n");
739 assert(Extractor.isEligible() &&
740 "Expected OpenMP outlining to be possible!");
741
742 for (auto *V : OI.ExcludeArgsFromAggregate)
743 Extractor.excludeArgFromAggregate(V);
744
745 Function *OutlinedFn = Extractor.extractCodeRegion(CEAC);
746
747 // Forward target-cpu, target-features attributes to the outlined function.
748 auto TargetCpuAttr = OuterFn->getFnAttribute("target-cpu");
749 if (TargetCpuAttr.isStringAttribute())
750 OutlinedFn->addFnAttr(TargetCpuAttr);
751
752 auto TargetFeaturesAttr = OuterFn->getFnAttribute("target-features");
753 if (TargetFeaturesAttr.isStringAttribute())
754 OutlinedFn->addFnAttr(TargetFeaturesAttr);
755
756 LLVM_DEBUG(dbgs() << "After outlining: " << *OuterFn << "\n");
757 LLVM_DEBUG(dbgs() << " Outlined function: " << *OutlinedFn << "\n");
758 assert(OutlinedFn->getReturnType()->isVoidTy() &&
759 "OpenMP outlined functions should not return a value!");
760
761 // For compability with the clang CG we move the outlined function after the
762 // one with the parallel region.
763 OutlinedFn->removeFromParent();
764 M.getFunctionList().insertAfter(OuterFn->getIterator(), OutlinedFn);
765
766 // Remove the artificial entry introduced by the extractor right away, we
767 // made our own entry block after all.
768 {
769 BasicBlock &ArtificialEntry = OutlinedFn->getEntryBlock();
770 assert(ArtificialEntry.getUniqueSuccessor() == OI.EntryBB);
771 assert(OI.EntryBB->getUniquePredecessor() == &ArtificialEntry);
772 // Move instructions from the to-be-deleted ArtificialEntry to the entry
773 // basic block of the parallel region. CodeExtractor generates
774 // instructions to unwrap the aggregate argument and may sink
775 // allocas/bitcasts for values that are solely used in the outlined region
776 // and do not escape.
777 assert(!ArtificialEntry.empty() &&
778 "Expected instructions to add in the outlined region entry");
779 for (BasicBlock::reverse_iterator It = ArtificialEntry.rbegin(),
780 End = ArtificialEntry.rend();
781 It != End;) {
782 Instruction &I = *It;
783 It++;
784
785 if (I.isTerminator())
786 continue;
787
788 I.moveBeforePreserving(*OI.EntryBB, OI.EntryBB->getFirstInsertionPt());
789 }
790
791 OI.EntryBB->moveBefore(&ArtificialEntry);
792 ArtificialEntry.eraseFromParent();
793 }
794 assert(&OutlinedFn->getEntryBlock() == OI.EntryBB);
795 assert(OutlinedFn && OutlinedFn->getNumUses() == 1);
796
797 // Run a user callback, e.g. to add attributes.
798 if (OI.PostOutlineCB)
799 OI.PostOutlineCB(*OutlinedFn);
800 }
801
802 // Remove work items that have been completed.
803 OutlineInfos = std::move(DeferredOutlines);
804
805 // The createTarget functions embeds user written code into
806 // the target region which may inject allocas which need to
807 // be moved to the entry block of our target or risk malformed
808 // optimisations by later passes, this is only relevant for
809 // the device pass which appears to be a little more delicate
810 // when it comes to optimisations (however, we do not block on
811 // that here, it's up to the inserter to the list to do so).
812 // This notbaly has to occur after the OutlinedInfo candidates
813 // have been extracted so we have an end product that will not
814 // be implicitly adversely affected by any raises unless
815 // intentionally appended to the list.
816 // NOTE: This only does so for ConstantData, it could be extended
817 // to ConstantExpr's with further effort, however, they should
818 // largely be folded when they get here. Extending it to runtime
819 // defined/read+writeable allocation sizes would be non-trivial
820 // (need to factor in movement of any stores to variables the
821 // allocation size depends on, as well as the usual loads,
822 // otherwise it'll yield the wrong result after movement) and
823 // likely be more suitable as an LLVM optimisation pass.
826
827 EmitMetadataErrorReportFunctionTy &&ErrorReportFn =
828 [](EmitMetadataErrorKind Kind,
829 const TargetRegionEntryInfo &EntryInfo) -> void {
830 errs() << "Error of kind: " << Kind
831 << " when emitting offload entries and metadata during "
832 "OMPIRBuilder finalization \n";
833 };
834
837
838 if (Config.EmitLLVMUsedMetaInfo.value_or(false)) {
839 std::vector<WeakTrackingVH> LLVMCompilerUsed = {
840 M.getGlobalVariable("__openmp_nvptx_data_transfer_temporary_storage")};
841 emitUsed("llvm.compiler.used", LLVMCompilerUsed);
842 }
843}
844
846 assert(OutlineInfos.empty() && "There must be no outstanding outlinings");
847}
848
851 auto *GV =
852 new GlobalVariable(M, I32Ty,
853 /* isConstant = */ true, GlobalValue::WeakODRLinkage,
854 ConstantInt::get(I32Ty, Value), Name);
855 GV->setVisibility(GlobalValue::HiddenVisibility);
856
857 return GV;
858}
859
861 if (List.empty())
862 return;
863
864 // Convert List to what ConstantArray needs.
866 UsedArray.resize(List.size());
867 for (unsigned I = 0, E = List.size(); I != E; ++I)
869 cast<Constant>(&*List[I]), Builder.getPtrTy());
870
871 if (UsedArray.empty())
872 return;
873 ArrayType *ATy = ArrayType::get(Builder.getPtrTy(), UsedArray.size());
874
875 auto *GV = new GlobalVariable(M, ATy, false, GlobalValue::AppendingLinkage,
876 ConstantArray::get(ATy, UsedArray), Name);
877
878 GV->setSection("llvm.metadata");
879}
880
883 OMPTgtExecModeFlags Mode) {
884 auto *Int8Ty = Builder.getInt8Ty();
885 auto *GVMode = new GlobalVariable(
886 M, Int8Ty, /*isConstant=*/true, GlobalValue::WeakAnyLinkage,
887 ConstantInt::get(Int8Ty, Mode), Twine(KernelName, "_exec_mode"));
888 GVMode->setVisibility(GlobalVariable::ProtectedVisibility);
889 return GVMode;
890}
891
893 uint32_t SrcLocStrSize,
894 IdentFlag LocFlags,
895 unsigned Reserve2Flags) {
896 // Enable "C-mode".
897 LocFlags |= OMP_IDENT_FLAG_KMPC;
898
899 Constant *&Ident =
900 IdentMap[{SrcLocStr, uint64_t(LocFlags) << 31 | Reserve2Flags}];
901 if (!Ident) {
903 Constant *IdentData[] = {I32Null,
904 ConstantInt::get(Int32, uint32_t(LocFlags)),
905 ConstantInt::get(Int32, Reserve2Flags),
906 ConstantInt::get(Int32, SrcLocStrSize), SrcLocStr};
907 Constant *Initializer =
908 ConstantStruct::get(OpenMPIRBuilder::Ident, IdentData);
909
910 // Look for existing encoding of the location + flags, not needed but
911 // minimizes the difference to the existing solution while we transition.
912 for (GlobalVariable &GV : M.globals())
913 if (GV.getValueType() == OpenMPIRBuilder::Ident && GV.hasInitializer())
914 if (GV.getInitializer() == Initializer)
915 Ident = &GV;
916
917 if (!Ident) {
918 auto *GV = new GlobalVariable(
919 M, OpenMPIRBuilder::Ident,
920 /* isConstant = */ true, GlobalValue::PrivateLinkage, Initializer, "",
923 GV->setUnnamedAddr(GlobalValue::UnnamedAddr::Global);
924 GV->setAlignment(Align(8));
925 Ident = GV;
926 }
927 }
928
930}
931
933 uint32_t &SrcLocStrSize) {
934 SrcLocStrSize = LocStr.size();
935 Constant *&SrcLocStr = SrcLocStrMap[LocStr];
936 if (!SrcLocStr) {
937 Constant *Initializer =
939
940 // Look for existing encoding of the location, not needed but minimizes the
941 // difference to the existing solution while we transition.
942 for (GlobalVariable &GV : M.globals())
943 if (GV.isConstant() && GV.hasInitializer() &&
944 GV.getInitializer() == Initializer)
945 return SrcLocStr = ConstantExpr::getPointerCast(&GV, Int8Ptr);
946
947 SrcLocStr = Builder.CreateGlobalString(LocStr, /* Name */ "",
948 /* AddressSpace */ 0, &M);
949 }
950 return SrcLocStr;
951}
952
954 StringRef FileName,
955 unsigned Line, unsigned Column,
956 uint32_t &SrcLocStrSize) {
957 SmallString<128> Buffer;
958 Buffer.push_back(';');
959 Buffer.append(FileName);
960 Buffer.push_back(';');
961 Buffer.append(FunctionName);
962 Buffer.push_back(';');
963 Buffer.append(std::to_string(Line));
964 Buffer.push_back(';');
965 Buffer.append(std::to_string(Column));
966 Buffer.push_back(';');
967 Buffer.push_back(';');
968 return getOrCreateSrcLocStr(Buffer.str(), SrcLocStrSize);
969}
970
971Constant *
973 StringRef UnknownLoc = ";unknown;unknown;0;0;;";
974 return getOrCreateSrcLocStr(UnknownLoc, SrcLocStrSize);
975}
976
978 uint32_t &SrcLocStrSize,
979 Function *F) {
980 DILocation *DIL = DL.get();
981 if (!DIL)
982 return getOrCreateDefaultSrcLocStr(SrcLocStrSize);
983 StringRef FileName = M.getName();
984 if (DIFile *DIF = DIL->getFile())
985 if (std::optional<StringRef> Source = DIF->getSource())
986 FileName = *Source;
987 StringRef Function = DIL->getScope()->getSubprogram()->getName();
988 if (Function.empty() && F)
989 Function = F->getName();
990 return getOrCreateSrcLocStr(Function, FileName, DIL->getLine(),
991 DIL->getColumn(), SrcLocStrSize);
992}
993
995 uint32_t &SrcLocStrSize) {
996 return getOrCreateSrcLocStr(Loc.DL, SrcLocStrSize,
997 Loc.IP.getBlock()->getParent());
998}
999
1001 return Builder.CreateCall(
1002 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_global_thread_num), Ident,
1003 "omp_global_thread_num");
1004}
1005
1008 bool ForceSimpleCall, bool CheckCancelFlag) {
1009 if (!updateToLocation(Loc))
1010 return Loc.IP;
1011
1012 // Build call __kmpc_cancel_barrier(loc, thread_id) or
1013 // __kmpc_barrier(loc, thread_id);
1014
1015 IdentFlag BarrierLocFlags;
1016 switch (Kind) {
1017 case OMPD_for:
1018 BarrierLocFlags = OMP_IDENT_FLAG_BARRIER_IMPL_FOR;
1019 break;
1020 case OMPD_sections:
1021 BarrierLocFlags = OMP_IDENT_FLAG_BARRIER_IMPL_SECTIONS;
1022 break;
1023 case OMPD_single:
1024 BarrierLocFlags = OMP_IDENT_FLAG_BARRIER_IMPL_SINGLE;
1025 break;
1026 case OMPD_barrier:
1027 BarrierLocFlags = OMP_IDENT_FLAG_BARRIER_EXPL;
1028 break;
1029 default:
1030 BarrierLocFlags = OMP_IDENT_FLAG_BARRIER_IMPL;
1031 break;
1032 }
1033
1034 uint32_t SrcLocStrSize;
1035 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
1036 Value *Args[] = {
1037 getOrCreateIdent(SrcLocStr, SrcLocStrSize, BarrierLocFlags),
1038 getOrCreateThreadID(getOrCreateIdent(SrcLocStr, SrcLocStrSize))};
1039
1040 // If we are in a cancellable parallel region, barriers are cancellation
1041 // points.
1042 // TODO: Check why we would force simple calls or to ignore the cancel flag.
1043 bool UseCancelBarrier =
1044 !ForceSimpleCall && isLastFinalizationInfoCancellable(OMPD_parallel);
1045
1046 Value *Result =
1048 UseCancelBarrier ? OMPRTL___kmpc_cancel_barrier
1049 : OMPRTL___kmpc_barrier),
1050 Args);
1051
1052 if (UseCancelBarrier && CheckCancelFlag)
1053 if (Error Err = emitCancelationCheckImpl(Result, OMPD_parallel))
1054 return Err;
1055
1056 return Builder.saveIP();
1057}
1058
1061 Value *IfCondition,
1062 omp::Directive CanceledDirective) {
1063 if (!updateToLocation(Loc))
1064 return Loc.IP;
1065
1066 // LLVM utilities like blocks with terminators.
1067 auto *UI = Builder.CreateUnreachable();
1068
1069 Instruction *ThenTI = UI, *ElseTI = nullptr;
1070 if (IfCondition)
1071 SplitBlockAndInsertIfThenElse(IfCondition, UI, &ThenTI, &ElseTI);
1072 Builder.SetInsertPoint(ThenTI);
1073
1074 Value *CancelKind = nullptr;
1075 switch (CanceledDirective) {
1076#define OMP_CANCEL_KIND(Enum, Str, DirectiveEnum, Value) \
1077 case DirectiveEnum: \
1078 CancelKind = Builder.getInt32(Value); \
1079 break;
1080#include "llvm/Frontend/OpenMP/OMPKinds.def"
1081 default:
1082 llvm_unreachable("Unknown cancel kind!");
1083 }
1084
1085 uint32_t SrcLocStrSize;
1086 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
1087 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
1088 Value *Args[] = {Ident, getOrCreateThreadID(Ident), CancelKind};
1089 Value *Result = Builder.CreateCall(
1090 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_cancel), Args);
1091 auto ExitCB = [this, CanceledDirective, Loc](InsertPointTy IP) -> Error {
1092 if (CanceledDirective == OMPD_parallel) {
1094 Builder.restoreIP(IP);
1096 omp::Directive::OMPD_unknown,
1097 /* ForceSimpleCall */ false,
1098 /* CheckCancelFlag */ false)
1099 .takeError();
1100 }
1101 return Error::success();
1102 };
1103
1104 // The actual cancel logic is shared with others, e.g., cancel_barriers.
1105 if (Error Err = emitCancelationCheckImpl(Result, CanceledDirective, ExitCB))
1106 return Err;
1107
1108 // Update the insertion point and remove the terminator we introduced.
1109 Builder.SetInsertPoint(UI->getParent());
1110 UI->eraseFromParent();
1111
1112 return Builder.saveIP();
1113}
1114
1116 const LocationDescription &Loc, InsertPointTy AllocaIP, Value *&Return,
1117 Value *Ident, Value *DeviceID, Value *NumTeams, Value *NumThreads,
1118 Value *HostPtr, ArrayRef<Value *> KernelArgs) {
1119 if (!updateToLocation(Loc))
1120 return Loc.IP;
1121
1122 Builder.restoreIP(AllocaIP);
1123 auto *KernelArgsPtr =
1124 Builder.CreateAlloca(OpenMPIRBuilder::KernelArgs, nullptr, "kernel_args");
1125 Builder.restoreIP(Loc.IP);
1126
1127 for (unsigned I = 0, Size = KernelArgs.size(); I != Size; ++I) {
1128 llvm::Value *Arg =
1129 Builder.CreateStructGEP(OpenMPIRBuilder::KernelArgs, KernelArgsPtr, I);
1131 KernelArgs[I], Arg,
1132 M.getDataLayout().getPrefTypeAlign(KernelArgs[I]->getType()));
1133 }
1134
1135 SmallVector<Value *> OffloadingArgs{Ident, DeviceID, NumTeams,
1136 NumThreads, HostPtr, KernelArgsPtr};
1137
1138 Return = Builder.CreateCall(
1139 getOrCreateRuntimeFunction(M, OMPRTL___tgt_target_kernel),
1140 OffloadingArgs);
1141
1142 return Builder.saveIP();
1143}
1144
1146 const LocationDescription &Loc, Value *OutlinedFnID,
1147 EmitFallbackCallbackTy EmitTargetCallFallbackCB, TargetKernelArgs &Args,
1148 Value *DeviceID, Value *RTLoc, InsertPointTy AllocaIP) {
1149
1150 if (!updateToLocation(Loc))
1151 return Loc.IP;
1152
1153 Builder.restoreIP(Loc.IP);
1154 // On top of the arrays that were filled up, the target offloading call
1155 // takes as arguments the device id as well as the host pointer. The host
1156 // pointer is used by the runtime library to identify the current target
1157 // region, so it only has to be unique and not necessarily point to
1158 // anything. It could be the pointer to the outlined function that
1159 // implements the target region, but we aren't using that so that the
1160 // compiler doesn't need to keep that, and could therefore inline the host
1161 // function if proven worthwhile during optimization.
1162
1163 // From this point on, we need to have an ID of the target region defined.
1164 assert(OutlinedFnID && "Invalid outlined function ID!");
1165 (void)OutlinedFnID;
1166
1167 // Return value of the runtime offloading call.
1168 Value *Return = nullptr;
1169
1170 // Arguments for the target kernel.
1171 SmallVector<Value *> ArgsVector;
1172 getKernelArgsVector(Args, Builder, ArgsVector);
1173
1174 // The target region is an outlined function launched by the runtime
1175 // via calls to __tgt_target_kernel().
1176 //
1177 // Note that on the host and CPU targets, the runtime implementation of
1178 // these calls simply call the outlined function without forking threads.
1179 // The outlined functions themselves have runtime calls to
1180 // __kmpc_fork_teams() and __kmpc_fork() for this purpose, codegen'd by
1181 // the compiler in emitTeamsCall() and emitParallelCall().
1182 //
1183 // In contrast, on the NVPTX target, the implementation of
1184 // __tgt_target_teams() launches a GPU kernel with the requested number
1185 // of teams and threads so no additional calls to the runtime are required.
1186 // Check the error code and execute the host version if required.
1188 Builder, AllocaIP, Return, RTLoc, DeviceID, Args.NumTeams.front(),
1189 Args.NumThreads.front(), OutlinedFnID, ArgsVector));
1190
1191 BasicBlock *OffloadFailedBlock =
1192 BasicBlock::Create(Builder.getContext(), "omp_offload.failed");
1193 BasicBlock *OffloadContBlock =
1194 BasicBlock::Create(Builder.getContext(), "omp_offload.cont");
1196 Builder.CreateCondBr(Failed, OffloadFailedBlock, OffloadContBlock);
1197
1198 auto CurFn = Builder.GetInsertBlock()->getParent();
1199 emitBlock(OffloadFailedBlock, CurFn);
1200 InsertPointOrErrorTy AfterIP = EmitTargetCallFallbackCB(Builder.saveIP());
1201 if (!AfterIP)
1202 return AfterIP.takeError();
1203 Builder.restoreIP(*AfterIP);
1204 emitBranch(OffloadContBlock);
1205 emitBlock(OffloadContBlock, CurFn, /*IsFinished=*/true);
1206 return Builder.saveIP();
1207}
1208
1210 Value *CancelFlag, omp::Directive CanceledDirective,
1211 FinalizeCallbackTy ExitCB) {
1212 assert(isLastFinalizationInfoCancellable(CanceledDirective) &&
1213 "Unexpected cancellation!");
1214
1215 // For a cancel barrier we create two new blocks.
1217 BasicBlock *NonCancellationBlock;
1218 if (Builder.GetInsertPoint() == BB->end()) {
1219 // TODO: This branch will not be needed once we moved to the
1220 // OpenMPIRBuilder codegen completely.
1221 NonCancellationBlock = BasicBlock::Create(
1222 BB->getContext(), BB->getName() + ".cont", BB->getParent());
1223 } else {
1224 NonCancellationBlock = SplitBlock(BB, &*Builder.GetInsertPoint());
1227 }
1228 BasicBlock *CancellationBlock = BasicBlock::Create(
1229 BB->getContext(), BB->getName() + ".cncl", BB->getParent());
1230
1231 // Jump to them based on the return value.
1232 Value *Cmp = Builder.CreateIsNull(CancelFlag);
1233 Builder.CreateCondBr(Cmp, NonCancellationBlock, CancellationBlock,
1234 /* TODO weight */ nullptr, nullptr);
1235
1236 // From the cancellation block we finalize all variables and go to the
1237 // post finalization block that is known to the FiniCB callback.
1238 Builder.SetInsertPoint(CancellationBlock);
1239 if (ExitCB)
1240 if (Error Err = ExitCB(Builder.saveIP()))
1241 return Err;
1242 auto &FI = FinalizationStack.back();
1243 if (Error Err = FI.FiniCB(Builder.saveIP()))
1244 return Err;
1245
1246 // The continuation block is where code generation continues.
1247 Builder.SetInsertPoint(NonCancellationBlock, NonCancellationBlock->begin());
1248 return Error::success();
1249}
1250
1251// Callback used to create OpenMP runtime calls to support
1252// omp parallel clause for the device.
1253// We need to use this callback to replace call to the OutlinedFn in OuterFn
1254// by the call to the OpenMP DeviceRTL runtime function (kmpc_parallel_51)
1256 OpenMPIRBuilder *OMPIRBuilder, Function &OutlinedFn, Function *OuterFn,
1257 BasicBlock *OuterAllocaBB, Value *Ident, Value *IfCondition,
1258 Value *NumThreads, Instruction *PrivTID, AllocaInst *PrivTIDAddr,
1259 Value *ThreadID, const SmallVector<Instruction *, 4> &ToBeDeleted) {
1260 // Add some known attributes.
1261 IRBuilder<> &Builder = OMPIRBuilder->Builder;
1262 OutlinedFn.addParamAttr(0, Attribute::NoAlias);
1263 OutlinedFn.addParamAttr(1, Attribute::NoAlias);
1264 OutlinedFn.addParamAttr(0, Attribute::NoUndef);
1265 OutlinedFn.addParamAttr(1, Attribute::NoUndef);
1266 OutlinedFn.addFnAttr(Attribute::NoUnwind);
1267
1268 assert(OutlinedFn.arg_size() >= 2 &&
1269 "Expected at least tid and bounded tid as arguments");
1270 unsigned NumCapturedVars = OutlinedFn.arg_size() - /* tid & bounded tid */ 2;
1271
1272 CallInst *CI = cast<CallInst>(OutlinedFn.user_back());
1273 assert(CI && "Expected call instruction to outlined function");
1274 CI->getParent()->setName("omp_parallel");
1275
1276 Builder.SetInsertPoint(CI);
1277 Type *PtrTy = OMPIRBuilder->VoidPtr;
1278 Value *NullPtrValue = Constant::getNullValue(PtrTy);
1279
1280 // Add alloca for kernel args
1281 OpenMPIRBuilder ::InsertPointTy CurrentIP = Builder.saveIP();
1282 Builder.SetInsertPoint(OuterAllocaBB, OuterAllocaBB->getFirstInsertionPt());
1283 AllocaInst *ArgsAlloca =
1284 Builder.CreateAlloca(ArrayType::get(PtrTy, NumCapturedVars));
1285 Value *Args = ArgsAlloca;
1286 // Add address space cast if array for storing arguments is not allocated
1287 // in address space 0
1288 if (ArgsAlloca->getAddressSpace())
1289 Args = Builder.CreatePointerCast(ArgsAlloca, PtrTy);
1290 Builder.restoreIP(CurrentIP);
1291
1292 // Store captured vars which are used by kmpc_parallel_51
1293 for (unsigned Idx = 0; Idx < NumCapturedVars; Idx++) {
1294 Value *V = *(CI->arg_begin() + 2 + Idx);
1295 Value *StoreAddress = Builder.CreateConstInBoundsGEP2_64(
1296 ArrayType::get(PtrTy, NumCapturedVars), Args, 0, Idx);
1297 Builder.CreateStore(V, StoreAddress);
1298 }
1299
1300 Value *Cond =
1301 IfCondition ? Builder.CreateSExtOrTrunc(IfCondition, OMPIRBuilder->Int32)
1302 : Builder.getInt32(1);
1303
1304 // Build kmpc_parallel_51 call
1305 Value *Parallel51CallArgs[] = {
1306 /* identifier*/ Ident,
1307 /* global thread num*/ ThreadID,
1308 /* if expression */ Cond,
1309 /* number of threads */ NumThreads ? NumThreads : Builder.getInt32(-1),
1310 /* Proc bind */ Builder.getInt32(-1),
1311 /* outlined function */
1312 Builder.CreateBitCast(&OutlinedFn, OMPIRBuilder->ParallelTaskPtr),
1313 /* wrapper function */ NullPtrValue,
1314 /* arguments of the outlined funciton*/ Args,
1315 /* number of arguments */ Builder.getInt64(NumCapturedVars)};
1316
1317 FunctionCallee RTLFn =
1318 OMPIRBuilder->getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_parallel_51);
1319
1320 Builder.CreateCall(RTLFn, Parallel51CallArgs);
1321
1322 LLVM_DEBUG(dbgs() << "With kmpc_parallel_51 placed: "
1323 << *Builder.GetInsertBlock()->getParent() << "\n");
1324
1325 // Initialize the local TID stack location with the argument value.
1326 Builder.SetInsertPoint(PrivTID);
1327 Function::arg_iterator OutlinedAI = OutlinedFn.arg_begin();
1328 Builder.CreateStore(Builder.CreateLoad(OMPIRBuilder->Int32, OutlinedAI),
1329 PrivTIDAddr);
1330
1331 // Remove redundant call to the outlined function.
1332 CI->eraseFromParent();
1333
1334 for (Instruction *I : ToBeDeleted) {
1335 I->eraseFromParent();
1336 }
1337}
1338
1339// Callback used to create OpenMP runtime calls to support
1340// omp parallel clause for the host.
1341// We need to use this callback to replace call to the OutlinedFn in OuterFn
1342// by the call to the OpenMP host runtime function ( __kmpc_fork_call[_if])
1343static void
1345 Function *OuterFn, Value *Ident, Value *IfCondition,
1346 Instruction *PrivTID, AllocaInst *PrivTIDAddr,
1347 const SmallVector<Instruction *, 4> &ToBeDeleted) {
1348 IRBuilder<> &Builder = OMPIRBuilder->Builder;
1349 FunctionCallee RTLFn;
1350 if (IfCondition) {
1351 RTLFn =
1352 OMPIRBuilder->getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_fork_call_if);
1353 } else {
1354 RTLFn =
1355 OMPIRBuilder->getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_fork_call);
1356 }
1357 if (auto *F = dyn_cast<Function>(RTLFn.getCallee())) {
1358 if (!F->hasMetadata(LLVMContext::MD_callback)) {
1359 LLVMContext &Ctx = F->getContext();
1360 MDBuilder MDB(Ctx);
1361 // Annotate the callback behavior of the __kmpc_fork_call:
1362 // - The callback callee is argument number 2 (microtask).
1363 // - The first two arguments of the callback callee are unknown (-1).
1364 // - All variadic arguments to the __kmpc_fork_call are passed to the
1365 // callback callee.
1366 F->addMetadata(LLVMContext::MD_callback,
1368 2, {-1, -1},
1369 /* VarArgsArePassed */ true)}));
1370 }
1371 }
1372 // Add some known attributes.
1373 OutlinedFn.addParamAttr(0, Attribute::NoAlias);
1374 OutlinedFn.addParamAttr(1, Attribute::NoAlias);
1375 OutlinedFn.addFnAttr(Attribute::NoUnwind);
1376
1377 assert(OutlinedFn.arg_size() >= 2 &&
1378 "Expected at least tid and bounded tid as arguments");
1379 unsigned NumCapturedVars = OutlinedFn.arg_size() - /* tid & bounded tid */ 2;
1380
1381 CallInst *CI = cast<CallInst>(OutlinedFn.user_back());
1382 CI->getParent()->setName("omp_parallel");
1383 Builder.SetInsertPoint(CI);
1384
1385 // Build call __kmpc_fork_call[_if](Ident, n, microtask, var1, .., varn);
1386 Value *ForkCallArgs[] = {
1387 Ident, Builder.getInt32(NumCapturedVars),
1388 Builder.CreateBitCast(&OutlinedFn, OMPIRBuilder->ParallelTaskPtr)};
1389
1390 SmallVector<Value *, 16> RealArgs;
1391 RealArgs.append(std::begin(ForkCallArgs), std::end(ForkCallArgs));
1392 if (IfCondition) {
1393 Value *Cond = Builder.CreateSExtOrTrunc(IfCondition, OMPIRBuilder->Int32);
1394 RealArgs.push_back(Cond);
1395 }
1396 RealArgs.append(CI->arg_begin() + /* tid & bound tid */ 2, CI->arg_end());
1397
1398 // __kmpc_fork_call_if always expects a void ptr as the last argument
1399 // If there are no arguments, pass a null pointer.
1400 auto PtrTy = OMPIRBuilder->VoidPtr;
1401 if (IfCondition && NumCapturedVars == 0) {
1402 Value *NullPtrValue = Constant::getNullValue(PtrTy);
1403 RealArgs.push_back(NullPtrValue);
1404 }
1405 if (IfCondition && RealArgs.back()->getType() != PtrTy)
1406 RealArgs.back() = Builder.CreateBitCast(RealArgs.back(), PtrTy);
1407
1408 Builder.CreateCall(RTLFn, RealArgs);
1409
1410 LLVM_DEBUG(dbgs() << "With fork_call placed: "
1411 << *Builder.GetInsertBlock()->getParent() << "\n");
1412
1413 // Initialize the local TID stack location with the argument value.
1414 Builder.SetInsertPoint(PrivTID);
1415 Function::arg_iterator OutlinedAI = OutlinedFn.arg_begin();
1416 Builder.CreateStore(Builder.CreateLoad(OMPIRBuilder->Int32, OutlinedAI),
1417 PrivTIDAddr);
1418
1419 // Remove redundant call to the outlined function.
1420 CI->eraseFromParent();
1421
1422 for (Instruction *I : ToBeDeleted) {
1423 I->eraseFromParent();
1424 }
1425}
1426
1428 const LocationDescription &Loc, InsertPointTy OuterAllocaIP,
1429 BodyGenCallbackTy BodyGenCB, PrivatizeCallbackTy PrivCB,
1430 FinalizeCallbackTy FiniCB, Value *IfCondition, Value *NumThreads,
1431 omp::ProcBindKind ProcBind, bool IsCancellable) {
1432 assert(!isConflictIP(Loc.IP, OuterAllocaIP) && "IPs must not be ambiguous");
1433
1434 if (!updateToLocation(Loc))
1435 return Loc.IP;
1436
1437 uint32_t SrcLocStrSize;
1438 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
1439 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
1440 Value *ThreadID = getOrCreateThreadID(Ident);
1441 // If we generate code for the target device, we need to allocate
1442 // struct for aggregate params in the device default alloca address space.
1443 // OpenMP runtime requires that the params of the extracted functions are
1444 // passed as zero address space pointers. This flag ensures that extracted
1445 // function arguments are declared in zero address space
1446 bool ArgsInZeroAddressSpace = Config.isTargetDevice();
1447
1448 // Build call __kmpc_push_num_threads(&Ident, global_tid, num_threads)
1449 // only if we compile for host side.
1450 if (NumThreads && !Config.isTargetDevice()) {
1451 Value *Args[] = {
1452 Ident, ThreadID,
1453 Builder.CreateIntCast(NumThreads, Int32, /*isSigned*/ false)};
1455 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_push_num_threads), Args);
1456 }
1457
1458 if (ProcBind != OMP_PROC_BIND_default) {
1459 // Build call __kmpc_push_proc_bind(&Ident, global_tid, proc_bind)
1460 Value *Args[] = {
1461 Ident, ThreadID,
1462 ConstantInt::get(Int32, unsigned(ProcBind), /*isSigned=*/true)};
1464 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_push_proc_bind), Args);
1465 }
1466
1467 BasicBlock *InsertBB = Builder.GetInsertBlock();
1468 Function *OuterFn = InsertBB->getParent();
1469
1470 // Save the outer alloca block because the insertion iterator may get
1471 // invalidated and we still need this later.
1472 BasicBlock *OuterAllocaBlock = OuterAllocaIP.getBlock();
1473
1474 // Vector to remember instructions we used only during the modeling but which
1475 // we want to delete at the end.
1477
1478 // Change the location to the outer alloca insertion point to create and
1479 // initialize the allocas we pass into the parallel region.
1480 InsertPointTy NewOuter(OuterAllocaBlock, OuterAllocaBlock->begin());
1481 Builder.restoreIP(NewOuter);
1482 AllocaInst *TIDAddrAlloca = Builder.CreateAlloca(Int32, nullptr, "tid.addr");
1483 AllocaInst *ZeroAddrAlloca =
1484 Builder.CreateAlloca(Int32, nullptr, "zero.addr");
1485 Instruction *TIDAddr = TIDAddrAlloca;
1486 Instruction *ZeroAddr = ZeroAddrAlloca;
1487 if (ArgsInZeroAddressSpace && M.getDataLayout().getAllocaAddrSpace() != 0) {
1488 // Add additional casts to enforce pointers in zero address space
1489 TIDAddr = new AddrSpaceCastInst(
1490 TIDAddrAlloca, PointerType ::get(M.getContext(), 0), "tid.addr.ascast");
1491 TIDAddr->insertAfter(TIDAddrAlloca->getIterator());
1492 ToBeDeleted.push_back(TIDAddr);
1493 ZeroAddr = new AddrSpaceCastInst(ZeroAddrAlloca,
1494 PointerType ::get(M.getContext(), 0),
1495 "zero.addr.ascast");
1496 ZeroAddr->insertAfter(ZeroAddrAlloca->getIterator());
1497 ToBeDeleted.push_back(ZeroAddr);
1498 }
1499
1500 // We only need TIDAddr and ZeroAddr for modeling purposes to get the
1501 // associated arguments in the outlined function, so we delete them later.
1502 ToBeDeleted.push_back(TIDAddrAlloca);
1503 ToBeDeleted.push_back(ZeroAddrAlloca);
1504
1505 // Create an artificial insertion point that will also ensure the blocks we
1506 // are about to split are not degenerated.
1507 auto *UI = new UnreachableInst(Builder.getContext(), InsertBB);
1508
1509 BasicBlock *EntryBB = UI->getParent();
1510 BasicBlock *PRegEntryBB = EntryBB->splitBasicBlock(UI, "omp.par.entry");
1511 BasicBlock *PRegBodyBB = PRegEntryBB->splitBasicBlock(UI, "omp.par.region");
1512 BasicBlock *PRegPreFiniBB =
1513 PRegBodyBB->splitBasicBlock(UI, "omp.par.pre_finalize");
1514 BasicBlock *PRegExitBB = PRegPreFiniBB->splitBasicBlock(UI, "omp.par.exit");
1515
1516 auto FiniCBWrapper = [&](InsertPointTy IP) {
1517 // Hide "open-ended" blocks from the given FiniCB by setting the right jump
1518 // target to the region exit block.
1519 if (IP.getBlock()->end() == IP.getPoint()) {
1521 Builder.restoreIP(IP);
1522 Instruction *I = Builder.CreateBr(PRegExitBB);
1523 IP = InsertPointTy(I->getParent(), I->getIterator());
1524 }
1525 assert(IP.getBlock()->getTerminator()->getNumSuccessors() == 1 &&
1526 IP.getBlock()->getTerminator()->getSuccessor(0) == PRegExitBB &&
1527 "Unexpected insertion point for finalization call!");
1528 return FiniCB(IP);
1529 };
1530
1531 FinalizationStack.push_back({FiniCBWrapper, OMPD_parallel, IsCancellable});
1532
1533 // Generate the privatization allocas in the block that will become the entry
1534 // of the outlined function.
1535 Builder.SetInsertPoint(PRegEntryBB->getTerminator());
1536 InsertPointTy InnerAllocaIP = Builder.saveIP();
1537
1538 AllocaInst *PrivTIDAddr =
1539 Builder.CreateAlloca(Int32, nullptr, "tid.addr.local");
1540 Instruction *PrivTID = Builder.CreateLoad(Int32, PrivTIDAddr, "tid");
1541
1542 // Add some fake uses for OpenMP provided arguments.
1543 ToBeDeleted.push_back(Builder.CreateLoad(Int32, TIDAddr, "tid.addr.use"));
1544 Instruction *ZeroAddrUse =
1545 Builder.CreateLoad(Int32, ZeroAddr, "zero.addr.use");
1546 ToBeDeleted.push_back(ZeroAddrUse);
1547
1548 // EntryBB
1549 // |
1550 // V
1551 // PRegionEntryBB <- Privatization allocas are placed here.
1552 // |
1553 // V
1554 // PRegionBodyBB <- BodeGen is invoked here.
1555 // |
1556 // V
1557 // PRegPreFiniBB <- The block we will start finalization from.
1558 // |
1559 // V
1560 // PRegionExitBB <- A common exit to simplify block collection.
1561 //
1562
1563 LLVM_DEBUG(dbgs() << "Before body codegen: " << *OuterFn << "\n");
1564
1565 // Let the caller create the body.
1566 assert(BodyGenCB && "Expected body generation callback!");
1567 InsertPointTy CodeGenIP(PRegBodyBB, PRegBodyBB->begin());
1568 if (Error Err = BodyGenCB(InnerAllocaIP, CodeGenIP))
1569 return Err;
1570
1571 LLVM_DEBUG(dbgs() << "After body codegen: " << *OuterFn << "\n");
1572
1573 OutlineInfo OI;
1574 if (Config.isTargetDevice()) {
1575 // Generate OpenMP target specific runtime call
1576 OI.PostOutlineCB = [=, ToBeDeletedVec =
1577 std::move(ToBeDeleted)](Function &OutlinedFn) {
1578 targetParallelCallback(this, OutlinedFn, OuterFn, OuterAllocaBlock, Ident,
1579 IfCondition, NumThreads, PrivTID, PrivTIDAddr,
1580 ThreadID, ToBeDeletedVec);
1581 };
1582 } else {
1583 // Generate OpenMP host runtime call
1584 OI.PostOutlineCB = [=, ToBeDeletedVec =
1585 std::move(ToBeDeleted)](Function &OutlinedFn) {
1586 hostParallelCallback(this, OutlinedFn, OuterFn, Ident, IfCondition,
1587 PrivTID, PrivTIDAddr, ToBeDeletedVec);
1588 };
1589 }
1590
1591 OI.OuterAllocaBB = OuterAllocaBlock;
1592 OI.EntryBB = PRegEntryBB;
1593 OI.ExitBB = PRegExitBB;
1594
1595 SmallPtrSet<BasicBlock *, 32> ParallelRegionBlockSet;
1597 OI.collectBlocks(ParallelRegionBlockSet, Blocks);
1598
1599 // Ensure a single exit node for the outlined region by creating one.
1600 // We might have multiple incoming edges to the exit now due to finalizations,
1601 // e.g., cancel calls that cause the control flow to leave the region.
1602 BasicBlock *PRegOutlinedExitBB = PRegExitBB;
1603 PRegExitBB = SplitBlock(PRegExitBB, &*PRegExitBB->getFirstInsertionPt());
1604 PRegOutlinedExitBB->setName("omp.par.outlined.exit");
1605 Blocks.push_back(PRegOutlinedExitBB);
1606
1607 CodeExtractorAnalysisCache CEAC(*OuterFn);
1608 CodeExtractor Extractor(Blocks, /* DominatorTree */ nullptr,
1609 /* AggregateArgs */ false,
1610 /* BlockFrequencyInfo */ nullptr,
1611 /* BranchProbabilityInfo */ nullptr,
1612 /* AssumptionCache */ nullptr,
1613 /* AllowVarArgs */ true,
1614 /* AllowAlloca */ true,
1615 /* AllocationBlock */ OuterAllocaBlock,
1616 /* Suffix */ ".omp_par", ArgsInZeroAddressSpace);
1617
1618 // Find inputs to, outputs from the code region.
1619 BasicBlock *CommonExit = nullptr;
1620 SetVector<Value *> Inputs, Outputs, SinkingCands, HoistingCands;
1621 Extractor.findAllocas(CEAC, SinkingCands, HoistingCands, CommonExit);
1622
1623 Extractor.findInputsOutputs(Inputs, Outputs, SinkingCands,
1624 /*CollectGlobalInputs=*/true);
1625
1626 Inputs.remove_if([&](Value *I) {
1627 if (auto *GV = dyn_cast_if_present<GlobalVariable>(I))
1628 return GV->getValueType() == OpenMPIRBuilder::Ident;
1629
1630 return false;
1631 });
1632
1633 LLVM_DEBUG(dbgs() << "Before privatization: " << *OuterFn << "\n");
1634
1635 FunctionCallee TIDRTLFn =
1636 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_global_thread_num);
1637
1638 auto PrivHelper = [&](Value &V) -> Error {
1639 if (&V == TIDAddr || &V == ZeroAddr) {
1640 OI.ExcludeArgsFromAggregate.push_back(&V);
1641 return Error::success();
1642 }
1643
1645 for (Use &U : V.uses())
1646 if (auto *UserI = dyn_cast<Instruction>(U.getUser()))
1647 if (ParallelRegionBlockSet.count(UserI->getParent()))
1648 Uses.insert(&U);
1649
1650 // __kmpc_fork_call expects extra arguments as pointers. If the input
1651 // already has a pointer type, everything is fine. Otherwise, store the
1652 // value onto stack and load it back inside the to-be-outlined region. This
1653 // will ensure only the pointer will be passed to the function.
1654 // FIXME: if there are more than 15 trailing arguments, they must be
1655 // additionally packed in a struct.
1656 Value *Inner = &V;
1657 if (!V.getType()->isPointerTy()) {
1659 LLVM_DEBUG(llvm::dbgs() << "Forwarding input as pointer: " << V << "\n");
1660
1661 Builder.restoreIP(OuterAllocaIP);
1662 Value *Ptr =
1663 Builder.CreateAlloca(V.getType(), nullptr, V.getName() + ".reloaded");
1664
1665 // Store to stack at end of the block that currently branches to the entry
1666 // block of the to-be-outlined region.
1667 Builder.SetInsertPoint(InsertBB,
1668 InsertBB->getTerminator()->getIterator());
1669 Builder.CreateStore(&V, Ptr);
1670
1671 // Load back next to allocations in the to-be-outlined region.
1672 Builder.restoreIP(InnerAllocaIP);
1673 Inner = Builder.CreateLoad(V.getType(), Ptr);
1674 }
1675
1676 Value *ReplacementValue = nullptr;
1677 CallInst *CI = dyn_cast<CallInst>(&V);
1678 if (CI && CI->getCalledFunction() == TIDRTLFn.getCallee()) {
1679 ReplacementValue = PrivTID;
1680 } else {
1681 InsertPointOrErrorTy AfterIP =
1682 PrivCB(InnerAllocaIP, Builder.saveIP(), V, *Inner, ReplacementValue);
1683 if (!AfterIP)
1684 return AfterIP.takeError();
1685 Builder.restoreIP(*AfterIP);
1686 InnerAllocaIP = {
1687 InnerAllocaIP.getBlock(),
1688 InnerAllocaIP.getBlock()->getTerminator()->getIterator()};
1689
1690 assert(ReplacementValue &&
1691 "Expected copy/create callback to set replacement value!");
1692 if (ReplacementValue == &V)
1693 return Error::success();
1694 }
1695
1696 for (Use *UPtr : Uses)
1697 UPtr->set(ReplacementValue);
1698
1699 return Error::success();
1700 };
1701
1702 // Reset the inner alloca insertion as it will be used for loading the values
1703 // wrapped into pointers before passing them into the to-be-outlined region.
1704 // Configure it to insert immediately after the fake use of zero address so
1705 // that they are available in the generated body and so that the
1706 // OpenMP-related values (thread ID and zero address pointers) remain leading
1707 // in the argument list.
1708 InnerAllocaIP = IRBuilder<>::InsertPoint(
1709 ZeroAddrUse->getParent(), ZeroAddrUse->getNextNode()->getIterator());
1710
1711 // Reset the outer alloca insertion point to the entry of the relevant block
1712 // in case it was invalidated.
1713 OuterAllocaIP = IRBuilder<>::InsertPoint(
1714 OuterAllocaBlock, OuterAllocaBlock->getFirstInsertionPt());
1715
1716 for (Value *Input : Inputs) {
1717 LLVM_DEBUG(dbgs() << "Captured input: " << *Input << "\n");
1718 if (Error Err = PrivHelper(*Input))
1719 return Err;
1720 }
1721 LLVM_DEBUG({
1722 for (Value *Output : Outputs)
1723 LLVM_DEBUG(dbgs() << "Captured output: " << *Output << "\n");
1724 });
1725 assert(Outputs.empty() &&
1726 "OpenMP outlining should not produce live-out values!");
1727
1728 LLVM_DEBUG(dbgs() << "After privatization: " << *OuterFn << "\n");
1729 LLVM_DEBUG({
1730 for (auto *BB : Blocks)
1731 dbgs() << " PBR: " << BB->getName() << "\n";
1732 });
1733
1734 // Adjust the finalization stack, verify the adjustment, and call the
1735 // finalize function a last time to finalize values between the pre-fini
1736 // block and the exit block if we left the parallel "the normal way".
1737 auto FiniInfo = FinalizationStack.pop_back_val();
1738 (void)FiniInfo;
1739 assert(FiniInfo.DK == OMPD_parallel &&
1740 "Unexpected finalization stack state!");
1741
1742 Instruction *PRegPreFiniTI = PRegPreFiniBB->getTerminator();
1743
1744 InsertPointTy PreFiniIP(PRegPreFiniBB, PRegPreFiniTI->getIterator());
1745 if (Error Err = FiniCB(PreFiniIP))
1746 return Err;
1747
1748 // Register the outlined info.
1749 addOutlineInfo(std::move(OI));
1750
1751 InsertPointTy AfterIP(UI->getParent(), UI->getParent()->end());
1752 UI->eraseFromParent();
1753
1754 return AfterIP;
1755}
1756
1758 // Build call void __kmpc_flush(ident_t *loc)
1759 uint32_t SrcLocStrSize;
1760 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
1761 Value *Args[] = {getOrCreateIdent(SrcLocStr, SrcLocStrSize)};
1762
1763 Builder.CreateCall(getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_flush), Args);
1764}
1765
1767 if (!updateToLocation(Loc))
1768 return;
1769 emitFlush(Loc);
1770}
1771
1773 // Build call kmp_int32 __kmpc_omp_taskwait(ident_t *loc, kmp_int32
1774 // global_tid);
1775 uint32_t SrcLocStrSize;
1776 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
1777 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
1778 Value *Args[] = {Ident, getOrCreateThreadID(Ident)};
1779
1780 // Ignore return result until untied tasks are supported.
1781 Builder.CreateCall(getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_omp_taskwait),
1782 Args);
1783}
1784
1786 if (!updateToLocation(Loc))
1787 return;
1788 emitTaskwaitImpl(Loc);
1789}
1790
1792 // Build call __kmpc_omp_taskyield(loc, thread_id, 0);
1793 uint32_t SrcLocStrSize;
1794 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
1795 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
1797 Value *Args[] = {Ident, getOrCreateThreadID(Ident), I32Null};
1798
1799 Builder.CreateCall(getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_omp_taskyield),
1800 Args);
1801}
1802
1804 if (!updateToLocation(Loc))
1805 return;
1806 emitTaskyieldImpl(Loc);
1807}
1808
1809// Processes the dependencies in Dependencies and does the following
1810// - Allocates space on the stack of an array of DependInfo objects
1811// - Populates each DependInfo object with relevant information of
1812// the corresponding dependence.
1813// - All code is inserted in the entry block of the current function.
1815 OpenMPIRBuilder &OMPBuilder,
1817 // Early return if we have no dependencies to process
1818 if (Dependencies.empty())
1819 return nullptr;
1820
1821 // Given a vector of DependData objects, in this function we create an
1822 // array on the stack that holds kmp_dep_info objects corresponding
1823 // to each dependency. This is then passed to the OpenMP runtime.
1824 // For example, if there are 'n' dependencies then the following psedo
1825 // code is generated. Assume the first dependence is on a variable 'a'
1826 //
1827 // \code{c}
1828 // DepArray = alloc(n x sizeof(kmp_depend_info);
1829 // idx = 0;
1830 // DepArray[idx].base_addr = ptrtoint(&a);
1831 // DepArray[idx].len = 8;
1832 // DepArray[idx].flags = Dep.DepKind; /*(See OMPContants.h for DepKind)*/
1833 // ++idx;
1834 // DepArray[idx].base_addr = ...;
1835 // \endcode
1836
1837 IRBuilderBase &Builder = OMPBuilder.Builder;
1838 Type *DependInfo = OMPBuilder.DependInfo;
1839 Module &M = OMPBuilder.M;
1840
1841 Value *DepArray = nullptr;
1842 OpenMPIRBuilder::InsertPointTy OldIP = Builder.saveIP();
1843 Builder.SetInsertPoint(
1845
1846 Type *DepArrayTy = ArrayType::get(DependInfo, Dependencies.size());
1847 DepArray = Builder.CreateAlloca(DepArrayTy, nullptr, ".dep.arr.addr");
1848
1849 for (const auto &[DepIdx, Dep] : enumerate(Dependencies)) {
1850 Value *Base =
1851 Builder.CreateConstInBoundsGEP2_64(DepArrayTy, DepArray, 0, DepIdx);
1852 // Store the pointer to the variable
1853 Value *Addr = Builder.CreateStructGEP(
1854 DependInfo, Base,
1855 static_cast<unsigned int>(RTLDependInfoFields::BaseAddr));
1856 Value *DepValPtr = Builder.CreatePtrToInt(Dep.DepVal, Builder.getInt64Ty());
1857 Builder.CreateStore(DepValPtr, Addr);
1858 // Store the size of the variable
1859 Value *Size = Builder.CreateStructGEP(
1860 DependInfo, Base, static_cast<unsigned int>(RTLDependInfoFields::Len));
1861 Builder.CreateStore(
1862 Builder.getInt64(M.getDataLayout().getTypeStoreSize(Dep.DepValueType)),
1863 Size);
1864 // Store the dependency kind
1865 Value *Flags = Builder.CreateStructGEP(
1866 DependInfo, Base,
1867 static_cast<unsigned int>(RTLDependInfoFields::Flags));
1868 Builder.CreateStore(
1869 ConstantInt::get(Builder.getInt8Ty(),
1870 static_cast<unsigned int>(Dep.DepKind)),
1871 Flags);
1872 }
1873 Builder.restoreIP(OldIP);
1874 return DepArray;
1875}
1876
1878 const LocationDescription &Loc, InsertPointTy AllocaIP,
1879 BodyGenCallbackTy BodyGenCB, bool Tied, Value *Final, Value *IfCondition,
1880 SmallVector<DependData> Dependencies, bool Mergeable, Value *EventHandle,
1881 Value *Priority) {
1882
1883 if (!updateToLocation(Loc))
1884 return InsertPointTy();
1885
1886 uint32_t SrcLocStrSize;
1887 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
1888 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
1889 // The current basic block is split into four basic blocks. After outlining,
1890 // they will be mapped as follows:
1891 // ```
1892 // def current_fn() {
1893 // current_basic_block:
1894 // br label %task.exit
1895 // task.exit:
1896 // ; instructions after task
1897 // }
1898 // def outlined_fn() {
1899 // task.alloca:
1900 // br label %task.body
1901 // task.body:
1902 // ret void
1903 // }
1904 // ```
1905 BasicBlock *TaskExitBB = splitBB(Builder, /*CreateBranch=*/true, "task.exit");
1906 BasicBlock *TaskBodyBB = splitBB(Builder, /*CreateBranch=*/true, "task.body");
1907 BasicBlock *TaskAllocaBB =
1908 splitBB(Builder, /*CreateBranch=*/true, "task.alloca");
1909
1910 InsertPointTy TaskAllocaIP =
1911 InsertPointTy(TaskAllocaBB, TaskAllocaBB->begin());
1912 InsertPointTy TaskBodyIP = InsertPointTy(TaskBodyBB, TaskBodyBB->begin());
1913 if (Error Err = BodyGenCB(TaskAllocaIP, TaskBodyIP))
1914 return Err;
1915
1916 OutlineInfo OI;
1917 OI.EntryBB = TaskAllocaBB;
1918 OI.OuterAllocaBB = AllocaIP.getBlock();
1919 OI.ExitBB = TaskExitBB;
1920
1921 // Add the thread ID argument.
1924 Builder, AllocaIP, ToBeDeleted, TaskAllocaIP, "global.tid", false));
1925
1926 OI.PostOutlineCB = [this, Ident, Tied, Final, IfCondition, Dependencies,
1927 Mergeable, Priority, EventHandle, TaskAllocaBB,
1928 ToBeDeleted](Function &OutlinedFn) mutable {
1929 // Replace the Stale CI by appropriate RTL function call.
1930 assert(OutlinedFn.getNumUses() == 1 &&
1931 "there must be a single user for the outlined function");
1932 CallInst *StaleCI = cast<CallInst>(OutlinedFn.user_back());
1933
1934 // HasShareds is true if any variables are captured in the outlined region,
1935 // false otherwise.
1936 bool HasShareds = StaleCI->arg_size() > 1;
1937 Builder.SetInsertPoint(StaleCI);
1938
1939 // Gather the arguments for emitting the runtime call for
1940 // @__kmpc_omp_task_alloc
1941 Function *TaskAllocFn =
1942 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_omp_task_alloc);
1943
1944 // Arguments - `loc_ref` (Ident) and `gtid` (ThreadID)
1945 // call.
1946 Value *ThreadID = getOrCreateThreadID(Ident);
1947
1948 // Argument - `flags`
1949 // Task is tied iff (Flags & 1) == 1.
1950 // Task is untied iff (Flags & 1) == 0.
1951 // Task is final iff (Flags & 2) == 2.
1952 // Task is not final iff (Flags & 2) == 0.
1953 // Task is mergeable iff (Flags & 4) == 4.
1954 // Task is not mergeable iff (Flags & 4) == 0.
1955 // Task is priority iff (Flags & 32) == 32.
1956 // Task is not priority iff (Flags & 32) == 0.
1957 // TODO: Handle the other flags.
1958 Value *Flags = Builder.getInt32(Tied);
1959 if (Final) {
1960 Value *FinalFlag =
1962 Flags = Builder.CreateOr(FinalFlag, Flags);
1963 }
1964
1965 if (Mergeable)
1967 if (Priority)
1969
1970 // Argument - `sizeof_kmp_task_t` (TaskSize)
1971 // Tasksize refers to the size in bytes of kmp_task_t data structure
1972 // including private vars accessed in task.
1973 // TODO: add kmp_task_t_with_privates (privates)
1974 Value *TaskSize = Builder.getInt64(
1976
1977 // Argument - `sizeof_shareds` (SharedsSize)
1978 // SharedsSize refers to the shareds array size in the kmp_task_t data
1979 // structure.
1980 Value *SharedsSize = Builder.getInt64(0);
1981 if (HasShareds) {
1982 AllocaInst *ArgStructAlloca =
1983 dyn_cast<AllocaInst>(StaleCI->getArgOperand(1));
1984 assert(ArgStructAlloca &&
1985 "Unable to find the alloca instruction corresponding to arguments "
1986 "for extracted function");
1987 StructType *ArgStructType =
1988 dyn_cast<StructType>(ArgStructAlloca->getAllocatedType());
1989 assert(ArgStructType && "Unable to find struct type corresponding to "
1990 "arguments for extracted function");
1991 SharedsSize =
1993 }
1994 // Emit the @__kmpc_omp_task_alloc runtime call
1995 // The runtime call returns a pointer to an area where the task captured
1996 // variables must be copied before the task is run (TaskData)
1997 CallInst *TaskData = Builder.CreateCall(
1998 TaskAllocFn, {/*loc_ref=*/Ident, /*gtid=*/ThreadID, /*flags=*/Flags,
1999 /*sizeof_task=*/TaskSize, /*sizeof_shared=*/SharedsSize,
2000 /*task_func=*/&OutlinedFn});
2001
2002 // Emit detach clause initialization.
2003 // evt = (typeof(evt))__kmpc_task_allow_completion_event(loc, tid,
2004 // task_descriptor);
2005 if (EventHandle) {
2007 OMPRTL___kmpc_task_allow_completion_event);
2008 llvm::Value *EventVal =
2009 Builder.CreateCall(TaskDetachFn, {Ident, ThreadID, TaskData});
2010 llvm::Value *EventHandleAddr =
2012 Builder.getPtrTy(0));
2013 EventVal = Builder.CreatePtrToInt(EventVal, Builder.getInt64Ty());
2014 Builder.CreateStore(EventVal, EventHandleAddr);
2015 }
2016 // Copy the arguments for outlined function
2017 if (HasShareds) {
2018 Value *Shareds = StaleCI->getArgOperand(1);
2019 Align Alignment = TaskData->getPointerAlignment(M.getDataLayout());
2020 Value *TaskShareds = Builder.CreateLoad(VoidPtr, TaskData);
2021 Builder.CreateMemCpy(TaskShareds, Alignment, Shareds, Alignment,
2022 SharedsSize);
2023 }
2024
2025 if (Priority) {
2026 //
2027 // The return type of "__kmpc_omp_task_alloc" is "kmp_task_t *",
2028 // we populate the priority information into the "kmp_task_t" here
2029 //
2030 // The struct "kmp_task_t" definition is available in kmp.h
2031 // kmp_task_t = { shareds, routine, part_id, data1, data2 }
2032 // data2 is used for priority
2033 //
2034 Type *Int32Ty = Builder.getInt32Ty();
2035 Constant *Zero = ConstantInt::get(Int32Ty, 0);
2036 // kmp_task_t* => { ptr }
2037 Type *TaskPtr = StructType::get(VoidPtr);
2038 Value *TaskGEP =
2039 Builder.CreateInBoundsGEP(TaskPtr, TaskData, {Zero, Zero});
2040 // kmp_task_t => { ptr, ptr, i32, ptr, ptr }
2041 Type *TaskStructType = StructType::get(
2042 VoidPtr, VoidPtr, Builder.getInt32Ty(), VoidPtr, VoidPtr);
2043 Value *PriorityData = Builder.CreateInBoundsGEP(
2044 TaskStructType, TaskGEP, {Zero, ConstantInt::get(Int32Ty, 4)});
2045 // kmp_cmplrdata_t => { ptr, ptr }
2046 Type *CmplrStructType = StructType::get(VoidPtr, VoidPtr);
2047 Value *CmplrData = Builder.CreateInBoundsGEP(CmplrStructType,
2048 PriorityData, {Zero, Zero});
2049 Builder.CreateStore(Priority, CmplrData);
2050 }
2051
2052 Value *DepArray = nullptr;
2053 if (Dependencies.size()) {
2054 InsertPointTy OldIP = Builder.saveIP();
2056 &OldIP.getBlock()->getParent()->getEntryBlock().back());
2057
2058 Type *DepArrayTy = ArrayType::get(DependInfo, Dependencies.size());
2059 DepArray = Builder.CreateAlloca(DepArrayTy, nullptr, ".dep.arr.addr");
2060
2061 unsigned P = 0;
2062 for (const DependData &Dep : Dependencies) {
2063 Value *Base =
2064 Builder.CreateConstInBoundsGEP2_64(DepArrayTy, DepArray, 0, P);
2065 // Store the pointer to the variable
2067 DependInfo, Base,
2068 static_cast<unsigned int>(RTLDependInfoFields::BaseAddr));
2069 Value *DepValPtr =
2071 Builder.CreateStore(DepValPtr, Addr);
2072 // Store the size of the variable
2074 DependInfo, Base,
2075 static_cast<unsigned int>(RTLDependInfoFields::Len));
2077 Dep.DepValueType)),
2078 Size);
2079 // Store the dependency kind
2081 DependInfo, Base,
2082 static_cast<unsigned int>(RTLDependInfoFields::Flags));
2084 ConstantInt::get(Builder.getInt8Ty(),
2085 static_cast<unsigned int>(Dep.DepKind)),
2086 Flags);
2087 ++P;
2088 }
2089
2090 Builder.restoreIP(OldIP);
2091 }
2092
2093 // In the presence of the `if` clause, the following IR is generated:
2094 // ...
2095 // %data = call @__kmpc_omp_task_alloc(...)
2096 // br i1 %if_condition, label %then, label %else
2097 // then:
2098 // call @__kmpc_omp_task(...)
2099 // br label %exit
2100 // else:
2101 // ;; Wait for resolution of dependencies, if any, before
2102 // ;; beginning the task
2103 // call @__kmpc_omp_wait_deps(...)
2104 // call @__kmpc_omp_task_begin_if0(...)
2105 // call @outlined_fn(...)
2106 // call @__kmpc_omp_task_complete_if0(...)
2107 // br label %exit
2108 // exit:
2109 // ...
2110 if (IfCondition) {
2111 // `SplitBlockAndInsertIfThenElse` requires the block to have a
2112 // terminator.
2113 splitBB(Builder, /*CreateBranch=*/true, "if.end");
2114 Instruction *IfTerminator =
2115 Builder.GetInsertPoint()->getParent()->getTerminator();
2116 Instruction *ThenTI = IfTerminator, *ElseTI = nullptr;
2117 Builder.SetInsertPoint(IfTerminator);
2118 SplitBlockAndInsertIfThenElse(IfCondition, IfTerminator, &ThenTI,
2119 &ElseTI);
2120 Builder.SetInsertPoint(ElseTI);
2121
2122 if (Dependencies.size()) {
2123 Function *TaskWaitFn =
2124 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_omp_wait_deps);
2126 TaskWaitFn,
2127 {Ident, ThreadID, Builder.getInt32(Dependencies.size()), DepArray,
2128 ConstantInt::get(Builder.getInt32Ty(), 0),
2130 }
2131 Function *TaskBeginFn =
2132 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_omp_task_begin_if0);
2133 Function *TaskCompleteFn =
2134 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_omp_task_complete_if0);
2135 Builder.CreateCall(TaskBeginFn, {Ident, ThreadID, TaskData});
2136 CallInst *CI = nullptr;
2137 if (HasShareds)
2138 CI = Builder.CreateCall(&OutlinedFn, {ThreadID, TaskData});
2139 else
2140 CI = Builder.CreateCall(&OutlinedFn, {ThreadID});
2141 CI->setDebugLoc(StaleCI->getDebugLoc());
2142 Builder.CreateCall(TaskCompleteFn, {Ident, ThreadID, TaskData});
2143 Builder.SetInsertPoint(ThenTI);
2144 }
2145
2146 if (Dependencies.size()) {
2147 Function *TaskFn =
2148 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_omp_task_with_deps);
2150 TaskFn,
2151 {Ident, ThreadID, TaskData, Builder.getInt32(Dependencies.size()),
2152 DepArray, ConstantInt::get(Builder.getInt32Ty(), 0),
2154
2155 } else {
2156 // Emit the @__kmpc_omp_task runtime call to spawn the task
2157 Function *TaskFn = getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_omp_task);
2158 Builder.CreateCall(TaskFn, {Ident, ThreadID, TaskData});
2159 }
2160
2161 StaleCI->eraseFromParent();
2162
2163 Builder.SetInsertPoint(TaskAllocaBB, TaskAllocaBB->begin());
2164 if (HasShareds) {
2165 LoadInst *Shareds = Builder.CreateLoad(VoidPtr, OutlinedFn.getArg(1));
2166 OutlinedFn.getArg(1)->replaceUsesWithIf(
2167 Shareds, [Shareds](Use &U) { return U.getUser() != Shareds; });
2168 }
2169
2170 for (Instruction *I : llvm::reverse(ToBeDeleted))
2171 I->eraseFromParent();
2172 };
2173
2174 addOutlineInfo(std::move(OI));
2175 Builder.SetInsertPoint(TaskExitBB, TaskExitBB->begin());
2176
2177 return Builder.saveIP();
2178}
2179
2182 InsertPointTy AllocaIP,
2183 BodyGenCallbackTy BodyGenCB) {
2184 if (!updateToLocation(Loc))
2185 return InsertPointTy();
2186
2187 uint32_t SrcLocStrSize;
2188 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
2189 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
2190 Value *ThreadID = getOrCreateThreadID(Ident);
2191
2192 // Emit the @__kmpc_taskgroup runtime call to start the taskgroup
2193 Function *TaskgroupFn =
2194 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_taskgroup);
2195 Builder.CreateCall(TaskgroupFn, {Ident, ThreadID});
2196
2197 BasicBlock *TaskgroupExitBB = splitBB(Builder, true, "taskgroup.exit");
2198 if (Error Err = BodyGenCB(AllocaIP, Builder.saveIP()))
2199 return Err;
2200
2201 Builder.SetInsertPoint(TaskgroupExitBB);
2202 // Emit the @__kmpc_end_taskgroup runtime call to end the taskgroup
2203 Function *EndTaskgroupFn =
2204 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_end_taskgroup);
2205 Builder.CreateCall(EndTaskgroupFn, {Ident, ThreadID});
2206
2207 return Builder.saveIP();
2208}
2209
2211 const LocationDescription &Loc, InsertPointTy AllocaIP,
2213 FinalizeCallbackTy FiniCB, bool IsCancellable, bool IsNowait) {
2214 assert(!isConflictIP(AllocaIP, Loc.IP) && "Dedicated IP allocas required");
2215
2216 if (!updateToLocation(Loc))
2217 return Loc.IP;
2218
2219 auto FiniCBWrapper = [&](InsertPointTy IP) {
2220 if (IP.getBlock()->end() != IP.getPoint())
2221 return FiniCB(IP);
2222 // This must be done otherwise any nested constructs using FinalizeOMPRegion
2223 // will fail because that function requires the Finalization Basic Block to
2224 // have a terminator, which is already removed by EmitOMPRegionBody.
2225 // IP is currently at cancelation block.
2226 // We need to backtrack to the condition block to fetch
2227 // the exit block and create a branch from cancelation
2228 // to exit block.
2230 Builder.restoreIP(IP);
2231 auto *CaseBB = IP.getBlock()->getSinglePredecessor();
2232 auto *CondBB = CaseBB->getSinglePredecessor()->getSinglePredecessor();
2233 auto *ExitBB = CondBB->getTerminator()->getSuccessor(1);
2234 Instruction *I = Builder.CreateBr(ExitBB);
2235 IP = InsertPointTy(I->getParent(), I->getIterator());
2236 return FiniCB(IP);
2237 };
2238
2239 FinalizationStack.push_back({FiniCBWrapper, OMPD_sections, IsCancellable});
2240
2241 // Each section is emitted as a switch case
2242 // Each finalization callback is handled from clang.EmitOMPSectionDirective()
2243 // -> OMP.createSection() which generates the IR for each section
2244 // Iterate through all sections and emit a switch construct:
2245 // switch (IV) {
2246 // case 0:
2247 // <SectionStmt[0]>;
2248 // break;
2249 // ...
2250 // case <NumSection> - 1:
2251 // <SectionStmt[<NumSection> - 1]>;
2252 // break;
2253 // }
2254 // ...
2255 // section_loop.after:
2256 // <FiniCB>;
2257 auto LoopBodyGenCB = [&](InsertPointTy CodeGenIP, Value *IndVar) -> Error {
2258 Builder.restoreIP(CodeGenIP);
2260 splitBBWithSuffix(Builder, /*CreateBranch=*/false, ".sections.after");
2261 Function *CurFn = Continue->getParent();
2262 SwitchInst *SwitchStmt = Builder.CreateSwitch(IndVar, Continue);
2263
2264 unsigned CaseNumber = 0;
2265 for (auto SectionCB : SectionCBs) {
2267 M.getContext(), "omp_section_loop.body.case", CurFn, Continue);
2268 SwitchStmt->addCase(Builder.getInt32(CaseNumber), CaseBB);
2269 Builder.SetInsertPoint(CaseBB);
2270 BranchInst *CaseEndBr = Builder.CreateBr(Continue);
2271 if (Error Err = SectionCB(InsertPointTy(), {CaseEndBr->getParent(),
2272 CaseEndBr->getIterator()}))
2273 return Err;
2274 CaseNumber++;
2275 }
2276 // remove the existing terminator from body BB since there can be no
2277 // terminators after switch/case
2278 return Error::success();
2279 };
2280 // Loop body ends here
2281 // LowerBound, UpperBound, and STride for createCanonicalLoop
2282 Type *I32Ty = Type::getInt32Ty(M.getContext());
2283 Value *LB = ConstantInt::get(I32Ty, 0);
2284 Value *UB = ConstantInt::get(I32Ty, SectionCBs.size());
2285 Value *ST = ConstantInt::get(I32Ty, 1);
2287 Loc, LoopBodyGenCB, LB, UB, ST, true, false, AllocaIP, "section_loop");
2288 if (!LoopInfo)
2289 return LoopInfo.takeError();
2290
2291 InsertPointOrErrorTy WsloopIP =
2292 applyStaticWorkshareLoop(Loc.DL, *LoopInfo, AllocaIP, !IsNowait);
2293 if (!WsloopIP)
2294 return WsloopIP.takeError();
2295 InsertPointTy AfterIP = *WsloopIP;
2296
2297 // Apply the finalization callback in LoopAfterBB
2298 auto FiniInfo = FinalizationStack.pop_back_val();
2299 assert(FiniInfo.DK == OMPD_sections &&
2300 "Unexpected finalization stack state!");
2301 if (FinalizeCallbackTy &CB = FiniInfo.FiniCB) {
2302 Builder.restoreIP(AfterIP);
2303 BasicBlock *FiniBB =
2304 splitBBWithSuffix(Builder, /*CreateBranch=*/true, "sections.fini");
2305 if (Error Err = CB(Builder.saveIP()))
2306 return Err;
2307 AfterIP = {FiniBB, FiniBB->begin()};
2308 }
2309
2310 return AfterIP;
2311}
2312
2315 BodyGenCallbackTy BodyGenCB,
2316 FinalizeCallbackTy FiniCB) {
2317 if (!updateToLocation(Loc))
2318 return Loc.IP;
2319
2320 auto FiniCBWrapper = [&](InsertPointTy IP) {
2321 if (IP.getBlock()->end() != IP.getPoint())
2322 return FiniCB(IP);
2323 // This must be done otherwise any nested constructs using FinalizeOMPRegion
2324 // will fail because that function requires the Finalization Basic Block to
2325 // have a terminator, which is already removed by EmitOMPRegionBody.
2326 // IP is currently at cancelation block.
2327 // We need to backtrack to the condition block to fetch
2328 // the exit block and create a branch from cancelation
2329 // to exit block.
2331 Builder.restoreIP(IP);
2332 auto *CaseBB = Loc.IP.getBlock();
2333 auto *CondBB = CaseBB->getSinglePredecessor()->getSinglePredecessor();
2334 auto *ExitBB = CondBB->getTerminator()->getSuccessor(1);
2335 Instruction *I = Builder.CreateBr(ExitBB);
2336 IP = InsertPointTy(I->getParent(), I->getIterator());
2337 return FiniCB(IP);
2338 };
2339
2340 Directive OMPD = Directive::OMPD_sections;
2341 // Since we are using Finalization Callback here, HasFinalize
2342 // and IsCancellable have to be true
2343 return EmitOMPInlinedRegion(OMPD, nullptr, nullptr, BodyGenCB, FiniCBWrapper,
2344 /*Conditional*/ false, /*hasFinalize*/ true,
2345 /*IsCancellable*/ true);
2346}
2347
2350 IT++;
2351 return OpenMPIRBuilder::InsertPointTy(I->getParent(), IT);
2352}
2353
2354Value *OpenMPIRBuilder::getGPUThreadID() {
2355 return Builder.CreateCall(
2357 OMPRTL___kmpc_get_hardware_thread_id_in_block),
2358 {});
2359}
2360
2361Value *OpenMPIRBuilder::getGPUWarpSize() {
2362 return Builder.CreateCall(
2363 getOrCreateRuntimeFunction(M, OMPRTL___kmpc_get_warp_size), {});
2364}
2365
2366Value *OpenMPIRBuilder::getNVPTXWarpID() {
2367 unsigned LaneIDBits = Log2_32(Config.getGridValue().GV_Warp_Size);
2368 return Builder.CreateAShr(getGPUThreadID(), LaneIDBits, "nvptx_warp_id");
2369}
2370
2371Value *OpenMPIRBuilder::getNVPTXLaneID() {
2372 unsigned LaneIDBits = Log2_32(Config.getGridValue().GV_Warp_Size);
2373 assert(LaneIDBits < 32 && "Invalid LaneIDBits size in NVPTX device.");
2374 unsigned LaneIDMask = ~0u >> (32u - LaneIDBits);
2375 return Builder.CreateAnd(getGPUThreadID(), Builder.getInt32(LaneIDMask),
2376 "nvptx_lane_id");
2377}
2378
2379Value *OpenMPIRBuilder::castValueToType(InsertPointTy AllocaIP, Value *From,
2380 Type *ToType) {
2381 Type *FromType = From->getType();
2382 uint64_t FromSize = M.getDataLayout().getTypeStoreSize(FromType);
2383 uint64_t ToSize = M.getDataLayout().getTypeStoreSize(ToType);
2384 assert(FromSize > 0 && "From size must be greater than zero");
2385 assert(ToSize > 0 && "To size must be greater than zero");
2386 if (FromType == ToType)
2387 return From;
2388 if (FromSize == ToSize)
2389 return Builder.CreateBitCast(From, ToType);
2390 if (ToType->isIntegerTy() && FromType->isIntegerTy())
2391 return Builder.CreateIntCast(From, ToType, /*isSigned*/ true);
2392 InsertPointTy SaveIP = Builder.saveIP();
2393 Builder.restoreIP(AllocaIP);
2394 Value *CastItem = Builder.CreateAlloca(ToType);
2395 Builder.restoreIP(SaveIP);
2396
2398 CastItem, Builder.getPtrTy(0));
2399 Builder.CreateStore(From, ValCastItem);
2400 return Builder.CreateLoad(ToType, CastItem);
2401}
2402
2403Value *OpenMPIRBuilder::createRuntimeShuffleFunction(InsertPointTy AllocaIP,
2404 Value *Element,
2405 Type *ElementType,
2406 Value *Offset) {
2407 uint64_t Size = M.getDataLayout().getTypeStoreSize(ElementType);
2408 assert(Size <= 8 && "Unsupported bitwidth in shuffle instruction");
2409
2410 // Cast all types to 32- or 64-bit values before calling shuffle routines.
2411 Type *CastTy = Builder.getIntNTy(Size <= 4 ? 32 : 64);
2412 Value *ElemCast = castValueToType(AllocaIP, Element, CastTy);
2413 Value *WarpSize =
2414 Builder.CreateIntCast(getGPUWarpSize(), Builder.getInt16Ty(), true);
2416 Size <= 4 ? RuntimeFunction::OMPRTL___kmpc_shuffle_int32
2417 : RuntimeFunction::OMPRTL___kmpc_shuffle_int64);
2418 Value *WarpSizeCast =
2419 Builder.CreateIntCast(WarpSize, Builder.getInt16Ty(), /*isSigned=*/true);
2420 Value *ShuffleCall =
2421 Builder.CreateCall(ShuffleFunc, {ElemCast, Offset, WarpSizeCast});
2422 return castValueToType(AllocaIP, ShuffleCall, CastTy);
2423}
2424
2425void OpenMPIRBuilder::shuffleAndStore(InsertPointTy AllocaIP, Value *SrcAddr,
2426 Value *DstAddr, Type *ElemType,
2427 Value *Offset, Type *ReductionArrayTy) {
2429 // Create the loop over the big sized data.
2430 // ptr = (void*)Elem;
2431 // ptrEnd = (void*) Elem + 1;
2432 // Step = 8;
2433 // while (ptr + Step < ptrEnd)
2434 // shuffle((int64_t)*ptr);
2435 // Step = 4;
2436 // while (ptr + Step < ptrEnd)
2437 // shuffle((int32_t)*ptr);
2438 // ...
2439 Type *IndexTy = Builder.getIndexTy(
2441 Value *ElemPtr = DstAddr;
2442 Value *Ptr = SrcAddr;
2443 for (unsigned IntSize = 8; IntSize >= 1; IntSize /= 2) {
2444 if (Size < IntSize)
2445 continue;
2446 Type *IntType = Builder.getIntNTy(IntSize * 8);
2448 Ptr, Builder.getPtrTy(0), Ptr->getName() + ".ascast");
2449 Value *SrcAddrGEP =
2450 Builder.CreateGEP(ElemType, SrcAddr, {ConstantInt::get(IndexTy, 1)});
2452 ElemPtr, Builder.getPtrTy(0), ElemPtr->getName() + ".ascast");
2453
2454 Function *CurFunc = Builder.GetInsertBlock()->getParent();
2455 if ((Size / IntSize) > 1) {
2457 SrcAddrGEP, Builder.getPtrTy());
2458 BasicBlock *PreCondBB =
2459 BasicBlock::Create(M.getContext(), ".shuffle.pre_cond");
2460 BasicBlock *ThenBB = BasicBlock::Create(M.getContext(), ".shuffle.then");
2461 BasicBlock *ExitBB = BasicBlock::Create(M.getContext(), ".shuffle.exit");
2462 BasicBlock *CurrentBB = Builder.GetInsertBlock();
2463 emitBlock(PreCondBB, CurFunc);
2464 PHINode *PhiSrc =
2465 Builder.CreatePHI(Ptr->getType(), /*NumReservedValues=*/2);
2466 PhiSrc->addIncoming(Ptr, CurrentBB);
2467 PHINode *PhiDest =
2468 Builder.CreatePHI(ElemPtr->getType(), /*NumReservedValues=*/2);
2469 PhiDest->addIncoming(ElemPtr, CurrentBB);
2470 Ptr = PhiSrc;
2471 ElemPtr = PhiDest;
2472 Value *PtrDiff = Builder.CreatePtrDiff(
2473 Builder.getInt8Ty(), PtrEnd,
2476 Builder.CreateICmpSGT(PtrDiff, Builder.getInt64(IntSize - 1)), ThenBB,
2477 ExitBB);
2478 emitBlock(ThenBB, CurFunc);
2479 Value *Res = createRuntimeShuffleFunction(
2480 AllocaIP,
2482 IntType, Ptr, M.getDataLayout().getPrefTypeAlign(ElemType)),
2483 IntType, Offset);
2484 Builder.CreateAlignedStore(Res, ElemPtr,
2485 M.getDataLayout().getPrefTypeAlign(ElemType));
2486 Value *LocalPtr =
2487 Builder.CreateGEP(IntType, Ptr, {ConstantInt::get(IndexTy, 1)});
2488 Value *LocalElemPtr =
2489 Builder.CreateGEP(IntType, ElemPtr, {ConstantInt::get(IndexTy, 1)});
2490 PhiSrc->addIncoming(LocalPtr, ThenBB);
2491 PhiDest->addIncoming(LocalElemPtr, ThenBB);
2492 emitBranch(PreCondBB);
2493 emitBlock(ExitBB, CurFunc);
2494 } else {
2495 Value *Res = createRuntimeShuffleFunction(
2496 AllocaIP, Builder.CreateLoad(IntType, Ptr), IntType, Offset);
2497 if (ElemType->isIntegerTy() && ElemType->getScalarSizeInBits() <
2498 Res->getType()->getScalarSizeInBits())
2499 Res = Builder.CreateTrunc(Res, ElemType);
2500 Builder.CreateStore(Res, ElemPtr);
2501 Ptr = Builder.CreateGEP(IntType, Ptr, {ConstantInt::get(IndexTy, 1)});
2502 ElemPtr =
2503 Builder.CreateGEP(IntType, ElemPtr, {ConstantInt::get(IndexTy, 1)});
2504 }
2505 Size = Size % IntSize;
2506 }
2507}
2508
2509void OpenMPIRBuilder::emitReductionListCopy(
2510 InsertPointTy AllocaIP, CopyAction Action, Type *ReductionArrayTy,
2511 ArrayRef<ReductionInfo> ReductionInfos, Value *SrcBase, Value *DestBase,
2512 CopyOptionsTy CopyOptions) {
2513 Type *IndexTy = Builder.getIndexTy(
2515 Value *RemoteLaneOffset = CopyOptions.RemoteLaneOffset;
2516
2517 // Iterates, element-by-element, through the source Reduce list and
2518 // make a copy.
2519 for (auto En : enumerate(ReductionInfos)) {
2520 const ReductionInfo &RI = En.value();
2521 Value *SrcElementAddr = nullptr;
2522 Value *DestElementAddr = nullptr;
2523 Value *DestElementPtrAddr = nullptr;
2524 // Should we shuffle in an element from a remote lane?
2525 bool ShuffleInElement = false;
2526 // Set to true to update the pointer in the dest Reduce list to a
2527 // newly created element.
2528 bool UpdateDestListPtr = false;
2529
2530 // Step 1.1: Get the address for the src element in the Reduce list.
2531 Value *SrcElementPtrAddr = Builder.CreateInBoundsGEP(
2532 ReductionArrayTy, SrcBase,
2533 {ConstantInt::get(IndexTy, 0), ConstantInt::get(IndexTy, En.index())});
2534 SrcElementAddr = Builder.CreateLoad(Builder.getPtrTy(), SrcElementPtrAddr);
2535
2536 // Step 1.2: Create a temporary to store the element in the destination
2537 // Reduce list.
2538 DestElementPtrAddr = Builder.CreateInBoundsGEP(
2539 ReductionArrayTy, DestBase,
2540 {ConstantInt::get(IndexTy, 0), ConstantInt::get(IndexTy, En.index())});
2541 switch (Action) {
2543 InsertPointTy CurIP = Builder.saveIP();
2544 Builder.restoreIP(AllocaIP);
2545 AllocaInst *DestAlloca = Builder.CreateAlloca(RI.ElementType, nullptr,
2546 ".omp.reduction.element");
2547 DestAlloca->setAlignment(
2548 M.getDataLayout().getPrefTypeAlign(RI.ElementType));
2549 DestElementAddr = DestAlloca;
2550 DestElementAddr =
2551 Builder.CreateAddrSpaceCast(DestElementAddr, Builder.getPtrTy(),
2552 DestElementAddr->getName() + ".ascast");
2553 Builder.restoreIP(CurIP);
2554 ShuffleInElement = true;
2555 UpdateDestListPtr = true;
2556 break;
2557 }
2559 DestElementAddr =
2560 Builder.CreateLoad(Builder.getPtrTy(), DestElementPtrAddr);
2561 break;
2562 }
2563 }
2564
2565 // Now that all active lanes have read the element in the
2566 // Reduce list, shuffle over the value from the remote lane.
2567 if (ShuffleInElement) {
2568 shuffleAndStore(AllocaIP, SrcElementAddr, DestElementAddr, RI.ElementType,
2569 RemoteLaneOffset, ReductionArrayTy);
2570 } else {
2571 switch (RI.EvaluationKind) {
2572 case EvalKind::Scalar: {
2573 Value *Elem = Builder.CreateLoad(RI.ElementType, SrcElementAddr);
2574 // Store the source element value to the dest element address.
2575 Builder.CreateStore(Elem, DestElementAddr);
2576 break;
2577 }
2578 case EvalKind::Complex: {
2580 RI.ElementType, SrcElementAddr, 0, 0, ".realp");
2581 Value *SrcReal = Builder.CreateLoad(
2582 RI.ElementType->getStructElementType(0), SrcRealPtr, ".real");
2584 RI.ElementType, SrcElementAddr, 0, 1, ".imagp");
2585 Value *SrcImg = Builder.CreateLoad(
2586 RI.ElementType->getStructElementType(1), SrcImgPtr, ".imag");
2587
2589 RI.ElementType, DestElementAddr, 0, 0, ".realp");
2591 RI.ElementType, DestElementAddr, 0, 1, ".imagp");
2592 Builder.CreateStore(SrcReal, DestRealPtr);
2593 Builder.CreateStore(SrcImg, DestImgPtr);
2594 break;
2595 }
2596 case EvalKind::Aggregate: {
2597 Value *SizeVal = Builder.getInt64(
2598 M.getDataLayout().getTypeStoreSize(RI.ElementType));
2600 DestElementAddr, M.getDataLayout().getPrefTypeAlign(RI.ElementType),
2601 SrcElementAddr, M.getDataLayout().getPrefTypeAlign(RI.ElementType),
2602 SizeVal, false);
2603 break;
2604 }
2605 };
2606 }
2607
2608 // Step 3.1: Modify reference in dest Reduce list as needed.
2609 // Modifying the reference in Reduce list to point to the newly
2610 // created element. The element is live in the current function
2611 // scope and that of functions it invokes (i.e., reduce_function).
2612 // RemoteReduceData[i] = (void*)&RemoteElem
2613 if (UpdateDestListPtr) {
2615 DestElementAddr, Builder.getPtrTy(),
2616 DestElementAddr->getName() + ".ascast");
2617 Builder.CreateStore(CastDestAddr, DestElementPtrAddr);
2618 }
2619 }
2620}
2621
2622Expected<Function *> OpenMPIRBuilder::emitInterWarpCopyFunction(
2623 const LocationDescription &Loc, ArrayRef<ReductionInfo> ReductionInfos,
2624 AttributeList FuncAttrs) {
2625 InsertPointTy SavedIP = Builder.saveIP();
2626 LLVMContext &Ctx = M.getContext();
2628 Builder.getVoidTy(), {Builder.getPtrTy(), Builder.getInt32Ty()},
2629 /* IsVarArg */ false);
2630 Function *WcFunc =
2632 "_omp_reduction_inter_warp_copy_func", &M);
2633 WcFunc->setAttributes(FuncAttrs);
2634 WcFunc->addParamAttr(0, Attribute::NoUndef);
2635 WcFunc->addParamAttr(1, Attribute::NoUndef);
2636 BasicBlock *EntryBB = BasicBlock::Create(M.getContext(), "entry", WcFunc);
2637 Builder.SetInsertPoint(EntryBB);
2638
2639 // ReduceList: thread local Reduce list.
2640 // At the stage of the computation when this function is called, partially
2641 // aggregated values reside in the first lane of every active warp.
2642 Argument *ReduceListArg = WcFunc->getArg(0);
2643 // NumWarps: number of warps active in the parallel region. This could
2644 // be smaller than 32 (max warps in a CTA) for partial block reduction.
2645 Argument *NumWarpsArg = WcFunc->getArg(1);
2646
2647 // This array is used as a medium to transfer, one reduce element at a time,
2648 // the data from the first lane of every warp to lanes in the first warp
2649 // in order to perform the final step of a reduction in a parallel region
2650 // (reduction across warps). The array is placed in NVPTX __shared__ memory
2651 // for reduced latency, as well as to have a distinct copy for concurrently
2652 // executing target regions. The array is declared with common linkage so
2653 // as to be shared across compilation units.
2654 StringRef TransferMediumName =
2655 "__openmp_nvptx_data_transfer_temporary_storage";
2656 GlobalVariable *TransferMedium = M.getGlobalVariable(TransferMediumName);
2657 unsigned WarpSize = Config.getGridValue().GV_Warp_Size;
2658 ArrayType *ArrayTy = ArrayType::get(Builder.getInt32Ty(), WarpSize);
2659 if (!TransferMedium) {
2660 TransferMedium = new GlobalVariable(
2661 M, ArrayTy, /*isConstant=*/false, GlobalVariable::WeakAnyLinkage,
2662 UndefValue::get(ArrayTy), TransferMediumName,
2663 /*InsertBefore=*/nullptr, GlobalVariable::NotThreadLocal,
2664 /*AddressSpace=*/3);
2665 }
2666
2667 // Get the CUDA thread id of the current OpenMP thread on the GPU.
2668 Value *GPUThreadID = getGPUThreadID();
2669 // nvptx_lane_id = nvptx_id % warpsize
2670 Value *LaneID = getNVPTXLaneID();
2671 // nvptx_warp_id = nvptx_id / warpsize
2672 Value *WarpID = getNVPTXWarpID();
2673
2674 InsertPointTy AllocaIP =
2677 Type *Arg0Type = ReduceListArg->getType();
2678 Type *Arg1Type = NumWarpsArg->getType();
2679 Builder.restoreIP(AllocaIP);
2680 AllocaInst *ReduceListAlloca = Builder.CreateAlloca(
2681 Arg0Type, nullptr, ReduceListArg->getName() + ".addr");
2682 AllocaInst *NumWarpsAlloca =
2683 Builder.CreateAlloca(Arg1Type, nullptr, NumWarpsArg->getName() + ".addr");
2685 ReduceListAlloca, Arg0Type, ReduceListAlloca->getName() + ".ascast");
2687 NumWarpsAlloca, Builder.getPtrTy(0),
2688 NumWarpsAlloca->getName() + ".ascast");
2689 Builder.CreateStore(ReduceListArg, ReduceListAddrCast);
2690 Builder.CreateStore(NumWarpsArg, NumWarpsAddrCast);
2691 AllocaIP = getInsertPointAfterInstr(NumWarpsAlloca);
2692 InsertPointTy CodeGenIP =
2694 Builder.restoreIP(CodeGenIP);
2695
2696 Value *ReduceList =
2697 Builder.CreateLoad(Builder.getPtrTy(), ReduceListAddrCast);
2698
2699 for (auto En : enumerate(ReductionInfos)) {
2700 //
2701 // Warp master copies reduce element to transfer medium in __shared__
2702 // memory.
2703 //
2704 const ReductionInfo &RI = En.value();
2705 unsigned RealTySize = M.getDataLayout().getTypeAllocSize(RI.ElementType);
2706 for (unsigned TySize = 4; TySize > 0 && RealTySize > 0; TySize /= 2) {
2707 Type *CType = Builder.getIntNTy(TySize * 8);
2708
2709 unsigned NumIters = RealTySize / TySize;
2710 if (NumIters == 0)
2711 continue;
2712 Value *Cnt = nullptr;
2713 Value *CntAddr = nullptr;
2714 BasicBlock *PrecondBB = nullptr;
2715 BasicBlock *ExitBB = nullptr;
2716 if (NumIters > 1) {
2717 CodeGenIP = Builder.saveIP();
2718 Builder.restoreIP(AllocaIP);
2719 CntAddr =
2720 Builder.CreateAlloca(Builder.getInt32Ty(), nullptr, ".cnt.addr");
2721
2722 CntAddr = Builder.CreateAddrSpaceCast(CntAddr, Builder.getPtrTy(),
2723 CntAddr->getName() + ".ascast");
2724 Builder.restoreIP(CodeGenIP);
2726 CntAddr,
2727 /*Volatile=*/false);
2728 PrecondBB = BasicBlock::Create(Ctx, "precond");
2729 ExitBB = BasicBlock::Create(Ctx, "exit");
2730 BasicBlock *BodyBB = BasicBlock::Create(Ctx, "body");
2731 emitBlock(PrecondBB, Builder.GetInsertBlock()->getParent());
2732 Cnt = Builder.CreateLoad(Builder.getInt32Ty(), CntAddr,
2733 /*Volatile=*/false);
2735 Cnt, ConstantInt::get(Builder.getInt32Ty(), NumIters));
2736 Builder.CreateCondBr(Cmp, BodyBB, ExitBB);
2738 }
2739
2740 // kmpc_barrier.
2741 InsertPointOrErrorTy BarrierIP1 =
2742 createBarrier(LocationDescription(Builder.saveIP(), Loc.DL),
2743 omp::Directive::OMPD_unknown,
2744 /* ForceSimpleCall */ false,
2745 /* CheckCancelFlag */ true);
2746 if (!BarrierIP1)
2747 return BarrierIP1.takeError();
2748 BasicBlock *ThenBB = BasicBlock::Create(Ctx, "then");
2749 BasicBlock *ElseBB = BasicBlock::Create(Ctx, "else");
2750 BasicBlock *MergeBB = BasicBlock::Create(Ctx, "ifcont");
2751
2752 // if (lane_id == 0)
2753 Value *IsWarpMaster = Builder.CreateIsNull(LaneID, "warp_master");
2754 Builder.CreateCondBr(IsWarpMaster, ThenBB, ElseBB);
2756
2757 // Reduce element = LocalReduceList[i]
2758 auto *RedListArrayTy =
2759 ArrayType::get(Builder.getPtrTy(), ReductionInfos.size());
2760 Type *IndexTy = Builder.getIndexTy(
2762 Value *ElemPtrPtr =
2763 Builder.CreateInBoundsGEP(RedListArrayTy, ReduceList,
2764 {ConstantInt::get(IndexTy, 0),
2765 ConstantInt::get(IndexTy, En.index())});
2766 // elemptr = ((CopyType*)(elemptrptr)) + I
2767 Value *ElemPtr = Builder.CreateLoad(Builder.getPtrTy(), ElemPtrPtr);
2768 if (NumIters > 1)
2769 ElemPtr = Builder.CreateGEP(Builder.getInt32Ty(), ElemPtr, Cnt);
2770
2771 // Get pointer to location in transfer medium.
2772 // MediumPtr = &medium[warp_id]
2773 Value *MediumPtr = Builder.CreateInBoundsGEP(
2774 ArrayTy, TransferMedium, {Builder.getInt64(0), WarpID});
2775 // elem = *elemptr
2776 //*MediumPtr = elem
2777 Value *Elem = Builder.CreateLoad(CType, ElemPtr);
2778 // Store the source element value to the dest element address.
2779 Builder.CreateStore(Elem, MediumPtr,
2780 /*IsVolatile*/ true);
2781 Builder.CreateBr(MergeBB);
2782
2783 // else
2785 Builder.CreateBr(MergeBB);
2786
2787 // endif
2789 InsertPointOrErrorTy BarrierIP2 =
2790 createBarrier(LocationDescription(Builder.saveIP(), Loc.DL),
2791 omp::Directive::OMPD_unknown,
2792 /* ForceSimpleCall */ false,
2793 /* CheckCancelFlag */ true);
2794 if (!BarrierIP2)
2795 return BarrierIP2.takeError();
2796
2797 // Warp 0 copies reduce element from transfer medium
2798 BasicBlock *W0ThenBB = BasicBlock::Create(Ctx, "then");
2799 BasicBlock *W0ElseBB = BasicBlock::Create(Ctx, "else");
2800 BasicBlock *W0MergeBB = BasicBlock::Create(Ctx, "ifcont");
2801
2802 Value *NumWarpsVal =
2803 Builder.CreateLoad(Builder.getInt32Ty(), NumWarpsAddrCast);
2804 // Up to 32 threads in warp 0 are active.
2805 Value *IsActiveThread =
2806 Builder.CreateICmpULT(GPUThreadID, NumWarpsVal, "is_active_thread");
2807 Builder.CreateCondBr(IsActiveThread, W0ThenBB, W0ElseBB);
2808
2809 emitBlock(W0ThenBB, Builder.GetInsertBlock()->getParent());
2810
2811 // SecMediumPtr = &medium[tid]
2812 // SrcMediumVal = *SrcMediumPtr
2813 Value *SrcMediumPtrVal = Builder.CreateInBoundsGEP(
2814 ArrayTy, TransferMedium, {Builder.getInt64(0), GPUThreadID});
2815 // TargetElemPtr = (CopyType*)(SrcDataAddr[i]) + I
2816 Value *TargetElemPtrPtr =
2817 Builder.CreateInBoundsGEP(RedListArrayTy, ReduceList,
2818 {ConstantInt::get(IndexTy, 0),
2819 ConstantInt::get(IndexTy, En.index())});
2820 Value *TargetElemPtrVal =
2821 Builder.CreateLoad(Builder.getPtrTy(), TargetElemPtrPtr);
2822 Value *TargetElemPtr = TargetElemPtrVal;
2823 if (NumIters > 1)
2824 TargetElemPtr =
2825 Builder.CreateGEP(Builder.getInt32Ty(), TargetElemPtr, Cnt);
2826
2827 // *TargetElemPtr = SrcMediumVal;
2828 Value *SrcMediumValue =
2829 Builder.CreateLoad(CType, SrcMediumPtrVal, /*IsVolatile*/ true);
2830 Builder.CreateStore(SrcMediumValue, TargetElemPtr);
2831 Builder.CreateBr(W0MergeBB);
2832
2833 emitBlock(W0ElseBB, Builder.GetInsertBlock()->getParent());
2834 Builder.CreateBr(W0MergeBB);
2835
2836 emitBlock(W0MergeBB, Builder.GetInsertBlock()->getParent());
2837
2838 if (NumIters > 1) {
2839 Cnt = Builder.CreateNSWAdd(
2840 Cnt, ConstantInt::get(Builder.getInt32Ty(), /*V=*/1));
2841 Builder.CreateStore(Cnt, CntAddr, /*Volatile=*/false);
2842
2843 auto *CurFn = Builder.GetInsertBlock()->getParent();
2844 emitBranch(PrecondBB);
2845 emitBlock(ExitBB, CurFn);
2846 }
2847 RealTySize %= TySize;
2848 }
2849 }
2850
2852 Builder.restoreIP(SavedIP);
2853
2854 return WcFunc;
2855}
2856
2857Function *OpenMPIRBuilder::emitShuffleAndReduceFunction(
2858 ArrayRef<ReductionInfo> ReductionInfos, Function *ReduceFn,
2859 AttributeList FuncAttrs) {
2860 LLVMContext &Ctx = M.getContext();
2861 FunctionType *FuncTy =
2863 {Builder.getPtrTy(), Builder.getInt16Ty(),
2864 Builder.getInt16Ty(), Builder.getInt16Ty()},
2865 /* IsVarArg */ false);
2866 Function *SarFunc =
2868 "_omp_reduction_shuffle_and_reduce_func", &M);
2869 SarFunc->setAttributes(FuncAttrs);
2870 SarFunc->addParamAttr(0, Attribute::NoUndef);
2871 SarFunc->addParamAttr(1, Attribute::NoUndef);
2872 SarFunc->addParamAttr(2, Attribute::NoUndef);
2873 SarFunc->addParamAttr(3, Attribute::NoUndef);
2874 SarFunc->addParamAttr(1, Attribute::SExt);
2875 SarFunc->addParamAttr(2, Attribute::SExt);
2876 SarFunc->addParamAttr(3, Attribute::SExt);
2877 BasicBlock *EntryBB = BasicBlock::Create(M.getContext(), "entry", SarFunc);
2878 Builder.SetInsertPoint(EntryBB);
2879
2880 // Thread local Reduce list used to host the values of data to be reduced.
2881 Argument *ReduceListArg = SarFunc->getArg(0);
2882 // Current lane id; could be logical.
2883 Argument *LaneIDArg = SarFunc->getArg(1);
2884 // Offset of the remote source lane relative to the current lane.
2885 Argument *RemoteLaneOffsetArg = SarFunc->getArg(2);
2886 // Algorithm version. This is expected to be known at compile time.
2887 Argument *AlgoVerArg = SarFunc->getArg(3);
2888
2889 Type *ReduceListArgType = ReduceListArg->getType();
2890 Type *LaneIDArgType = LaneIDArg->getType();
2891 Type *LaneIDArgPtrType = Builder.getPtrTy(0);
2892 Value *ReduceListAlloca = Builder.CreateAlloca(
2893 ReduceListArgType, nullptr, ReduceListArg->getName() + ".addr");
2894 Value *LaneIdAlloca = Builder.CreateAlloca(LaneIDArgType, nullptr,
2895 LaneIDArg->getName() + ".addr");
2896 Value *RemoteLaneOffsetAlloca = Builder.CreateAlloca(
2897 LaneIDArgType, nullptr, RemoteLaneOffsetArg->getName() + ".addr");
2898 Value *AlgoVerAlloca = Builder.CreateAlloca(LaneIDArgType, nullptr,
2899 AlgoVerArg->getName() + ".addr");
2900 ArrayType *RedListArrayTy =
2901 ArrayType::get(Builder.getPtrTy(), ReductionInfos.size());
2902
2903 // Create a local thread-private variable to host the Reduce list
2904 // from a remote lane.
2905 Instruction *RemoteReductionListAlloca = Builder.CreateAlloca(
2906 RedListArrayTy, nullptr, ".omp.reduction.remote_reduce_list");
2907
2909 ReduceListAlloca, ReduceListArgType,
2910 ReduceListAlloca->getName() + ".ascast");
2912 LaneIdAlloca, LaneIDArgPtrType, LaneIdAlloca->getName() + ".ascast");
2913 Value *RemoteLaneOffsetAddrCast = Builder.CreatePointerBitCastOrAddrSpaceCast(
2914 RemoteLaneOffsetAlloca, LaneIDArgPtrType,
2915 RemoteLaneOffsetAlloca->getName() + ".ascast");
2917 AlgoVerAlloca, LaneIDArgPtrType, AlgoVerAlloca->getName() + ".ascast");
2919 RemoteReductionListAlloca, Builder.getPtrTy(),
2920 RemoteReductionListAlloca->getName() + ".ascast");
2921
2922 Builder.CreateStore(ReduceListArg, ReduceListAddrCast);
2923 Builder.CreateStore(LaneIDArg, LaneIdAddrCast);
2924 Builder.CreateStore(RemoteLaneOffsetArg, RemoteLaneOffsetAddrCast);
2925 Builder.CreateStore(AlgoVerArg, AlgoVerAddrCast);
2926
2927 Value *ReduceList = Builder.CreateLoad(ReduceListArgType, ReduceListAddrCast);
2928 Value *LaneId = Builder.CreateLoad(LaneIDArgType, LaneIdAddrCast);
2929 Value *RemoteLaneOffset =
2930 Builder.CreateLoad(LaneIDArgType, RemoteLaneOffsetAddrCast);
2931 Value *AlgoVer = Builder.CreateLoad(LaneIDArgType, AlgoVerAddrCast);
2932
2933 InsertPointTy AllocaIP = getInsertPointAfterInstr(RemoteReductionListAlloca);
2934
2935 // This loop iterates through the list of reduce elements and copies,
2936 // element by element, from a remote lane in the warp to RemoteReduceList,
2937 // hosted on the thread's stack.
2938 emitReductionListCopy(
2939 AllocaIP, CopyAction::RemoteLaneToThread, RedListArrayTy, ReductionInfos,
2940 ReduceList, RemoteListAddrCast, {RemoteLaneOffset, nullptr, nullptr});
2941
2942 // The actions to be performed on the Remote Reduce list is dependent
2943 // on the algorithm version.
2944 //
2945 // if (AlgoVer==0) || (AlgoVer==1 && (LaneId < Offset)) || (AlgoVer==2 &&
2946 // LaneId % 2 == 0 && Offset > 0):
2947 // do the reduction value aggregation
2948 //
2949 // The thread local variable Reduce list is mutated in place to host the
2950 // reduced data, which is the aggregated value produced from local and
2951 // remote lanes.
2952 //
2953 // Note that AlgoVer is expected to be a constant integer known at compile
2954 // time.
2955 // When AlgoVer==0, the first conjunction evaluates to true, making
2956 // the entire predicate true during compile time.
2957 // When AlgoVer==1, the second conjunction has only the second part to be
2958 // evaluated during runtime. Other conjunctions evaluates to false
2959 // during compile time.
2960 // When AlgoVer==2, the third conjunction has only the second part to be
2961 // evaluated during runtime. Other conjunctions evaluates to false
2962 // during compile time.
2963 Value *CondAlgo0 = Builder.CreateIsNull(AlgoVer);
2964 Value *Algo1 = Builder.CreateICmpEQ(AlgoVer, Builder.getInt16(1));
2965 Value *LaneComp = Builder.CreateICmpULT(LaneId, RemoteLaneOffset);
2966 Value *CondAlgo1 = Builder.CreateAnd(Algo1, LaneComp);
2967 Value *Algo2 = Builder.CreateICmpEQ(AlgoVer, Builder.getInt16(2));
2968 Value *LaneIdAnd1 = Builder.CreateAnd(LaneId, Builder.getInt16(1));
2969 Value *LaneIdComp = Builder.CreateIsNull(LaneIdAnd1);
2970 Value *Algo2AndLaneIdComp = Builder.CreateAnd(Algo2, LaneIdComp);
2971 Value *RemoteOffsetComp =
2972 Builder.CreateICmpSGT(RemoteLaneOffset, Builder.getInt16(0));
2973 Value *CondAlgo2 = Builder.CreateAnd(Algo2AndLaneIdComp, RemoteOffsetComp);
2974 Value *CA0OrCA1 = Builder.CreateOr(CondAlgo0, CondAlgo1);
2975 Value *CondReduce = Builder.CreateOr(CA0OrCA1, CondAlgo2);
2976
2977 BasicBlock *ThenBB = BasicBlock::Create(Ctx, "then");
2978 BasicBlock *ElseBB = BasicBlock::Create(Ctx, "else");
2979 BasicBlock *MergeBB = BasicBlock::Create(Ctx, "ifcont");
2980
2981 Builder.CreateCondBr(CondReduce, ThenBB, ElseBB);
2984 ReduceList, Builder.getPtrTy());
2985 Value *RemoteReduceListPtr = Builder.CreatePointerBitCastOrAddrSpaceCast(
2986 RemoteListAddrCast, Builder.getPtrTy());
2987 Builder.CreateCall(ReduceFn, {LocalReduceListPtr, RemoteReduceListPtr})
2988 ->addFnAttr(Attribute::NoUnwind);
2989 Builder.CreateBr(MergeBB);
2990
2992 Builder.CreateBr(MergeBB);
2993
2995
2996 // if (AlgoVer==1 && (LaneId >= Offset)) copy Remote Reduce list to local
2997 // Reduce list.
2998 Algo1 = Builder.CreateICmpEQ(AlgoVer, Builder.getInt16(1));
2999 Value *LaneIdGtOffset = Builder.CreateICmpUGE(LaneId, RemoteLaneOffset);
3000 Value *CondCopy = Builder.CreateAnd(Algo1, LaneIdGtOffset);
3001
3002 BasicBlock *CpyThenBB = BasicBlock::Create(Ctx, "then");
3003 BasicBlock *CpyElseBB = BasicBlock::Create(Ctx, "else");
3004 BasicBlock *CpyMergeBB = BasicBlock::Create(Ctx, "ifcont");
3005 Builder.CreateCondBr(CondCopy, CpyThenBB, CpyElseBB);
3006
3007 emitBlock(CpyThenBB, Builder.GetInsertBlock()->getParent());
3008 emitReductionListCopy(AllocaIP, CopyAction::ThreadCopy, RedListArrayTy,
3009 ReductionInfos, RemoteListAddrCast, ReduceList);
3010 Builder.CreateBr(CpyMergeBB);
3011
3012 emitBlock(CpyElseBB, Builder.GetInsertBlock()->getParent());
3013 Builder.CreateBr(CpyMergeBB);
3014
3015 emitBlock(CpyMergeBB, Builder.GetInsertBlock()->getParent());
3016
3018
3019 return SarFunc;
3020}
3021
3022Function *OpenMPIRBuilder::emitListToGlobalCopyFunction(
3023 ArrayRef<ReductionInfo> ReductionInfos, Type *ReductionsBufferTy,
3024 AttributeList FuncAttrs) {
3026 LLVMContext &Ctx = M.getContext();
3029 {Builder.getPtrTy(), Builder.getInt32Ty(), Builder.getPtrTy()},
3030 /* IsVarArg */ false);
3031 Function *LtGCFunc =
3033 "_omp_reduction_list_to_global_copy_func", &M);
3034 LtGCFunc->setAttributes(FuncAttrs);
3035 LtGCFunc->addParamAttr(0, Attribute::NoUndef);
3036 LtGCFunc->addParamAttr(1, Attribute::NoUndef);
3037 LtGCFunc->addParamAttr(2, Attribute::NoUndef);
3038
3039 BasicBlock *EntryBlock = BasicBlock::Create(Ctx, "entry", LtGCFunc);
3040 Builder.SetInsertPoint(EntryBlock);
3041
3042 // Buffer: global reduction buffer.
3043 Argument *BufferArg = LtGCFunc->getArg(0);
3044 // Idx: index of the buffer.
3045 Argument *IdxArg = LtGCFunc->getArg(1);
3046 // ReduceList: thread local Reduce list.
3047 Argument *ReduceListArg = LtGCFunc->getArg(2);
3048
3049 Value *BufferArgAlloca = Builder.CreateAlloca(Builder.getPtrTy(), nullptr,
3050 BufferArg->getName() + ".addr");
3051 Value *IdxArgAlloca = Builder.CreateAlloca(Builder.getInt32Ty(), nullptr,
3052 IdxArg->getName() + ".addr");
3053 Value *ReduceListArgAlloca = Builder.CreateAlloca(
3054 Builder.getPtrTy(), nullptr, ReduceListArg->getName() + ".addr");
3056 BufferArgAlloca, Builder.getPtrTy(),
3057 BufferArgAlloca->getName() + ".ascast");
3059 IdxArgAlloca, Builder.getPtrTy(), IdxArgAlloca->getName() + ".ascast");
3060 Value *ReduceListArgAddrCast = Builder.CreatePointerBitCastOrAddrSpaceCast(
3061 ReduceListArgAlloca, Builder.getPtrTy(),
3062 ReduceListArgAlloca->getName() + ".ascast");
3063
3064 Builder.CreateStore(BufferArg, BufferArgAddrCast);
3065 Builder.CreateStore(IdxArg, IdxArgAddrCast);
3066 Builder.CreateStore(ReduceListArg, ReduceListArgAddrCast);
3067
3068 Value *LocalReduceList =
3069 Builder.CreateLoad(Builder.getPtrTy(), ReduceListArgAddrCast);
3070 Value *BufferArgVal =
3071 Builder.CreateLoad(Builder.getPtrTy(), BufferArgAddrCast);
3072 Value *Idxs[] = {Builder.CreateLoad(Builder.getInt32Ty(), IdxArgAddrCast)};
3073 Type *IndexTy = Builder.getIndexTy(
3075 for (auto En : enumerate(ReductionInfos)) {
3076 const ReductionInfo &RI = En.value();
3077 auto *RedListArrayTy =
3078 ArrayType::get(Builder.getPtrTy(), ReductionInfos.size());
3079 // Reduce element = LocalReduceList[i]
3080 Value *ElemPtrPtr = Builder.CreateInBoundsGEP(
3081 RedListArrayTy, LocalReduceList,
3082 {ConstantInt::get(IndexTy, 0), ConstantInt::get(IndexTy, En.index())});
3083 // elemptr = ((CopyType*)(elemptrptr)) + I
3084 Value *ElemPtr = Builder.CreateLoad(Builder.getPtrTy(), ElemPtrPtr);
3085
3086 // Global = Buffer.VD[Idx];
3087 Value *BufferVD =
3088 Builder.CreateInBoundsGEP(ReductionsBufferTy, BufferArgVal, Idxs);
3090 ReductionsBufferTy, BufferVD, 0, En.index());
3091
3092 switch (RI.EvaluationKind) {
3093 case EvalKind::Scalar: {
3094 Value *TargetElement = Builder.CreateLoad(RI.ElementType, ElemPtr);
3095 Builder.CreateStore(TargetElement, GlobVal);
3096 break;
3097 }
3098 case EvalKind::Complex: {
3100 RI.ElementType, ElemPtr, 0, 0, ".realp");
3101 Value *SrcReal = Builder.CreateLoad(
3102 RI.ElementType->getStructElementType(0), SrcRealPtr, ".real");
3104 RI.ElementType, ElemPtr, 0, 1, ".imagp");
3105 Value *SrcImg = Builder.CreateLoad(
3106 RI.ElementType->getStructElementType(1), SrcImgPtr, ".imag");
3107
3109 RI.ElementType, GlobVal, 0, 0, ".realp");
3111 RI.ElementType, GlobVal, 0, 1, ".imagp");
3112 Builder.CreateStore(SrcReal, DestRealPtr);
3113 Builder.CreateStore(SrcImg, DestImgPtr);
3114 break;
3115 }
3116 case EvalKind::Aggregate: {
3117 Value *SizeVal =
3118 Builder.getInt64(M.getDataLayout().getTypeStoreSize(RI.ElementType));
3120 GlobVal, M.getDataLayout().getPrefTypeAlign(RI.ElementType), ElemPtr,
3121 M.getDataLayout().getPrefTypeAlign(RI.ElementType), SizeVal, false);
3122 break;
3123 }
3124 }
3125 }
3126
3128 Builder.restoreIP(OldIP);
3129 return LtGCFunc;
3130}
3131
3132Function *OpenMPIRBuilder::emitListToGlobalReduceFunction(
3133 ArrayRef<ReductionInfo> ReductionInfos, Function *ReduceFn,
3134 Type *ReductionsBufferTy, AttributeList FuncAttrs) {
3136 LLVMContext &Ctx = M.getContext();
3139 {Builder.getPtrTy(), Builder.getInt32Ty(), Builder.getPtrTy()},
3140 /* IsVarArg */ false);
3141 Function *LtGRFunc =
3143 "_omp_reduction_list_to_global_reduce_func", &M);
3144 LtGRFunc->setAttributes(FuncAttrs);
3145 LtGRFunc->addParamAttr(0, Attribute::NoUndef);
3146 LtGRFunc->addParamAttr(1, Attribute::NoUndef);
3147 LtGRFunc->addParamAttr(2, Attribute::NoUndef);
3148
3149 BasicBlock *EntryBlock = BasicBlock::Create(Ctx, "entry", LtGRFunc);
3150 Builder.SetInsertPoint(EntryBlock);
3151
3152 // Buffer: global reduction buffer.
3153 Argument *BufferArg = LtGRFunc->getArg(0);
3154 // Idx: index of the buffer.
3155 Argument *IdxArg = LtGRFunc->getArg(1);
3156 // ReduceList: thread local Reduce list.
3157 Argument *ReduceListArg = LtGRFunc->getArg(2);
3158
3159 Value *BufferArgAlloca = Builder.CreateAlloca(Builder.getPtrTy(), nullptr,
3160 BufferArg->getName() + ".addr");
3161 Value *IdxArgAlloca = Builder.CreateAlloca(Builder.getInt32Ty(), nullptr,
3162 IdxArg->getName() + ".addr");
3163 Value *ReduceListArgAlloca = Builder.CreateAlloca(
3164 Builder.getPtrTy(), nullptr, ReduceListArg->getName() + ".addr");
3165 auto *RedListArrayTy =
3166 ArrayType::get(Builder.getPtrTy(), ReductionInfos.size());
3167
3168 // 1. Build a list of reduction variables.
3169 // void *RedList[<n>] = {<ReductionVars>[0], ..., <ReductionVars>[<n>-1]};
3170 Value *LocalReduceList =
3171 Builder.CreateAlloca(RedListArrayTy, nullptr, ".omp.reduction.red_list");
3172
3174 BufferArgAlloca, Builder.getPtrTy(),
3175 BufferArgAlloca->getName() + ".ascast");
3177 IdxArgAlloca, Builder.getPtrTy(), IdxArgAlloca->getName() + ".ascast");
3178 Value *ReduceListArgAddrCast = Builder.CreatePointerBitCastOrAddrSpaceCast(
3179 ReduceListArgAlloca, Builder.getPtrTy(),
3180 ReduceListArgAlloca->getName() + ".ascast");
3181 Value *LocalReduceListAddrCast = Builder.CreatePointerBitCastOrAddrSpaceCast(
3182 LocalReduceList, Builder.getPtrTy(),
3183 LocalReduceList->getName() + ".ascast");
3184
3185 Builder.CreateStore(BufferArg, BufferArgAddrCast);
3186 Builder.CreateStore(IdxArg, IdxArgAddrCast);
3187 Builder.CreateStore(ReduceListArg, ReduceListArgAddrCast);
3188
3189 Value *BufferVal = Builder.CreateLoad(Builder.getPtrTy(), BufferArgAddrCast);
3190 Value *Idxs[] = {Builder.CreateLoad(Builder.getInt32Ty(), IdxArgAddrCast)};
3191 Type *IndexTy = Builder.getIndexTy(
3193 for (auto En : enumerate(ReductionInfos)) {
3194 Value *TargetElementPtrPtr = Builder.CreateInBoundsGEP(
3195 RedListArrayTy, LocalReduceListAddrCast,
3196 {ConstantInt::get(IndexTy, 0), ConstantInt::get(IndexTy, En.index())});
3197 Value *BufferVD =
3198 Builder.CreateInBoundsGEP(ReductionsBufferTy, BufferVal, Idxs);
3199 // Global = Buffer.VD[Idx];
3201 ReductionsBufferTy, BufferVD, 0, En.index());
3202 Builder.CreateStore(GlobValPtr, TargetElementPtrPtr);
3203 }
3204
3205 // Call reduce_function(GlobalReduceList, ReduceList)
3206 Value *ReduceList =
3207 Builder.CreateLoad(Builder.getPtrTy(), ReduceListArgAddrCast);
3208 Builder.CreateCall(ReduceFn, {LocalReduceListAddrCast, ReduceList})
3209 ->addFnAttr(Attribute::NoUnwind);
3211 Builder.restoreIP(OldIP);
3212 return LtGRFunc;
3213}
3214
3215Function *OpenMPIRBuilder::emitGlobalToListCopyFunction(
3216 ArrayRef<ReductionInfo> ReductionInfos, Type *ReductionsBufferTy,
3217 AttributeList FuncAttrs) {
3219 LLVMContext &Ctx = M.getContext();
3222 {Builder.getPtrTy(), Builder.getInt32Ty(), Builder.getPtrTy()},
3223 /* IsVarArg */ false);
3224 Function *LtGCFunc =
3226 "_omp_reduction_global_to_list_copy_func", &M);
3227 LtGCFunc->setAttributes(FuncAttrs);
3228 LtGCFunc->addParamAttr(0, Attribute::NoUndef);
3229 LtGCFunc->addParamAttr(1, Attribute::NoUndef);
3230 LtGCFunc->addParamAttr(2, Attribute::NoUndef);
3231
3232 BasicBlock *EntryBlock = BasicBlock::Create(Ctx, "entry", LtGCFunc);
3233 Builder.SetInsertPoint(EntryBlock);
3234
3235 // Buffer: global reduction buffer.
3236 Argument *BufferArg = LtGCFunc->getArg(0);
3237 // Idx: index of the buffer.
3238 Argument *IdxArg = LtGCFunc->getArg(1);
3239 // ReduceList: thread local Reduce list.
3240 Argument *ReduceListArg = LtGCFunc->getArg(2);
3241
3242 Value *BufferArgAlloca = Builder.CreateAlloca(Builder.getPtrTy(), nullptr,
3243 BufferArg->getName() + ".addr");
3244 Value *IdxArgAlloca = Builder.CreateAlloca(Builder.getInt32Ty(), nullptr,
3245 IdxArg->getName() + ".addr");
3246 Value *ReduceListArgAlloca = Builder.CreateAlloca(
3247 Builder.getPtrTy(), nullptr, ReduceListArg->getName() + ".addr");
3249 BufferArgAlloca, Builder.getPtrTy(),
3250 BufferArgAlloca->getName() + ".ascast");
3252 IdxArgAlloca, Builder.getPtrTy(), IdxArgAlloca->getName() + ".ascast");
3253 Value *ReduceListArgAddrCast = Builder.CreatePointerBitCastOrAddrSpaceCast(
3254 ReduceListArgAlloca, Builder.getPtrTy(),
3255 ReduceListArgAlloca->getName() + ".ascast");
3256 Builder.CreateStore(BufferArg, BufferArgAddrCast);
3257 Builder.CreateStore(IdxArg, IdxArgAddrCast);
3258 Builder.CreateStore(ReduceListArg, ReduceListArgAddrCast);
3259
3260 Value *LocalReduceList =
3261 Builder.CreateLoad(Builder.getPtrTy(), ReduceListArgAddrCast);
3262 Value *BufferVal = Builder.CreateLoad(Builder.getPtrTy(), BufferArgAddrCast);
3263 Value *Idxs[] = {Builder.CreateLoad(Builder.getInt32Ty(), IdxArgAddrCast)};
3264 Type *IndexTy = Builder.getIndexTy(
3266 for (auto En : enumerate(ReductionInfos)) {
3267 const OpenMPIRBuilder::ReductionInfo &RI = En.value();
3268 auto *RedListArrayTy =
3269 ArrayType::get(Builder.getPtrTy(), ReductionInfos.size());
3270 // Reduce element = LocalReduceList[i]
3271 Value *ElemPtrPtr = Builder.CreateInBoundsGEP(
3272 RedListArrayTy, LocalReduceList,
3273 {ConstantInt::get(IndexTy, 0), ConstantInt::get(IndexTy, En.index())});
3274 // elemptr = ((CopyType*)(elemptrptr)) + I
3275 Value *ElemPtr = Builder.CreateLoad(Builder.getPtrTy(), ElemPtrPtr);
3276 // Global = Buffer.VD[Idx];
3277 Value *BufferVD =
3278 Builder.CreateInBoundsGEP(ReductionsBufferTy, BufferVal, Idxs);
3280 ReductionsBufferTy, BufferVD, 0, En.index());
3281
3282 switch (RI.EvaluationKind) {
3283 case EvalKind::Scalar: {
3284 Value *TargetElement = Builder.CreateLoad(RI.ElementType, GlobValPtr);
3285 Builder.CreateStore(TargetElement, ElemPtr);
3286 break;
3287 }
3288 case EvalKind::Complex: {
3290 RI.ElementType, GlobValPtr, 0, 0, ".realp");
3291 Value *SrcReal = Builder.CreateLoad(
3292 RI.ElementType->getStructElementType(0), SrcRealPtr, ".real");
3294 RI.ElementType, GlobValPtr, 0, 1, ".imagp");
3295 Value *SrcImg = Builder.CreateLoad(
3296 RI.ElementType->getStructElementType(1), SrcImgPtr, ".imag");
3297
3299 RI.ElementType, ElemPtr, 0, 0, ".realp");
3301 RI.ElementType, ElemPtr, 0, 1, ".imagp");
3302 Builder.CreateStore(SrcReal, DestRealPtr);
3303 Builder.CreateStore(SrcImg, DestImgPtr);
3304 break;
3305 }
3306 case EvalKind::Aggregate: {
3307 Value *SizeVal =
3311 GlobValPtr, M.getDataLayout().getPrefTypeAlign(RI.ElementType),
3312 SizeVal, false);
3313 break;
3314 }
3315 }
3316 }
3317
3319 Builder.restoreIP(OldIP);
3320 return LtGCFunc;
3321}
3322
3323Function *OpenMPIRBuilder::emitGlobalToListReduceFunction(
3324 ArrayRef<ReductionInfo> ReductionInfos, Function *ReduceFn,
3325 Type *ReductionsBufferTy, AttributeList FuncAttrs) {
3327 LLVMContext &Ctx = M.getContext();
3328 auto *FuncTy = FunctionType::get(
3330 {Builder.getPtrTy(), Builder.getInt32Ty(), Builder.getPtrTy()},
3331 /* IsVarArg */ false);
3332 Function *LtGRFunc =
3334 "_omp_reduction_global_to_list_reduce_func", &M);
3335 LtGRFunc->setAttributes(FuncAttrs);
3336 LtGRFunc->addParamAttr(0, Attribute::NoUndef);
3337 LtGRFunc->addParamAttr(1, Attribute::NoUndef);
3338 LtGRFunc->addParamAttr(2, Attribute::NoUndef);
3339
3340 BasicBlock *EntryBlock = BasicBlock::Create(Ctx, "entry", LtGRFunc);
3341 Builder.SetInsertPoint(EntryBlock);
3342
3343 // Buffer: global reduction buffer.
3344 Argument *BufferArg = LtGRFunc->getArg(0);
3345 // Idx: index of the buffer.
3346 Argument *IdxArg = LtGRFunc->getArg(1);
3347 // ReduceList: thread local Reduce list.
3348 Argument *ReduceListArg = LtGRFunc->getArg(2);
3349
3350 Value *BufferArgAlloca = Builder.CreateAlloca(Builder.getPtrTy(), nullptr,
3351 BufferArg->getName() + ".addr");
3352 Value *IdxArgAlloca = Builder.CreateAlloca(Builder.getInt32Ty(), nullptr,
3353 IdxArg->getName() + ".addr");
3354 Value *ReduceListArgAlloca = Builder.CreateAlloca(
3355 Builder.getPtrTy(), nullptr, ReduceListArg->getName() + ".addr");
3356 ArrayType *RedListArrayTy =
3357 ArrayType::get(Builder.getPtrTy(), ReductionInfos.size());
3358
3359 // 1. Build a list of reduction variables.
3360 // void *RedList[<n>] = {<ReductionVars>[0], ..., <ReductionVars>[<n>-1]};
3361 Value *LocalReduceList =
3362 Builder.CreateAlloca(RedListArrayTy, nullptr, ".omp.reduction.red_list");
3363
3365 BufferArgAlloca, Builder.getPtrTy(),
3366 BufferArgAlloca->getName() + ".ascast");
3368 IdxArgAlloca, Builder.getPtrTy(), IdxArgAlloca->getName() + ".ascast");
3369 Value *ReduceListArgAddrCast = Builder.CreatePointerBitCastOrAddrSpaceCast(
3370 ReduceListArgAlloca, Builder.getPtrTy(),
3371 ReduceListArgAlloca->getName() + ".ascast");
3373 LocalReduceList, Builder.getPtrTy(),
3374 LocalReduceList->getName() + ".ascast");
3375
3376 Builder.CreateStore(BufferArg, BufferArgAddrCast);
3377 Builder.CreateStore(IdxArg, IdxArgAddrCast);
3378 Builder.CreateStore(ReduceListArg, ReduceListArgAddrCast);
3379
3380 Value *BufferVal = Builder.CreateLoad(Builder.getPtrTy(), BufferArgAddrCast);
3381 Value *Idxs[] = {Builder.CreateLoad(Builder.getInt32Ty(), IdxArgAddrCast)};
3382 Type *IndexTy = Builder.getIndexTy(
3384 for (auto En : enumerate(ReductionInfos)) {
3385 Value *TargetElementPtrPtr = Builder.CreateInBoundsGEP(
3386 RedListArrayTy, ReductionList,
3387 {ConstantInt::get(IndexTy, 0), ConstantInt::get(IndexTy, En.index())});
3388 // Global = Buffer.VD[Idx];
3389 Value *BufferVD =
3390 Builder.CreateInBoundsGEP(ReductionsBufferTy, BufferVal, Idxs);
3392 ReductionsBufferTy, BufferVD, 0, En.index());
3393 Builder.CreateStore(GlobValPtr, TargetElementPtrPtr);
3394 }
3395
3396 // Call reduce_function(ReduceList, GlobalReduceList)
3397 Value *ReduceList =
3398 Builder.CreateLoad(Builder.getPtrTy(), ReduceListArgAddrCast);
3399 Builder.CreateCall(ReduceFn, {ReduceList, ReductionList})
3400 ->addFnAttr(Attribute::NoUnwind);
3402 Builder.restoreIP(OldIP);
3403 return LtGRFunc;
3404}
3405
3406std::string OpenMPIRBuilder::getReductionFuncName(StringRef Name) const {
3407 std::string Suffix =
3408 createPlatformSpecificName({"omp", "reduction", "reduction_func"});
3409 return (Name + Suffix).str();
3410}
3411
3412Expected<Function *> OpenMPIRBuilder::createReductionFunction(
3413 StringRef ReducerName, ArrayRef<ReductionInfo> ReductionInfos,
3414 ReductionGenCBKind ReductionGenCBKind, AttributeList FuncAttrs) {
3415 auto *FuncTy = FunctionType::get(Builder.getVoidTy(),
3416 {Builder.getPtrTy(), Builder.getPtrTy()},
3417 /* IsVarArg */ false);
3418 std::string Name = getReductionFuncName(ReducerName);
3419 Function *ReductionFunc =
3421 ReductionFunc->setAttributes(FuncAttrs);
3422 ReductionFunc->addParamAttr(0, Attribute::NoUndef);
3423 ReductionFunc->addParamAttr(1, Attribute::NoUndef);
3424 BasicBlock *EntryBB =
3425 BasicBlock::Create(M.getContext(), "entry", ReductionFunc);
3426 Builder.SetInsertPoint(EntryBB);
3427
3428 // Need to alloca memory here and deal with the pointers before getting
3429 // LHS/RHS pointers out
3430 Value *LHSArrayPtr = nullptr;
3431 Value *RHSArrayPtr = nullptr;
3432 Argument *Arg0 = ReductionFunc->getArg(0);
3433 Argument *Arg1 = ReductionFunc->getArg(1);
3434 Type *Arg0Type = Arg0->getType();
3435 Type *Arg1Type = Arg1->getType();
3436
3437 Value *LHSAlloca =
3438 Builder.CreateAlloca(Arg0Type, nullptr, Arg0->getName() + ".addr");
3439 Value *RHSAlloca =
3440 Builder.CreateAlloca(Arg1Type, nullptr, Arg1->getName() + ".addr");
3442 LHSAlloca, Arg0Type, LHSAlloca->getName() + ".ascast");
3444 RHSAlloca, Arg1Type, RHSAlloca->getName() + ".ascast");
3445 Builder.CreateStore(Arg0, LHSAddrCast);
3446 Builder.CreateStore(Arg1, RHSAddrCast);
3447 LHSArrayPtr = Builder.CreateLoad(Arg0Type, LHSAddrCast);
3448 RHSArrayPtr = Builder.CreateLoad(Arg1Type, RHSAddrCast);
3449
3450 Type *RedArrayTy = ArrayType::get(Builder.getPtrTy(), ReductionInfos.size());
3451 Type *IndexTy = Builder.getIndexTy(
3453 SmallVector<Value *> LHSPtrs, RHSPtrs;
3454 for (auto En : enumerate(ReductionInfos)) {
3455 const ReductionInfo &RI = En.value();
3456 Value *RHSI8PtrPtr = Builder.CreateInBoundsGEP(
3457 RedArrayTy, RHSArrayPtr,
3458 {ConstantInt::get(IndexTy, 0), ConstantInt::get(IndexTy, En.index())});
3459 Value *RHSI8Ptr = Builder.CreateLoad(Builder.getPtrTy(), RHSI8PtrPtr);
3461 RHSI8Ptr, RI.PrivateVariable->getType(),
3462 RHSI8Ptr->getName() + ".ascast");
3463
3464 Value *LHSI8PtrPtr = Builder.CreateInBoundsGEP(
3465 RedArrayTy, LHSArrayPtr,
3466 {ConstantInt::get(IndexTy, 0), ConstantInt::get(IndexTy, En.index())});
3467 Value *LHSI8Ptr = Builder.CreateLoad(Builder.getPtrTy(), LHSI8PtrPtr);
3469 LHSI8Ptr, RI.Variable->getType(), LHSI8Ptr->getName() + ".ascast");
3470
3472 LHSPtrs.emplace_back(LHSPtr);
3473 RHSPtrs.emplace_back(RHSPtr);
3474 } else {
3475 Value *LHS = Builder.CreateLoad(RI.ElementType, LHSPtr);
3476 Value *RHS = Builder.CreateLoad(RI.ElementType, RHSPtr);
3477 Value *Reduced;
3478 InsertPointOrErrorTy AfterIP =
3479 RI.ReductionGen(Builder.saveIP(), LHS, RHS, Reduced);
3480 if (!AfterIP)
3481 return AfterIP.takeError();
3482 if (!Builder.GetInsertBlock())
3483 return ReductionFunc;
3484 Builder.CreateStore(Reduced, LHSPtr);
3485 }
3486 }
3487
3489 for (auto En : enumerate(ReductionInfos)) {
3490 unsigned Index = En.index();
3491 const ReductionInfo &RI = En.value();
3492 Value *LHSFixupPtr, *RHSFixupPtr;
3493 Builder.restoreIP(RI.ReductionGenClang(
3494 Builder.saveIP(), Index, &LHSFixupPtr, &RHSFixupPtr, ReductionFunc));
3495
3496 // Fix the CallBack code genereated to use the correct Values for the LHS
3497 // and RHS
3498 LHSFixupPtr->replaceUsesWithIf(
3499 LHSPtrs[Index], [ReductionFunc](const Use &U) {
3500 return cast<Instruction>(U.getUser())->getParent()->getParent() ==
3501 ReductionFunc;
3502 });
3503 RHSFixupPtr->replaceUsesWithIf(
3504 RHSPtrs[Index], [ReductionFunc](const Use &U) {
3505 return cast<Instruction>(U.getUser())->getParent()->getParent() ==
3506 ReductionFunc;
3507 });
3508 }
3509
3511 return ReductionFunc;
3512}
3513
3514static void
3516 bool IsGPU) {
3517 for (const OpenMPIRBuilder::ReductionInfo &RI : ReductionInfos) {
3518 (void)RI;
3519 assert(RI.Variable && "expected non-null variable");
3520 assert(RI.PrivateVariable && "expected non-null private variable");
3521 assert((RI.ReductionGen || RI.ReductionGenClang) &&
3522 "expected non-null reduction generator callback");
3523 if (!IsGPU) {
3524 assert(
3525 RI.Variable->getType() == RI.PrivateVariable->getType() &&
3526 "expected variables and their private equivalents to have the same "
3527 "type");
3528 }
3529 assert(RI.Variable->getType()->isPointerTy() &&
3530 "expected variables to be pointers");
3531 }
3532}
3533
3535 const LocationDescription &Loc, InsertPointTy AllocaIP,
3536 InsertPointTy CodeGenIP, ArrayRef<ReductionInfo> ReductionInfos,
3537 bool IsNoWait, bool IsTeamsReduction, bool HasDistribute,
3538 ReductionGenCBKind ReductionGenCBKind, std::optional<omp::GV> GridValue,
3539 unsigned ReductionBufNum, Value *SrcLocInfo) {
3540 if (!updateToLocation(Loc))
3541 return InsertPointTy();
3542 Builder.restoreIP(CodeGenIP);
3543 checkReductionInfos(ReductionInfos, /*IsGPU*/ true);
3544 LLVMContext &Ctx = M.getContext();
3545
3546 // Source location for the ident struct
3547 if (!SrcLocInfo) {
3548 uint32_t SrcLocStrSize;
3549 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
3550 SrcLocInfo = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
3551 }
3552
3553 if (ReductionInfos.size() == 0)
3554 return Builder.saveIP();
3555
3556 Function *CurFunc = Builder.GetInsertBlock()->getParent();
3557 AttributeList FuncAttrs;
3558 AttrBuilder AttrBldr(Ctx);
3559 for (auto Attr : CurFunc->getAttributes().getFnAttrs())
3560 AttrBldr.addAttribute(Attr);
3561 AttrBldr.removeAttribute(Attribute::OptimizeNone);
3562 FuncAttrs = FuncAttrs.addFnAttributes(Ctx, AttrBldr);
3563
3564 CodeGenIP = Builder.saveIP();
3565 Expected<Function *> ReductionResult =
3566 createReductionFunction(Builder.GetInsertBlock()->getParent()->getName(),
3567 ReductionInfos, ReductionGenCBKind, FuncAttrs);
3568 if (!ReductionResult)
3569 return ReductionResult.takeError();
3570 Function *ReductionFunc = *ReductionResult;
3571 Builder.restoreIP(CodeGenIP);
3572
3573 // Set the grid value in the config needed for lowering later on
3574 if (GridValue.has_value())
3575 Config.setGridValue(GridValue.value());
3576 else
3577 Config.setGridValue(getGridValue(T, ReductionFunc));
3578
3579 // Build res = __kmpc_reduce{_nowait}(<gtid>, <n>, sizeof(RedList),
3580 // RedList, shuffle_reduce_func, interwarp_copy_func);
3581 // or
3582 // Build res = __kmpc_reduce_teams_nowait_simple(<loc>, <gtid>, <lck>);
3583 Value *Res;
3584
3585 // 1. Build a list of reduction variables.
3586 // void *RedList[<n>] = {<ReductionVars>[0], ..., <ReductionVars>[<n>-1]};
3587 auto Size = ReductionInfos.size();
3588 Type *PtrTy = PointerType::getUnqual(Ctx);
3589 Type *RedArrayTy = ArrayType::get(PtrTy, Size);
3590 CodeGenIP = Builder.saveIP();
3591 Builder.restoreIP(AllocaIP);
3592 Value *ReductionListAlloca =
3593 Builder.CreateAlloca(RedArrayTy, nullptr, ".omp.reduction.red_list");
3595 ReductionListAlloca, PtrTy, ReductionListAlloca->getName() + ".ascast");
3596 Builder.restoreIP(CodeGenIP);
3597 Type *IndexTy = Builder.getIndexTy(
3599 for (auto En : enumerate(ReductionInfos)) {
3600 const ReductionInfo &RI = En.value();
3601 Value *ElemPtr = Builder.CreateInBoundsGEP(
3602 RedArrayTy, ReductionList,
3603 {ConstantInt::get(IndexTy, 0), ConstantInt::get(IndexTy, En.index())});
3604 Value *CastElem =
3606 Builder.CreateStore(CastElem, ElemPtr);
3607 }
3608 CodeGenIP = Builder.saveIP();
3609 Function *SarFunc =
3610 emitShuffleAndReduceFunction(ReductionInfos, ReductionFunc, FuncAttrs);
3611 Expected<Function *> CopyResult =
3612 emitInterWarpCopyFunction(Loc, ReductionInfos, FuncAttrs);
3613 if (!CopyResult)
3614 return CopyResult.takeError();
3615 Function *WcFunc = *CopyResult;
3616 Builder.restoreIP(CodeGenIP);
3617
3618 Value *RL = Builder.CreatePointerBitCastOrAddrSpaceCast(ReductionList, PtrTy);
3619
3620 unsigned MaxDataSize = 0;
3621 SmallVector<Type *> ReductionTypeArgs;
3622 for (auto En : enumerate(ReductionInfos)) {
3623 auto Size = M.getDataLayout().getTypeStoreSize(En.value().ElementType);
3624 if (Size > MaxDataSize)
3625 MaxDataSize = Size;
3626 ReductionTypeArgs.emplace_back(En.value().ElementType);
3627 }
3628 Value *ReductionDataSize =
3629 Builder.getInt64(MaxDataSize * ReductionInfos.size());
3630 if (!IsTeamsReduction) {
3631 Value *SarFuncCast =
3633 Value *WcFuncCast =
3635 Value *Args[] = {SrcLocInfo, ReductionDataSize, RL, SarFuncCast,
3636 WcFuncCast};
3638 RuntimeFunction::OMPRTL___kmpc_nvptx_parallel_reduce_nowait_v2);
3639 Res = Builder.CreateCall(Pv2Ptr, Args);
3640 } else {
3641 CodeGenIP = Builder.saveIP();
3642 StructType *ReductionsBufferTy = StructType::create(
3643 Ctx, ReductionTypeArgs, "struct._globalized_locals_ty");
3644 Function *RedFixedBuferFn = getOrCreateRuntimeFunctionPtr(
3645 RuntimeFunction::OMPRTL___kmpc_reduction_get_fixed_buffer);
3646 Function *LtGCFunc = emitListToGlobalCopyFunction(
3647 ReductionInfos, ReductionsBufferTy, FuncAttrs);
3648 Function *LtGRFunc = emitListToGlobalReduceFunction(
3649 ReductionInfos, ReductionFunc, ReductionsBufferTy, FuncAttrs);
3650 Function *GtLCFunc = emitGlobalToListCopyFunction(
3651 ReductionInfos, ReductionsBufferTy, FuncAttrs);
3652 Function *GtLRFunc = emitGlobalToListReduceFunction(
3653 ReductionInfos, ReductionFunc, ReductionsBufferTy, FuncAttrs);
3654 Builder.restoreIP(CodeGenIP);
3655
3656 Value *KernelTeamsReductionPtr = Builder.CreateCall(
3657 RedFixedBuferFn, {}, "_openmp_teams_reductions_buffer_$_$ptr");
3658
3659 Value *Args3[] = {SrcLocInfo,
3660 KernelTeamsReductionPtr,
3661 Builder.getInt32(ReductionBufNum),
3662 ReductionDataSize,
3663 RL,
3664 SarFunc,
3665 WcFunc,
3666 LtGCFunc,
3667 LtGRFunc,
3668 GtLCFunc,
3669 GtLRFunc};
3670
3671 Function *TeamsReduceFn = getOrCreateRuntimeFunctionPtr(
3672 RuntimeFunction::OMPRTL___kmpc_nvptx_teams_reduce_nowait_v2);
3673 Res = Builder.CreateCall(TeamsReduceFn, Args3);
3674 }
3675
3676 // 5. Build if (res == 1)
3677 BasicBlock *ExitBB = BasicBlock::Create(Ctx, ".omp.reduction.done");
3678 BasicBlock *ThenBB = BasicBlock::Create(Ctx, ".omp.reduction.then");
3680 Builder.CreateCondBr(Cond, ThenBB, ExitBB);
3681
3682 // 6. Build then branch: where we have reduced values in the master
3683 // thread in each team.
3684 // __kmpc_end_reduce{_nowait}(<gtid>);
3685 // break;
3686 emitBlock(ThenBB, CurFunc);
3687
3688 // Add emission of __kmpc_end_reduce{_nowait}(<gtid>);
3689 for (auto En : enumerate(ReductionInfos)) {
3690 const ReductionInfo &RI = En.value();
3691 Value *LHS = RI.Variable;
3692 Value *RHS =
3694
3696 Value *LHSPtr, *RHSPtr;
3698 &LHSPtr, &RHSPtr, CurFunc));
3699
3700 // Fix the CallBack code genereated to use the correct Values for the LHS
3701 // and RHS
3702 LHSPtr->replaceUsesWithIf(LHS, [ReductionFunc](const Use &U) {
3703 return cast<Instruction>(U.getUser())->getParent()->getParent() ==
3704 ReductionFunc;
3705 });
3706 RHSPtr->replaceUsesWithIf(RHS, [ReductionFunc](const Use &U) {
3707 return cast<Instruction>(U.getUser())->getParent()->getParent() ==
3708 ReductionFunc;
3709 });
3710 } else {
3711 assert(false && "Unhandled ReductionGenCBKind");
3712 }
3713 }
3714 emitBlock(ExitBB, CurFunc);
3715
3717
3718 return Builder.saveIP();
3719}
3720
3722 Type *VoidTy = Type::getVoidTy(M.getContext());
3723 Type *Int8PtrTy = PointerType::getUnqual(M.getContext());
3724 auto *FuncTy =
3725 FunctionType::get(VoidTy, {Int8PtrTy, Int8PtrTy}, /* IsVarArg */ false);
3727 ".omp.reduction.func", &M);
3728}
3729
3732 InsertPointTy AllocaIP,
3733 ArrayRef<ReductionInfo> ReductionInfos,
3734 ArrayRef<bool> IsByRef, bool IsNoWait) {
3735 assert(ReductionInfos.size() == IsByRef.size());
3736 for (const ReductionInfo &RI : ReductionInfos) {
3737 (void)RI;
3738 assert(RI.Variable && "expected non-null variable");
3739 assert(RI.PrivateVariable && "expected non-null private variable");
3740 assert(RI.ReductionGen && "expected non-null reduction generator callback");
3741 assert(RI.Variable->getType() == RI.PrivateVariable->getType() &&
3742 "expected variables and their private equivalents to have the same "
3743 "type");
3744 assert(RI.Variable->getType()->isPointerTy() &&
3745 "expected variables to be pointers");
3746 }
3747
3748 if (!updateToLocation(Loc))
3749 return InsertPointTy();
3750
3751 BasicBlock *InsertBlock = Loc.IP.getBlock();
3752 BasicBlock *ContinuationBlock =
3753 InsertBlock->splitBasicBlock(Loc.IP.getPoint(), "reduce.finalize");
3754 InsertBlock->getTerminator()->eraseFromParent();
3755
3756 // Create and populate array of type-erased pointers to private reduction
3757 // values.
3758 unsigned NumReductions = ReductionInfos.size();
3759 Type *RedArrayTy = ArrayType::get(Builder.getPtrTy(), NumReductions);
3761 Value *RedArray = Builder.CreateAlloca(RedArrayTy, nullptr, "red.array");
3762
3763 Builder.SetInsertPoint(InsertBlock, InsertBlock->end());
3764
3765 for (auto En : enumerate(ReductionInfos)) {
3766 unsigned Index = En.index();
3767 const ReductionInfo &RI = En.value();
3768 Value *RedArrayElemPtr = Builder.CreateConstInBoundsGEP2_64(
3769 RedArrayTy, RedArray, 0, Index, "red.array.elem." + Twine(Index));
3770 Builder.CreateStore(RI.PrivateVariable, RedArrayElemPtr);
3771 }
3772
3773 // Emit a call to the runtime function that orchestrates the reduction.
3774 // Declare the reduction function in the process.
3776 Module *Module = Func->getParent();
3777 uint32_t SrcLocStrSize;
3778 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
3779 bool CanGenerateAtomic = all_of(ReductionInfos, [](const ReductionInfo &RI) {
3780 return RI.AtomicReductionGen;
3781 });
3782 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize,
3783 CanGenerateAtomic
3784 ? IdentFlag::OMP_IDENT_FLAG_ATOMIC_REDUCE
3785 : IdentFlag(0));
3786 Value *ThreadId = getOrCreateThreadID(Ident);
3787 Constant *NumVariables = Builder.getInt32(NumReductions);
3788 const DataLayout &DL = Module->getDataLayout();
3789 unsigned RedArrayByteSize = DL.getTypeStoreSize(RedArrayTy);
3790 Constant *RedArraySize = Builder.getInt64(RedArrayByteSize);
3791 Function *ReductionFunc = getFreshReductionFunc(*Module);
3792 Value *Lock = getOMPCriticalRegionLock(".reduction");
3794 IsNoWait ? RuntimeFunction::OMPRTL___kmpc_reduce_nowait
3795 : RuntimeFunction::OMPRTL___kmpc_reduce);
3796 CallInst *ReduceCall =
3797 Builder.CreateCall(ReduceFunc,
3798 {Ident, ThreadId, NumVariables, RedArraySize, RedArray,
3799 ReductionFunc, Lock},
3800 "reduce");
3801
3802 // Create final reduction entry blocks for the atomic and non-atomic case.
3803 // Emit IR that dispatches control flow to one of the blocks based on the
3804 // reduction supporting the atomic mode.
3805 BasicBlock *NonAtomicRedBlock =
3806 BasicBlock::Create(Module->getContext(), "reduce.switch.nonatomic", Func);
3807 BasicBlock *AtomicRedBlock =
3808 BasicBlock::Create(Module->getContext(), "reduce.switch.atomic", Func);
3809 SwitchInst *Switch =
3810 Builder.CreateSwitch(ReduceCall, ContinuationBlock, /* NumCases */ 2);
3811 Switch->addCase(Builder.getInt32(1), NonAtomicRedBlock);
3812 Switch->addCase(Builder.getInt32(2), AtomicRedBlock);
3813
3814 // Populate the non-atomic reduction using the elementwise reduction function.
3815 // This loads the elements from the global and private variables and reduces
3816 // them before storing back the result to the global variable.
3817 Builder.SetInsertPoint(NonAtomicRedBlock);
3818 for (auto En : enumerate(ReductionInfos)) {
3819 const ReductionInfo &RI = En.value();
3821 // We have one less load for by-ref case because that load is now inside of
3822 // the reduction region
3823 Value *RedValue = RI.Variable;
3824 if (!IsByRef[En.index()]) {
3825 RedValue = Builder.CreateLoad(ValueType, RI.Variable,
3826 "red.value." + Twine(En.index()));
3827 }
3828 Value *PrivateRedValue =
3830 "red.private.value." + Twine(En.index()));
3831 Value *Reduced;
3832 InsertPointOrErrorTy AfterIP =
3833 RI.ReductionGen(Builder.saveIP(), RedValue, PrivateRedValue, Reduced);
3834 if (!AfterIP)
3835 return AfterIP.takeError();
3836 Builder.restoreIP(*AfterIP);
3837
3838 if (!Builder.GetInsertBlock())
3839 return InsertPointTy();
3840 // for by-ref case, the load is inside of the reduction region
3841 if (!IsByRef[En.index()])
3842 Builder.CreateStore(Reduced, RI.Variable);
3843 }
3844 Function *EndReduceFunc = getOrCreateRuntimeFunctionPtr(
3845 IsNoWait ? RuntimeFunction::OMPRTL___kmpc_end_reduce_nowait
3846 : RuntimeFunction::OMPRTL___kmpc_end_reduce);
3847 Builder.CreateCall(EndReduceFunc, {Ident, ThreadId, Lock});
3848 Builder.CreateBr(ContinuationBlock);
3849
3850 // Populate the atomic reduction using the atomic elementwise reduction
3851 // function. There are no loads/stores here because they will be happening
3852 // inside the atomic elementwise reduction.
3853 Builder.SetInsertPoint(AtomicRedBlock);
3854 if (CanGenerateAtomic && llvm::none_of(IsByRef, [](bool P) { return P; })) {
3855 for (const ReductionInfo &RI : ReductionInfos) {
3858 if (!AfterIP)
3859 return AfterIP.takeError();
3860 Builder.restoreIP(*AfterIP);
3861 if (!Builder.GetInsertBlock())
3862 return InsertPointTy();
3863 }
3864 Builder.CreateBr(ContinuationBlock);
3865 } else {
3867 }
3868
3869 // Populate the outlined reduction function using the elementwise reduction
3870 // function. Partial values are extracted from the type-erased array of
3871 // pointers to private variables.
3872 BasicBlock *ReductionFuncBlock =
3873 BasicBlock::Create(Module->getContext(), "", ReductionFunc);
3874 Builder.SetInsertPoint(ReductionFuncBlock);
3875 Value *LHSArrayPtr = ReductionFunc->getArg(0);
3876 Value *RHSArrayPtr = ReductionFunc->getArg(1);
3877
3878 for (auto En : enumerate(ReductionInfos)) {
3879 const ReductionInfo &RI = En.value();
3881 RedArrayTy, LHSArrayPtr, 0, En.index());
3882 Value *LHSI8Ptr = Builder.CreateLoad(Builder.getPtrTy(), LHSI8PtrPtr);
3883 Value *LHSPtr = Builder.CreateBitCast(LHSI8Ptr, RI.Variable->getType());
3884 Value *LHS = Builder.CreateLoad(RI.ElementType, LHSPtr);
3886 RedArrayTy, RHSArrayPtr, 0, En.index());
3887 Value *RHSI8Ptr = Builder.CreateLoad(Builder.getPtrTy(), RHSI8PtrPtr);
3888 Value *RHSPtr =
3890 Value *RHS = Builder.CreateLoad(RI.ElementType, RHSPtr);
3891 Value *Reduced;
3892 InsertPointOrErrorTy AfterIP =
3893 RI.ReductionGen(Builder.saveIP(), LHS, RHS, Reduced);
3894 if (!AfterIP)
3895 return AfterIP.takeError();
3896 Builder.restoreIP(*AfterIP);
3897 if (!Builder.GetInsertBlock())
3898 return InsertPointTy();
3899 // store is inside of the reduction region when using by-ref
3900 if (!IsByRef[En.index()])
3901 Builder.CreateStore(Reduced, LHSPtr);
3902 }
3904
3905 Builder.SetInsertPoint(ContinuationBlock);
3906 return Builder.saveIP();
3907}
3908
3911 BodyGenCallbackTy BodyGenCB,
3912 FinalizeCallbackTy FiniCB) {
3913 if (!updateToLocation(Loc))
3914 return Loc.IP;
3915
3916 Directive OMPD = Directive::OMPD_master;
3917 uint32_t SrcLocStrSize;
3918 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
3919 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
3920 Value *ThreadId = getOrCreateThreadID(Ident);
3921 Value *Args[] = {Ident, ThreadId};
3922
3923 Function *EntryRTLFn = getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_master);
3924 Instruction *EntryCall = Builder.CreateCall(EntryRTLFn, Args);
3925
3926 Function *ExitRTLFn = getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_end_master);
3927 Instruction *ExitCall = Builder.CreateCall(ExitRTLFn, Args);
3928
3929 return EmitOMPInlinedRegion(OMPD, EntryCall, ExitCall, BodyGenCB, FiniCB,
3930 /*Conditional*/ true, /*hasFinalize*/ true);
3931}
3932
3935 BodyGenCallbackTy BodyGenCB,
3936 FinalizeCallbackTy FiniCB, Value *Filter) {
3937 if (!updateToLocation(Loc))
3938 return Loc.IP;
3939
3940 Directive OMPD = Directive::OMPD_masked;
3941 uint32_t SrcLocStrSize;
3942 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
3943 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
3944 Value *ThreadId = getOrCreateThreadID(Ident);
3945 Value *Args[] = {Ident, ThreadId, Filter};
3946 Value *ArgsEnd[] = {Ident, ThreadId};
3947
3948 Function *EntryRTLFn = getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_masked);
3949 Instruction *EntryCall = Builder.CreateCall(EntryRTLFn, Args);
3950
3951 Function *ExitRTLFn = getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_end_masked);
3952 Instruction *ExitCall = Builder.CreateCall(ExitRTLFn, ArgsEnd);
3953
3954 return EmitOMPInlinedRegion(OMPD, EntryCall, ExitCall, BodyGenCB, FiniCB,
3955 /*Conditional*/ true, /*hasFinalize*/ true);
3956}
3957
3959 DebugLoc DL, Value *TripCount, Function *F, BasicBlock *PreInsertBefore,
3960 BasicBlock *PostInsertBefore, const Twine &Name) {
3961 Module *M = F->getParent();
3962 LLVMContext &Ctx = M->getContext();
3963 Type *IndVarTy = TripCount->getType();
3964
3965 // Create the basic block structure.
3966 BasicBlock *Preheader =
3967 BasicBlock::Create(Ctx, "omp_" + Name + ".preheader", F, PreInsertBefore);
3968 BasicBlock *Header =
3969 BasicBlock::Create(Ctx, "omp_" + Name + ".header", F, PreInsertBefore);
3970 BasicBlock *Cond =
3971 BasicBlock::Create(Ctx, "omp_" + Name + ".cond", F, PreInsertBefore);
3972 BasicBlock *Body =
3973 BasicBlock::Create(Ctx, "omp_" + Name + ".body", F, PreInsertBefore);
3974 BasicBlock *Latch =
3975 BasicBlock::Create(Ctx, "omp_" + Name + ".inc", F, PostInsertBefore);
3976 BasicBlock *Exit =
3977 BasicBlock::Create(Ctx, "omp_" + Name + ".exit", F, PostInsertBefore);
3978 BasicBlock *After =
3979 BasicBlock::Create(Ctx, "omp_" + Name + ".after", F, PostInsertBefore);
3980
3981 // Use specified DebugLoc for new instructions.
3983
3984 Builder.SetInsertPoint(Preheader);
3985 Builder.CreateBr(Header);
3986
3987 Builder.SetInsertPoint(Header);
3988 PHINode *IndVarPHI = Builder.CreatePHI(IndVarTy, 2, "omp_" + Name + ".iv");
3989 IndVarPHI->addIncoming(ConstantInt::get(IndVarTy, 0), Preheader);
3991
3993 Value *Cmp =
3994 Builder.CreateICmpULT(IndVarPHI, TripCount, "omp_" + Name + ".cmp");
3995 Builder.CreateCondBr(Cmp, Body, Exit);
3996
3997 Builder.SetInsertPoint(Body);
3998 Builder.CreateBr(Latch);
3999
4000 Builder.SetInsertPoint(Latch);
4001 Value *Next = Builder.CreateAdd(IndVarPHI, ConstantInt::get(IndVarTy, 1),
4002 "omp_" + Name + ".next", /*HasNUW=*/true);
4003 Builder.CreateBr(Header);
4004 IndVarPHI->addIncoming(Next, Latch);
4005
4006 Builder.SetInsertPoint(Exit);
4008
4009 // Remember and return the canonical control flow.
4010 LoopInfos.emplace_front();
4011 CanonicalLoopInfo *CL = &LoopInfos.front();
4012
4013 CL->Header = Header;
4014 CL->Cond = Cond;
4015 CL->Latch = Latch;
4016 CL->Exit = Exit;
4017
4018#ifndef NDEBUG
4019 CL->assertOK();
4020#endif
4021 return CL;
4022}
4023
4026 LoopBodyGenCallbackTy BodyGenCB,
4027 Value *TripCount, const Twine &Name) {
4028 BasicBlock *BB = Loc.IP.getBlock();
4029 BasicBlock *NextBB = BB->getNextNode();
4030
4031 CanonicalLoopInfo *CL = createLoopSkeleton(Loc.DL, TripCount, BB->getParent(),
4032 NextBB, NextBB, Name);
4033 BasicBlock *After = CL->getAfter();
4034
4035 // If location is not set, don't connect the loop.
4036 if (updateToLocation(Loc)) {
4037 // Split the loop at the insertion point: Branch to the preheader and move
4038 // every following instruction to after the loop (the After BB). Also, the
4039 // new successor is the loop's after block.
4040 spliceBB(Builder, After, /*CreateBranch=*/false);
4042 }
4043
4044 // Emit the body content. We do it after connecting the loop to the CFG to
4045 // avoid that the callback encounters degenerate BBs.
4046 if (Error Err = BodyGenCB(CL->getBodyIP(), CL->getIndVar()))
4047 return Err;
4048
4049#ifndef NDEBUG
4050 CL->assertOK();
4051#endif
4052 return CL;
4053}
4054
4056 const LocationDescription &Loc, LoopBodyGenCallbackTy BodyGenCB,
4057 Value *Start, Value *Stop, Value *Step, bool IsSigned, bool InclusiveStop,
4058 InsertPointTy ComputeIP, const Twine &Name) {
4059
4060 // Consider the following difficulties (assuming 8-bit signed integers):
4061 // * Adding \p Step to the loop counter which passes \p Stop may overflow:
4062 // DO I = 1, 100, 50
4063 /// * A \p Step of INT_MIN cannot not be normalized to a positive direction:
4064 // DO I = 100, 0, -128
4065
4066 // Start, Stop and Step must be of the same integer type.
4067 auto *IndVarTy = cast<IntegerType>(Start->getType());
4068 assert(IndVarTy == Stop->getType() && "Stop type mismatch");
4069 assert(IndVarTy == Step->getType() && "Step type mismatch");
4070
4071 LocationDescription ComputeLoc =
4072 ComputeIP.isSet() ? LocationDescription(ComputeIP, Loc.DL) : Loc;
4073 updateToLocation(ComputeLoc);
4074
4075 ConstantInt *Zero = ConstantInt::get(IndVarTy, 0);
4076 ConstantInt *One = ConstantInt::get(IndVarTy, 1);
4077
4078 // Like Step, but always positive.
4079 Value *Incr = Step;
4080
4081 // Distance between Start and Stop; always positive.
4082 Value *Span;
4083
4084 // Condition whether there are no iterations are executed at all, e.g. because
4085 // UB < LB.
4086 Value *ZeroCmp;
4087
4088 if (IsSigned) {
4089 // Ensure that increment is positive. If not, negate and invert LB and UB.
4090 Value *IsNeg = Builder.CreateICmpSLT(Step, Zero);
4091 Incr = Builder.CreateSelect(IsNeg, Builder.CreateNeg(Step), Step);
4092 Value *LB = Builder.CreateSelect(IsNeg, Stop, Start);
4093 Value *UB = Builder.CreateSelect(IsNeg, Start, Stop);
4094 Span = Builder.CreateSub(UB, LB, "", false, true);
4095 ZeroCmp = Builder.CreateICmp(
4096 InclusiveStop ? CmpInst::ICMP_SLT : CmpInst::ICMP_SLE, UB, LB);
4097 } else {
4098 Span = Builder.CreateSub(Stop, Start, "", true);
4099 ZeroCmp = Builder.CreateICmp(
4100 InclusiveStop ? CmpInst::ICMP_ULT : CmpInst::ICMP_ULE, Stop, Start);
4101 }
4102
4103 Value *CountIfLooping;
4104 if (InclusiveStop) {
4105 CountIfLooping = Builder.CreateAdd(Builder.CreateUDiv(Span, Incr), One);
4106 } else {
4107 // Avoid incrementing past stop since it could overflow.
4108 Value *CountIfTwo = Builder.CreateAdd(
4109 Builder.CreateUDiv(Builder.CreateSub(Span, One), Incr), One);
4110 Value *OneCmp = Builder.CreateICmp(CmpInst::ICMP_ULE, Span, Incr);
4111 CountIfLooping = Builder.CreateSelect(OneCmp, One, CountIfTwo);
4112 }
4113 Value *TripCount = Builder.CreateSelect(ZeroCmp, Zero, CountIfLooping,
4114 "omp_" + Name + ".tripcount");
4115
4116 auto BodyGen = [=](InsertPointTy CodeGenIP, Value *IV) {
4117 Builder.restoreIP(CodeGenIP);
4118 Value *Span = Builder.CreateMul(IV, Step);
4119 Value *IndVar = Builder.CreateAdd(Span, Start);
4120 return BodyGenCB(Builder.saveIP(), IndVar);
4121 };
4122 LocationDescription LoopLoc = ComputeIP.isSet() ? Loc.IP : Builder.saveIP();
4123 return createCanonicalLoop(LoopLoc, BodyGen, TripCount, Name);
4124}
4125
4126// Returns an LLVM function to call for initializing loop bounds using OpenMP
4127// static scheduling depending on `type`. Only i32 and i64 are supported by the
4128// runtime. Always interpret integers as unsigned similarly to
4129// CanonicalLoopInfo.
4131 OpenMPIRBuilder &OMPBuilder) {
4132 unsigned Bitwidth = Ty->getIntegerBitWidth();
4133 if (Bitwidth == 32)
4134 return OMPBuilder.getOrCreateRuntimeFunction(
4135 M, omp::RuntimeFunction::OMPRTL___kmpc_for_static_init_4u);
4136 if (Bitwidth == 64)
4137 return OMPBuilder.getOrCreateRuntimeFunction(
4138 M, omp::RuntimeFunction::OMPRTL___kmpc_for_static_init_8u);
4139 llvm_unreachable("unknown OpenMP loop iterator bitwidth");
4140}
4141
4143OpenMPIRBuilder::applyStaticWorkshareLoop(DebugLoc DL, CanonicalLoopInfo *CLI,
4144 InsertPointTy AllocaIP,
4145 bool NeedsBarrier) {
4146 assert(CLI->isValid() && "Requires a valid canonical loop");
4147 assert(!isConflictIP(AllocaIP, CLI->getPreheaderIP()) &&
4148 "Require dedicated allocate IP");
4149
4150 // Set up the source location value for OpenMP runtime.
4153
4154 uint32_t SrcLocStrSize;
4155 Constant *SrcLocStr = getOrCreateSrcLocStr(DL, SrcLocStrSize);
4156 Value *SrcLoc = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
4157
4158 // Declare useful OpenMP runtime functions.
4159 Value *IV = CLI->getIndVar();
4160 Type *IVTy = IV->getType();
4161 FunctionCallee StaticInit = getKmpcForStaticInitForType(IVTy, M, *this);
4162 FunctionCallee StaticFini =
4163 getOrCreateRuntimeFunction(M, omp::OMPRTL___kmpc_for_static_fini);
4164
4165 // Allocate space for computed loop bounds as expected by the "init" function.
4166 Builder.SetInsertPoint(AllocaIP.getBlock()->getFirstNonPHIOrDbgOrAlloca());
4167
4168 Type *I32Type = Type::getInt32Ty(M.getContext());
4169 Value *PLastIter = Builder.CreateAlloca(I32Type, nullptr, "p.lastiter");
4170 Value *PLowerBound = Builder.CreateAlloca(IVTy, nullptr, "p.lowerbound");
4171 Value *PUpperBound = Builder.CreateAlloca(IVTy, nullptr, "p.upperbound");
4172 Value *PStride = Builder.CreateAlloca(IVTy, nullptr, "p.stride");
4173
4174 // At the end of the preheader, prepare for calling the "init" function by
4175 // storing the current loop bounds into the allocated space. A canonical loop
4176 // always iterates from 0 to trip-count with step 1. Note that "init" expects
4177 // and produces an inclusive upper bound.
4179 Constant *Zero = ConstantInt::get(IVTy, 0);
4180 Constant *One = ConstantInt::get(IVTy, 1);
4181 Builder.CreateStore(Zero, PLowerBound);
4182 Value *UpperBound = Builder.CreateSub(CLI->getTripCount(), One);
4183 Builder.CreateStore(UpperBound, PUpperBound);
4184 Builder.CreateStore(One, PStride);
4185
4186 Value *ThreadNum = getOrCreateThreadID(SrcLoc);
4187
4188 Constant *SchedulingType = ConstantInt::get(
4189 I32Type, static_cast<int>(OMPScheduleType::UnorderedStatic));
4190
4191 // Call the "init" function and update the trip count of the loop with the
4192 // value it produced.
4193 Builder.CreateCall(StaticInit,
4194 {SrcLoc, ThreadNum, SchedulingType, PLastIter, PLowerBound,
4195 PUpperBound, PStride, One, Zero});
4196 Value *LowerBound = Builder.CreateLoad(IVTy, PLowerBound);
4197 Value *InclusiveUpperBound = Builder.CreateLoad(IVTy, PUpperBound);
4198 Value *TripCountMinusOne = Builder.CreateSub(InclusiveUpperBound, LowerBound);
4199 Value *TripCount = Builder.CreateAdd(TripCountMinusOne, One);
4200 CLI->setTripCount(TripCount);
4201
4202 // Update all uses of the induction variable except the one in the condition
4203 // block that compares it with the actual upper bound, and the increment in
4204 // the latch block.
4205
4206 CLI->mapIndVar([&](Instruction *OldIV) -> Value * {
4208 CLI->getBody()->getFirstInsertionPt());
4210 return Builder.CreateAdd(OldIV, LowerBound);
4211 });
4212
4213 // In the "exit" block, call the "fini" function.
4215 CLI->getExit()->getTerminator()->getIterator());
4216 Builder.CreateCall(StaticFini, {SrcLoc, ThreadNum});
4217
4218 // Add the barrier if requested.
4219 if (NeedsBarrier) {
4220 InsertPointOrErrorTy BarrierIP =
4221 createBarrier(LocationDescription(Builder.saveIP(), DL),
4222 omp::Directive::OMPD_for, /* ForceSimpleCall */ false,
4223 /* CheckCancelFlag */ false);
4224 if (!BarrierIP)
4225 return BarrierIP.takeError();
4226 }
4227
4228 InsertPointTy AfterIP = CLI->getAfterIP();
4229 CLI->invalidate();
4230
4231 return AfterIP;
4232}
4233
4235OpenMPIRBuilder::applyStaticChunkedWorkshareLoop(DebugLoc DL,
4236 CanonicalLoopInfo *CLI,
4237 InsertPointTy AllocaIP,
4238 bool NeedsBarrier,
4239 Value *ChunkSize) {
4240 assert(CLI->isValid() && "Requires a valid canonical loop");
4241 assert(ChunkSize && "Chunk size is required");
4242
4243 LLVMContext &Ctx = CLI->getFunction()->getContext();
4244 Value *IV = CLI->getIndVar();
4245 Value *OrigTripCount = CLI->getTripCount();
4246 Type *IVTy = IV->getType();
4247 assert(IVTy->getIntegerBitWidth() <= 64 &&
4248 "Max supported tripcount bitwidth is 64 bits");
4249 Type *InternalIVTy = IVTy->getIntegerBitWidth() <= 32 ? Type::getInt32Ty(Ctx)
4250 : Type::getInt64Ty(Ctx);
4251 Type *I32Type = Type::getInt32Ty(M.getContext());
4252 Constant *Zero = ConstantInt::get(InternalIVTy, 0);
4253 Constant *One = ConstantInt::get(InternalIVTy, 1);
4254
4255 // Declare useful OpenMP runtime functions.
4256 FunctionCallee StaticInit =
4257 getKmpcForStaticInitForType(InternalIVTy, M, *this);
4258 FunctionCallee StaticFini =
4259 getOrCreateRuntimeFunction(M, omp::OMPRTL___kmpc_for_static_fini);
4260
4261 // Allocate space for computed loop bounds as expected by the "init" function.
4262 Builder.restoreIP(AllocaIP);
4264 Value *PLastIter = Builder.CreateAlloca(I32Type, nullptr, "p.lastiter");
4265 Value *PLowerBound =
4266 Builder.CreateAlloca(InternalIVTy, nullptr, "p.lowerbound");
4267 Value *PUpperBound =
4268 Builder.CreateAlloca(InternalIVTy, nullptr, "p.upperbound");
4269 Value *PStride = Builder.CreateAlloca(InternalIVTy, nullptr, "p.stride");
4270
4271 // Set up the source location value for the OpenMP runtime.
4274
4275 // TODO: Detect overflow in ubsan or max-out with current tripcount.
4276 Value *CastedChunkSize =
4277 Builder.CreateZExtOrTrunc(ChunkSize, InternalIVTy, "chunksize");
4278 Value *CastedTripCount =
4279 Builder.CreateZExt(OrigTripCount, InternalIVTy, "tripcount");
4280
4281 Constant *SchedulingType = ConstantInt::get(
4282 I32Type, static_cast<int>(OMPScheduleType::UnorderedStaticChunked));
4283 Builder.CreateStore(Zero, PLowerBound);
4284 Value *OrigUpperBound = Builder.CreateSub(CastedTripCount, One);
4285 Builder.CreateStore(OrigUpperBound, PUpperBound);
4286 Builder.CreateStore(One, PStride);
4287
4288 // Call the "init" function and update the trip count of the loop with the
4289 // value it produced.
4290 uint32_t SrcLocStrSize;
4291 Constant *SrcLocStr = getOrCreateSrcLocStr(DL, SrcLocStrSize);
4292 Value *SrcLoc = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
4293 Value *ThreadNum = getOrCreateThreadID(SrcLoc);
4294 Builder.CreateCall(StaticInit,
4295 {/*loc=*/SrcLoc, /*global_tid=*/ThreadNum,
4296 /*schedtype=*/SchedulingType, /*plastiter=*/PLastIter,
4297 /*plower=*/PLowerBound, /*pupper=*/PUpperBound,
4298 /*pstride=*/PStride, /*incr=*/One,
4299 /*chunk=*/CastedChunkSize});
4300
4301 // Load values written by the "init" function.
4302 Value *FirstChunkStart =
4303 Builder.CreateLoad(InternalIVTy, PLowerBound, "omp_firstchunk.lb");
4304 Value *FirstChunkStop =
4305 Builder.CreateLoad(InternalIVTy, PUpperBound, "omp_firstchunk.ub");
4306 Value *FirstChunkEnd = Builder.CreateAdd(FirstChunkStop, One);
4307 Value *ChunkRange =
4308 Builder.CreateSub(FirstChunkEnd, FirstChunkStart, "omp_chunk.range");
4309 Value *NextChunkStride =
4310 Builder.CreateLoad(InternalIVTy, PStride, "omp_dispatch.stride");
4311
4312 // Create outer "dispatch" loop for enumerating the chunks.
4313 BasicBlock *DispatchEnter = splitBB(Builder, true);
4314 Value *DispatchCounter;
4315
4316 // It is safe to assume this didn't return an error because the callback
4317 // passed into createCanonicalLoop is the only possible error source, and it
4318 // always returns success.
4320 {Builder.saveIP(), DL},
4321 [&](InsertPointTy BodyIP, Value *Counter) {
4322 DispatchCounter = Counter;
4323 return Error::success();
4324 },
4325 FirstChunkStart, CastedTripCount, NextChunkStride,
4326 /*IsSigned=*/false, /*InclusiveStop=*/false, /*ComputeIP=*/{},
4327 "dispatch"));
4328
4329 // Remember the BasicBlocks of the dispatch loop we need, then invalidate to
4330 // not have to preserve the canonical invariant.
4331 BasicBlock *DispatchBody = DispatchCLI->getBody();
4332 BasicBlock *DispatchLatch = DispatchCLI->getLatch();
4333 BasicBlock *DispatchExit = DispatchCLI->getExit();
4334 BasicBlock *DispatchAfter = DispatchCLI->getAfter();
4335 DispatchCLI->invalidate();
4336
4337 // Rewire the original loop to become the chunk loop inside the dispatch loop.
4338 redirectTo(DispatchAfter, CLI->getAfter(), DL);
4339 redirectTo(CLI->getExit(), DispatchLatch, DL);
4340 redirectTo(DispatchBody, DispatchEnter, DL);
4341
4342 // Prepare the prolog of the chunk loop.
4345
4346 // Compute the number of iterations of the chunk loop.
4348 Value *ChunkEnd = Builder.CreateAdd(DispatchCounter, ChunkRange);
4349 Value *IsLastChunk =
4350 Builder.CreateICmpUGE(ChunkEnd, CastedTripCount, "omp_chunk.is_last");
4351 Value *CountUntilOrigTripCount =
4352 Builder.CreateSub(CastedTripCount, DispatchCounter);
4353 Value *ChunkTripCount = Builder.CreateSelect(
4354 IsLastChunk, CountUntilOrigTripCount, ChunkRange, "omp_chunk.tripcount");
4355 Value *BackcastedChunkTC =
4356 Builder.CreateTrunc(ChunkTripCount, IVTy, "omp_chunk.tripcount.trunc");
4357 CLI->setTripCount(BackcastedChunkTC);
4358
4359 // Update all uses of the induction variable except the one in the condition
4360 // block that compares it with the actual upper bound, and the increment in
4361 // the latch block.
4362 Value *BackcastedDispatchCounter =
4363 Builder.CreateTrunc(DispatchCounter, IVTy, "omp_dispatch.iv.trunc");
4364 CLI->mapIndVar([&](Instruction *) -> Value * {
4365 Builder.restoreIP(CLI->getBodyIP());
4366 return Builder.CreateAdd(IV, BackcastedDispatchCounter);
4367 });
4368
4369 // In the "exit" block, call the "fini" function.
4370 Builder.SetInsertPoint(DispatchExit, DispatchExit->getFirstInsertionPt());
4371 Builder.CreateCall(StaticFini, {SrcLoc, ThreadNum});
4372
4373 // Add the barrier if requested.
4374 if (NeedsBarrier) {
4375 InsertPointOrErrorTy AfterIP =
4376 createBarrier(LocationDescription(Builder.saveIP(), DL), OMPD_for,
4377 /*ForceSimpleCall=*/false, /*CheckCancelFlag=*/false);
4378 if (!AfterIP)
4379 return AfterIP.takeError();
4380 }
4381
4382#ifndef NDEBUG
4383 // Even though we currently do not support applying additional methods to it,
4384 // the chunk loop should remain a canonical loop.
4385 CLI->assertOK();
4386#endif
4387
4388 return InsertPointTy(DispatchAfter, DispatchAfter->getFirstInsertionPt());
4389}
4390
4391// Returns an LLVM function to call for executing an OpenMP static worksharing
4392// for loop depending on `type`. Only i32 and i64 are supported by the runtime.
4393// Always interpret integers as unsigned similarly to CanonicalLoopInfo.
4394static FunctionCallee
4396 WorksharingLoopType LoopType) {
4397 unsigned Bitwidth = Ty->getIntegerBitWidth();
4398 Module &M = OMPBuilder->M;
4399 switch (LoopType) {
4400 case WorksharingLoopType::ForStaticLoop:
4401 if (Bitwidth == 32)
4402 return OMPBuilder->getOrCreateRuntimeFunction(
4403 M, omp::RuntimeFunction::OMPRTL___kmpc_for_static_loop_4u);
4404 if (Bitwidth == 64)
4405 return OMPBuilder->getOrCreateRuntimeFunction(
4406 M, omp::RuntimeFunction::OMPRTL___kmpc_for_static_loop_8u);
4407 break;
4408 case WorksharingLoopType::DistributeStaticLoop:
4409 if (Bitwidth == 32)
4410 return OMPBuilder->getOrCreateRuntimeFunction(
4411 M, omp::RuntimeFunction::OMPRTL___kmpc_distribute_static_loop_4u);
4412 if (Bitwidth == 64)
4413 return OMPBuilder->getOrCreateRuntimeFunction(
4414 M, omp::RuntimeFunction::OMPRTL___kmpc_distribute_static_loop_8u);
4415 break;
4416 case WorksharingLoopType::DistributeForStaticLoop:
4417 if (Bitwidth == 32)
4418 return OMPBuilder->getOrCreateRuntimeFunction(
4419 M, omp::RuntimeFunction::OMPRTL___kmpc_distribute_for_static_loop_4u);
4420 if (Bitwidth == 64)
4421 return OMPBuilder->getOrCreateRuntimeFunction(
4422 M, omp::RuntimeFunction::OMPRTL___kmpc_distribute_for_static_loop_8u);
4423 break;
4424 }
4425 if (Bitwidth != 32 && Bitwidth != 64) {
4426 llvm_unreachable("Unknown OpenMP loop iterator bitwidth");
4427 }
4428 llvm_unreachable("Unknown type of OpenMP worksharing loop");
4429}
4430
4431// Inserts a call to proper OpenMP Device RTL function which handles
4432// loop worksharing.
4434 OpenMPIRBuilder *OMPBuilder, WorksharingLoopType LoopType,
4435 BasicBlock *InsertBlock, Value *Ident, Value *LoopBodyArg,
4436 Type *ParallelTaskPtr, Value *TripCount, Function &LoopBodyFn) {
4437 Type *TripCountTy = TripCount->getType();
4438 Module &M = OMPBuilder->M;
4439 IRBuilder<> &Builder = OMPBuilder->Builder;
4440 FunctionCallee RTLFn =
4441 getKmpcForStaticLoopForType(TripCountTy, OMPBuilder, LoopType);
4442 SmallVector<Value *, 8> RealArgs;
4443 RealArgs.push_back(Ident);
4444 RealArgs.push_back(Builder.CreateBitCast(&LoopBodyFn, ParallelTaskPtr));
4445 RealArgs.push_back(LoopBodyArg);
4446 RealArgs.push_back(TripCount);
4447 if (LoopType == WorksharingLoopType::DistributeStaticLoop) {
4448 RealArgs.push_back(ConstantInt::get(TripCountTy, 0));
4449 Builder.CreateCall(RTLFn, RealArgs);
4450 return;
4451 }
4452 FunctionCallee RTLNumThreads = OMPBuilder->getOrCreateRuntimeFunction(
4453 M, omp::RuntimeFunction::OMPRTL_omp_get_num_threads);
4454 Builder.restoreIP({InsertBlock, std::prev(InsertBlock->end())});
4455 Value *NumThreads = Builder.CreateCall(RTLNumThreads, {});
4456
4457 RealArgs.push_back(
4458 Builder.CreateZExtOrTrunc(NumThreads, TripCountTy, "num.threads.cast"));
4459 RealArgs.push_back(ConstantInt::get(TripCountTy, 0));
4460 if (LoopType == WorksharingLoopType::DistributeForStaticLoop) {
4461 RealArgs.push_back(ConstantInt::get(TripCountTy, 0));
4462 }
4463
4464 Builder.CreateCall(RTLFn, RealArgs);
4465}
4466
4467static void
4469 CanonicalLoopInfo *CLI, Value *Ident,
4470 Function &OutlinedFn, Type *ParallelTaskPtr,
4471 const SmallVector<Instruction *, 4> &ToBeDeleted,
4472 WorksharingLoopType LoopType) {
4473 IRBuilder<> &Builder = OMPIRBuilder->Builder;
4474 BasicBlock *Preheader = CLI->getPreheader();
4475 Value *TripCount = CLI->getTripCount();
4476
4477 // After loop body outling, the loop body contains only set up
4478 // of loop body argument structure and the call to the outlined
4479 // loop body function. Firstly, we need to move setup of loop body args
4480 // into loop preheader.
4481 Preheader->splice(std::prev(Preheader->end()), CLI->getBody(),
4482 CLI->getBody()->begin(), std::prev(CLI->getBody()->end()));
4483
4484 // The next step is to remove the whole loop. We do not it need anymore.
4485 // That's why make an unconditional branch from loop preheader to loop
4486 // exit block
4487 Builder.restoreIP({Preheader, Preheader->end()});
4488 Builder.SetCurrentDebugLocation(Preheader->getTerminator()->getDebugLoc());
4489 Preheader->getTerminator()->eraseFromParent();
4490 Builder.CreateBr(CLI->getExit());
4491
4492 // Delete dead loop blocks
4493 OpenMPIRBuilder::OutlineInfo CleanUpInfo;
4494 SmallPtrSet<BasicBlock *, 32> RegionBlockSet;
4495 SmallVector<BasicBlock *, 32> BlocksToBeRemoved;
4496 CleanUpInfo.EntryBB = CLI->getHeader();
4497 CleanUpInfo.ExitBB = CLI->getExit();
4498 CleanUpInfo.collectBlocks(RegionBlockSet, BlocksToBeRemoved);
4499 DeleteDeadBlocks(BlocksToBeRemoved);
4500
4501 // Find the instruction which corresponds to loop body argument structure
4502 // and remove the call to loop body function instruction.
4503 Value *LoopBodyArg;
4504 User *OutlinedFnUser = OutlinedFn.getUniqueUndroppableUser();
4505 assert(OutlinedFnUser &&
4506 "Expected unique undroppable user of outlined function");
4507 CallInst *OutlinedFnCallInstruction = dyn_cast<CallInst>(OutlinedFnUser);
4508 assert(OutlinedFnCallInstruction && "Expected outlined function call");
4509 assert((OutlinedFnCallInstruction->getParent() == Preheader) &&
4510 "Expected outlined function call to be located in loop preheader");
4511 // Check in case no argument structure has been passed.
4512 if (OutlinedFnCallInstruction->arg_size() > 1)
4513 LoopBodyArg = OutlinedFnCallInstruction->getArgOperand(1);
4514 else
4515 LoopBodyArg = Constant::getNullValue(Builder.getPtrTy());
4516 OutlinedFnCallInstruction->eraseFromParent();
4517
4518 createTargetLoopWorkshareCall(OMPIRBuilder, LoopType, Preheader, Ident,
4519 LoopBodyArg, ParallelTaskPtr, TripCount,
4520 OutlinedFn);
4521
4522 for (auto &ToBeDeletedItem : ToBeDeleted)
4523 ToBeDeletedItem->eraseFromParent();
4524 CLI->invalidate();
4525}
4526
4528OpenMPIRBuilder::applyWorkshareLoopTarget(DebugLoc DL, CanonicalLoopInfo *CLI,
4529 InsertPointTy AllocaIP,
4530 WorksharingLoopType LoopType) {
4531 uint32_t SrcLocStrSize;
4532 Constant *SrcLocStr = getOrCreateSrcLocStr(DL, SrcLocStrSize);
4533 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
4534
4535 OutlineInfo OI;
4536 OI.OuterAllocaBB = CLI->getPreheader();
4537 Function *OuterFn = CLI->getPreheader()->getParent();
4538
4539 // Instructions which need to be deleted at the end of code generation
4541
4542 OI.OuterAllocaBB = AllocaIP.getBlock();
4543
4544 // Mark the body loop as region which needs to be extracted
4545 OI.EntryBB = CLI->getBody();
4546 OI.ExitBB = CLI->getLatch()->splitBasicBlock(CLI->getLatch()->begin(),
4547 "omp.prelatch", true);
4548
4549 // Prepare loop body for extraction
4550 Builder.restoreIP({CLI->getPreheader(), CLI->getPreheader()->begin()});
4551
4552 // Insert new loop counter variable which will be used only in loop
4553 // body.
4554 AllocaInst *NewLoopCnt = Builder.CreateAlloca(CLI->getIndVarType(), 0, "");
4555 Instruction *NewLoopCntLoad =
4556 Builder.CreateLoad(CLI->getIndVarType(), NewLoopCnt);
4557 // New loop counter instructions are redundant in the loop preheader when
4558 // code generation for workshare loop is finshed. That's why mark them as
4559 // ready for deletion.
4560 ToBeDeleted.push_back(NewLoopCntLoad);
4561 ToBeDeleted.push_back(NewLoopCnt);
4562
4563 // Analyse loop body region. Find all input variables which are used inside
4564 // loop body region.
4565 SmallPtrSet<BasicBlock *, 32> ParallelRegionBlockSet;
4567 OI.collectBlocks(ParallelRegionBlockSet, Blocks);
4568 SmallVector<BasicBlock *, 32> BlocksT(ParallelRegionBlockSet.begin(),
4569 ParallelRegionBlockSet.end());
4570
4571 CodeExtractorAnalysisCache CEAC(*OuterFn);
4572 CodeExtractor Extractor(Blocks,
4573 /* DominatorTree */ nullptr,
4574 /* AggregateArgs */ true,
4575 /* BlockFrequencyInfo */ nullptr,
4576 /* BranchProbabilityInfo */ nullptr,
4577 /* AssumptionCache */ nullptr,
4578 /* AllowVarArgs */ true,
4579 /* AllowAlloca */ true,
4580 /* AllocationBlock */ CLI->getPreheader(),
4581 /* Suffix */ ".omp_wsloop",
4582 /* AggrArgsIn0AddrSpace */ true);
4583
4584 BasicBlock *CommonExit = nullptr;
4585 SetVector<Value *> Inputs, Outputs, SinkingCands, HoistingCands;
4586
4587 // Find allocas outside the loop body region which are used inside loop
4588 // body
4589 Extractor.findAllocas(CEAC, SinkingCands, HoistingCands, CommonExit);
4590
4591 // We need to model loop body region as the function f(cnt, loop_arg).
4592 // That's why we replace loop induction variable by the new counter
4593 // which will be one of loop body function argument
4595 CLI->getIndVar()->user_end());
4596 for (auto Use : Users) {
4597 if (Instruction *Inst = dyn_cast<Instruction>(Use)) {
4598 if (ParallelRegionBlockSet.count(Inst->getParent())) {
4599 Inst->replaceUsesOfWith(CLI->getIndVar(), NewLoopCntLoad);
4600 }
4601 }
4602 }
4603 // Make sure that loop counter variable is not merged into loop body
4604 // function argument structure and it is passed as separate variable
4605 OI.ExcludeArgsFromAggregate.push_back(NewLoopCntLoad);
4606
4607 // PostOutline CB is invoked when loop body function is outlined and
4608 // loop body is replaced by call to outlined function. We need to add
4609 // call to OpenMP device rtl inside loop preheader. OpenMP device rtl
4610 // function will handle loop control logic.
4611 //
4612 OI.PostOutlineCB = [=, ToBeDeletedVec =
4613 std::move(ToBeDeleted)](Function &OutlinedFn) {
4614 workshareLoopTargetCallback(this, CLI, Ident, OutlinedFn, ParallelTaskPtr,
4615 ToBeDeletedVec, LoopType);
4616 };
4617 addOutlineInfo(std::move(OI));
4618 return CLI->getAfterIP();
4619}
4620
4623 bool NeedsBarrier, omp::ScheduleKind SchedKind, Value *ChunkSize,
4624 bool HasSimdModifier, bool HasMonotonicModifier,
4625 bool HasNonmonotonicModifier, bool HasOrderedClause,
4626 WorksharingLoopType LoopType) {
4627 if (Config.isTargetDevice())
4628 return applyWorkshareLoopTarget(DL, CLI, AllocaIP, LoopType);
4629 OMPScheduleType EffectiveScheduleType = computeOpenMPScheduleType(
4630 SchedKind, ChunkSize, HasSimdModifier, HasMonotonicModifier,
4631 HasNonmonotonicModifier, HasOrderedClause);
4632
4633 bool IsOrdered = (EffectiveScheduleType & OMPScheduleType::ModifierOrdered) ==
4634 OMPScheduleType::ModifierOrdered;
4635 switch (EffectiveScheduleType & ~OMPScheduleType::ModifierMask) {
4636 case OMPScheduleType::BaseStatic:
4637 assert(!ChunkSize && "No chunk size with static-chunked schedule");
4638 if (IsOrdered)
4639 return applyDynamicWorkshareLoop(DL, CLI, AllocaIP, EffectiveScheduleType,
4640 NeedsBarrier, ChunkSize);
4641 // FIXME: Monotonicity ignored?
4642 return applyStaticWorkshareLoop(DL, CLI, AllocaIP, NeedsBarrier);
4643
4644 case OMPScheduleType::BaseStaticChunked:
4645 if (IsOrdered)
4646 return applyDynamicWorkshareLoop(DL, CLI, AllocaIP, EffectiveScheduleType,
4647 NeedsBarrier, ChunkSize);
4648 // FIXME: Monotonicity ignored?
4649 return applyStaticChunkedWorkshareLoop(DL, CLI, AllocaIP, NeedsBarrier,
4650 ChunkSize);
4651
4652 case OMPScheduleType::BaseRuntime:
4653 case OMPScheduleType::BaseAuto:
4654 case OMPScheduleType::BaseGreedy:
4655 case OMPScheduleType::BaseBalanced:
4656 case OMPScheduleType::BaseSteal:
4657 case OMPScheduleType::BaseGuidedSimd:
4658 case OMPScheduleType::BaseRuntimeSimd:
4659 assert(!ChunkSize &&
4660 "schedule type does not support user-defined chunk sizes");
4661 [[fallthrough]];
4662 case OMPScheduleType::BaseDynamicChunked:
4663 case OMPScheduleType::BaseGuidedChunked:
4664 case OMPScheduleType::BaseGuidedIterativeChunked:
4665 case OMPScheduleType::BaseGuidedAnalyticalChunked:
4666 case OMPScheduleType::BaseStaticBalancedChunked:
4667 return applyDynamicWorkshareLoop(DL, CLI, AllocaIP, EffectiveScheduleType,
4668 NeedsBarrier, ChunkSize);
4669
4670 default:
4671 llvm_unreachable("Unknown/unimplemented schedule kind");
4672 }
4673}
4674
4675/// Returns an LLVM function to call for initializing loop bounds using OpenMP
4676/// dynamic scheduling depending on `type`. Only i32 and i64 are supported by
4677/// the runtime. Always interpret integers as unsigned similarly to
4678/// CanonicalLoopInfo.
4679static FunctionCallee
4681 unsigned Bitwidth = Ty->getIntegerBitWidth();
4682 if (Bitwidth == 32)
4683 return OMPBuilder.getOrCreateRuntimeFunction(
4684 M, omp::RuntimeFunction::OMPRTL___kmpc_dispatch_init_4u);
4685 if (Bitwidth == 64)
4686 return OMPBuilder.getOrCreateRuntimeFunction(
4687 M, omp::RuntimeFunction::OMPRTL___kmpc_dispatch_init_8u);
4688 llvm_unreachable("unknown OpenMP loop iterator bitwidth");
4689}
4690
4691/// Returns an LLVM function to call for updating the next loop using OpenMP
4692/// dynamic scheduling depending on `type`. Only i32 and i64 are supported by
4693/// the runtime. Always interpret integers as unsigned similarly to
4694/// CanonicalLoopInfo.
4695static FunctionCallee
4697 unsigned Bitwidth = Ty->getIntegerBitWidth();
4698 if (Bitwidth == 32)
4699 return OMPBuilder.getOrCreateRuntimeFunction(
4700 M, omp::RuntimeFunction::OMPRTL___kmpc_dispatch_next_4u);
4701 if (Bitwidth == 64)
4702 return OMPBuilder.getOrCreateRuntimeFunction(
4703 M, omp::RuntimeFunction::OMPRTL___kmpc_dispatch_next_8u);
4704 llvm_unreachable("unknown OpenMP loop iterator bitwidth");
4705}
4706
4707/// Returns an LLVM function to call for finalizing the dynamic loop using
4708/// depending on `type`. Only i32 and i64 are supported by the runtime. Always
4709/// interpret integers as unsigned similarly to CanonicalLoopInfo.
4710static FunctionCallee
4712 unsigned Bitwidth = Ty->getIntegerBitWidth();
4713 if (Bitwidth == 32)
4714 return OMPBuilder.getOrCreateRuntimeFunction(
4715 M, omp::RuntimeFunction::OMPRTL___kmpc_dispatch_fini_4u);
4716 if (Bitwidth == 64)
4717 return OMPBuilder.getOrCreateRuntimeFunction(
4718 M, omp::RuntimeFunction::OMPRTL___kmpc_dispatch_fini_8u);
4719 llvm_unreachable("unknown OpenMP loop iterator bitwidth");
4720}
4721
4723OpenMPIRBuilder::applyDynamicWorkshareLoop(DebugLoc DL, CanonicalLoopInfo *CLI,
4724 InsertPointTy AllocaIP,
4725 OMPScheduleType SchedType,
4726 bool NeedsBarrier, Value *Chunk) {
4727 assert(CLI->isValid() && "Requires a valid canonical loop");
4728 assert(!isConflictIP(AllocaIP, CLI->getPreheaderIP()) &&
4729 "Require dedicated allocate IP");
4731 "Require valid schedule type");
4732
4733 bool Ordered = (SchedType & OMPScheduleType::ModifierOrdered) ==
4734 OMPScheduleType::ModifierOrdered;
4735
4736 // Set up the source location value for OpenMP runtime.
4738
4739 uint32_t SrcLocStrSize;
4740 Constant *SrcLocStr = getOrCreateSrcLocStr(DL, SrcLocStrSize);
4741 Value *SrcLoc = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
4742
4743 // Declare useful OpenMP runtime functions.
4744 Value *IV = CLI->getIndVar();
4745 Type *IVTy = IV->getType();
4746 FunctionCallee DynamicInit = getKmpcForDynamicInitForType(IVTy, M, *this);
4747 FunctionCallee DynamicNext = getKmpcForDynamicNextForType(IVTy, M, *this);
4748
4749 // Allocate space for computed loop bounds as expected by the "init" function.
4750 Builder.SetInsertPoint(AllocaIP.getBlock()->getFirstNonPHIOrDbgOrAlloca());
4751 Type *I32Type = Type::getInt32Ty(M.getContext());
4752 Value *PLastIter = Builder.CreateAlloca(I32Type, nullptr, "p.lastiter");
4753 Value *PLowerBound = Builder.CreateAlloca(IVTy, nullptr, "p.lowerbound");
4754 Value *PUpperBound = Builder.CreateAlloca(IVTy, nullptr, "p.upperbound");
4755 Value *PStride = Builder.CreateAlloca(IVTy, nullptr, "p.stride");
4756
4757 // At the end of the preheader, prepare for calling the "init" function by
4758 // storing the current loop bounds into the allocated space. A canonical loop
4759 // always iterates from 0 to trip-count with step 1. Note that "init" expects
4760 // and produces an inclusive upper bound.
4761 BasicBlock *PreHeader = CLI->getPreheader();
4762 Builder.SetInsertPoint(PreHeader->getTerminator());
4763 Constant *One = ConstantInt::get(IVTy, 1);
4764 Builder.CreateStore(One, PLowerBound);
4765 Value *UpperBound = CLI->getTripCount();
4766 Builder.CreateStore(UpperBound, PUpperBound);
4767 Builder.CreateStore(One, PStride);
4768
4769 BasicBlock *Header = CLI->getHeader();
4770 BasicBlock *Exit = CLI->getExit();
4771 BasicBlock *Cond = CLI->getCond();
4772 BasicBlock *Latch = CLI->getLatch();
4773 InsertPointTy AfterIP = CLI->getAfterIP();
4774
4775 // The CLI will be "broken" in the code below, as the loop is no longer
4776 // a valid canonical loop.
4777
4778 if (!Chunk)
4779 Chunk = One;
4780
4781 Value *ThreadNum = getOrCreateThreadID(SrcLoc);
4782
4783 Constant *SchedulingType =
4784 ConstantInt::get(I32Type, static_cast<int>(SchedType));
4785
4786 // Call the "init" function.
4787 Builder.CreateCall(DynamicInit,
4788 {SrcLoc, ThreadNum, SchedulingType, /* LowerBound */ One,
4789 UpperBound, /* step */ One, Chunk});
4790
4791 // An outer loop around the existing one.
4792 BasicBlock *OuterCond = BasicBlock::Create(
4793 PreHeader->getContext(), Twine(PreHeader->getName()) + ".outer.cond",
4794 PreHeader->getParent());
4795 // This needs to be 32-bit always, so can't use the IVTy Zero above.
4796 Builder.SetInsertPoint(OuterCond, OuterCond->getFirstInsertionPt());
4797 Value *Res =
4798 Builder.CreateCall(DynamicNext, {SrcLoc, ThreadNum, PLastIter,
4799 PLowerBound, PUpperBound, PStride});
4800 Constant *Zero32 = ConstantInt::get(I32Type, 0);
4801 Value *MoreWork = Builder.CreateCmp(CmpInst::ICMP_NE, Res, Zero32);
4802 Value *LowerBound =
4803 Builder.CreateSub(Builder.CreateLoad(IVTy, PLowerBound), One, "lb");
4804 Builder.CreateCondBr(MoreWork, Header, Exit);
4805
4806 // Change PHI-node in loop header to use outer cond rather than preheader,
4807 // and set IV to the LowerBound.
4808 Instruction *Phi = &Header->front();
4809 auto *PI = cast<PHINode>(Phi);
4810 PI->setIncomingBlock(0, OuterCond);
4811 PI->setIncomingValue(0, LowerBound);
4812
4813 // Then set the pre-header to jump to the OuterCond
4814 Instruction *Term = PreHeader->getTerminator();
4815 auto *Br = cast<BranchInst>(Term);
4816 Br->setSuccessor(0, OuterCond);
4817
4818 // Modify the inner condition:
4819 // * Use the UpperBound returned from the DynamicNext call.
4820 // * jump to the loop outer loop when done with one of the inner loops.
4821 Builder.SetInsertPoint(Cond, Cond->getFirstInsertionPt());
4822 UpperBound = Builder.CreateLoad(IVTy, PUpperBound, "ub");
4824 auto *CI = cast<CmpInst>(Comp);
4825 CI->setOperand(1, UpperBound);
4826 // Redirect the inner exit to branch to outer condition.
4827 Instruction *Branch = &Cond->back();
4828 auto *BI = cast<BranchInst>(Branch);
4829 assert(BI->getSuccessor(1) == Exit);
4830 BI->setSuccessor(1, OuterCond);
4831
4832 // Call the "fini" function if "ordered" is present in wsloop directive.
4833 if (Ordered) {
4834 Builder.SetInsertPoint(&Latch->back());
4835 FunctionCallee DynamicFini = getKmpcForDynamicFiniForType(IVTy, M, *this);
4836 Builder.CreateCall(DynamicFini, {SrcLoc, ThreadNum});
4837 }
4838
4839 // Add the barrier if requested.
4840 if (NeedsBarrier) {
4841 Builder.SetInsertPoint(&Exit->back());
4842 InsertPointOrErrorTy BarrierIP =
4843 createBarrier(LocationDescription(Builder.saveIP(), DL),
4844 omp::Directive::OMPD_for, /* ForceSimpleCall */ false,
4845 /* CheckCancelFlag */ false);
4846 if (!BarrierIP)
4847 return BarrierIP.takeError();
4848 }
4849
4850 CLI->invalidate();
4851 return AfterIP;
4852}
4853
4854/// Redirect all edges that branch to \p OldTarget to \p NewTarget. That is,
4855/// after this \p OldTarget will be orphaned.
4857 BasicBlock *NewTarget, DebugLoc DL) {
4858 for (BasicBlock *Pred : make_early_inc_range(predecessors(OldTarget)))
4859 redirectTo(Pred, NewTarget, DL);
4860}
4861
4862/// Determine which blocks in \p BBs are reachable from outside and remove the
4863/// ones that are not reachable from the function.
4865 SmallPtrSet<BasicBlock *, 6> BBsToErase{BBs.begin(), BBs.end()};
4866 auto HasRemainingUses = [&BBsToErase](BasicBlock *BB) {
4867 for (Use &U : BB->uses()) {
4868 auto *UseInst = dyn_cast<Instruction>(U.getUser());
4869 if (!UseInst)
4870 continue;
4871 if (BBsToErase.count(UseInst->getParent()))
4872 continue;
4873 return true;
4874 }
4875 return false;
4876 };
4877
4878 while (BBsToErase.remove_if(HasRemainingUses)) {
4879 // Try again if anything was removed.
4880 }
4881
4882 SmallVector<BasicBlock *, 7> BBVec(BBsToErase.begin(), BBsToErase.end());
4883 DeleteDeadBlocks(BBVec);
4884}
4885
4888 InsertPointTy ComputeIP) {
4889 assert(Loops.size() >= 1 && "At least one loop required");
4890 size_t NumLoops = Loops.size();
4891
4892 // Nothing to do if there is already just one loop.
4893 if (NumLoops == 1)
4894 return Loops.front();
4895
4896 CanonicalLoopInfo *Outermost = Loops.front();
4897 CanonicalLoopInfo *Innermost = Loops.back();
4898 BasicBlock *OrigPreheader = Outermost->getPreheader();
4899 BasicBlock *OrigAfter = Outermost->getAfter();
4900 Function *F = OrigPreheader->getParent();
4901
4902 // Loop control blocks that may become orphaned later.
4903 SmallVector<BasicBlock *, 12> OldControlBBs;
4904 OldControlBBs.reserve(6 * Loops.size());
4906 Loop->collectControlBlocks(OldControlBBs);
4907
4908 // Setup the IRBuilder for inserting the trip count computation.
4910 if (ComputeIP.isSet())
4911 Builder.restoreIP(ComputeIP);
4912 else
4913 Builder.restoreIP(Outermost->getPreheaderIP());
4914
4915 // Derive the collapsed' loop trip count.
4916 // TODO: Find common/largest indvar type.
4917 Value *CollapsedTripCount = nullptr;
4918 for (CanonicalLoopInfo *L : Loops) {
4919 assert(L->isValid() &&
4920 "All loops to collapse must be valid canonical loops");
4921 Value *OrigTripCount = L->getTripCount();
4922 if (!CollapsedTripCount) {
4923 CollapsedTripCount = OrigTripCount;
4924 continue;
4925 }
4926
4927 // TODO: Enable UndefinedSanitizer to diagnose an overflow here.
4928 CollapsedTripCount = Builder.CreateMul(CollapsedTripCount, OrigTripCount,
4929 {}, /*HasNUW=*/true);
4930 }
4931
4932 // Create the collapsed loop control flow.
4933 CanonicalLoopInfo *Result =
4934 createLoopSkeleton(DL, CollapsedTripCount, F,
4935 OrigPreheader->getNextNode(), OrigAfter, "collapsed");
4936
4937 // Build the collapsed loop body code.
4938 // Start with deriving the input loop induction variables from the collapsed
4939 // one, using a divmod scheme. To preserve the original loops' order, the
4940 // innermost loop use the least significant bits.
4941 Builder.restoreIP(Result->getBodyIP());
4942
4943 Value *Leftover = Result->getIndVar();
4944 SmallVector<Value *> NewIndVars;
4945 NewIndVars.resize(NumLoops);
4946 for (int i = NumLoops - 1; i >= 1; --i) {
4947 Value *OrigTripCount = Loops[i]->getTripCount();
4948
4949 Value *NewIndVar = Builder.CreateURem(Leftover, OrigTripCount);
4950 NewIndVars[i] = NewIndVar;
4951
4952 Leftover = Builder.CreateUDiv(Leftover, OrigTripCount);
4953 }
4954 // Outermost loop gets all the remaining bits.
4955 NewIndVars[0] = Leftover;
4956
4957 // Construct the loop body control flow.
4958 // We progressively construct the branch structure following in direction of
4959 // the control flow, from the leading in-between code, the loop nest body, the
4960 // trailing in-between code, and rejoining the collapsed loop's latch.
4961 // ContinueBlock and ContinuePred keep track of the source(s) of next edge. If
4962 // the ContinueBlock is set, continue with that block. If ContinuePred, use
4963 // its predecessors as sources.
4964 BasicBlock *ContinueBlock = Result->getBody();
4965 BasicBlock *ContinuePred = nullptr;
4966 auto ContinueWith = [&ContinueBlock, &ContinuePred, DL](BasicBlock *Dest,
4967 BasicBlock *NextSrc) {
4968 if (ContinueBlock)
4969 redirectTo(ContinueBlock, Dest, DL);
4970 else
4971 redirectAllPredecessorsTo(ContinuePred, Dest, DL);
4972
4973 ContinueBlock = nullptr;
4974 ContinuePred = NextSrc;
4975 };
4976
4977 // The code before the nested loop of each level.
4978 // Because we are sinking it into the nest, it will be executed more often
4979 // that the original loop. More sophisticated schemes could keep track of what
4980 // the in-between code is and instantiate it only once per thread.
4981 for (size_t i = 0; i < NumLoops - 1; ++i)
4982 ContinueWith(Loops[i]->getBody(), Loops[i + 1]->getHeader());
4983
4984 // Connect the loop nest body.
4985 ContinueWith(Innermost->getBody(), Innermost->getLatch());
4986
4987 // The code after the nested loop at each level.
4988 for (size_t i = NumLoops - 1; i > 0; --i)
4989 ContinueWith(Loops[i]->getAfter(), Loops[i - 1]->getLatch());
4990
4991 // Connect the finished loop to the collapsed loop latch.
4992 ContinueWith(Result->getLatch(), nullptr);
4993
4994 // Replace the input loops with the new collapsed loop.
4995 redirectTo(Outermost->getPreheader(), Result->getPreheader(), DL);
4996 redirectTo(Result->getAfter(), Outermost->getAfter(), DL);
4997
4998 // Replace the input loop indvars with the derived ones.
4999 for (size_t i = 0; i < NumLoops; ++i)
5000 Loops[i]->getIndVar()->replaceAllUsesWith(NewIndVars[i]);
5001
5002 // Remove unused parts of the input loops.
5003 removeUnusedBlocksFromParent(OldControlBBs);
5004
5005 for (CanonicalLoopInfo *L : Loops)
5006 L->invalidate();
5007
5008#ifndef NDEBUG
5009 Result->assertOK();
5010#endif
5011 return Result;
5012}
5013
5014std::vector<CanonicalLoopInfo *>
5016 ArrayRef<Value *> TileSizes) {
5017 assert(TileSizes.size() == Loops.size() &&
5018 "Must pass as many tile sizes as there are loops");
5019 int NumLoops = Loops.size();
5020 assert(NumLoops >= 1 && "At least one loop to tile required");
5021
5022 CanonicalLoopInfo *OutermostLoop = Loops.front();
5023 CanonicalLoopInfo *InnermostLoop = Loops.back();
5024 Function *F = OutermostLoop->getBody()->getParent();
5025 BasicBlock *InnerEnter = InnermostLoop->getBody();
5026 BasicBlock *InnerLatch = InnermostLoop->getLatch();
5027
5028 // Loop control blocks that may become orphaned later.
5029 SmallVector<BasicBlock *, 12> OldControlBBs;
5030 OldControlBBs.reserve(6 * Loops.size());
5032 Loop->collectControlBlocks(OldControlBBs);
5033
5034 // Collect original trip counts and induction variable to be accessible by
5035 // index. Also, the structure of the original loops is not preserved during
5036 // the construction of the tiled loops, so do it before we scavenge the BBs of
5037 // any original CanonicalLoopInfo.
5038 SmallVector<Value *, 4> OrigTripCounts, OrigIndVars;
5039 for (CanonicalLoopInfo *L : Loops) {
5040 assert(L->isValid() && "All input loops must be valid canonical loops");
5041 OrigTripCounts.push_back(L->getTripCount());
5042 OrigIndVars.push_back(L->getIndVar());
5043 }
5044
5045 // Collect the code between loop headers. These may contain SSA definitions
5046 // that are used in the loop nest body. To be usable with in the innermost
5047 // body, these BasicBlocks will be sunk into the loop nest body. That is,
5048 // these instructions may be executed more often than before the tiling.
5049 // TODO: It would be sufficient to only sink them into body of the
5050 // corresponding tile loop.
5052 for (int i = 0; i < NumLoops - 1; ++i) {
5053 CanonicalLoopInfo *Surrounding = Loops[i];
5054 CanonicalLoopInfo *Nested = Loops[i + 1];
5055
5056 BasicBlock *EnterBB = Surrounding->getBody();
5057 BasicBlock *ExitBB = Nested->getHeader();
5058 InbetweenCode.emplace_back(EnterBB, ExitBB);
5059 }
5060
5061 // Compute the trip counts of the floor loops.
5063 Builder.restoreIP(OutermostLoop->getPreheaderIP());
5064 SmallVector<Value *, 4> FloorCount, FloorRems;
5065 for (int i = 0; i < NumLoops; ++i) {
5066 Value *TileSize = TileSizes[i];
5067 Value *OrigTripCount = OrigTripCounts[i];
5068 Type *IVType = OrigTripCount->getType();
5069
5070 Value *FloorTripCount = Builder.CreateUDiv(OrigTripCount, TileSize);
5071 Value *FloorTripRem = Builder.CreateURem(OrigTripCount, TileSize);
5072
5073 // 0 if tripcount divides the tilesize, 1 otherwise.
5074 // 1 means we need an additional iteration for a partial tile.
5075 //
5076 // Unfortunately we cannot just use the roundup-formula
5077 // (tripcount + tilesize - 1)/tilesize
5078 // because the summation might overflow. We do not want introduce undefined
5079 // behavior when the untiled loop nest did not.
5080 Value *FloorTripOverflow =
5081 Builder.CreateICmpNE(FloorTripRem, ConstantInt::get(IVType, 0));
5082
5083 FloorTripOverflow = Builder.CreateZExt(FloorTripOverflow, IVType);
5084 FloorTripCount =
5085 Builder.CreateAdd(FloorTripCount, FloorTripOverflow,
5086 "omp_floor" + Twine(i) + ".tripcount", true);
5087
5088 // Remember some values for later use.
5089 FloorCount.push_back(FloorTripCount);
5090 FloorRems.push_back(FloorTripRem);
5091 }
5092
5093 // Generate the new loop nest, from the outermost to the innermost.
5094 std::vector<CanonicalLoopInfo *> Result;
5095 Result.reserve(NumLoops * 2);
5096
5097 // The basic block of the surrounding loop that enters the nest generated
5098 // loop.
5099 BasicBlock *Enter = OutermostLoop->getPreheader();
5100
5101 // The basic block of the surrounding loop where the inner code should
5102 // continue.
5103 BasicBlock *Continue = OutermostLoop->getAfter();
5104
5105 // Where the next loop basic block should be inserted.
5106 BasicBlock *OutroInsertBefore = InnermostLoop->getExit();
5107
5108 auto EmbeddNewLoop =
5109 [this, DL, F, InnerEnter, &Enter, &Continue, &OutroInsertBefore](
5110 Value *TripCount, const Twine &Name) -> CanonicalLoopInfo * {
5111 CanonicalLoopInfo *EmbeddedLoop = createLoopSkeleton(
5112 DL, TripCount, F, InnerEnter, OutroInsertBefore, Name);
5113 redirectTo(Enter, EmbeddedLoop->getPreheader(), DL);
5114 redirectTo(EmbeddedLoop->getAfter(), Continue, DL);
5115
5116 // Setup the position where the next embedded loop connects to this loop.
5117 Enter = EmbeddedLoop->getBody();
5118 Continue = EmbeddedLoop->getLatch();
5119 OutroInsertBefore = EmbeddedLoop->getLatch();
5120 return EmbeddedLoop;
5121 };
5122
5123 auto EmbeddNewLoops = [&Result, &EmbeddNewLoop](ArrayRef<Value *> TripCounts,
5124 const Twine &NameBase) {
5125 for (auto P : enumerate(TripCounts)) {
5126 CanonicalLoopInfo *EmbeddedLoop =
5127 EmbeddNewLoop(P.value(), NameBase + Twine(P.index()));
5128 Result.push_back(EmbeddedLoop);
5129 }
5130 };
5131
5132 EmbeddNewLoops(FloorCount, "floor");
5133
5134 // Within the innermost floor loop, emit the code that computes the tile
5135 // sizes.
5137 SmallVector<Value *, 4> TileCounts;
5138 for (int i = 0; i < NumLoops; ++i) {
5139 CanonicalLoopInfo *FloorLoop = Result[i];
5140 Value *TileSize = TileSizes[i];
5141
5142 Value *FloorIsEpilogue =
5143 Builder.CreateICmpEQ(FloorLoop->getIndVar(), FloorCount[i]);
5144 Value *TileTripCount =
5145 Builder.CreateSelect(FloorIsEpilogue, FloorRems[i], TileSize);
5146
5147 TileCounts.push_back(TileTripCount);
5148 }
5149
5150 // Create the tile loops.
5151 EmbeddNewLoops(TileCounts, "tile");
5152
5153 // Insert the inbetween code into the body.
5154 BasicBlock *BodyEnter = Enter;
5155 BasicBlock *BodyEntered = nullptr;
5156 for (std::pair<BasicBlock *, BasicBlock *> P : InbetweenCode) {
5157 BasicBlock *EnterBB = P.first;
5158 BasicBlock *ExitBB = P.second;
5159
5160 if (BodyEnter)
5161 redirectTo(BodyEnter, EnterBB, DL);
5162 else
5163 redirectAllPredecessorsTo(BodyEntered, EnterBB, DL);
5164
5165 BodyEnter = nullptr;
5166 BodyEntered = ExitBB;
5167 }
5168
5169 // Append the original loop nest body into the generated loop nest body.
5170 if (BodyEnter)
5171 redirectTo(BodyEnter, InnerEnter, DL);
5172 else
5173 redirectAllPredecessorsTo(BodyEntered, InnerEnter, DL);
5175
5176 // Replace the original induction variable with an induction variable computed
5177 // from the tile and floor induction variables.
5178 Builder.restoreIP(Result.back()->getBodyIP());
5179 for (int i = 0; i < NumLoops; ++i) {
5180 CanonicalLoopInfo *FloorLoop = Result[i];
5181 CanonicalLoopInfo *TileLoop = Result[NumLoops + i];
5182 Value *OrigIndVar = OrigIndVars[i];
5183 Value *Size = TileSizes[i];
5184
5185 Value *Scale =
5186 Builder.CreateMul(Size, FloorLoop->getIndVar(), {}, /*HasNUW=*/true);
5187 Value *Shift =
5188 Builder.CreateAdd(Scale, TileLoop->getIndVar(), {}, /*HasNUW=*/true);
5189 OrigIndVar->replaceAllUsesWith(Shift);
5190 }
5191
5192 // Remove unused parts of the original loops.
5193 removeUnusedBlocksFromParent(OldControlBBs);
5194
5195 for (CanonicalLoopInfo *L : Loops)
5196 L->invalidate();
5197
5198#ifndef NDEBUG
5199 for (CanonicalLoopInfo *GenL : Result)
5200 GenL->assertOK();
5201#endif
5202 return Result;
5203}
5204
5205/// Attach metadata \p Properties to the basic block described by \p BB. If the
5206/// basic block already has metadata, the basic block properties are appended.
5208 ArrayRef<Metadata *> Properties) {
5209 // Nothing to do if no property to attach.
5210 if (Properties.empty())
5211 return;
5212
5213 LLVMContext &Ctx = BB->getContext();
5214 SmallVector<Metadata *> NewProperties;
5215 NewProperties.push_back(nullptr);
5216
5217 // If the basic block already has metadata, prepend it to the new metadata.
5218 MDNode *Existing = BB->getTerminator()->getMetadata(LLVMContext::MD_loop);
5219 if (Existing)
5220 append_range(NewProperties, drop_begin(Existing->operands(), 1));
5221
5222 append_range(NewProperties, Properties);
5223 MDNode *BasicBlockID = MDNode::getDistinct(Ctx, NewProperties);
5224 BasicBlockID->replaceOperandWith(0, BasicBlockID);
5225
5226 BB->getTerminator()->setMetadata(LLVMContext::MD_loop, BasicBlockID);
5227}
5228
5229/// Attach loop metadata \p Properties to the loop described by \p Loop. If the
5230/// loop already has metadata, the loop properties are appended.
5232 ArrayRef<Metadata *> Properties) {
5233 assert(Loop->isValid() && "Expecting a valid CanonicalLoopInfo");
5234
5235 // Attach metadata to the loop's latch
5236 BasicBlock *Latch = Loop->getLatch();
5237 assert(Latch && "A valid CanonicalLoopInfo must have a unique latch");
5238 addBasicBlockMetadata(Latch, Properties);
5239}
5240
5241/// Attach llvm.access.group metadata to the memref instructions of \p Block
5242static void addSimdMetadata(BasicBlock *Block, MDNode *AccessGroup,
5243 LoopInfo &LI) {
5244 for (Instruction &I : *Block) {
5245 if (I.mayReadOrWriteMemory()) {
5246 // TODO: This instruction may already have access group from
5247 // other pragmas e.g. #pragma clang loop vectorize. Append
5248 // so that the existing metadata is not overwritten.
5249 I.setMetadata(LLVMContext::MD_access_group, AccessGroup);
5250 }
5251 }
5252}
5253
5257 Loop, {MDNode::get(Ctx, MDString::get(Ctx, "llvm.loop.unroll.enable")),
5258 MDNode::get(Ctx, MDString::get(Ctx, "llvm.loop.unroll.full"))});
5259}
5260
5264 Loop, {
5265 MDNode::get(Ctx, MDString::get(Ctx, "llvm.loop.unroll.enable")),
5266 });
5267}
5268
5269void OpenMPIRBuilder::createIfVersion(CanonicalLoopInfo *CanonicalLoop,
5270 Value *IfCond, ValueToValueMapTy &VMap,
5271 const Twine &NamePrefix) {
5272 Function *F = CanonicalLoop->getFunction();
5273
5274 // Define where if branch should be inserted
5275 Instruction *SplitBefore = CanonicalLoop->getPreheader()->getTerminator();
5276
5277 // TODO: We should not rely on pass manager. Currently we use pass manager
5278 // only for getting llvm::Loop which corresponds to given CanonicalLoopInfo
5279 // object. We should have a method which returns all blocks between
5280 // CanonicalLoopInfo::getHeader() and CanonicalLoopInfo::getAfter()
5282 FAM.registerPass([]() { return DominatorTreeAnalysis(); });
5283 FAM.registerPass([]() { return LoopAnalysis(); });
5284 FAM.registerPass([]() { return PassInstrumentationAnalysis(); });
5285
5286 // Get the loop which needs to be cloned
5287 LoopAnalysis LIA;
5288 LoopInfo &&LI = LIA.run(*F, FAM);
5289 Loop *L = LI.getLoopFor(CanonicalLoop->getHeader());
5290
5291 // Create additional blocks for the if statement
5292 BasicBlock *Head = SplitBefore->getParent();
5293 Instruction *HeadOldTerm = Head->getTerminator();
5294 llvm::LLVMContext &C = Head->getContext();
5296 C, NamePrefix + ".if.then", Head->getParent(), Head->getNextNode());
5298 C, NamePrefix + ".if.else", Head->getParent(), CanonicalLoop->getExit());
5299
5300 // Create if condition branch.
5301 Builder.SetInsertPoint(HeadOldTerm);
5302 Instruction *BrInstr =
5303 Builder.CreateCondBr(IfCond, ThenBlock, /*ifFalse*/ ElseBlock);
5304 InsertPointTy IP{BrInstr->getParent(), ++BrInstr->getIterator()};
5305 // Then block contains branch to omp loop which needs to be vectorized
5306 spliceBB(IP, ThenBlock, false);
5307 ThenBlock->replaceSuccessorsPhiUsesWith(Head, ThenBlock);
5308
5309 Builder.SetInsertPoint(ElseBlock);
5310
5311 // Clone loop for the else branch
5313
5314 VMap[CanonicalLoop->getPreheader()] = ElseBlock;
5315 for (BasicBlock *Block : L->getBlocks()) {
5316 BasicBlock *NewBB = CloneBasicBlock(Block, VMap, "", F);
5317 NewBB->moveBefore(CanonicalLoop->getExit());
5318 VMap[Block] = NewBB;
5319 NewBlocks.push_back(NewBB);
5320 }
5321 remapInstructionsInBlocks(NewBlocks, VMap);
5322 Builder.CreateBr(NewBlocks.front());
5323}
5324
5325unsigned
5327 const StringMap<bool> &Features) {
5328 if (TargetTriple.isX86()) {
5329 if (Features.lookup("avx512f"))
5330 return 512;
5331 else if (Features.lookup("avx"))
5332 return 256;
5333 return 128;
5334 }
5335 if (TargetTriple.isPPC())
5336 return 128;
5337 if (TargetTriple.isWasm())
5338 return 128;
5339 return 0;
5340}
5341
5343 MapVector<Value *, Value *> AlignedVars,
5344 Value *IfCond, OrderKind Order,
5345 ConstantInt *Simdlen, ConstantInt *Safelen) {
5347
5348 Function *F = CanonicalLoop->getFunction();
5349
5350 // TODO: We should not rely on pass manager. Currently we use pass manager
5351 // only for getting llvm::Loop which corresponds to given CanonicalLoopInfo
5352 // object. We should have a method which returns all blocks between
5353 // CanonicalLoopInfo::getHeader() and CanonicalLoopInfo::getAfter()
5355 FAM.registerPass([]() { return DominatorTreeAnalysis(); });
5356 FAM.registerPass([]() { return LoopAnalysis(); });
5357 FAM.registerPass([]() { return PassInstrumentationAnalysis(); });
5358
5359 LoopAnalysis LIA;
5360 LoopInfo &&LI = LIA.run(*F, FAM);
5361
5362 Loop *L = LI.getLoopFor(CanonicalLoop->getHeader());
5363 if (AlignedVars.size()) {
5365 for (auto &AlignedItem : AlignedVars) {
5366 Value *AlignedPtr = AlignedItem.first;
5367 Value *Alignment = AlignedItem.second;
5368 Instruction *loadInst = dyn_cast<Instruction>(AlignedPtr);
5369 Builder.SetInsertPoint(loadInst->getNextNode());
5370 Builder.CreateAlignmentAssumption(F->getDataLayout(), AlignedPtr,
5371 Alignment);
5372 }
5373 Builder.restoreIP(IP);
5374 }
5375
5376 if (IfCond) {
5377 ValueToValueMapTy VMap;
5378 createIfVersion(CanonicalLoop, IfCond, VMap, "simd");
5379 // Add metadata to the cloned loop which disables vectorization
5380 Value *MappedLatch = VMap.lookup(CanonicalLoop->getLatch());
5381 assert(MappedLatch &&
5382 "Cannot find value which corresponds to original loop latch");
5383 assert(isa<BasicBlock>(MappedLatch) &&
5384 "Cannot cast mapped latch block value to BasicBlock");
5385 BasicBlock *NewLatchBlock = dyn_cast<BasicBlock>(MappedLatch);
5386 ConstantAsMetadata *BoolConst =
5389 NewLatchBlock,
5390 {MDNode::get(Ctx, {MDString::get(Ctx, "llvm.loop.vectorize.enable"),
5391 BoolConst})});
5392 }
5393
5394 SmallSet<BasicBlock *, 8> Reachable;
5395
5396 // Get the basic blocks from the loop in which memref instructions
5397 // can be found.
5398 // TODO: Generalize getting all blocks inside a CanonicalizeLoopInfo,
5399 // preferably without running any passes.
5400 for (BasicBlock *Block : L->getBlocks()) {
5401 if (Block == CanonicalLoop->getCond() ||
5402 Block == CanonicalLoop->getHeader())
5403 continue;
5404 Reachable.insert(Block);
5405 }
5406
5407 SmallVector<Metadata *> LoopMDList;
5408
5409 // In presence of finite 'safelen', it may be unsafe to mark all
5410 // the memory instructions parallel, because loop-carried
5411 // dependences of 'safelen' iterations are possible.
5412 // If clause order(concurrent) is specified then the memory instructions
5413 // are marked parallel even if 'safelen' is finite.
5414 if ((Safelen == nullptr) || (Order == OrderKind::OMP_ORDER_concurrent)) {
5415 // Add access group metadata to memory-access instructions.
5416 MDNode *AccessGroup = MDNode::getDistinct(Ctx, {});
5417 for (BasicBlock *BB : Reachable)
5418 addSimdMetadata(BB, AccessGroup, LI);
5419 // TODO: If the loop has existing parallel access metadata, have
5420 // to combine two lists.
5421 LoopMDList.push_back(MDNode::get(
5422 Ctx, {MDString::get(Ctx, "llvm.loop.parallel_accesses"), AccessGroup}));
5423 }
5424
5425 // Use the above access group metadata to create loop level
5426 // metadata, which should be distinct for each loop.
5427 ConstantAsMetadata *BoolConst =
5429 LoopMDList.push_back(MDNode::get(
5430 Ctx, {MDString::get(Ctx, "llvm.loop.vectorize.enable"), BoolConst}));
5431
5432 if (Simdlen || Safelen) {
5433 // If both simdlen and safelen clauses are specified, the value of the
5434 // simdlen parameter must be less than or equal to the value of the safelen
5435 // parameter. Therefore, use safelen only in the absence of simdlen.
5436 ConstantInt *VectorizeWidth = Simdlen == nullptr ? Safelen : Simdlen;
5437 LoopMDList.push_back(
5438 MDNode::get(Ctx, {MDString::get(Ctx, "llvm.loop.vectorize.width"),
5439 ConstantAsMetadata::get(VectorizeWidth)}));
5440 }
5441
5442 addLoopMetadata(CanonicalLoop, LoopMDList);
5443}
5444
5445/// Create the TargetMachine object to query the backend for optimization
5446/// preferences.
5447///
5448/// Ideally, this would be passed from the front-end to the OpenMPBuilder, but
5449/// e.g. Clang does not pass it to its CodeGen layer and creates it only when
5450/// needed for the LLVM pass pipline. We use some default options to avoid
5451/// having to pass too many settings from the frontend that probably do not
5452/// matter.
5453///
5454/// Currently, TargetMachine is only used sometimes by the unrollLoopPartial
5455/// method. If we are going to use TargetMachine for more purposes, especially
5456/// those that are sensitive to TargetOptions, RelocModel and CodeModel, it
5457/// might become be worth requiring front-ends to pass on their TargetMachine,
5458/// or at least cache it between methods. Note that while fontends such as Clang
5459/// have just a single main TargetMachine per translation unit, "target-cpu" and
5460/// "target-features" that determine the TargetMachine are per-function and can
5461/// be overrided using __attribute__((target("OPTIONS"))).
5462static std::unique_ptr<TargetMachine>
5464 Module *M = F->getParent();
5465
5466 StringRef CPU = F->getFnAttribute("target-cpu").getValueAsString();
5467 StringRef Features = F->getFnAttribute("target-features").getValueAsString();
5468 const std::string &Triple = M->getTargetTriple();
5469
5470 std::string Error;
5472 if (!TheTarget)
5473 return {};
5474
5476 return std::unique_ptr<TargetMachine>(TheTarget->createTargetMachine(
5477 Triple, CPU, Features, Options, /*RelocModel=*/std::nullopt,
5478 /*CodeModel=*/std::nullopt, OptLevel));
5479}
5480
5481/// Heuristically determine the best-performant unroll factor for \p CLI. This
5482/// depends on the target processor. We are re-using the same heuristics as the
5483/// LoopUnrollPass.
5485 Function *F = CLI->getFunction();
5486
5487 // Assume the user requests the most aggressive unrolling, even if the rest of
5488 // the code is optimized using a lower setting.
5490 std::unique_ptr<TargetMachine> TM = createTargetMachine(F, OptLevel);
5491
5493 FAM.registerPass([]() { return TargetLibraryAnalysis(); });
5494 FAM.registerPass([]() { return AssumptionAnalysis(); });
5495 FAM.registerPass([]() { return DominatorTreeAnalysis(); });
5496 FAM.registerPass([]() { return LoopAnalysis(); });
5497 FAM.registerPass([]() { return ScalarEvolutionAnalysis(); });
5498 FAM.registerPass([]() { return PassInstrumentationAnalysis(); });
5499 TargetIRAnalysis TIRA;
5500 if (TM)
5501 TIRA = TargetIRAnalysis(
5502 [&](const Function &F) { return TM->getTargetTransformInfo(F); });
5503 FAM.registerPass([&]() { return TIRA; });
5504
5505 TargetIRAnalysis::Result &&TTI = TIRA.run(*F, FAM);
5507 ScalarEvolution &&SE = SEA.run(*F, FAM);
5509 DominatorTree &&DT = DTA.run(*F, FAM);
5510 LoopAnalysis LIA;
5511 LoopInfo &&LI = LIA.run(*F, FAM);
5513 AssumptionCache &&AC = ACT.run(*F, FAM);
5515
5516 Loop *L = LI.getLoopFor(CLI->getHeader());
5517 assert(L && "Expecting CanonicalLoopInfo to be recognized as a loop");
5518
5520 L, SE, TTI,
5521 /*BlockFrequencyInfo=*/nullptr,
5522 /*ProfileSummaryInfo=*/nullptr, ORE, static_cast<int>(OptLevel),
5523 /*UserThreshold=*/std::nullopt,
5524 /*UserCount=*/std::nullopt,
5525 /*UserAllowPartial=*/true,
5526 /*UserAllowRuntime=*/true,
5527 /*UserUpperBound=*/std::nullopt,
5528 /*UserFullUnrollMaxCount=*/std::nullopt);
5529
5530 UP.Force = true;
5531
5532 // Account for additional optimizations taking place before the LoopUnrollPass
5533 // would unroll the loop.
5536
5537 // Use normal unroll factors even if the rest of the code is optimized for
5538 // size.
5541
5542 LLVM_DEBUG(dbgs() << "Unroll heuristic thresholds:\n"
5543 << " Threshold=" << UP.Threshold << "\n"
5544 << " PartialThreshold=" << UP.PartialThreshold << "\n"
5545 << " OptSizeThreshold=" << UP.OptSizeThreshold << "\n"
5546 << " PartialOptSizeThreshold="
5547 << UP.PartialOptSizeThreshold << "\n");
5548
5549 // Disable peeling.
5552 /*UserAllowPeeling=*/false,
5553 /*UserAllowProfileBasedPeeling=*/false,
5554 /*UnrollingSpecficValues=*/false);
5555
5557 CodeMetrics::collectEphemeralValues(L, &AC, EphValues);
5558
5559 // Assume that reads and writes to stack variables can be eliminated by
5560 // Mem2Reg, SROA or LICM. That is, don't count them towards the loop body's
5561 // size.
5562 for (BasicBlock *BB : L->blocks()) {
5563 for (Instruction &I : *BB) {
5564 Value *Ptr;
5565 if (auto *Load = dyn_cast<LoadInst>(&I)) {
5566 Ptr = Load->getPointerOperand();
5567 } else if (auto *Store = dyn_cast<StoreInst>(&I)) {
5568 Ptr = Store->getPointerOperand();
5569 } else
5570 continue;
5571
5572 Ptr = Ptr->stripPointerCasts();
5573
5574 if (auto *Alloca = dyn_cast<AllocaInst>(Ptr)) {
5575 if (Alloca->getParent() == &F->getEntryBlock())
5576 EphValues.insert(&I);
5577 }
5578 }
5579 }
5580
5581 UnrollCostEstimator UCE(L, TTI, EphValues, UP.BEInsns);
5582
5583 // Loop is not unrollable if the loop contains certain instructions.
5584 if (!UCE.canUnroll()) {
5585 LLVM_DEBUG(dbgs() << "Loop not considered unrollable\n");
5586 return 1;
5587 }
5588
5589 LLVM_DEBUG(dbgs() << "Estimated loop size is " << UCE.getRolledLoopSize()
5590 << "\n");
5591
5592 // TODO: Determine trip count of \p CLI if constant, computeUnrollCount might
5593 // be able to use it.
5594 int TripCount = 0;
5595 int MaxTripCount = 0;
5596 bool MaxOrZero = false;
5597 unsigned TripMultiple = 0;
5598
5599 bool UseUpperBound = false;
5600 computeUnrollCount(L, TTI, DT, &LI, &AC, SE, EphValues, &ORE, TripCount,
5601 MaxTripCount, MaxOrZero, TripMultiple, UCE, UP, PP,
5602 UseUpperBound);
5603 unsigned Factor = UP.Count;
5604 LLVM_DEBUG(dbgs() << "Suggesting unroll factor of " << Factor << "\n");
5605
5606 // This function returns 1 to signal to not unroll a loop.
5607 if (Factor == 0)
5608 return 1;
5609 return Factor;
5610}
5611
5613 int32_t Factor,
5614 CanonicalLoopInfo **UnrolledCLI) {
5615 assert(Factor >= 0 && "Unroll factor must not be negative");
5616
5617 Function *F = Loop->getFunction();
5618 LLVMContext &Ctx = F->getContext();
5619
5620 // If the unrolled loop is not used for another loop-associated directive, it
5621 // is sufficient to add metadata for the LoopUnrollPass.
5622 if (!UnrolledCLI) {
5623 SmallVector<Metadata *, 2> LoopMetadata;
5624 LoopMetadata.push_back(
5625 MDNode::get(Ctx, MDString::get(Ctx, "llvm.loop.unroll.enable")));
5626
5627 if (Factor >= 1) {
5629 ConstantInt::get(Type::getInt32Ty(Ctx), APInt(32, Factor)));
5630 LoopMetadata.push_back(MDNode::get(
5631 Ctx, {MDString::get(Ctx, "llvm.loop.unroll.count"), FactorConst}));
5632 }
5633
5634 addLoopMetadata(Loop, LoopMetadata);
5635 return;
5636 }
5637
5638 // Heuristically determine the unroll factor.
5639 if (Factor == 0)
5641
5642 // No change required with unroll factor 1.
5643 if (Factor == 1) {
5644 *UnrolledCLI = Loop;
5645 return;
5646 }
5647
5648 assert(Factor >= 2 &&
5649 "unrolling only makes sense with a factor of 2 or larger");
5650
5651 Type *IndVarTy = Loop->getIndVarType();
5652
5653 // Apply partial unrolling by tiling the loop by the unroll-factor, then fully
5654 // unroll the inner loop.
5655 Value *FactorVal =
5656 ConstantInt::get(IndVarTy, APInt(IndVarTy->getIntegerBitWidth(), Factor,
5657 /*isSigned=*/false));
5658 std::vector<CanonicalLoopInfo *> LoopNest =
5659 tileLoops(DL, {Loop}, {FactorVal});
5660 assert(LoopNest.size() == 2 && "Expect 2 loops after tiling");
5661 *UnrolledCLI = LoopNest[0];
5662 CanonicalLoopInfo *InnerLoop = LoopNest[1];
5663
5664 // LoopUnrollPass can only fully unroll loops with constant trip count.
5665 // Unroll by the unroll factor with a fallback epilog for the remainder
5666 // iterations if necessary.
5668 ConstantInt::get(Type::getInt32Ty(Ctx), APInt(32, Factor)));
5670 InnerLoop,
5671 {MDNode::get(Ctx, MDString::get(Ctx, "llvm.loop.unroll.enable")),
5673 Ctx, {MDString::get(Ctx, "llvm.loop.unroll.count"), FactorConst})});
5674
5675#ifndef NDEBUG
5676 (*UnrolledCLI)->assertOK();
5677#endif
5678}
5679
5682 llvm::Value *BufSize, llvm::Value *CpyBuf,
5683 llvm::Value *CpyFn, llvm::Value *DidIt) {
5684 if (!updateToLocation(Loc))
5685 return Loc.IP;
5686
5687 uint32_t SrcLocStrSize;
5688 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
5689 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
5690 Value *ThreadId = getOrCreateThreadID(Ident);
5691
5692 llvm::Value *DidItLD = Builder.CreateLoad(Builder.getInt32Ty(), DidIt);
5693
5694 Value *Args[] = {Ident, ThreadId, BufSize, CpyBuf, CpyFn, DidItLD};
5695
5696 Function *Fn = getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_copyprivate);
5697 Builder.CreateCall(Fn, Args);
5698
5699 return Builder.saveIP();
5700}
5701
5703 const LocationDescription &Loc, BodyGenCallbackTy BodyGenCB,
5704 FinalizeCallbackTy FiniCB, bool IsNowait, ArrayRef<llvm::Value *> CPVars,
5706
5707 if (!updateToLocation(Loc))
5708 return Loc.IP;
5709
5710 // If needed allocate and initialize `DidIt` with 0.
5711 // DidIt: flag variable: 1=single thread; 0=not single thread.
5712 llvm::Value *DidIt = nullptr;
5713 if (!CPVars.empty()) {
5716 }
5717
5718 Directive OMPD = Directive::OMPD_single;
5719 uint32_t SrcLocStrSize;
5720 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
5721 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
5722 Value *ThreadId = getOrCreateThreadID(Ident);
5723 Value *Args[] = {Ident, ThreadId};
5724
5725 Function *EntryRTLFn = getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_single);
5726 Instruction *EntryCall = Builder.CreateCall(EntryRTLFn, Args);
5727
5728 Function *ExitRTLFn = getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_end_single);
5729 Instruction *ExitCall = Builder.CreateCall(ExitRTLFn, Args);
5730
5731 auto FiniCBWrapper = [&](InsertPointTy IP) -> Error {
5732 if (Error Err = FiniCB(IP))
5733 return Err;
5734
5735 // The thread that executes the single region must set `DidIt` to 1.
5736 // This is used by __kmpc_copyprivate, to know if the caller is the
5737 // single thread or not.
5738 if (DidIt)
5740
5741 return Error::success();
5742 };
5743
5744 // generates the following:
5745 // if (__kmpc_single()) {
5746 // .... single region ...
5747 // __kmpc_end_single
5748 // }
5749 // __kmpc_copyprivate
5750 // __kmpc_barrier
5751
5752 InsertPointOrErrorTy AfterIP =
5753 EmitOMPInlinedRegion(OMPD, EntryCall, ExitCall, BodyGenCB, FiniCBWrapper,
5754 /*Conditional*/ true,
5755 /*hasFinalize*/ true);
5756 if (!AfterIP)
5757 return AfterIP.takeError();
5758
5759 if (DidIt) {
5760 for (size_t I = 0, E = CPVars.size(); I < E; ++I)
5761 // NOTE BufSize is currently unused, so just pass 0.
5763 /*BufSize=*/ConstantInt::get(Int64, 0), CPVars[I],
5764 CPFuncs[I], DidIt);
5765 // NOTE __kmpc_copyprivate already inserts a barrier
5766 } else if (!IsNowait) {
5767 InsertPointOrErrorTy AfterIP =
5769 omp::Directive::OMPD_unknown, /* ForceSimpleCall */ false,
5770 /* CheckCancelFlag */ false);
5771 if (!AfterIP)
5772 return AfterIP.takeError();
5773 }
5774 return Builder.saveIP();
5775}
5776
5778 const LocationDescription &Loc, BodyGenCallbackTy BodyGenCB,
5779 FinalizeCallbackTy FiniCB, StringRef CriticalName, Value *HintInst) {
5780
5781 if (!updateToLocation(Loc))
5782 return Loc.IP;
5783
5784 Directive OMPD = Directive::OMPD_critical;
5785 uint32_t SrcLocStrSize;
5786 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
5787 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
5788 Value *ThreadId = getOrCreateThreadID(Ident);
5789 Value *LockVar = getOMPCriticalRegionLock(CriticalName);
5790 Value *Args[] = {Ident, ThreadId, LockVar};
5791
5792 SmallVector<llvm::Value *, 4> EnterArgs(std::begin(Args), std::end(Args));
5793 Function *RTFn = nullptr;
5794 if (HintInst) {
5795 // Add Hint to entry Args and create call
5796 EnterArgs.push_back(HintInst);
5797 RTFn = getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_critical_with_hint);
5798 } else {
5799 RTFn = getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_critical);
5800 }
5801 Instruction *EntryCall = Builder.CreateCall(RTFn, EnterArgs);
5802
5803 Function *ExitRTLFn =
5804 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_end_critical);
5805 Instruction *ExitCall = Builder.CreateCall(ExitRTLFn, Args);
5806
5807 return EmitOMPInlinedRegion(OMPD, EntryCall, ExitCall, BodyGenCB, FiniCB,
5808 /*Conditional*/ false, /*hasFinalize*/ true);
5809}
5810
5813 InsertPointTy AllocaIP, unsigned NumLoops,
5814 ArrayRef<llvm::Value *> StoreValues,
5815 const Twine &Name, bool IsDependSource) {
5816 assert(
5817 llvm::all_of(StoreValues,
5818 [](Value *SV) { return SV->getType()->isIntegerTy(64); }) &&
5819 "OpenMP runtime requires depend vec with i64 type");
5820
5821 if (!updateToLocation(Loc))
5822 return Loc.IP;
5823
5824 // Allocate space for vector and generate alloc instruction.
5825 auto *ArrI64Ty = ArrayType::get(Int64, NumLoops);
5826 Builder.restoreIP(AllocaIP);
5827 AllocaInst *ArgsBase = Builder.CreateAlloca(ArrI64Ty, nullptr, Name);
5828 ArgsBase->setAlignment(Align(8));
5829 Builder.restoreIP(Loc.IP);
5830
5831 // Store the index value with offset in depend vector.
5832 for (unsigned I = 0; I < NumLoops; ++I) {
5833 Value *DependAddrGEPIter = Builder.CreateInBoundsGEP(
5834 ArrI64Ty, ArgsBase, {Builder.getInt64(0), Builder.getInt64(I)});
5835 StoreInst *STInst = Builder.CreateStore(StoreValues[I], DependAddrGEPIter);
5836 STInst->setAlignment(Align(8));
5837 }
5838
5839 Value *DependBaseAddrGEP = Builder.CreateInBoundsGEP(
5840 ArrI64Ty, ArgsBase, {Builder.getInt64(0), Builder.getInt64(0)});
5841
5842 uint32_t SrcLocStrSize;
5843 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
5844 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
5845 Value *ThreadId = getOrCreateThreadID(Ident);
5846 Value *Args[] = {Ident, ThreadId, DependBaseAddrGEP};
5847
5848 Function *RTLFn = nullptr;
5849 if (IsDependSource)
5850 RTLFn = getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_doacross_post);
5851 else
5852 RTLFn = getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_doacross_wait);
5853 Builder.CreateCall(RTLFn, Args);
5854
5855 return Builder.saveIP();
5856}
5857
5859 const LocationDescription &Loc, BodyGenCallbackTy BodyGenCB,
5860 FinalizeCallbackTy FiniCB, bool IsThreads) {
5861 if (!updateToLocation(Loc))
5862 return Loc.IP;
5863
5864 Directive OMPD = Directive::OMPD_ordered;
5865 Instruction *EntryCall = nullptr;
5866 Instruction *ExitCall = nullptr;
5867
5868 if (IsThreads) {
5869 uint32_t SrcLocStrSize;
5870 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
5871 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
5872 Value *ThreadId = getOrCreateThreadID(Ident);
5873 Value *Args[] = {Ident, ThreadId};
5874
5875 Function *EntryRTLFn = getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_ordered);
5876 EntryCall = Builder.CreateCall(EntryRTLFn, Args);
5877
5878 Function *ExitRTLFn =
5879 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_end_ordered);
5880 ExitCall = Builder.CreateCall(ExitRTLFn, Args);
5881 }
5882
5883 return EmitOMPInlinedRegion(OMPD, EntryCall, ExitCall, BodyGenCB, FiniCB,
5884 /*Conditional*/ false, /*hasFinalize*/ true);
5885}
5886
5887OpenMPIRBuilder::InsertPointOrErrorTy OpenMPIRBuilder::EmitOMPInlinedRegion(
5888 Directive OMPD, Instruction *EntryCall, Instruction *ExitCall,
5889 BodyGenCallbackTy BodyGenCB, FinalizeCallbackTy FiniCB, bool Conditional,
5890 bool HasFinalize, bool IsCancellable) {
5891
5892 if (HasFinalize)
5893 FinalizationStack.push_back({FiniCB, OMPD, IsCancellable});
5894
5895 // Create inlined region's entry and body blocks, in preparation
5896 // for conditional creation
5897 BasicBlock *EntryBB = Builder.GetInsertBlock();
5898 Instruction *SplitPos = EntryBB->getTerminator();
5899 if (!isa_and_nonnull<BranchInst>(SplitPos))
5900 SplitPos = new UnreachableInst(Builder.getContext(), EntryBB);
5901 BasicBlock *ExitBB = EntryBB->splitBasicBlock(SplitPos, "omp_region.end");
5902 BasicBlock *FiniBB =
5903 EntryBB->splitBasicBlock(EntryBB->getTerminator(), "omp_region.finalize");
5904
5906 emitCommonDirectiveEntry(OMPD, EntryCall, ExitBB, Conditional);
5907
5908 // generate body
5909 if (Error Err = BodyGenCB(/* AllocaIP */ InsertPointTy(),
5910 /* CodeGenIP */ Builder.saveIP()))
5911 return Err;
5912
5913 // emit exit call and do any needed finalization.
5914 auto FinIP = InsertPointTy(FiniBB, FiniBB->getFirstInsertionPt());
5915 assert(FiniBB->getTerminator()->getNumSuccessors() == 1 &&
5916 FiniBB->getTerminator()->getSuccessor(0) == ExitBB &&
5917 "Unexpected control flow graph state!!");
5918 InsertPointOrErrorTy AfterIP =
5919 emitCommonDirectiveExit(OMPD, FinIP, ExitCall, HasFinalize);
5920 if (!AfterIP)
5921 return AfterIP.takeError();
5922 assert(FiniBB->getUniquePredecessor()->getUniqueSuccessor() == FiniBB &&
5923 "Unexpected Control Flow State!");
5925
5926 // If we are skipping the region of a non conditional, remove the exit
5927 // block, and clear the builder's insertion point.
5928 assert(SplitPos->getParent() == ExitBB &&
5929 "Unexpected Insertion point location!");
5930 auto merged = MergeBlockIntoPredecessor(ExitBB);
5931 BasicBlock *ExitPredBB = SplitPos->getParent();
5932 auto InsertBB = merged ? ExitPredBB : ExitBB;
5933 if (!isa_and_nonnull<BranchInst>(SplitPos))
5934 SplitPos->eraseFromParent();
5935 Builder.SetInsertPoint(InsertBB);
5936
5937 return Builder.saveIP();
5938}
5939
5940OpenMPIRBuilder::InsertPointTy OpenMPIRBuilder::emitCommonDirectiveEntry(
5941 Directive OMPD, Value *EntryCall, BasicBlock *ExitBB, bool Conditional) {
5942 // if nothing to do, Return current insertion point.
5943 if (!Conditional || !EntryCall)
5944 return Builder.saveIP();
5945
5946 BasicBlock *EntryBB = Builder.GetInsertBlock();
5947 Value *CallBool = Builder.CreateIsNotNull(EntryCall);
5948 auto *ThenBB = BasicBlock::Create(M.getContext(), "omp_region.body");
5949 auto *UI = new UnreachableInst(Builder.getContext(), ThenBB);
5950
5951 // Emit thenBB and set the Builder's insertion point there for
5952 // body generation next. Place the block after the current block.
5953 Function *CurFn = EntryBB->getParent();
5954 CurFn->insert(std::next(EntryBB->getIterator()), ThenBB);
5955
5956 // Move Entry branch to end of ThenBB, and replace with conditional
5957 // branch (If-stmt)
5958 Instruction *EntryBBTI = EntryBB->getTerminator();
5959 Builder.CreateCondBr(CallBool, ThenBB, ExitBB);
5960 EntryBBTI->removeFromParent();
5962 Builder.Insert(EntryBBTI);
5963 UI->eraseFromParent();
5965
5966 // return an insertion point to ExitBB.
5967 return IRBuilder<>::InsertPoint(ExitBB, ExitBB->getFirstInsertionPt());
5968}
5969
5970OpenMPIRBuilder::InsertPointOrErrorTy OpenMPIRBuilder::emitCommonDirectiveExit(
5971 omp::Directive OMPD, InsertPointTy FinIP, Instruction *ExitCall,
5972 bool HasFinalize) {
5973
5974 Builder.restoreIP(FinIP);
5975
5976 // If there is finalization to do, emit it before the exit call
5977 if (HasFinalize) {
5978 assert(!FinalizationStack.empty() &&
5979 "Unexpected finalization stack state!");
5980
5981 FinalizationInfo Fi = FinalizationStack.pop_back_val();
5982 assert(Fi.DK == OMPD && "Unexpected Directive for Finalization call!");
5983
5984 if (Error Err = Fi.FiniCB(FinIP))
5985 return Err;
5986
5987 BasicBlock *FiniBB = FinIP.getBlock();
5988 Instruction *FiniBBTI = FiniBB->getTerminator();
5989
5990 // set Builder IP for call creation
5991 Builder.SetInsertPoint(FiniBBTI);
5992 }
5993
5994 if (!ExitCall)
5995 return Builder.saveIP();
5996
5997 // place the Exitcall as last instruction before Finalization block terminator
5998 ExitCall->removeFromParent();
5999 Builder.Insert(ExitCall);
6000
6001 return IRBuilder<>::InsertPoint(ExitCall->getParent(),
6002 ExitCall->getIterator());
6003}
6004
6006 InsertPointTy IP, Value *MasterAddr, Value *PrivateAddr,
6007 llvm::IntegerType *IntPtrTy, bool BranchtoEnd) {
6008 if (!IP.isSet())
6009 return IP;
6010
6012
6013 // creates the following CFG structure
6014 // OMP_Entry : (MasterAddr != PrivateAddr)?
6015 // F T
6016 // | \
6017 // | copin.not.master
6018 // | /
6019 // v /
6020 // copyin.not.master.end
6021 // |
6022 // v
6023 // OMP.Entry.Next
6024
6025 BasicBlock *OMP_Entry = IP.getBlock();
6026 Function *CurFn = OMP_Entry->getParent();
6027 BasicBlock *CopyBegin =
6028 BasicBlock::Create(M.getContext(), "copyin.not.master", CurFn);
6029 BasicBlock *CopyEnd = nullptr;
6030
6031 // If entry block is terminated, split to preserve the branch to following
6032 // basic block (i.e. OMP.Entry.Next), otherwise, leave everything as is.
6033 if (isa_and_nonnull<BranchInst>(OMP_Entry->getTerminator())) {
6034 CopyEnd = OMP_Entry->splitBasicBlock(OMP_Entry->getTerminator(),
6035 "copyin.not.master.end");
6036 OMP_Entry->getTerminator()->eraseFromParent();
6037 } else {
6038 CopyEnd =
6039 BasicBlock::Create(M.getContext(), "copyin.not.master.end", CurFn);
6040 }
6041
6042 Builder.SetInsertPoint(OMP_Entry);
6043 Value *MasterPtr = Builder.CreatePtrToInt(MasterAddr, IntPtrTy);
6044 Value *PrivatePtr = Builder.CreatePtrToInt(PrivateAddr, IntPtrTy);
6045 Value *cmp = Builder.CreateICmpNE(MasterPtr, PrivatePtr);
6046 Builder.CreateCondBr(cmp, CopyBegin, CopyEnd);
6047
6048 Builder.SetInsertPoint(CopyBegin);
6049 if (BranchtoEnd)
6051
6052 return Builder.saveIP();
6053}
6054
6056 Value *Size, Value *Allocator,
6057 std::string Name) {
6059 updateToLocation(Loc);
6060
6061 uint32_t SrcLocStrSize;
6062 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
6063 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
6064 Value *ThreadId = getOrCreateThreadID(Ident);
6065 Value *Args[] = {ThreadId, Size, Allocator};
6066
6067 Function *Fn = getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_alloc);
6068
6069 return Builder.CreateCall(Fn, Args, Name);
6070}
6071
6073 Value *Addr, Value *Allocator,
6074 std::string Name) {
6076 updateToLocation(Loc);
6077
6078 uint32_t SrcLocStrSize;
6079 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
6080 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
6081 Value *ThreadId = getOrCreateThreadID(Ident);
6082 Value *Args[] = {ThreadId, Addr, Allocator};
6083 Function *Fn = getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_free);
6084 return Builder.CreateCall(Fn, Args, Name);
6085}
6086
6088 const LocationDescription &Loc, Value *InteropVar,
6089 omp::OMPInteropType InteropType, Value *Device, Value *NumDependences,
6090 Value *DependenceAddress, bool HaveNowaitClause) {
6092 updateToLocation(Loc);
6093
6094 uint32_t SrcLocStrSize;
6095 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
6096 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
6097 Value *ThreadId = getOrCreateThreadID(Ident);
6098 if (Device == nullptr)
6100 Constant *InteropTypeVal = ConstantInt::get(Int32, (int)InteropType);
6101 if (NumDependences == nullptr) {
6102 NumDependences = ConstantInt::get(Int32, 0);
6103 PointerType *PointerTypeVar = PointerType::getUnqual(M.getContext());
6104 DependenceAddress = ConstantPointerNull::get(PointerTypeVar);
6105 }
6106 Value *HaveNowaitClauseVal = ConstantInt::get(Int32, HaveNowaitClause);
6107 Value *Args[] = {
6108 Ident, ThreadId, InteropVar, InteropTypeVal,
6109 Device, NumDependences, DependenceAddress, HaveNowaitClauseVal};
6110
6111 Function *Fn = getOrCreateRuntimeFunctionPtr(OMPRTL___tgt_interop_init);
6112
6113 return Builder.CreateCall(Fn, Args);
6114}
6115
6117 const LocationDescription &Loc, Value *InteropVar, Value *Device,
6118 Value *NumDependences, Value *DependenceAddress, bool HaveNowaitClause) {
6120 updateToLocation(Loc);
6121
6122 uint32_t SrcLocStrSize;
6123 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
6124 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
6125 Value *ThreadId = getOrCreateThreadID(Ident);
6126 if (Device == nullptr)
6128 if (NumDependences == nullptr) {
6129 NumDependences = ConstantInt::get(Int32, 0);
6130 PointerType *PointerTypeVar = PointerType::getUnqual(M.getContext());
6131 DependenceAddress = ConstantPointerNull::get(PointerTypeVar);
6132 }
6133 Value *HaveNowaitClauseVal = ConstantInt::get(Int32, HaveNowaitClause);
6134 Value *Args[] = {
6135 Ident, ThreadId, InteropVar, Device,
6136 NumDependences, DependenceAddress, HaveNowaitClauseVal};
6137
6138 Function *Fn = getOrCreateRuntimeFunctionPtr(OMPRTL___tgt_interop_destroy);
6139
6140 return Builder.CreateCall(Fn, Args);
6141}
6142
6144 Value *InteropVar, Value *Device,
6145 Value *NumDependences,
6146 Value *DependenceAddress,
6147 bool HaveNowaitClause) {
6149 updateToLocation(Loc);
6150 uint32_t SrcLocStrSize;
6151 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
6152 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
6153 Value *ThreadId = getOrCreateThreadID(Ident);
6154 if (Device == nullptr)
6156 if (NumDependences == nullptr) {
6157 NumDependences = ConstantInt::get(Int32, 0);
6158 PointerType *PointerTypeVar = PointerType::getUnqual(M.getContext());
6159 DependenceAddress = ConstantPointerNull::get(PointerTypeVar);
6160 }
6161 Value *HaveNowaitClauseVal = ConstantInt::get(Int32, HaveNowaitClause);
6162 Value *Args[] = {
6163 Ident, ThreadId, InteropVar, Device,
6164 NumDependences, DependenceAddress, HaveNowaitClauseVal};
6165
6166 Function *Fn = getOrCreateRuntimeFunctionPtr(OMPRTL___tgt_interop_use);
6167
6168 return Builder.CreateCall(Fn, Args);
6169}
6170
6172 const LocationDescription &Loc, llvm::Value *Pointer,
6175 updateToLocation(Loc);
6176
6177 uint32_t SrcLocStrSize;
6178 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
6179 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
6180 Value *ThreadId = getOrCreateThreadID(Ident);
6181 Constant *ThreadPrivateCache =
6182 getOrCreateInternalVariable(Int8PtrPtr, Name.str());
6183 llvm::Value *Args[] = {Ident, ThreadId, Pointer, Size, ThreadPrivateCache};
6184
6185 Function *Fn =
6186 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_threadprivate_cached);
6187
6188 return Builder.CreateCall(Fn, Args);
6189}
6190
6192 const LocationDescription &Loc,
6194 assert(!Attrs.MaxThreads.empty() && !Attrs.MaxTeams.empty() &&
6195 "expected num_threads and num_teams to be specified");
6196
6197 if (!updateToLocation(Loc))
6198 return Loc.IP;
6199
6200 uint32_t SrcLocStrSize;
6201 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
6202 Constant *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
6203 Constant *IsSPMDVal = ConstantInt::getSigned(Int8, Attrs.ExecFlags);
6204 Constant *UseGenericStateMachineVal = ConstantInt::getSigned(
6205 Int8, Attrs.ExecFlags != omp::OMP_TGT_EXEC_MODE_SPMD);
6206 Constant *MayUseNestedParallelismVal = ConstantInt::getSigned(Int8, true);
6207 Constant *DebugIndentionLevelVal = ConstantInt::getSigned(Int16, 0);
6208
6209 Function *DebugKernelWrapper = Builder.GetInsertBlock()->getParent();
6210 Function *Kernel = DebugKernelWrapper;
6211
6212 // We need to strip the debug prefix to get the correct kernel name.
6213 StringRef KernelName = Kernel->getName();
6214 const std::string DebugPrefix = "_debug__";
6215 if (KernelName.ends_with(DebugPrefix)) {
6216 KernelName = KernelName.drop_back(DebugPrefix.length());
6217 Kernel = M.getFunction(KernelName);
6218 assert(Kernel && "Expected the real kernel to exist");
6219 }
6220
6221 // Manifest the launch configuration in the metadata matching the kernel
6222 // environment.
6223 if (Attrs.MinTeams > 1 || Attrs.MaxTeams.front() > 0)
6224 writeTeamsForKernel(T, *Kernel, Attrs.MinTeams, Attrs.MaxTeams.front());
6225
6226 // If MaxThreads not set, select the maximum between the default workgroup
6227 // size and the MinThreads value.
6228 int32_t MaxThreadsVal = Attrs.MaxThreads.front();
6229 if (MaxThreadsVal < 0)
6230 MaxThreadsVal = std::max(
6231 int32_t(getGridValue(T, Kernel).GV_Default_WG_Size), Attrs.MinThreads);
6232
6233 if (MaxThreadsVal > 0)
6234 writeThreadBoundsForKernel(T, *Kernel, Attrs.MinThreads, MaxThreadsVal);
6235
6236 Constant *MinThreads = ConstantInt::getSigned(Int32, Attrs.MinThreads);
6238 Constant *MinTeams = ConstantInt::getSigned(Int32, Attrs.MinTeams);
6239 Constant *MaxTeams = ConstantInt::getSigned(Int32, Attrs.MaxTeams.front());
6240 Constant *ReductionDataSize = ConstantInt::getSigned(Int32, 0);
6241 Constant *ReductionBufferLength = ConstantInt::getSigned(Int32, 0);
6242
6244 omp::RuntimeFunction::OMPRTL___kmpc_target_init);
6245 const DataLayout &DL = Fn->getDataLayout();
6246
6247 Twine DynamicEnvironmentName = KernelName + "_dynamic_environment";
6248 Constant *DynamicEnvironmentInitializer =
6249 ConstantStruct::get(DynamicEnvironment, {DebugIndentionLevelVal});
6250 GlobalVariable *DynamicEnvironmentGV = new GlobalVariable(
6251 M, DynamicEnvironment, /*IsConstant=*/false, GlobalValue::WeakODRLinkage,
6252 DynamicEnvironmentInitializer, DynamicEnvironmentName,
6253 /*InsertBefore=*/nullptr, GlobalValue::NotThreadLocal,
6254 DL.getDefaultGlobalsAddressSpace());
6255 DynamicEnvironmentGV->setVisibility(GlobalValue::ProtectedVisibility);
6256
6257 Constant *DynamicEnvironment =
6258 DynamicEnvironmentGV->getType() == DynamicEnvironmentPtr
6259 ? DynamicEnvironmentGV
6260 : ConstantExpr::getAddrSpaceCast(DynamicEnvironmentGV,
6261 DynamicEnvironmentPtr);
6262
6263 Constant *ConfigurationEnvironmentInitializer = ConstantStruct::get(
6264 ConfigurationEnvironment, {
6265 UseGenericStateMachineVal,
6266 MayUseNestedParallelismVal,
6267 IsSPMDVal,
6268 MinThreads,
6269 MaxThreads,
6270 MinTeams,
6271 MaxTeams,
6272 ReductionDataSize,
6273 ReductionBufferLength,
6274 });
6275 Constant *KernelEnvironmentInitializer = ConstantStruct::get(
6276 KernelEnvironment, {
6277 ConfigurationEnvironmentInitializer,
6278 Ident,
6279 DynamicEnvironment,
6280 });
6281 std::string KernelEnvironmentName =
6282 (KernelName + "_kernel_environment").str();
6283 GlobalVariable *KernelEnvironmentGV = new GlobalVariable(
6284 M, KernelEnvironment, /*IsConstant=*/true, GlobalValue::WeakODRLinkage,
6285 KernelEnvironmentInitializer, KernelEnvironmentName,
6286 /*InsertBefore=*/nullptr, GlobalValue::NotThreadLocal,
6287 DL.getDefaultGlobalsAddressSpace());
6288 KernelEnvironmentGV->setVisibility(GlobalValue::ProtectedVisibility);
6289
6290 Constant *KernelEnvironment =
6291 KernelEnvironmentGV->getType() == KernelEnvironmentPtr
6292 ? KernelEnvironmentGV
6293 : ConstantExpr::getAddrSpaceCast(KernelEnvironmentGV,
6294 KernelEnvironmentPtr);
6295 Value *KernelLaunchEnvironment = DebugKernelWrapper->getArg(0);
6296 CallInst *ThreadKind =
6297 Builder.CreateCall(Fn, {KernelEnvironment, KernelLaunchEnvironment});
6298
6299 Value *ExecUserCode = Builder.CreateICmpEQ(
6300 ThreadKind, Constant::getAllOnesValue(ThreadKind->getType()),
6301 "exec_user_code");
6302
6303 // ThreadKind = __kmpc_target_init(...)
6304 // if (ThreadKind == -1)
6305 // user_code
6306 // else
6307 // return;
6308
6309 auto *UI = Builder.CreateUnreachable();
6310 BasicBlock *CheckBB = UI->getParent();
6311 BasicBlock *UserCodeEntryBB = CheckBB->splitBasicBlock(UI, "user_code.entry");
6312
6313 BasicBlock *WorkerExitBB = BasicBlock::Create(
6314 CheckBB->getContext(), "worker.exit", CheckBB->getParent());
6315 Builder.SetInsertPoint(WorkerExitBB);
6317
6318 auto *CheckBBTI = CheckBB->getTerminator();
6319 Builder.SetInsertPoint(CheckBBTI);
6320 Builder.CreateCondBr(ExecUserCode, UI->getParent(), WorkerExitBB);
6321
6322 CheckBBTI->eraseFromParent();
6323 UI->eraseFromParent();
6324
6325 // Continue in the "user_code" block, see diagram above and in
6326 // openmp/libomptarget/deviceRTLs/common/include/target.h .
6327 return InsertPointTy(UserCodeEntryBB, UserCodeEntryBB->getFirstInsertionPt());
6328}
6329
6331 int32_t TeamsReductionDataSize,
6332 int32_t TeamsReductionBufferLength) {
6333 if (!updateToLocation(Loc))
6334 return;
6335
6337 omp::RuntimeFunction::OMPRTL___kmpc_target_deinit);
6338
6339 Builder.CreateCall(Fn, {});
6340
6341 if (!TeamsReductionBufferLength || !TeamsReductionDataSize)
6342 return;
6343
6345 // We need to strip the debug prefix to get the correct kernel name.
6346 StringRef KernelName = Kernel->getName();
6347 const std::string DebugPrefix = "_debug__";
6348 if (KernelName.ends_with(DebugPrefix))
6349 KernelName = KernelName.drop_back(DebugPrefix.length());
6350 auto *KernelEnvironmentGV =
6351 M.getNamedGlobal((KernelName + "_kernel_environment").str());
6352 assert(KernelEnvironmentGV && "Expected kernel environment global\n");
6353 auto *KernelEnvironmentInitializer = KernelEnvironmentGV->getInitializer();
6354 auto *NewInitializer = ConstantFoldInsertValueInstruction(
6355 KernelEnvironmentInitializer,
6356 ConstantInt::get(Int32, TeamsReductionDataSize), {0, 7});
6357 NewInitializer = ConstantFoldInsertValueInstruction(
6358 NewInitializer, ConstantInt::get(Int32, TeamsReductionBufferLength),
6359 {0, 8});
6360 KernelEnvironmentGV->setInitializer(NewInitializer);
6361}
6362
6364 Module &M = *Kernel.getParent();
6365 NamedMDNode *MD = M.getOrInsertNamedMetadata("nvvm.annotations");
6366 for (auto *Op : MD->operands()) {
6367 if (Op->getNumOperands() != 3)
6368 continue;
6369 auto *KernelOp = dyn_cast<ConstantAsMetadata>(Op->getOperand(0));
6370 if (!KernelOp || KernelOp->getValue() != &Kernel)
6371 continue;
6372 auto *Prop = dyn_cast<MDString>(Op->getOperand(1));
6373 if (!Prop || Prop->getString() != Name)
6374 continue;
6375 return Op;
6376 }
6377 return nullptr;
6378}
6379
6381 bool Min) {
6382 // Update the "maxntidx" metadata for NVIDIA, or add it.
6383 MDNode *ExistingOp = getNVPTXMDNode(Kernel, Name);
6384 if (ExistingOp) {
6385 auto *OldVal = cast<ConstantAsMetadata>(ExistingOp->getOperand(2));
6386 int32_t OldLimit = cast<ConstantInt>(OldVal->getValue())->getZExtValue();
6387 ExistingOp->replaceOperandWith(
6388 2, ConstantAsMetadata::get(ConstantInt::get(
6389 OldVal->getValue()->getType(),
6390 Min ? std::min(OldLimit, Value) : std::max(OldLimit, Value))));
6391 } else {
6392 LLVMContext &Ctx = Kernel.getContext();
6394 MDString::get(Ctx, Name),
6396 ConstantInt::get(Type::getInt32Ty(Ctx), Value))};
6397 // Append metadata to nvvm.annotations
6398 Module &M = *Kernel.getParent();
6399 NamedMDNode *MD = M.getOrInsertNamedMetadata("nvvm.annotations");
6400 MD->addOperand(MDNode::get(Ctx, MDVals));
6401 }
6402}
6403
6404std::pair<int32_t, int32_t>
6406 int32_t ThreadLimit =
6407 Kernel.getFnAttributeAsParsedInteger("omp_target_thread_limit");
6408
6409 if (T.isAMDGPU()) {
6410 const auto &Attr = Kernel.getFnAttribute("amdgpu-flat-work-group-size");
6411 if (!Attr.isValid() || !Attr.isStringAttribute())
6412 return {0, ThreadLimit};
6413 auto [LBStr, UBStr] = Attr.getValueAsString().split(',');
6414 int32_t LB, UB;
6415 if (!llvm::to_integer(UBStr, UB, 10))
6416 return {0, ThreadLimit};
6417 UB = ThreadLimit ? std::min(ThreadLimit, UB) : UB;
6418 if (!llvm::to_integer(LBStr, LB, 10))
6419 return {0, UB};
6420 return {LB, UB};
6421 }
6422
6423 if (MDNode *ExistingOp = getNVPTXMDNode(Kernel, "maxntidx")) {
6424 auto *OldVal = cast<ConstantAsMetadata>(ExistingOp->getOperand(2));
6425 int32_t UB = cast<ConstantInt>(OldVal->getValue())->getZExtValue();
6426 return {0, ThreadLimit ? std::min(ThreadLimit, UB) : UB};
6427 }
6428 return {0, ThreadLimit};
6429}
6430
6432 Function &Kernel, int32_t LB,
6433 int32_t UB) {
6434 Kernel.addFnAttr("omp_target_thread_limit", std::to_string(UB));
6435
6436 if (T.isAMDGPU()) {
6437 Kernel.addFnAttr("amdgpu-flat-work-group-size",
6438 llvm::utostr(LB) + "," + llvm::utostr(UB));
6439 return;
6440 }
6441
6442 updateNVPTXMetadata(Kernel, "maxntidx", UB, true);
6443}
6444
6445std::pair<int32_t, int32_t>
6447 // TODO: Read from backend annotations if available.
6448 return {0, Kernel.getFnAttributeAsParsedInteger("omp_target_num_teams")};
6449}
6450
6452 int32_t LB, int32_t UB) {
6453 if (T.isNVPTX())
6454 if (UB > 0)
6455 updateNVPTXMetadata(Kernel, "maxclusterrank", UB, true);
6456 if (T.isAMDGPU())
6457 Kernel.addFnAttr("amdgpu-max-num-workgroups", llvm::utostr(LB) + ",1,1");
6458
6459 Kernel.addFnAttr("omp_target_num_teams", std::to_string(LB));
6460}
6461
6462void OpenMPIRBuilder::setOutlinedTargetRegionFunctionAttributes(
6463 Function *OutlinedFn) {
6464 if (Config.isTargetDevice()) {
6466 // TODO: Determine if DSO local can be set to true.
6467 OutlinedFn->setDSOLocal(false);
6469 if (T.isAMDGCN())
6471 else if (T.isNVPTX())
6473 }
6474}
6475
6476Constant *OpenMPIRBuilder::createOutlinedFunctionID(Function *OutlinedFn,
6477 StringRef EntryFnIDName) {
6478 if (Config.isTargetDevice()) {
6479 assert(OutlinedFn && "The outlined function must exist if embedded");
6480 return OutlinedFn;
6481 }
6482
6483 return new GlobalVariable(
6484 M, Builder.getInt8Ty(), /*isConstant=*/true, GlobalValue::WeakAnyLinkage,
6485 Constant::getNullValue(Builder.getInt8Ty()), EntryFnIDName);
6486}
6487
6488Constant *OpenMPIRBuilder::createTargetRegionEntryAddr(Function *OutlinedFn,
6489 StringRef EntryFnName) {
6490 if (OutlinedFn)
6491 return OutlinedFn;
6492
6493 assert(!M.getGlobalVariable(EntryFnName, true) &&
6494 "Named kernel already exists?");
6495 return new GlobalVariable(
6496 M, Builder.getInt8Ty(), /*isConstant=*/true, GlobalValue::InternalLinkage,
6497 Constant::getNullValue(Builder.getInt8Ty()), EntryFnName);
6498}
6499
6501 TargetRegionEntryInfo &EntryInfo,
6502 FunctionGenCallback &GenerateFunctionCallback, bool IsOffloadEntry,
6503 Function *&OutlinedFn, Constant *&OutlinedFnID) {
6504
6505 SmallString<64> EntryFnName;
6506 OffloadInfoManager.getTargetRegionEntryFnName(EntryFnName, EntryInfo);
6507
6509 Expected<Function *> CBResult = GenerateFunctionCallback(EntryFnName);
6510 if (!CBResult)
6511 return CBResult.takeError();
6512 OutlinedFn = *CBResult;
6513 } else {
6514 OutlinedFn = nullptr;
6515 }
6516
6517 // If this target outline function is not an offload entry, we don't need to
6518 // register it. This may be in the case of a false if clause, or if there are
6519 // no OpenMP targets.
6520 if (!IsOffloadEntry)
6521 return Error::success();
6522
6523 std::string EntryFnIDName =
6525 ? std::string(EntryFnName)
6526 : createPlatformSpecificName({EntryFnName, "region_id"});
6527
6528 OutlinedFnID = registerTargetRegionFunction(EntryInfo, OutlinedFn,
6529 EntryFnName, EntryFnIDName);
6530 return Error::success();
6531}
6532
6534 TargetRegionEntryInfo &EntryInfo, Function *OutlinedFn,
6535 StringRef EntryFnName, StringRef EntryFnIDName) {
6536 if (OutlinedFn)
6537 setOutlinedTargetRegionFunctionAttributes(OutlinedFn);
6538 auto OutlinedFnID = createOutlinedFunctionID(OutlinedFn, EntryFnIDName);
6539 auto EntryAddr = createTargetRegionEntryAddr(OutlinedFn, EntryFnName);
6541 EntryInfo, EntryAddr, OutlinedFnID,
6543 return OutlinedFnID;
6544}
6545
6547 const LocationDescription &Loc, InsertPointTy AllocaIP,
6548 InsertPointTy CodeGenIP, Value *DeviceID, Value *IfCond,
6549 TargetDataInfo &Info, GenMapInfoCallbackTy GenMapInfoCB,
6550 omp::RuntimeFunction *MapperFunc,
6552 BodyGenTy BodyGenType)>
6553 BodyGenCB,
6554 function_ref<void(unsigned int, Value *)> DeviceAddrCB,
6555 function_ref<Value *(unsigned int)> CustomMapperCB, Value *SrcLocInfo) {
6556 if (!updateToLocation(Loc))
6557 return InsertPointTy();
6558
6559 Builder.restoreIP(CodeGenIP);
6560 // Disable TargetData CodeGen on Device pass.
6561 if (Config.IsTargetDevice.value_or(false)) {
6562 if (BodyGenCB) {
6563 InsertPointOrErrorTy AfterIP =
6564 BodyGenCB(Builder.saveIP(), BodyGenTy::NoPriv);
6565 if (!AfterIP)
6566 return AfterIP.takeError();
6567 Builder.restoreIP(*AfterIP);
6568 }
6569 return Builder.saveIP();
6570 }
6571
6572 bool IsStandAlone = !BodyGenCB;
6573 MapInfosTy *MapInfo;
6574 // Generate the code for the opening of the data environment. Capture all the
6575 // arguments of the runtime call by reference because they are used in the
6576 // closing of the region.
6577 auto BeginThenGen = [&](InsertPointTy AllocaIP,
6578 InsertPointTy CodeGenIP) -> Error {
6579 MapInfo = &GenMapInfoCB(Builder.saveIP());
6580 emitOffloadingArrays(AllocaIP, Builder.saveIP(), *MapInfo, Info,
6581 /*IsNonContiguous=*/true, DeviceAddrCB,
6582 CustomMapperCB);
6583
6584 TargetDataRTArgs RTArgs;
6586
6587 // Emit the number of elements in the offloading arrays.
6588 Value *PointerNum = Builder.getInt32(Info.NumberOfPtrs);
6589
6590 // Source location for the ident struct
6591 if (!SrcLocInfo) {
6592 uint32_t SrcLocStrSize;
6593 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
6594 SrcLocInfo = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
6595 }
6596
6597 SmallVector<llvm::Value *, 13> OffloadingArgs = {
6598 SrcLocInfo, DeviceID,
6599 PointerNum, RTArgs.BasePointersArray,
6600 RTArgs.PointersArray, RTArgs.SizesArray,
6601 RTArgs.MapTypesArray, RTArgs.MapNamesArray,
6602 RTArgs.MappersArray};
6603
6604 if (IsStandAlone) {
6605 assert(MapperFunc && "MapperFunc missing for standalone target data");
6606
6607 auto TaskBodyCB = [&](Value *, Value *,
6609 if (Info.HasNoWait) {
6610 OffloadingArgs.append({llvm::Constant::getNullValue(Int32),
6614 }
6615
6617 OffloadingArgs);
6618
6619 if (Info.HasNoWait) {
6620 BasicBlock *OffloadContBlock =
6621 BasicBlock::Create(Builder.getContext(), "omp_offload.cont");
6623 emitBlock(OffloadContBlock, CurFn, /*IsFinished=*/true);
6625 }
6626 return Error::success();
6627 };
6628
6629 bool RequiresOuterTargetTask = Info.HasNoWait;
6630 if (!RequiresOuterTargetTask)
6631 cantFail(TaskBodyCB(/*DeviceID=*/nullptr, /*RTLoc=*/nullptr,
6632 /*TargetTaskAllocaIP=*/{}));
6633 else
6634 cantFail(emitTargetTask(TaskBodyCB, DeviceID, SrcLocInfo, AllocaIP,
6635 /*Dependencies=*/{}, Info.HasNoWait));
6636 } else {
6637 Function *BeginMapperFunc = getOrCreateRuntimeFunctionPtr(
6638 omp::OMPRTL___tgt_target_data_begin_mapper);
6639
6640 Builder.CreateCall(BeginMapperFunc, OffloadingArgs);
6641
6642 for (auto DeviceMap : Info.DevicePtrInfoMap) {
6643 if (isa<AllocaInst>(DeviceMap.second.second)) {
6644 auto *LI =
6645 Builder.CreateLoad(Builder.getPtrTy(), DeviceMap.second.first);
6646 Builder.CreateStore(LI, DeviceMap.second.second);
6647 }
6648 }
6649
6650 // If device pointer privatization is required, emit the body of the
6651 // region here. It will have to be duplicated: with and without
6652 // privatization.
6653 InsertPointOrErrorTy AfterIP =
6654 BodyGenCB(Builder.saveIP(), BodyGenTy::Priv);
6655 if (!AfterIP)
6656 return AfterIP.takeError();
6657 Builder.restoreIP(*AfterIP);
6658 }
6659 return Error::success();
6660 };
6661
6662 // If we need device pointer privatization, we need to emit the body of the
6663 // region with no privatization in the 'else' branch of the conditional.
6664 // Otherwise, we don't have to do anything.
6665 auto BeginElseGen = [&](InsertPointTy AllocaIP,
6666 InsertPointTy CodeGenIP) -> Error {
6667 InsertPointOrErrorTy AfterIP =
6668 BodyGenCB(Builder.saveIP(), BodyGenTy::DupNoPriv);
6669 if (!AfterIP)
6670 return AfterIP.takeError();
6671 Builder.restoreIP(*AfterIP);
6672 return Error::success();
6673 };
6674
6675 // Generate code for the closing of the data region.
6676 auto EndThenGen = [&](InsertPointTy AllocaIP, InsertPointTy CodeGenIP) {
6677 TargetDataRTArgs RTArgs;
6678 Info.EmitDebug = !MapInfo->Names.empty();
6679 emitOffloadingArraysArgument(Builder, RTArgs, Info, /*ForEndCall=*/true);
6680
6681 // Emit the number of elements in the offloading arrays.
6682 Value *PointerNum = Builder.getInt32(Info.NumberOfPtrs);
6683
6684 // Source location for the ident struct
6685 if (!SrcLocInfo) {
6686 uint32_t SrcLocStrSize;
6687 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
6688 SrcLocInfo = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
6689 }
6690
6691 Value *OffloadingArgs[] = {SrcLocInfo, DeviceID,
6692 PointerNum, RTArgs.BasePointersArray,
6693 RTArgs.PointersArray, RTArgs.SizesArray,
6694 RTArgs.MapTypesArray, RTArgs.MapNamesArray,
6695 RTArgs.MappersArray};
6696 Function *EndMapperFunc =
6697 getOrCreateRuntimeFunctionPtr(omp::OMPRTL___tgt_target_data_end_mapper);
6698
6699 Builder.CreateCall(EndMapperFunc, OffloadingArgs);
6700 return Error::success();
6701 };
6702
6703 // We don't have to do anything to close the region if the if clause evaluates
6704 // to false.
6705 auto EndElseGen = [&](InsertPointTy AllocaIP, InsertPointTy CodeGenIP) {
6706 return Error::success();
6707 };
6708
6709 Error Err = [&]() -> Error {
6710 if (BodyGenCB) {
6711 Error Err = [&]() {
6712 if (IfCond)
6713 return emitIfClause(IfCond, BeginThenGen, BeginElseGen, AllocaIP);
6714 return BeginThenGen(AllocaIP, Builder.saveIP());
6715 }();
6716
6717 if (Err)
6718 return Err;
6719
6720 // If we don't require privatization of device pointers, we emit the body
6721 // in between the runtime calls. This avoids duplicating the body code.
6722 InsertPointOrErrorTy AfterIP =
6723 BodyGenCB(Builder.saveIP(), BodyGenTy::NoPriv);
6724 if (!AfterIP)
6725 return AfterIP.takeError();
6726 Builder.restoreIP(*AfterIP);
6727
6728 if (IfCond)
6729 return emitIfClause(IfCond, EndThenGen, EndElseGen, AllocaIP);
6730 return EndThenGen(AllocaIP, Builder.saveIP());
6731 }
6732 if (IfCond)
6733 return emitIfClause(IfCond, BeginThenGen, EndElseGen, AllocaIP);
6734 return BeginThenGen(AllocaIP, Builder.saveIP());
6735 }();
6736
6737 if (Err)
6738 return Err;
6739
6740 return Builder.saveIP();
6741}
6742
6745 bool IsGPUDistribute) {
6746 assert((IVSize == 32 || IVSize == 64) &&
6747 "IV size is not compatible with the omp runtime");
6749 if (IsGPUDistribute)
6750 Name = IVSize == 32
6751 ? (IVSigned ? omp::OMPRTL___kmpc_distribute_static_init_4
6752 : omp::OMPRTL___kmpc_distribute_static_init_4u)
6753 : (IVSigned ? omp::OMPRTL___kmpc_distribute_static_init_8
6754 : omp::OMPRTL___kmpc_distribute_static_init_8u);
6755 else
6756 Name = IVSize == 32 ? (IVSigned ? omp::OMPRTL___kmpc_for_static_init_4
6757 : omp::OMPRTL___kmpc_for_static_init_4u)
6758 : (IVSigned ? omp::OMPRTL___kmpc_for_static_init_8
6759 : omp::OMPRTL___kmpc_for_static_init_8u);
6760
6762}
6763
6765 bool IVSigned) {
6766 assert((IVSize == 32 || IVSize == 64) &&
6767 "IV size is not compatible with the omp runtime");
6768 RuntimeFunction Name = IVSize == 32
6769 ? (IVSigned ? omp::OMPRTL___kmpc_dispatch_init_4
6770 : omp::OMPRTL___kmpc_dispatch_init_4u)
6771 : (IVSigned ? omp::OMPRTL___kmpc_dispatch_init_8
6772 : omp::OMPRTL___kmpc_dispatch_init_8u);
6773
6775}
6776
6778 bool IVSigned) {
6779 assert((IVSize == 32 || IVSize == 64) &&
6780 "IV size is not compatible with the omp runtime");
6781 RuntimeFunction Name = IVSize == 32
6782 ? (IVSigned ? omp::OMPRTL___kmpc_dispatch_next_4
6783 : omp::OMPRTL___kmpc_dispatch_next_4u)
6784 : (IVSigned ? omp::OMPRTL___kmpc_dispatch_next_8
6785 : omp::OMPRTL___kmpc_dispatch_next_8u);
6786
6788}
6789
6791 bool IVSigned) {
6792 assert((IVSize == 32 || IVSize == 64) &&
6793 "IV size is not compatible with the omp runtime");
6794 RuntimeFunction Name = IVSize == 32
6795 ? (IVSigned ? omp::OMPRTL___kmpc_dispatch_fini_4
6796 : omp::OMPRTL___kmpc_dispatch_fini_4u)
6797 : (IVSigned ? omp::OMPRTL___kmpc_dispatch_fini_8
6798 : omp::OMPRTL___kmpc_dispatch_fini_8u);
6799
6801}
6802
6804 return getOrCreateRuntimeFunction(M, omp::OMPRTL___kmpc_dispatch_deinit);
6805}
6806
6808 OpenMPIRBuilder &OMPBuilder, IRBuilderBase &Builder,
6810 StringRef FuncName, SmallVectorImpl<Value *> &Inputs,
6813 SmallVector<Type *> ParameterTypes;
6814 if (OMPBuilder.Config.isTargetDevice()) {
6815 // Add the "implicit" runtime argument we use to provide launch specific
6816 // information for target devices.
6817 auto *Int8PtrTy = PointerType::getUnqual(Builder.getContext());
6818 ParameterTypes.push_back(Int8PtrTy);
6819
6820 // All parameters to target devices are passed as pointers
6821 // or i64. This assumes 64-bit address spaces/pointers.
6822 for (auto &Arg : Inputs)
6823 ParameterTypes.push_back(Arg->getType()->isPointerTy()
6824 ? Arg->getType()
6825 : Type::getInt64Ty(Builder.getContext()));
6826 } else {
6827 for (auto &Arg : Inputs)
6828 ParameterTypes.push_back(Arg->getType());
6829 }
6830
6831 auto BB = Builder.GetInsertBlock();
6832 auto M = BB->getModule();
6833 auto FuncType = FunctionType::get(Builder.getVoidTy(), ParameterTypes,
6834 /*isVarArg*/ false);
6835 auto Func =
6836 Function::Create(FuncType, GlobalValue::InternalLinkage, FuncName, M);
6837
6838 // Forward target-cpu and target-features function attributes from the
6839 // original function to the new outlined function.
6840 Function *ParentFn = Builder.GetInsertBlock()->getParent();
6841
6842 auto TargetCpuAttr = ParentFn->getFnAttribute("target-cpu");
6843 if (TargetCpuAttr.isStringAttribute())
6844 Func->addFnAttr(TargetCpuAttr);
6845
6846 auto TargetFeaturesAttr = ParentFn->getFnAttribute("target-features");
6847 if (TargetFeaturesAttr.isStringAttribute())
6848 Func->addFnAttr(TargetFeaturesAttr);
6849
6850 if (OMPBuilder.Config.isTargetDevice()) {
6851 Value *ExecMode =
6852 OMPBuilder.emitKernelExecutionMode(FuncName, DefaultAttrs.ExecFlags);
6853 OMPBuilder.emitUsed("llvm.compiler.used", {ExecMode});
6854 }
6855
6856 // Save insert point.
6857 IRBuilder<>::InsertPointGuard IPG(Builder);
6858 // If there's a DISubprogram associated with current function, then
6859 // generate one for the outlined function.
6860 if (Function *ParentFunc = BB->getParent()) {
6861 if (DISubprogram *SP = ParentFunc->getSubprogram()) {
6862 DICompileUnit *CU = SP->getUnit();
6863 DIBuilder DB(*M, true, CU);
6865 if (DL) {
6866 // TODO: We are using nullopt for arguments at the moment. This will
6867 // need to be updated when debug data is being generated for variables.
6868 DISubroutineType *Ty =
6869 DB.createSubroutineType(DB.getOrCreateTypeArray({}));
6870 DISubprogram::DISPFlags SPFlags = DISubprogram::SPFlagDefinition |
6871 DISubprogram::SPFlagOptimized |
6872 DISubprogram::SPFlagLocalToUnit;
6873
6874 DISubprogram *OutlinedSP = DB.createFunction(
6875 CU, FuncName, FuncName, SP->getFile(), DL.getLine(), Ty,
6876 DL.getLine(), DINode::DIFlags::FlagArtificial, SPFlags);
6877
6878 // Attach subprogram to the function.
6879 Func->setSubprogram(OutlinedSP);
6880 // Update the CurrentDebugLocation in the builder so that right scope
6881 // is used for things inside outlined function.
6883 DILocation::get(Func->getContext(), DL.getLine(), DL.getCol(),
6884 OutlinedSP, DL.getInlinedAt()));
6885 }
6886 }
6887 }
6888
6889 // Generate the region into the function.
6890 BasicBlock *EntryBB = BasicBlock::Create(Builder.getContext(), "entry", Func);
6891 Builder.SetInsertPoint(EntryBB);
6892
6893 // Insert target init call in the device compilation pass.
6894 if (OMPBuilder.Config.isTargetDevice())
6895 Builder.restoreIP(OMPBuilder.createTargetInit(Builder, DefaultAttrs));
6896
6897 BasicBlock *UserCodeEntryBB = Builder.GetInsertBlock();
6898
6899 // As we embed the user code in the middle of our target region after we
6900 // generate entry code, we must move what allocas we can into the entry
6901 // block to avoid possible breaking optimisations for device
6902 if (OMPBuilder.Config.isTargetDevice())
6904
6905 // Insert target deinit call in the device compilation pass.
6906 BasicBlock *OutlinedBodyBB =
6907 splitBB(Builder, /*CreateBranch=*/true, "outlined.body");
6909 Builder.saveIP(),
6910 OpenMPIRBuilder::InsertPointTy(OutlinedBodyBB, OutlinedBodyBB->begin()));
6911 if (!AfterIP)
6912 return AfterIP.takeError();
6913 Builder.restoreIP(*AfterIP);
6914 if (OMPBuilder.Config.isTargetDevice())
6915 OMPBuilder.createTargetDeinit(Builder);
6916
6917 // Insert return instruction.
6918 Builder.CreateRetVoid();
6919
6920 // New Alloca IP at entry point of created device function.
6921 Builder.SetInsertPoint(EntryBB->getFirstNonPHIIt());
6922 auto AllocaIP = Builder.saveIP();
6923
6924 Builder.SetInsertPoint(UserCodeEntryBB->getFirstNonPHIOrDbg());
6925
6926 // Skip the artificial dyn_ptr on the device.
6927 const auto &ArgRange =
6928 OMPBuilder.Config.isTargetDevice()
6929 ? make_range(Func->arg_begin() + 1, Func->arg_end())
6930 : Func->args();
6931
6932 auto ReplaceValue = [](Value *Input, Value *InputCopy, Function *Func) {
6933 // Things like GEP's can come in the form of Constants. Constants and
6934 // ConstantExpr's do not have access to the knowledge of what they're
6935 // contained in, so we must dig a little to find an instruction so we
6936 // can tell if they're used inside of the function we're outlining. We
6937 // also replace the original constant expression with a new instruction
6938 // equivalent; an instruction as it allows easy modification in the
6939 // following loop, as we can now know the constant (instruction) is
6940 // owned by our target function and replaceUsesOfWith can now be invoked
6941 // on it (cannot do this with constants it seems). A brand new one also
6942 // allows us to be cautious as it is perhaps possible the old expression
6943 // was used inside of the function but exists and is used externally
6944 // (unlikely by the nature of a Constant, but still).
6945 // NOTE: We cannot remove dead constants that have been rewritten to
6946 // instructions at this stage, we run the risk of breaking later lowering
6947 // by doing so as we could still be in the process of lowering the module
6948 // from MLIR to LLVM-IR and the MLIR lowering may still require the original
6949 // constants we have created rewritten versions of.
6950 if (auto *Const = dyn_cast<Constant>(Input))
6951 convertUsersOfConstantsToInstructions(Const, Func, false);
6952
6953 // Collect all the instructions
6954 for (User *User : make_early_inc_range(Input->users()))
6955 if (auto *Instr = dyn_cast<Instruction>(User))
6956 if (Instr->getFunction() == Func)
6957 Instr->replaceUsesOfWith(Input, InputCopy);
6958 };
6959
6960 SmallVector<std::pair<Value *, Value *>> DeferredReplacement;
6961
6962 // Rewrite uses of input valus to parameters.
6963 for (auto InArg : zip(Inputs, ArgRange)) {
6964 Value *Input = std::get<0>(InArg);
6965 Argument &Arg = std::get<1>(InArg);
6966 Value *InputCopy = nullptr;
6967
6969 ArgAccessorFuncCB(Arg, Input, InputCopy, AllocaIP, Builder.saveIP());
6970 if (!AfterIP)
6971 return AfterIP.takeError();
6972 Builder.restoreIP(*AfterIP);
6973
6974 // In certain cases a Global may be set up for replacement, however, this
6975 // Global may be used in multiple arguments to the kernel, just segmented
6976 // apart, for example, if we have a global array, that is sectioned into
6977 // multiple mappings (technically not legal in OpenMP, but there is a case
6978 // in Fortran for Common Blocks where this is neccesary), we will end up
6979 // with GEP's into this array inside the kernel, that refer to the Global
6980 // but are technically seperate arguments to the kernel for all intents and
6981 // purposes. If we have mapped a segment that requires a GEP into the 0-th
6982 // index, it will fold into an referal to the Global, if we then encounter
6983 // this folded GEP during replacement all of the references to the
6984 // Global in the kernel will be replaced with the argument we have generated
6985 // that corresponds to it, including any other GEP's that refer to the
6986 // Global that may be other arguments. This will invalidate all of the other
6987 // preceding mapped arguments that refer to the same global that may be
6988 // seperate segments. To prevent this, we defer global processing until all
6989 // other processing has been performed.
6990 if (llvm::isa<llvm::GlobalValue>(std::get<0>(InArg)) ||
6991 llvm::isa<llvm::GlobalObject>(std::get<0>(InArg)) ||
6992 llvm::isa<llvm::GlobalVariable>(std::get<0>(InArg))) {
6993 DeferredReplacement.push_back(std::make_pair(Input, InputCopy));
6994 continue;
6995 }
6996
6997 ReplaceValue(Input, InputCopy, Func);
6998 }
6999
7000 // Replace all of our deferred Input values, currently just Globals.
7001 for (auto Deferred : DeferredReplacement)
7002 ReplaceValue(std::get<0>(Deferred), std::get<1>(Deferred), Func);
7003
7004 return Func;
7005}
7006
7007/// Create an entry point for a target task with the following.
7008/// It'll have the following signature
7009/// void @.omp_target_task_proxy_func(i32 %thread.id, ptr %task)
7010/// This function is called from emitTargetTask once the
7011/// code to launch the target kernel has been outlined already.
7013 IRBuilderBase &Builder,
7014 CallInst *StaleCI) {
7015 Module &M = OMPBuilder.M;
7016 // KernelLaunchFunction is the target launch function, i.e.
7017 // the function that sets up kernel arguments and calls
7018 // __tgt_target_kernel to launch the kernel on the device.
7019 //
7020 Function *KernelLaunchFunction = StaleCI->getCalledFunction();
7021
7022 // StaleCI is the CallInst which is the call to the outlined
7023 // target kernel launch function. If there are values that the
7024 // outlined function uses then these are aggregated into a structure
7025 // which is passed as the second argument. If not, then there's
7026 // only one argument, the threadID. So, StaleCI can be
7027 //
7028 // %structArg = alloca { ptr, ptr }, align 8
7029 // %gep_ = getelementptr { ptr, ptr }, ptr %structArg, i32 0, i32 0
7030 // store ptr %20, ptr %gep_, align 8
7031 // %gep_8 = getelementptr { ptr, ptr }, ptr %structArg, i32 0, i32 1
7032 // store ptr %21, ptr %gep_8, align 8
7033 // call void @_QQmain..omp_par.1(i32 %global.tid.val6, ptr %structArg)
7034 //
7035 // OR
7036 //
7037 // call void @_QQmain..omp_par.1(i32 %global.tid.val6)
7039 StaleCI->getIterator());
7040 LLVMContext &Ctx = StaleCI->getParent()->getContext();
7041 Type *ThreadIDTy = Type::getInt32Ty(Ctx);
7042 Type *TaskPtrTy = OMPBuilder.TaskPtr;
7043 Type *TaskTy = OMPBuilder.Task;
7044 auto ProxyFnTy =
7045 FunctionType::get(Builder.getVoidTy(), {ThreadIDTy, TaskPtrTy},
7046 /* isVarArg */ false);
7047 auto ProxyFn = Function::Create(ProxyFnTy, GlobalValue::InternalLinkage,
7048 ".omp_target_task_proxy_func",
7049 Builder.GetInsertBlock()->getModule());
7050 ProxyFn->getArg(0)->setName("thread.id");
7051 ProxyFn->getArg(1)->setName("task");
7052
7053 BasicBlock *EntryBB =
7054 BasicBlock::Create(Builder.getContext(), "entry", ProxyFn);
7055 Builder.SetInsertPoint(EntryBB);
7056
7057 bool HasShareds = StaleCI->arg_size() > 1;
7058 // TODO: This is a temporary assert to prove to ourselves that
7059 // the outlined target launch function is always going to have
7060 // atmost two arguments if there is any data shared between
7061 // host and device.
7062 assert((!HasShareds || (StaleCI->arg_size() == 2)) &&
7063 "StaleCI with shareds should have exactly two arguments.");
7064 if (HasShareds) {
7065 auto *ArgStructAlloca = dyn_cast<AllocaInst>(StaleCI->getArgOperand(1));
7066 assert(ArgStructAlloca &&
7067 "Unable to find the alloca instruction corresponding to arguments "
7068 "for extracted function");
7069 auto *ArgStructType = cast<StructType>(ArgStructAlloca->getAllocatedType());
7070
7071 AllocaInst *NewArgStructAlloca =
7072 Builder.CreateAlloca(ArgStructType, nullptr, "structArg");
7073 Value *TaskT = ProxyFn->getArg(1);
7074 Value *ThreadId = ProxyFn->getArg(0);
7075 Value *SharedsSize =
7076 Builder.getInt64(M.getDataLayout().getTypeStoreSize(ArgStructType));
7077
7078 Value *Shareds = Builder.CreateStructGEP(TaskTy, TaskT, 0);
7079 LoadInst *LoadShared =
7080 Builder.CreateLoad(PointerType::getUnqual(Ctx), Shareds);
7081
7082 Builder.CreateMemCpy(
7083 NewArgStructAlloca, NewArgStructAlloca->getAlign(), LoadShared,
7084 LoadShared->getPointerAlignment(M.getDataLayout()), SharedsSize);
7085
7086 Builder.CreateCall(KernelLaunchFunction, {ThreadId, NewArgStructAlloca});
7087 }
7088 Builder.CreateRetVoid();
7089 return ProxyFn;
7090}
7091
7093 OpenMPIRBuilder &OMPBuilder, IRBuilderBase &Builder, bool IsOffloadEntry,
7094 TargetRegionEntryInfo &EntryInfo,
7096 Function *&OutlinedFn, Constant *&OutlinedFnID,
7100
7101 OpenMPIRBuilder::FunctionGenCallback &&GenerateOutlinedFunction =
7102 [&](StringRef EntryFnName) {
7103 return createOutlinedFunction(OMPBuilder, Builder, DefaultAttrs,
7104 EntryFnName, Inputs, CBFunc,
7105 ArgAccessorFuncCB);
7106 };
7107
7108 return OMPBuilder.emitTargetRegionFunction(
7109 EntryInfo, GenerateOutlinedFunction, IsOffloadEntry, OutlinedFn,
7110 OutlinedFnID);
7111}
7112
7114 TargetTaskBodyCallbackTy TaskBodyCB, Value *DeviceID, Value *RTLoc,
7117 bool HasNoWait) {
7118
7119 // The following explains the code-gen scenario for the `target` directive. A
7120 // similar scneario is followed for other device-related directives (e.g.
7121 // `target enter data`) but in similar fashion since we only need to emit task
7122 // that encapsulates the proper runtime call.
7123 //
7124 // When we arrive at this function, the target region itself has been
7125 // outlined into the function OutlinedFn.
7126 // So at ths point, for
7127 // --------------------------------------------------
7128 // void user_code_that_offloads(...) {
7129 // omp target depend(..) map(from:a) map(to:b, c)
7130 // a = b + c
7131 // }
7132 //
7133 // --------------------------------------------------
7134 //
7135 // we have
7136 //
7137 // --------------------------------------------------
7138 //
7139 // void user_code_that_offloads(...) {
7140 // %.offload_baseptrs = alloca [3 x ptr], align 8
7141 // %.offload_ptrs = alloca [3 x ptr], align 8
7142 // %.offload_mappers = alloca [3 x ptr], align 8
7143 // ;; target region has been outlined and now we need to
7144 // ;; offload to it via a target task.
7145 // }
7146 // void outlined_device_function(ptr a, ptr b, ptr c) {
7147 // *a = *b + *c
7148 // }
7149 //
7150 // We have to now do the following
7151 // (i) Make an offloading call to outlined_device_function using the OpenMP
7152 // RTL. See 'kernel_launch_function' in the pseudo code below. This is
7153 // emitted by emitKernelLaunch
7154 // (ii) Create a task entry point function that calls kernel_launch_function
7155 // and is the entry point for the target task. See
7156 // '@.omp_target_task_proxy_func in the pseudocode below.
7157 // (iii) Create a task with the task entry point created in (ii)
7158 //
7159 // That is we create the following
7160 //
7161 // void user_code_that_offloads(...) {
7162 // %.offload_baseptrs = alloca [3 x ptr], align 8
7163 // %.offload_ptrs = alloca [3 x ptr], align 8
7164 // %.offload_mappers = alloca [3 x ptr], align 8
7165 //
7166 // %structArg = alloca { ptr, ptr, ptr }, align 8
7167 // %strucArg[0] = %.offload_baseptrs
7168 // %strucArg[1] = %.offload_ptrs
7169 // %strucArg[2] = %.offload_mappers
7170 // proxy_target_task = @__kmpc_omp_task_alloc(...,
7171 // @.omp_target_task_proxy_func)
7172 // memcpy(proxy_target_task->shareds, %structArg, sizeof(structArg))
7173 // dependencies_array = ...
7174 // ;; if nowait not present
7175 // call @__kmpc_omp_wait_deps(..., dependencies_array)
7176 // call @__kmpc_omp_task_begin_if0(...)
7177 // call @ @.omp_target_task_proxy_func(i32 thread_id, ptr
7178 // %proxy_target_task) call @__kmpc_omp_task_complete_if0(...)
7179 // }
7180 //
7181 // define internal void @.omp_target_task_proxy_func(i32 %thread.id,
7182 // ptr %task) {
7183 // %structArg = alloca {ptr, ptr, ptr}
7184 // %shared_data = load (getelementptr %task, 0, 0)
7185 // mempcy(%structArg, %shared_data, sizeof(structArg))
7186 // kernel_launch_function(%thread.id, %structArg)
7187 // }
7188 //
7189 // We need the proxy function because the signature of the task entry point
7190 // expected by kmpc_omp_task is always the same and will be different from
7191 // that of the kernel_launch function.
7192 //
7193 // kernel_launch_function is generated by emitKernelLaunch and has the
7194 // always_inline attribute.
7195 // void kernel_launch_function(thread_id,
7196 // structArg) alwaysinline {
7197 // %kernel_args = alloca %struct.__tgt_kernel_arguments, align 8
7198 // offload_baseptrs = load(getelementptr structArg, 0, 0)
7199 // offload_ptrs = load(getelementptr structArg, 0, 1)
7200 // offload_mappers = load(getelementptr structArg, 0, 2)
7201 // ; setup kernel_args using offload_baseptrs, offload_ptrs and
7202 // ; offload_mappers
7203 // call i32 @__tgt_target_kernel(...,
7204 // outlined_device_function,
7205 // ptr %kernel_args)
7206 // }
7207 // void outlined_device_function(ptr a, ptr b, ptr c) {
7208 // *a = *b + *c
7209 // }
7210 //
7211 BasicBlock *TargetTaskBodyBB =
7212 splitBB(Builder, /*CreateBranch=*/true, "target.task.body");
7213 BasicBlock *TargetTaskAllocaBB =
7214 splitBB(Builder, /*CreateBranch=*/true, "target.task.alloca");
7215
7216 InsertPointTy TargetTaskAllocaIP(TargetTaskAllocaBB,
7217 TargetTaskAllocaBB->begin());
7218 InsertPointTy TargetTaskBodyIP(TargetTaskBodyBB, TargetTaskBodyBB->begin());
7219
7220 OutlineInfo OI;
7221 OI.EntryBB = TargetTaskAllocaBB;
7222 OI.OuterAllocaBB = AllocaIP.getBlock();
7223
7224 // Add the thread ID argument.
7227 Builder, AllocaIP, ToBeDeleted, TargetTaskAllocaIP, "global.tid", false));
7228
7229 Builder.restoreIP(TargetTaskBodyIP);
7230
7231 if (Error Err = TaskBodyCB(DeviceID, RTLoc, TargetTaskAllocaIP))
7232 return Err;
7233
7234 OI.ExitBB = Builder.saveIP().getBlock();
7235 OI.PostOutlineCB = [this, ToBeDeleted, Dependencies, HasNoWait,
7236 DeviceID](Function &OutlinedFn) mutable {
7237 assert(OutlinedFn.getNumUses() == 1 &&
7238 "there must be a single user for the outlined function");
7239
7240 CallInst *StaleCI = cast<CallInst>(OutlinedFn.user_back());
7241 bool HasShareds = StaleCI->arg_size() > 1;
7242
7243 Function *ProxyFn = emitTargetTaskProxyFunction(*this, Builder, StaleCI);
7244
7245 LLVM_DEBUG(dbgs() << "Proxy task entry function created: " << *ProxyFn
7246 << "\n");
7247
7248 Builder.SetInsertPoint(StaleCI);
7249
7250 // Gather the arguments for emitting the runtime call.
7251 uint32_t SrcLocStrSize;
7252 Constant *SrcLocStr =
7254 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
7255
7256 // @__kmpc_omp_task_alloc or @__kmpc_omp_target_task_alloc
7257 //
7258 // If `HasNoWait == true`, we call @__kmpc_omp_target_task_alloc to provide
7259 // the DeviceID to the deferred task and also since
7260 // @__kmpc_omp_target_task_alloc creates an untied/async task.
7261 bool NeedsTargetTask = HasNoWait && DeviceID;
7262 Function *TaskAllocFn =
7263 !NeedsTargetTask
7264 ? getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_omp_task_alloc)
7266 OMPRTL___kmpc_omp_target_task_alloc);
7267
7268 // Arguments - `loc_ref` (Ident) and `gtid` (ThreadID)
7269 // call.
7270 Value *ThreadID = getOrCreateThreadID(Ident);
7271
7272 // Argument - `sizeof_kmp_task_t` (TaskSize)
7273 // Tasksize refers to the size in bytes of kmp_task_t data structure
7274 // including private vars accessed in task.
7275 // TODO: add kmp_task_t_with_privates (privates)
7276 Value *TaskSize =
7278
7279 // Argument - `sizeof_shareds` (SharedsSize)
7280 // SharedsSize refers to the shareds array size in the kmp_task_t data
7281 // structure.
7282 Value *SharedsSize = Builder.getInt64(0);
7283 if (HasShareds) {
7284 auto *ArgStructAlloca = dyn_cast<AllocaInst>(StaleCI->getArgOperand(1));
7285 assert(ArgStructAlloca &&
7286 "Unable to find the alloca instruction corresponding to arguments "
7287 "for extracted function");
7288 auto *ArgStructType =
7289 dyn_cast<StructType>(ArgStructAlloca->getAllocatedType());
7290 assert(ArgStructType && "Unable to find struct type corresponding to "
7291 "arguments for extracted function");
7292 SharedsSize =
7294 }
7295
7296 // Argument - `flags`
7297 // Task is tied iff (Flags & 1) == 1.
7298 // Task is untied iff (Flags & 1) == 0.
7299 // Task is final iff (Flags & 2) == 2.
7300 // Task is not final iff (Flags & 2) == 0.
7301 // A target task is not final and is untied.
7303
7304 // Emit the @__kmpc_omp_task_alloc runtime call
7305 // The runtime call returns a pointer to an area where the task captured
7306 // variables must be copied before the task is run (TaskData)
7307 CallInst *TaskData = nullptr;
7308
7309 SmallVector<llvm::Value *> TaskAllocArgs = {
7310 /*loc_ref=*/Ident, /*gtid=*/ThreadID,
7311 /*flags=*/Flags,
7312 /*sizeof_task=*/TaskSize, /*sizeof_shared=*/SharedsSize,
7313 /*task_func=*/ProxyFn};
7314
7315 if (NeedsTargetTask) {
7316 assert(DeviceID && "Expected non-empty device ID.");
7317 TaskAllocArgs.push_back(DeviceID);
7318 }
7319
7320 TaskData = Builder.CreateCall(TaskAllocFn, TaskAllocArgs);
7321
7322 if (HasShareds) {
7323 Value *Shareds = StaleCI->getArgOperand(1);
7324 Align Alignment = TaskData->getPointerAlignment(M.getDataLayout());
7325 Value *TaskShareds = Builder.CreateLoad(VoidPtr, TaskData);
7326 Builder.CreateMemCpy(TaskShareds, Alignment, Shareds, Alignment,
7327 SharedsSize);
7328 }
7329
7330 Value *DepArray = emitTaskDependencies(*this, Dependencies);
7331
7332 // ---------------------------------------------------------------
7333 // V5.2 13.8 target construct
7334 // If the nowait clause is present, execution of the target task
7335 // may be deferred. If the nowait clause is not present, the target task is
7336 // an included task.
7337 // ---------------------------------------------------------------
7338 // The above means that the lack of a nowait on the target construct
7339 // translates to '#pragma omp task if(0)'
7340 if (!NeedsTargetTask) {
7341 if (DepArray) {
7342 Function *TaskWaitFn =
7343 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_omp_wait_deps);
7345 TaskWaitFn,
7346 {/*loc_ref=*/Ident, /*gtid=*/ThreadID,
7347 /*ndeps=*/Builder.getInt32(Dependencies.size()),
7348 /*dep_list=*/DepArray,
7349 /*ndeps_noalias=*/ConstantInt::get(Builder.getInt32Ty(), 0),
7350 /*noalias_dep_list=*/
7352 }
7353 // Included task.
7354 Function *TaskBeginFn =
7355 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_omp_task_begin_if0);
7356 Function *TaskCompleteFn =
7357 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_omp_task_complete_if0);
7358 Builder.CreateCall(TaskBeginFn, {Ident, ThreadID, TaskData});
7359 CallInst *CI = Builder.CreateCall(ProxyFn, {ThreadID, TaskData});
7360 CI->setDebugLoc(StaleCI->getDebugLoc());
7361 Builder.CreateCall(TaskCompleteFn, {Ident, ThreadID, TaskData});
7362 } else if (DepArray) {
7363 // HasNoWait - meaning the task may be deferred. Call
7364 // __kmpc_omp_task_with_deps if there are dependencies,
7365 // else call __kmpc_omp_task
7366 Function *TaskFn =
7367 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_omp_task_with_deps);
7369 TaskFn,
7370 {Ident, ThreadID, TaskData, Builder.getInt32(Dependencies.size()),
7371 DepArray, ConstantInt::get(Builder.getInt32Ty(), 0),
7373 } else {
7374 // Emit the @__kmpc_omp_task runtime call to spawn the task
7375 Function *TaskFn = getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_omp_task);
7376 Builder.CreateCall(TaskFn, {Ident, ThreadID, TaskData});
7377 }
7378
7379 StaleCI->eraseFromParent();
7380 for (Instruction *I : llvm::reverse(ToBeDeleted))
7381 I->eraseFromParent();
7382 };
7383 addOutlineInfo(std::move(OI));
7384
7385 LLVM_DEBUG(dbgs() << "Insert block after emitKernelLaunch = \n"
7386 << *(Builder.GetInsertBlock()) << "\n");
7387 LLVM_DEBUG(dbgs() << "Module after emitKernelLaunch = \n"
7389 << "\n");
7390 return Builder.saveIP();
7391}
7392
7394 InsertPointTy AllocaIP, InsertPointTy CodeGenIP, TargetDataInfo &Info,
7395 TargetDataRTArgs &RTArgs, MapInfosTy &CombinedInfo, bool IsNonContiguous,
7396 bool ForEndCall, function_ref<void(unsigned int, Value *)> DeviceAddrCB,
7397 function_ref<Value *(unsigned int)> CustomMapperCB) {
7398 emitOffloadingArrays(AllocaIP, CodeGenIP, CombinedInfo, Info, IsNonContiguous,
7399 DeviceAddrCB, CustomMapperCB);
7400 emitOffloadingArraysArgument(Builder, RTArgs, Info, ForEndCall);
7401}
7402
7403static void
7408 Value *IfCond, Function *OutlinedFn, Constant *OutlinedFnID,
7412 bool HasNoWait = false) {
7413 // Generate a function call to the host fallback implementation of the target
7414 // region. This is called by the host when no offload entry was generated for
7415 // the target region and when the offloading call fails at runtime.
7416 auto &&EmitTargetCallFallbackCB = [&](OpenMPIRBuilder::InsertPointTy IP)
7418 Builder.restoreIP(IP);
7419 Builder.CreateCall(OutlinedFn, Args);
7420 return Builder.saveIP();
7421 };
7422
7423 bool HasDependencies = Dependencies.size() > 0;
7424 bool RequiresOuterTargetTask = HasNoWait || HasDependencies;
7425
7427
7428 auto TaskBodyCB =
7429 [&](Value *DeviceID, Value *RTLoc,
7430 IRBuilderBase::InsertPoint TargetTaskAllocaIP) -> Error {
7431 // Assume no error was returned because EmitTargetCallFallbackCB doesn't
7432 // produce any.
7434 // emitKernelLaunch makes the necessary runtime call to offload the
7435 // kernel. We then outline all that code into a separate function
7436 // ('kernel_launch_function' in the pseudo code above). This function is
7437 // then called by the target task proxy function (see
7438 // '@.omp_target_task_proxy_func' in the pseudo code above)
7439 // "@.omp_target_task_proxy_func' is generated by
7440 // emitTargetTaskProxyFunction.
7441 if (OutlinedFnID && DeviceID)
7442 return OMPBuilder.emitKernelLaunch(Builder, OutlinedFnID,
7443 EmitTargetCallFallbackCB, KArgs,
7444 DeviceID, RTLoc, TargetTaskAllocaIP);
7445
7446 // We only need to do the outlining if `DeviceID` is set to avoid calling
7447 // `emitKernelLaunch` if we want to code-gen for the host; e.g. if we are
7448 // generating the `else` branch of an `if` clause.
7449 //
7450 // When OutlinedFnID is set to nullptr, then it's not an offloading call.
7451 // In this case, we execute the host implementation directly.
7452 return EmitTargetCallFallbackCB(OMPBuilder.Builder.saveIP());
7453 }());
7454
7455 OMPBuilder.Builder.restoreIP(AfterIP);
7456 return Error::success();
7457 };
7458
7459 auto &&EmitTargetCallElse =
7460 [&](OpenMPIRBuilder::InsertPointTy AllocaIP,
7462 // Assume no error was returned because EmitTargetCallFallbackCB doesn't
7463 // produce any.
7465 if (RequiresOuterTargetTask) {
7466 // Arguments that are intended to be directly forwarded to an
7467 // emitKernelLaunch call are pased as nullptr, since
7468 // OutlinedFnID=nullptr results in that call not being done.
7469 return OMPBuilder.emitTargetTask(TaskBodyCB, /*DeviceID=*/nullptr,
7470 /*RTLoc=*/nullptr, AllocaIP,
7471 Dependencies, HasNoWait);
7472 }
7473 return EmitTargetCallFallbackCB(Builder.saveIP());
7474 }());
7475
7476 Builder.restoreIP(AfterIP);
7477 return Error::success();
7478 };
7479
7480 auto &&EmitTargetCallThen =
7481 [&](OpenMPIRBuilder::InsertPointTy AllocaIP,
7484 /*RequiresDevicePointerInfo=*/false,
7485 /*SeparateBeginEndCalls=*/true);
7486
7487 OpenMPIRBuilder::MapInfosTy &MapInfo = GenMapInfoCB(Builder.saveIP());
7489 OMPBuilder.emitOffloadingArraysAndArgs(AllocaIP, Builder.saveIP(), Info,
7490 RTArgs, MapInfo,
7491 /*IsNonContiguous=*/true,
7492 /*ForEndCall=*/false);
7493
7494 SmallVector<Value *, 3> NumTeamsC;
7495 for (auto [DefaultVal, RuntimeVal] :
7496 zip_equal(DefaultAttrs.MaxTeams, RuntimeAttrs.MaxTeams))
7497 NumTeamsC.push_back(RuntimeVal ? RuntimeVal
7498 : Builder.getInt32(DefaultVal));
7499
7500 // Calculate number of threads: 0 if no clauses specified, otherwise it is
7501 // the minimum between optional THREAD_LIMIT and NUM_THREADS clauses.
7502 auto InitMaxThreadsClause = [&Builder](Value *Clause) {
7503 if (Clause)
7504 Clause = Builder.CreateIntCast(Clause, Builder.getInt32Ty(),
7505 /*isSigned=*/false);
7506 return Clause;
7507 };
7508 auto CombineMaxThreadsClauses = [&Builder](Value *Clause, Value *&Result) {
7509 if (Clause)
7510 Result =
7511 Result ? Builder.CreateSelect(Builder.CreateICmpULT(Result, Clause),
7512 Result, Clause)
7513 : Clause;
7514 };
7515
7516 // If a multi-dimensional THREAD_LIMIT is set, it is the OMPX_BARE case, so
7517 // the NUM_THREADS clause is overriden by THREAD_LIMIT.
7518 SmallVector<Value *, 3> NumThreadsC;
7519 Value *MaxThreadsClause =
7520 RuntimeAttrs.TeamsThreadLimit.size() == 1
7521 ? InitMaxThreadsClause(RuntimeAttrs.MaxThreads)
7522 : nullptr;
7523
7524 for (auto [TeamsVal, TargetVal] : zip_equal(
7525 RuntimeAttrs.TeamsThreadLimit, RuntimeAttrs.TargetThreadLimit)) {
7526 Value *TeamsThreadLimitClause = InitMaxThreadsClause(TeamsVal);
7527 Value *NumThreads = InitMaxThreadsClause(TargetVal);
7528
7529 CombineMaxThreadsClauses(TeamsThreadLimitClause, NumThreads);
7530 CombineMaxThreadsClauses(MaxThreadsClause, NumThreads);
7531
7532 NumThreadsC.push_back(NumThreads ? NumThreads : Builder.getInt32(0));
7533 }
7534
7535 unsigned NumTargetItems = Info.NumberOfPtrs;
7536 // TODO: Use correct device ID
7537 Value *DeviceID = Builder.getInt64(OMP_DEVICEID_UNDEF);
7538 uint32_t SrcLocStrSize;
7539 Constant *SrcLocStr = OMPBuilder.getOrCreateDefaultSrcLocStr(SrcLocStrSize);
7540 Value *RTLoc = OMPBuilder.getOrCreateIdent(SrcLocStr, SrcLocStrSize,
7541 llvm::omp::IdentFlag(0), 0);
7542
7543 Value *TripCount = RuntimeAttrs.LoopTripCount
7544 ? Builder.CreateIntCast(RuntimeAttrs.LoopTripCount,
7545 Builder.getInt64Ty(),
7546 /*isSigned=*/false)
7547 : Builder.getInt64(0);
7548
7549 // TODO: Use correct DynCGGroupMem
7550 Value *DynCGGroupMem = Builder.getInt32(0);
7551
7552 KArgs = OpenMPIRBuilder::TargetKernelArgs(NumTargetItems, RTArgs, TripCount,
7553 NumTeamsC, NumThreadsC,
7554 DynCGGroupMem, HasNoWait);
7555
7556 // Assume no error was returned because TaskBodyCB and
7557 // EmitTargetCallFallbackCB don't produce any.
7559 // The presence of certain clauses on the target directive require the
7560 // explicit generation of the target task.
7561 if (RequiresOuterTargetTask)
7562 return OMPBuilder.emitTargetTask(TaskBodyCB, DeviceID, RTLoc, AllocaIP,
7563 Dependencies, HasNoWait);
7564
7565 return OMPBuilder.emitKernelLaunch(Builder, OutlinedFnID,
7566 EmitTargetCallFallbackCB, KArgs,
7567 DeviceID, RTLoc, AllocaIP);
7568 }());
7569
7570 Builder.restoreIP(AfterIP);
7571 return Error::success();
7572 };
7573
7574 // If we don't have an ID for the target region, it means an offload entry
7575 // wasn't created. In this case we just run the host fallback directly and
7576 // ignore any potential 'if' clauses.
7577 if (!OutlinedFnID) {
7578 cantFail(EmitTargetCallElse(AllocaIP, Builder.saveIP()));
7579 return;
7580 }
7581
7582 // If there's no 'if' clause, only generate the kernel launch code path.
7583 if (!IfCond) {
7584 cantFail(EmitTargetCallThen(AllocaIP, Builder.saveIP()));
7585 return;
7586 }
7587
7588 cantFail(OMPBuilder.emitIfClause(IfCond, EmitTargetCallThen,
7589 EmitTargetCallElse, AllocaIP));
7590}
7591
7593 const LocationDescription &Loc, bool IsOffloadEntry, InsertPointTy AllocaIP,
7594 InsertPointTy CodeGenIP, TargetRegionEntryInfo &EntryInfo,
7595 const TargetKernelDefaultAttrs &DefaultAttrs,
7596 const TargetKernelRuntimeAttrs &RuntimeAttrs, Value *IfCond,
7600 SmallVector<DependData> Dependencies, bool HasNowait) {
7601
7602 if (!updateToLocation(Loc))
7603 return InsertPointTy();
7604
7605 Builder.restoreIP(CodeGenIP);
7606
7607 Function *OutlinedFn;
7608 Constant *OutlinedFnID = nullptr;
7609 // The target region is outlined into its own function. The LLVM IR for
7610 // the target region itself is generated using the callbacks CBFunc
7611 // and ArgAccessorFuncCB
7613 *this, Builder, IsOffloadEntry, EntryInfo, DefaultAttrs, OutlinedFn,
7614 OutlinedFnID, Args, CBFunc, ArgAccessorFuncCB))
7615 return Err;
7616
7617 // If we are not on the target device, then we need to generate code
7618 // to make a remote call (offload) to the previously outlined function
7619 // that represents the target region. Do that now.
7620 if (!Config.isTargetDevice())
7621 emitTargetCall(*this, Builder, AllocaIP, DefaultAttrs, RuntimeAttrs, IfCond,
7622 OutlinedFn, OutlinedFnID, Args, GenMapInfoCB, Dependencies,
7623 HasNowait);
7624 return Builder.saveIP();
7625}
7626
7627std::string OpenMPIRBuilder::getNameWithSeparators(ArrayRef<StringRef> Parts,
7628 StringRef FirstSeparator,
7629 StringRef Separator) {
7630 SmallString<128> Buffer;
7632 StringRef Sep = FirstSeparator;
7633 for (StringRef Part : Parts) {
7634 OS << Sep << Part;
7635 Sep = Separator;
7636 }
7637 return OS.str().str();
7638}
7639
7640std::string
7642 return OpenMPIRBuilder::getNameWithSeparators(Parts, Config.firstSeparator(),
7643 Config.separator());
7644}
7645
7648 unsigned AddressSpace) {
7649 auto &Elem = *InternalVars.try_emplace(Name, nullptr).first;
7650 if (Elem.second) {
7651 assert(Elem.second->getValueType() == Ty &&
7652 "OMP internal variable has different type than requested");
7653 } else {
7654 // TODO: investigate the appropriate linkage type used for the global
7655 // variable for possibly changing that to internal or private, or maybe
7656 // create different versions of the function for different OMP internal
7657 // variables.
7658 auto Linkage = this->M.getTargetTriple().rfind("wasm32") == 0
7661 auto *GV = new GlobalVariable(M, Ty, /*IsConstant=*/false, Linkage,
7662 Constant::getNullValue(Ty), Elem.first(),
7663 /*InsertBefore=*/nullptr,
7665 const DataLayout &DL = M.getDataLayout();
7666 const llvm::Align TypeAlign = DL.getABITypeAlign(Ty);
7667 const llvm::Align PtrAlign = DL.getPointerABIAlignment(AddressSpace);
7668 GV->setAlignment(std::max(TypeAlign, PtrAlign));
7669 Elem.second = GV;
7670 }
7671
7672 return Elem.second;
7673}
7674
7675Value *OpenMPIRBuilder::getOMPCriticalRegionLock(StringRef CriticalName) {
7676 std::string Prefix = Twine("gomp_critical_user_", CriticalName).str();
7677 std::string Name = getNameWithSeparators({Prefix, "var"}, ".", ".");
7678 return getOrCreateInternalVariable(KmpCriticalNameTy, Name);
7679}
7680
7683 Value *Null =
7684 Constant::getNullValue(PointerType::getUnqual(BasePtr->getContext()));
7685 Value *SizeGep =
7686 Builder.CreateGEP(BasePtr->getType(), Null, Builder.getInt32(1));
7687 Value *SizePtrToInt = Builder.CreatePtrToInt(SizeGep, Type::getInt64Ty(Ctx));
7688 return SizePtrToInt;
7689}
7690
7693 std::string VarName) {
7694 llvm::Constant *MaptypesArrayInit =
7696 auto *MaptypesArrayGlobal = new llvm::GlobalVariable(
7697 M, MaptypesArrayInit->getType(),
7698 /*isConstant=*/true, llvm::GlobalValue::PrivateLinkage, MaptypesArrayInit,
7699 VarName);
7700 MaptypesArrayGlobal->setUnnamedAddr(llvm::GlobalValue::UnnamedAddr::Global);
7701 return MaptypesArrayGlobal;
7702}
7703
7705 InsertPointTy AllocaIP,
7706 unsigned NumOperands,
7707 struct MapperAllocas &MapperAllocas) {
7708 if (!updateToLocation(Loc))
7709 return;
7710
7711 auto *ArrI8PtrTy = ArrayType::get(Int8Ptr, NumOperands);
7712 auto *ArrI64Ty = ArrayType::get(Int64, NumOperands);
7713 Builder.restoreIP(AllocaIP);
7714 AllocaInst *ArgsBase = Builder.CreateAlloca(
7715 ArrI8PtrTy, /* ArraySize = */ nullptr, ".offload_baseptrs");
7716 AllocaInst *Args = Builder.CreateAlloca(ArrI8PtrTy, /* ArraySize = */ nullptr,
7717 ".offload_ptrs");
7718 AllocaInst *ArgSizes = Builder.CreateAlloca(
7719 ArrI64Ty, /* ArraySize = */ nullptr, ".offload_sizes");
7720 Builder.restoreIP(Loc.IP);
7721 MapperAllocas.ArgsBase = ArgsBase;
7722 MapperAllocas.Args = Args;
7723 MapperAllocas.ArgSizes = ArgSizes;
7724}
7725
7727 Function *MapperFunc, Value *SrcLocInfo,
7728 Value *MaptypesArg, Value *MapnamesArg,
7730 int64_t DeviceID, unsigned NumOperands) {
7731 if (!updateToLocation(Loc))
7732 return;
7733
7734 auto *ArrI8PtrTy = ArrayType::get(Int8Ptr, NumOperands);
7735 auto *ArrI64Ty = ArrayType::get(Int64, NumOperands);
7736 Value *ArgsBaseGEP =
7738 {Builder.getInt32(0), Builder.getInt32(0)});
7739 Value *ArgsGEP =
7741 {Builder.getInt32(0), Builder.getInt32(0)});
7742 Value *ArgSizesGEP =
7744 {Builder.getInt32(0), Builder.getInt32(0)});
7745 Value *NullPtr =
7746 Constant::getNullValue(PointerType::getUnqual(Int8Ptr->getContext()));
7747 Builder.CreateCall(MapperFunc,
7748 {SrcLocInfo, Builder.getInt64(DeviceID),
7749 Builder.getInt32(NumOperands), ArgsBaseGEP, ArgsGEP,
7750 ArgSizesGEP, MaptypesArg, MapnamesArg, NullPtr});
7751}
7752
7754 TargetDataRTArgs &RTArgs,
7755 TargetDataInfo &Info,
7756 bool ForEndCall) {
7757 assert((!ForEndCall || Info.separateBeginEndCalls()) &&
7758 "expected region end call to runtime only when end call is separate");
7759 auto UnqualPtrTy = PointerType::getUnqual(M.getContext());
7760 auto VoidPtrTy = UnqualPtrTy;
7761 auto VoidPtrPtrTy = UnqualPtrTy;
7762 auto Int64Ty = Type::getInt64Ty(M.getContext());
7763 auto Int64PtrTy = UnqualPtrTy;
7764
7765 if (!Info.NumberOfPtrs) {
7766 RTArgs.BasePointersArray = ConstantPointerNull::get(VoidPtrPtrTy);
7767 RTArgs.PointersArray = ConstantPointerNull::get(VoidPtrPtrTy);
7768 RTArgs.SizesArray = ConstantPointerNull::get(Int64PtrTy);
7769 RTArgs.MapTypesArray = ConstantPointerNull::get(Int64PtrTy);
7770 RTArgs.MapNamesArray = ConstantPointerNull::get(VoidPtrPtrTy);
7771 RTArgs.MappersArray = ConstantPointerNull::get(VoidPtrPtrTy);
7772 return;
7773 }
7774
7776 ArrayType::get(VoidPtrTy, Info.NumberOfPtrs),
7777 Info.RTArgs.BasePointersArray,
7778 /*Idx0=*/0, /*Idx1=*/0);
7780 ArrayType::get(VoidPtrTy, Info.NumberOfPtrs), Info.RTArgs.PointersArray,
7781 /*Idx0=*/0,
7782 /*Idx1=*/0);
7784 ArrayType::get(Int64Ty, Info.NumberOfPtrs), Info.RTArgs.SizesArray,
7785 /*Idx0=*/0, /*Idx1=*/0);
7787 ArrayType::get(Int64Ty, Info.NumberOfPtrs),
7788 ForEndCall && Info.RTArgs.MapTypesArrayEnd ? Info.RTArgs.MapTypesArrayEnd
7789 : Info.RTArgs.MapTypesArray,
7790 /*Idx0=*/0,
7791 /*Idx1=*/0);
7792
7793 // Only emit the mapper information arrays if debug information is
7794 // requested.
7795 if (!Info.EmitDebug)
7796 RTArgs.MapNamesArray = ConstantPointerNull::get(VoidPtrPtrTy);
7797 else
7799 ArrayType::get(VoidPtrTy, Info.NumberOfPtrs), Info.RTArgs.MapNamesArray,
7800 /*Idx0=*/0,
7801 /*Idx1=*/0);
7802 // If there is no user-defined mapper, set the mapper array to nullptr to
7803 // avoid an unnecessary data privatization
7804 if (!Info.HasMapper)
7805 RTArgs.MappersArray = ConstantPointerNull::get(VoidPtrPtrTy);
7806 else
7807 RTArgs.MappersArray =
7808 Builder.CreatePointerCast(Info.RTArgs.MappersArray, VoidPtrPtrTy);
7809}
7810
7812 InsertPointTy CodeGenIP,
7813 MapInfosTy &CombinedInfo,
7814 TargetDataInfo &Info) {
7816 CombinedInfo.NonContigInfo;
7817
7818 // Build an array of struct descriptor_dim and then assign it to
7819 // offload_args.
7820 //
7821 // struct descriptor_dim {
7822 // uint64_t offset;
7823 // uint64_t count;
7824 // uint64_t stride
7825 // };
7826 Type *Int64Ty = Builder.getInt64Ty();
7828 M.getContext(), ArrayRef<Type *>({Int64Ty, Int64Ty, Int64Ty}),
7829 "struct.descriptor_dim");
7830
7831 enum { OffsetFD = 0, CountFD, StrideFD };
7832 // We need two index variable here since the size of "Dims" is the same as
7833 // the size of Components, however, the size of offset, count, and stride is
7834 // equal to the size of base declaration that is non-contiguous.
7835 for (unsigned I = 0, L = 0, E = NonContigInfo.Dims.size(); I < E; ++I) {
7836 // Skip emitting ir if dimension size is 1 since it cannot be
7837 // non-contiguous.
7838 if (NonContigInfo.Dims[I] == 1)
7839 continue;
7840 Builder.restoreIP(AllocaIP);
7841 ArrayType *ArrayTy = ArrayType::get(DimTy, NonContigInfo.Dims[I]);
7842 AllocaInst *DimsAddr =
7843 Builder.CreateAlloca(ArrayTy, /* ArraySize = */ nullptr, "dims");
7844 Builder.restoreIP(CodeGenIP);
7845 for (unsigned II = 0, EE = NonContigInfo.Dims[I]; II < EE; ++II) {
7846 unsigned RevIdx = EE - II - 1;
7847 Value *DimsLVal = Builder.CreateInBoundsGEP(
7848 DimsAddr->getAllocatedType(), DimsAddr,
7849 {Builder.getInt64(0), Builder.getInt64(II)});
7850 // Offset
7851 Value *OffsetLVal = Builder.CreateStructGEP(DimTy, DimsLVal, OffsetFD);
7853 NonContigInfo.Offsets[L][RevIdx], OffsetLVal,
7854 M.getDataLayout().getPrefTypeAlign(OffsetLVal->getType()));
7855 // Count
7856 Value *CountLVal = Builder.CreateStructGEP(DimTy, DimsLVal, CountFD);
7858 NonContigInfo.Counts[L][RevIdx], CountLVal,
7859 M.getDataLayout().getPrefTypeAlign(CountLVal->getType()));
7860 // Stride
7861 Value *StrideLVal = Builder.CreateStructGEP(DimTy, DimsLVal, StrideFD);
7863 NonContigInfo.Strides[L][RevIdx], StrideLVal,
7864 M.getDataLayout().getPrefTypeAlign(CountLVal->getType()));
7865 }
7866 // args[I] = &dims
7867 Builder.restoreIP(CodeGenIP);
7869 DimsAddr, Builder.getPtrTy());
7871 ArrayType::get(Builder.getPtrTy(), Info.NumberOfPtrs),
7872 Info.RTArgs.PointersArray, 0, I);
7875 ++L;
7876 }
7877}
7878
7879void OpenMPIRBuilder::emitUDMapperArrayInitOrDel(
7880 Function *MapperFn, Value *MapperHandle, Value *Base, Value *Begin,
7881 Value *Size, Value *MapType, Value *MapName, TypeSize ElementSize,
7882 BasicBlock *ExitBB, bool IsInit) {
7883 StringRef Prefix = IsInit ? ".init" : ".del";
7884
7885 // Evaluate if this is an array section.
7887 M.getContext(), createPlatformSpecificName({"omp.array", Prefix}));
7888 Value *IsArray =
7889 Builder.CreateICmpSGT(Size, Builder.getInt64(1), "omp.arrayinit.isarray");
7890 Value *DeleteBit = Builder.CreateAnd(
7891 MapType,
7893 static_cast<std::underlying_type_t<OpenMPOffloadMappingFlags>>(
7894 OpenMPOffloadMappingFlags::OMP_MAP_DELETE)));
7895 Value *DeleteCond;
7896 Value *Cond;
7897 if (IsInit) {
7898 // base != begin?
7899 Value *BaseIsBegin = Builder.CreateICmpNE(Base, Begin);
7900 // IsPtrAndObj?
7901 Value *PtrAndObjBit = Builder.CreateAnd(
7902 MapType,
7904 static_cast<std::underlying_type_t<OpenMPOffloadMappingFlags>>(
7905 OpenMPOffloadMappingFlags::OMP_MAP_PTR_AND_OBJ)));
7906 PtrAndObjBit = Builder.CreateIsNotNull(PtrAndObjBit);
7907 BaseIsBegin = Builder.CreateAnd(BaseIsBegin, PtrAndObjBit);
7908 Cond = Builder.CreateOr(IsArray, BaseIsBegin);
7909 DeleteCond = Builder.CreateIsNull(
7910 DeleteBit,
7911 createPlatformSpecificName({"omp.array", Prefix, ".delete"}));
7912 } else {
7913 Cond = IsArray;
7914 DeleteCond = Builder.CreateIsNotNull(
7915 DeleteBit,
7916 createPlatformSpecificName({"omp.array", Prefix, ".delete"}));
7917 }
7918 Cond = Builder.CreateAnd(Cond, DeleteCond);
7919 Builder.CreateCondBr(Cond, BodyBB, ExitBB);
7920
7921 emitBlock(BodyBB, MapperFn);
7922 // Get the array size by multiplying element size and element number (i.e., \p
7923 // Size).
7924 Value *ArraySize = Builder.CreateNUWMul(Size, Builder.getInt64(ElementSize));
7925 // Remove OMP_MAP_TO and OMP_MAP_FROM from the map type, so that it achieves
7926 // memory allocation/deletion purpose only.
7927 Value *MapTypeArg = Builder.CreateAnd(
7928 MapType,
7930 ~static_cast<std::underlying_type_t<OpenMPOffloadMappingFlags>>(
7931 OpenMPOffloadMappingFlags::OMP_MAP_TO |
7932 OpenMPOffloadMappingFlags::OMP_MAP_FROM)));
7933 MapTypeArg = Builder.CreateOr(
7934 MapTypeArg,
7936 static_cast<std::underlying_type_t<OpenMPOffloadMappingFlags>>(
7937 OpenMPOffloadMappingFlags::OMP_MAP_IMPLICIT)));
7938
7939 // Call the runtime API __tgt_push_mapper_component to fill up the runtime
7940 // data structure.
7941 Value *OffloadingArgs[] = {MapperHandle, Base, Begin,
7942 ArraySize, MapTypeArg, MapName};
7944 getOrCreateRuntimeFunction(M, OMPRTL___tgt_push_mapper_component),
7945 OffloadingArgs);
7946}
7947
7949 function_ref<MapInfosTy &(InsertPointTy CodeGenIP, llvm::Value *PtrPHI,
7950 llvm::Value *BeginArg)>
7951 GenMapInfoCB,
7952 Type *ElemTy, StringRef FuncName,
7953 function_ref<bool(unsigned int, Function **)> CustomMapperCB) {
7954 SmallVector<Type *> Params;
7955 Params.emplace_back(Builder.getPtrTy());
7956 Params.emplace_back(Builder.getPtrTy());
7957 Params.emplace_back(Builder.getPtrTy());
7960 Params.emplace_back(Builder.getPtrTy());
7961
7962 auto *FnTy =
7963 FunctionType::get(Builder.getVoidTy(), Params, /* IsVarArg */ false);
7964
7965 SmallString<64> TyStr;
7966 raw_svector_ostream Out(TyStr);
7967 Function *MapperFn =
7969 MapperFn->addFnAttr(Attribute::NoInline);
7970 MapperFn->addFnAttr(Attribute::NoUnwind);
7971 MapperFn->addParamAttr(0, Attribute::NoUndef);
7972 MapperFn->addParamAttr(1, Attribute::NoUndef);
7973 MapperFn->addParamAttr(2, Attribute::NoUndef);
7974 MapperFn->addParamAttr(3, Attribute::NoUndef);
7975 MapperFn->addParamAttr(4, Attribute::NoUndef);
7976 MapperFn->addParamAttr(5, Attribute::NoUndef);
7977
7978 // Start the mapper function code generation.
7979 BasicBlock *EntryBB = BasicBlock::Create(M.getContext(), "entry", MapperFn);
7980 auto SavedIP = Builder.saveIP();
7981 Builder.SetInsertPoint(EntryBB);
7982
7983 Value *MapperHandle = MapperFn->getArg(0);
7984 Value *BaseIn = MapperFn->getArg(1);
7985 Value *BeginIn = MapperFn->getArg(2);
7986 Value *Size = MapperFn->getArg(3);
7987 Value *MapType = MapperFn->getArg(4);
7988 Value *MapName = MapperFn->getArg(5);
7989
7990 // Compute the starting and end addresses of array elements.
7991 // Prepare common arguments for array initiation and deletion.
7992 // Convert the size in bytes into the number of array elements.
7993 TypeSize ElementSize = M.getDataLayout().getTypeStoreSize(ElemTy);
7995 Value *PtrBegin = Builder.CreateBitCast(BeginIn, Builder.getPtrTy());
7996 Value *PtrEnd = Builder.CreateGEP(ElemTy, PtrBegin, Size);
7997
7998 // Emit array initiation if this is an array section and \p MapType indicates
7999 // that memory allocation is required.
8000 BasicBlock *HeadBB = BasicBlock::Create(M.getContext(), "omp.arraymap.head");
8001 emitUDMapperArrayInitOrDel(MapperFn, MapperHandle, BaseIn, BeginIn, Size,
8002 MapType, MapName, ElementSize, HeadBB,
8003 /*IsInit=*/true);
8004
8005 // Emit a for loop to iterate through SizeArg of elements and map all of them.
8006
8007 // Emit the loop header block.
8008 emitBlock(HeadBB, MapperFn);
8009 BasicBlock *BodyBB = BasicBlock::Create(M.getContext(), "omp.arraymap.body");
8010 BasicBlock *DoneBB = BasicBlock::Create(M.getContext(), "omp.done");
8011 // Evaluate whether the initial condition is satisfied.
8012 Value *IsEmpty =
8013 Builder.CreateICmpEQ(PtrBegin, PtrEnd, "omp.arraymap.isempty");
8014 Builder.CreateCondBr(IsEmpty, DoneBB, BodyBB);
8015
8016 // Emit the loop body block.
8017 emitBlock(BodyBB, MapperFn);
8018 BasicBlock *LastBB = BodyBB;
8019 PHINode *PtrPHI =
8020 Builder.CreatePHI(PtrBegin->getType(), 2, "omp.arraymap.ptrcurrent");
8021 PtrPHI->addIncoming(PtrBegin, HeadBB);
8022
8023 // Get map clause information. Fill up the arrays with all mapped variables.
8024 MapInfosTy &Info = GenMapInfoCB(Builder.saveIP(), PtrPHI, BeginIn);
8025
8026 // Call the runtime API __tgt_mapper_num_components to get the number of
8027 // pre-existing components.
8028 Value *OffloadingArgs[] = {MapperHandle};
8029 Value *PreviousSize = Builder.CreateCall(
8030 getOrCreateRuntimeFunction(M, OMPRTL___tgt_mapper_num_components),
8031 OffloadingArgs);
8032 Value *ShiftedPreviousSize =
8034
8035 // Fill up the runtime mapper handle for all components.
8036 for (unsigned I = 0; I < Info.BasePointers.size(); ++I) {
8037 Value *CurBaseArg =
8038 Builder.CreateBitCast(Info.BasePointers[I], Builder.getPtrTy());
8039 Value *CurBeginArg =
8041 Value *CurSizeArg = Info.Sizes[I];
8042 Value *CurNameArg = Info.Names.size()
8043 ? Info.Names[I]
8045
8046 // Extract the MEMBER_OF field from the map type.
8047 Value *OriMapType = Builder.getInt64(
8048 static_cast<std::underlying_type_t<OpenMPOffloadMappingFlags>>(
8049 Info.Types[I]));
8050 Value *MemberMapType =
8051 Builder.CreateNUWAdd(OriMapType, ShiftedPreviousSize);
8052
8053 // Combine the map type inherited from user-defined mapper with that
8054 // specified in the program. According to the OMP_MAP_TO and OMP_MAP_FROM
8055 // bits of the \a MapType, which is the input argument of the mapper
8056 // function, the following code will set the OMP_MAP_TO and OMP_MAP_FROM
8057 // bits of MemberMapType.
8058 // [OpenMP 5.0], 1.2.6. map-type decay.
8059 // | alloc | to | from | tofrom | release | delete
8060 // ----------------------------------------------------------
8061 // alloc | alloc | alloc | alloc | alloc | release | delete
8062 // to | alloc | to | alloc | to | release | delete
8063 // from | alloc | alloc | from | from | release | delete
8064 // tofrom | alloc | to | from | tofrom | release | delete
8065 Value *LeftToFrom = Builder.CreateAnd(
8066 MapType,
8068 static_cast<std::underlying_type_t<OpenMPOffloadMappingFlags>>(
8069 OpenMPOffloadMappingFlags::OMP_MAP_TO |
8070 OpenMPOffloadMappingFlags::OMP_MAP_FROM)));
8071 BasicBlock *AllocBB = BasicBlock::Create(M.getContext(), "omp.type.alloc");
8072 BasicBlock *AllocElseBB =
8073 BasicBlock::Create(M.getContext(), "omp.type.alloc.else");
8074 BasicBlock *ToBB = BasicBlock::Create(M.getContext(), "omp.type.to");
8075 BasicBlock *ToElseBB =
8076 BasicBlock::Create(M.getContext(), "omp.type.to.else");
8077 BasicBlock *FromBB = BasicBlock::Create(M.getContext(), "omp.type.from");
8078 BasicBlock *EndBB = BasicBlock::Create(M.getContext(), "omp.type.end");
8079 Value *IsAlloc = Builder.CreateIsNull(LeftToFrom);
8080 Builder.CreateCondBr(IsAlloc, AllocBB, AllocElseBB);
8081 // In case of alloc, clear OMP_MAP_TO and OMP_MAP_FROM.
8082 emitBlock(AllocBB, MapperFn);
8083 Value *AllocMapType = Builder.CreateAnd(
8084 MemberMapType,
8086 ~static_cast<std::underlying_type_t<OpenMPOffloadMappingFlags>>(
8087 OpenMPOffloadMappingFlags::OMP_MAP_TO |
8088 OpenMPOffloadMappingFlags::OMP_MAP_FROM)));
8089 Builder.CreateBr(EndBB);
8090 emitBlock(AllocElseBB, MapperFn);
8091 Value *IsTo = Builder.CreateICmpEQ(
8092 LeftToFrom,
8094 static_cast<std::underlying_type_t<OpenMPOffloadMappingFlags>>(
8095 OpenMPOffloadMappingFlags::OMP_MAP_TO)));
8096 Builder.CreateCondBr(IsTo, ToBB, ToElseBB);
8097 // In case of to, clear OMP_MAP_FROM.
8098 emitBlock(ToBB, MapperFn);
8099 Value *ToMapType = Builder.CreateAnd(
8100 MemberMapType,
8102 ~static_cast<std::underlying_type_t<OpenMPOffloadMappingFlags>>(
8103 OpenMPOffloadMappingFlags::OMP_MAP_FROM)));
8104 Builder.CreateBr(EndBB);
8105 emitBlock(ToElseBB, MapperFn);
8106 Value *IsFrom = Builder.CreateICmpEQ(
8107 LeftToFrom,
8109 static_cast<std::underlying_type_t<OpenMPOffloadMappingFlags>>(
8110 OpenMPOffloadMappingFlags::OMP_MAP_FROM)));
8111 Builder.CreateCondBr(IsFrom, FromBB, EndBB);
8112 // In case of from, clear OMP_MAP_TO.
8113 emitBlock(FromBB, MapperFn);
8114 Value *FromMapType = Builder.CreateAnd(
8115 MemberMapType,
8117 ~static_cast<std::underlying_type_t<OpenMPOffloadMappingFlags>>(
8118 OpenMPOffloadMappingFlags::OMP_MAP_TO)));
8119 // In case of tofrom, do nothing.
8120 emitBlock(EndBB, MapperFn);
8121 LastBB = EndBB;
8122 PHINode *CurMapType =
8123 Builder.CreatePHI(Builder.getInt64Ty(), 4, "omp.maptype");
8124 CurMapType->addIncoming(AllocMapType, AllocBB);
8125 CurMapType->addIncoming(ToMapType, ToBB);
8126 CurMapType->addIncoming(FromMapType, FromBB);
8127 CurMapType->addIncoming(MemberMapType, ToElseBB);
8128
8129 Value *OffloadingArgs[] = {MapperHandle, CurBaseArg, CurBeginArg,
8130 CurSizeArg, CurMapType, CurNameArg};
8131 Function *ChildMapperFn = nullptr;
8132 if (CustomMapperCB && CustomMapperCB(I, &ChildMapperFn)) {
8133 // Call the corresponding mapper function.
8134 Builder.CreateCall(ChildMapperFn, OffloadingArgs)->setDoesNotThrow();
8135 } else {
8136 // Call the runtime API __tgt_push_mapper_component to fill up the runtime
8137 // data structure.
8139 getOrCreateRuntimeFunction(M, OMPRTL___tgt_push_mapper_component),
8140 OffloadingArgs);
8141 }
8142 }
8143
8144 // Update the pointer to point to the next element that needs to be mapped,
8145 // and check whether we have mapped all elements.
8146 Value *PtrNext = Builder.CreateConstGEP1_32(ElemTy, PtrPHI, /*Idx0=*/1,
8147 "omp.arraymap.next");
8148 PtrPHI->addIncoming(PtrNext, LastBB);
8149 Value *IsDone = Builder.CreateICmpEQ(PtrNext, PtrEnd, "omp.arraymap.isdone");
8150 BasicBlock *ExitBB = BasicBlock::Create(M.getContext(), "omp.arraymap.exit");
8151 Builder.CreateCondBr(IsDone, ExitBB, BodyBB);
8152
8153 emitBlock(ExitBB, MapperFn);
8154 // Emit array deletion if this is an array section and \p MapType indicates
8155 // that deletion is required.
8156 emitUDMapperArrayInitOrDel(MapperFn, MapperHandle, BaseIn, BeginIn, Size,
8157 MapType, MapName, ElementSize, DoneBB,
8158 /*IsInit=*/false);
8159
8160 // Emit the function exit block.
8161 emitBlock(DoneBB, MapperFn, /*IsFinished=*/true);
8162
8164 Builder.restoreIP(SavedIP);
8165 return MapperFn;
8166}
8167
8169 InsertPointTy AllocaIP, InsertPointTy CodeGenIP, MapInfosTy &CombinedInfo,
8170 TargetDataInfo &Info, bool IsNonContiguous,
8171 function_ref<void(unsigned int, Value *)> DeviceAddrCB,
8172 function_ref<Value *(unsigned int)> CustomMapperCB) {
8173
8174 // Reset the array information.
8175 Info.clearArrayInfo();
8176 Info.NumberOfPtrs = CombinedInfo.BasePointers.size();
8177
8178 if (Info.NumberOfPtrs == 0)
8179 return;
8180
8181 Builder.restoreIP(AllocaIP);
8182 // Detect if we have any capture size requiring runtime evaluation of the
8183 // size so that a constant array could be eventually used.
8184 ArrayType *PointerArrayType =
8185 ArrayType::get(Builder.getPtrTy(), Info.NumberOfPtrs);
8186
8187 Info.RTArgs.BasePointersArray = Builder.CreateAlloca(
8188 PointerArrayType, /* ArraySize = */ nullptr, ".offload_baseptrs");
8189
8190 Info.RTArgs.PointersArray = Builder.CreateAlloca(
8191 PointerArrayType, /* ArraySize = */ nullptr, ".offload_ptrs");
8192 AllocaInst *MappersArray = Builder.CreateAlloca(
8193 PointerArrayType, /* ArraySize = */ nullptr, ".offload_mappers");
8194 Info.RTArgs.MappersArray = MappersArray;
8195
8196 // If we don't have any VLA types or other types that require runtime
8197 // evaluation, we can use a constant array for the map sizes, otherwise we
8198 // need to fill up the arrays as we do for the pointers.
8199 Type *Int64Ty = Builder.getInt64Ty();
8200 SmallVector<Constant *> ConstSizes(CombinedInfo.Sizes.size(),
8201 ConstantInt::get(Int64Ty, 0));
8202 SmallBitVector RuntimeSizes(CombinedInfo.Sizes.size());
8203 for (unsigned I = 0, E = CombinedInfo.Sizes.size(); I < E; ++I) {
8204 if (auto *CI = dyn_cast<Constant>(CombinedInfo.Sizes[I])) {
8205 if (!isa<ConstantExpr>(CI) && !isa<GlobalValue>(CI)) {
8206 if (IsNonContiguous &&
8207 static_cast<std::underlying_type_t<OpenMPOffloadMappingFlags>>(
8208 CombinedInfo.Types[I] &
8209 OpenMPOffloadMappingFlags::OMP_MAP_NON_CONTIG))
8210 ConstSizes[I] =
8211 ConstantInt::get(Int64Ty, CombinedInfo.NonContigInfo.Dims[I]);
8212 else
8213 ConstSizes[I] = CI;
8214 continue;
8215 }
8216 }
8217 RuntimeSizes.set(I);
8218 }
8219
8220 if (RuntimeSizes.all()) {
8221 ArrayType *SizeArrayType = ArrayType::get(Int64Ty, Info.NumberOfPtrs);
8222 Info.RTArgs.SizesArray = Builder.CreateAlloca(
8223 SizeArrayType, /* ArraySize = */ nullptr, ".offload_sizes");
8224 Builder.restoreIP(CodeGenIP);
8225 } else {
8226 auto *SizesArrayInit = ConstantArray::get(
8227 ArrayType::get(Int64Ty, ConstSizes.size()), ConstSizes);
8228 std::string Name = createPlatformSpecificName({"offload_sizes"});
8229 auto *SizesArrayGbl =
8230 new GlobalVariable(M, SizesArrayInit->getType(), /*isConstant=*/true,
8231 GlobalValue::PrivateLinkage, SizesArrayInit, Name);
8232 SizesArrayGbl->setUnnamedAddr(GlobalValue::UnnamedAddr::Global);
8233
8234 if (!RuntimeSizes.any()) {
8235 Info.RTArgs.SizesArray = SizesArrayGbl;
8236 } else {
8237 unsigned IndexSize = M.getDataLayout().getIndexSizeInBits(0);
8238 Align OffloadSizeAlign = M.getDataLayout().getABIIntegerTypeAlignment(64);
8239 ArrayType *SizeArrayType = ArrayType::get(Int64Ty, Info.NumberOfPtrs);
8241 SizeArrayType, /* ArraySize = */ nullptr, ".offload_sizes");
8242 Buffer->setAlignment(OffloadSizeAlign);
8243 Builder.restoreIP(CodeGenIP);
8245 Buffer, M.getDataLayout().getPrefTypeAlign(Buffer->getType()),
8246 SizesArrayGbl, OffloadSizeAlign,
8248 IndexSize,
8249 Buffer->getAllocationSize(M.getDataLayout())->getFixedValue()));
8250
8251 Info.RTArgs.SizesArray = Buffer;
8252 }
8253 Builder.restoreIP(CodeGenIP);
8254 }
8255
8256 // The map types are always constant so we don't need to generate code to
8257 // fill arrays. Instead, we create an array constant.
8259 for (auto mapFlag : CombinedInfo.Types)
8260 Mapping.push_back(
8261 static_cast<std::underlying_type_t<OpenMPOffloadMappingFlags>>(
8262 mapFlag));
8263 std::string MaptypesName = createPlatformSpecificName({"offload_maptypes"});
8264 auto *MapTypesArrayGbl = createOffloadMaptypes(Mapping, MaptypesName);
8265 Info.RTArgs.MapTypesArray = MapTypesArrayGbl;
8266
8267 // The information types are only built if provided.
8268 if (!CombinedInfo.Names.empty()) {
8269 std::string MapnamesName = createPlatformSpecificName({"offload_mapnames"});
8270 auto *MapNamesArrayGbl =
8271 createOffloadMapnames(CombinedInfo.Names, MapnamesName);
8272 Info.RTArgs.MapNamesArray = MapNamesArrayGbl;
8273 Info.EmitDebug = true;
8274 } else {
8275 Info.RTArgs.MapNamesArray =
8277 Info.EmitDebug = false;
8278 }
8279
8280 // If there's a present map type modifier, it must not be applied to the end
8281 // of a region, so generate a separate map type array in that case.
8282 if (Info.separateBeginEndCalls()) {
8283 bool EndMapTypesDiffer = false;
8284 for (uint64_t &Type : Mapping) {
8285 if (Type & static_cast<std::underlying_type_t<OpenMPOffloadMappingFlags>>(
8286 OpenMPOffloadMappingFlags::OMP_MAP_PRESENT)) {
8287 Type &= ~static_cast<std::underlying_type_t<OpenMPOffloadMappingFlags>>(
8288 OpenMPOffloadMappingFlags::OMP_MAP_PRESENT);
8289 EndMapTypesDiffer = true;
8290 }
8291 }
8292 if (EndMapTypesDiffer) {
8293 MapTypesArrayGbl = createOffloadMaptypes(Mapping, MaptypesName);
8294 Info.RTArgs.MapTypesArrayEnd = MapTypesArrayGbl;
8295 }
8296 }
8297
8298 PointerType *PtrTy = Builder.getPtrTy();
8299 for (unsigned I = 0; I < Info.NumberOfPtrs; ++I) {
8300 Value *BPVal = CombinedInfo.BasePointers[I];
8302 ArrayType::get(PtrTy, Info.NumberOfPtrs), Info.RTArgs.BasePointersArray,
8303 0, I);
8304 Builder.CreateAlignedStore(BPVal, BP,
8306
8307 if (Info.requiresDevicePointerInfo()) {
8308 if (CombinedInfo.DevicePointers[I] == DeviceInfoTy::Pointer) {
8309 CodeGenIP = Builder.saveIP();
8310 Builder.restoreIP(AllocaIP);
8311 Info.DevicePtrInfoMap[BPVal] = {BP, Builder.CreateAlloca(PtrTy)};
8312 Builder.restoreIP(CodeGenIP);
8313 if (DeviceAddrCB)
8314 DeviceAddrCB(I, Info.DevicePtrInfoMap[BPVal].second);
8315 } else if (CombinedInfo.DevicePointers[I] == DeviceInfoTy::Address) {
8316 Info.DevicePtrInfoMap[BPVal] = {BP, BP};
8317 if (DeviceAddrCB)
8318 DeviceAddrCB(I, BP);
8319 }
8320 }
8321
8322 Value *PVal = CombinedInfo.Pointers[I];
8324 ArrayType::get(PtrTy, Info.NumberOfPtrs), Info.RTArgs.PointersArray, 0,
8325 I);
8326 // TODO: Check alignment correct.
8329
8330 if (RuntimeSizes.test(I)) {
8332 ArrayType::get(Int64Ty, Info.NumberOfPtrs), Info.RTArgs.SizesArray,
8333 /*Idx0=*/0,
8334 /*Idx1=*/I);
8336 Int64Ty,
8337 /*isSigned=*/true),
8338 S, M.getDataLayout().getPrefTypeAlign(PtrTy));
8339 }
8340 // Fill up the mapper array.
8341 unsigned IndexSize = M.getDataLayout().getIndexSizeInBits(0);
8342 Value *MFunc = ConstantPointerNull::get(PtrTy);
8343 if (CustomMapperCB)
8344 if (Value *CustomMFunc = CustomMapperCB(I))
8345 MFunc = Builder.CreatePointerCast(CustomMFunc, PtrTy);
8347 MappersArray->getAllocatedType(), MappersArray,
8348 {Builder.getIntN(IndexSize, 0), Builder.getIntN(IndexSize, I)});
8350 MFunc, MAddr, M.getDataLayout().getPrefTypeAlign(MAddr->getType()));
8351 }
8352
8353 if (!IsNonContiguous || CombinedInfo.NonContigInfo.Offsets.empty() ||
8354 Info.NumberOfPtrs == 0)
8355 return;
8356 emitNonContiguousDescriptor(AllocaIP, CodeGenIP, CombinedInfo, Info);
8357}
8358
8361
8362 if (!CurBB || CurBB->getTerminator()) {
8363 // If there is no insert point or the previous block is already
8364 // terminated, don't touch it.
8365 } else {
8366 // Otherwise, create a fall-through branch.
8368 }
8369
8371}
8372
8374 bool IsFinished) {
8376
8377 // Fall out of the current block (if necessary).
8378 emitBranch(BB);
8379
8380 if (IsFinished && BB->use_empty()) {
8381 BB->eraseFromParent();
8382 return;
8383 }
8384
8385 // Place the block after the current block, if possible, or else at
8386 // the end of the function.
8387 if (CurBB && CurBB->getParent())
8388 CurFn->insert(std::next(CurBB->getIterator()), BB);
8389 else
8390 CurFn->insert(CurFn->end(), BB);
8392}
8393
8395 BodyGenCallbackTy ElseGen,
8396 InsertPointTy AllocaIP) {
8397 // If the condition constant folds and can be elided, try to avoid emitting
8398 // the condition and the dead arm of the if/else.
8399 if (auto *CI = dyn_cast<ConstantInt>(Cond)) {
8400 auto CondConstant = CI->getSExtValue();
8401 if (CondConstant)
8402 return ThenGen(AllocaIP, Builder.saveIP());
8403
8404 return ElseGen(AllocaIP, Builder.saveIP());
8405 }
8406
8408
8409 // Otherwise, the condition did not fold, or we couldn't elide it. Just
8410 // emit the conditional branch.
8411 BasicBlock *ThenBlock = BasicBlock::Create(M.getContext(), "omp_if.then");
8412 BasicBlock *ElseBlock = BasicBlock::Create(M.getContext(), "omp_if.else");
8413 BasicBlock *ContBlock = BasicBlock::Create(M.getContext(), "omp_if.end");
8414 Builder.CreateCondBr(Cond, ThenBlock, ElseBlock);
8415 // Emit the 'then' code.
8416 emitBlock(ThenBlock, CurFn);
8417 if (Error Err = ThenGen(AllocaIP, Builder.saveIP()))
8418 return Err;
8419 emitBranch(ContBlock);
8420 // Emit the 'else' code if present.
8421 // There is no need to emit line number for unconditional branch.
8422 emitBlock(ElseBlock, CurFn);
8423 if (Error Err = ElseGen(AllocaIP, Builder.saveIP()))
8424 return Err;
8425 // There is no need to emit line number for unconditional branch.
8426 emitBranch(ContBlock);
8427 // Emit the continuation block for code after the if.
8428 emitBlock(ContBlock, CurFn, /*IsFinished=*/true);
8429 return Error::success();
8430}
8431
8432bool OpenMPIRBuilder::checkAndEmitFlushAfterAtomic(
8433 const LocationDescription &Loc, llvm::AtomicOrdering AO, AtomicKind AK) {
8436 "Unexpected Atomic Ordering.");
8437
8438 bool Flush = false;
8440
8441 switch (AK) {
8442 case Read:
8445 FlushAO = AtomicOrdering::Acquire;
8446 Flush = true;
8447 }
8448 break;
8449 case Write:
8450 case Compare:
8451 case Update:
8454 FlushAO = AtomicOrdering::Release;
8455 Flush = true;
8456 }
8457 break;
8458 case Capture:
8459 switch (AO) {
8461 FlushAO = AtomicOrdering::Acquire;
8462 Flush = true;
8463 break;
8465 FlushAO = AtomicOrdering::Release;
8466 Flush = true;
8467 break;
8471 Flush = true;
8472 break;
8473 default:
8474 // do nothing - leave silently.
8475 break;
8476 }
8477 }
8478
8479 if (Flush) {
8480 // Currently Flush RT call still doesn't take memory_ordering, so for when
8481 // that happens, this tries to do the resolution of which atomic ordering
8482 // to use with but issue the flush call
8483 // TODO: pass `FlushAO` after memory ordering support is added
8484 (void)FlushAO;
8485 emitFlush(Loc);
8486 }
8487
8488 // for AO == AtomicOrdering::Monotonic and all other case combinations
8489 // do nothing
8490 return Flush;
8491}
8492
8496 AtomicOrdering AO) {
8497 if (!updateToLocation(Loc))
8498 return Loc.IP;
8499
8500 assert(X.Var->getType()->isPointerTy() &&
8501 "OMP Atomic expects a pointer to target memory");
8502 Type *XElemTy = X.ElemTy;
8503 assert((XElemTy->isFloatingPointTy() || XElemTy->isIntegerTy() ||
8504 XElemTy->isPointerTy() || XElemTy->isStructTy()) &&
8505 "OMP atomic read expected a scalar type");
8506
8507 Value *XRead = nullptr;
8508
8509 if (XElemTy->isIntegerTy()) {
8510 LoadInst *XLD =
8511 Builder.CreateLoad(XElemTy, X.Var, X.IsVolatile, "omp.atomic.read");
8512 XLD->setAtomic(AO);
8513 XRead = cast<Value>(XLD);
8514 } else if (XElemTy->isStructTy()) {
8515 // FIXME: Add checks to ensure __atomic_load is emitted iff the
8516 // target does not support `atomicrmw` of the size of the struct
8517 LoadInst *OldVal = Builder.CreateLoad(XElemTy, X.Var, "omp.atomic.read");
8518 OldVal->setAtomic(AO);
8519 const DataLayout &LoadDL = OldVal->getModule()->getDataLayout();
8520 unsigned LoadSize =
8521 LoadDL.getTypeStoreSize(OldVal->getPointerOperand()->getType());
8522 OpenMPIRBuilder::AtomicInfo atomicInfo(
8523 &Builder, XElemTy, LoadSize * 8, LoadSize * 8, OldVal->getAlign(),
8524 OldVal->getAlign(), true /* UseLibcall */, X.Var);
8525 auto AtomicLoadRes = atomicInfo.EmitAtomicLoadLibcall(AO);
8526 XRead = AtomicLoadRes.first;
8527 OldVal->eraseFromParent();
8528 } else {
8529 // We need to perform atomic op as integer
8530 IntegerType *IntCastTy =
8532 LoadInst *XLoad =
8533 Builder.CreateLoad(IntCastTy, X.Var, X.IsVolatile, "omp.atomic.load");
8534 XLoad->setAtomic(AO);
8535 if (XElemTy->isFloatingPointTy()) {
8536 XRead = Builder.CreateBitCast(XLoad, XElemTy, "atomic.flt.cast");
8537 } else {
8538 XRead = Builder.CreateIntToPtr(XLoad, XElemTy, "atomic.ptr.cast");
8539 }
8540 }
8541 checkAndEmitFlushAfterAtomic(Loc, AO, AtomicKind::Read);
8542 if (XRead->getType() != V.Var->getType())
8543 XRead = emitImplicitCast(Builder, XRead, V.Var);
8544 Builder.CreateStore(XRead, V.Var, V.IsVolatile);
8545 return Builder.saveIP();
8546}
8547
8550 AtomicOpValue &X, Value *Expr,
8551 AtomicOrdering AO) {
8552 if (!updateToLocation(Loc))
8553 return Loc.IP;
8554
8555 assert(X.Var->getType()->isPointerTy() &&
8556 "OMP Atomic expects a pointer to target memory");
8557 Type *XElemTy = X.ElemTy;
8558 assert((XElemTy->isFloatingPointTy() || XElemTy->isIntegerTy() ||
8559 XElemTy->isPointerTy()) &&
8560 "OMP atomic write expected a scalar type");
8561
8562 if (XElemTy->isIntegerTy()) {
8563 StoreInst *XSt = Builder.CreateStore(Expr, X.Var, X.IsVolatile);
8564 XSt->setAtomic(AO);
8565 } else {
8566 // We need to bitcast and perform atomic op as integers
8567 IntegerType *IntCastTy =
8569 Value *ExprCast =
8570 Builder.CreateBitCast(Expr, IntCastTy, "atomic.src.int.cast");
8571 StoreInst *XSt = Builder.CreateStore(ExprCast, X.Var, X.IsVolatile);
8572 XSt->setAtomic(AO);
8573 }
8574
8575 checkAndEmitFlushAfterAtomic(Loc, AO, AtomicKind::Write);
8576 return Builder.saveIP();
8577}
8578
8580 const LocationDescription &Loc, InsertPointTy AllocaIP, AtomicOpValue &X,
8581 Value *Expr, AtomicOrdering AO, AtomicRMWInst::BinOp RMWOp,
8582 AtomicUpdateCallbackTy &UpdateOp, bool IsXBinopExpr) {
8583 assert(!isConflictIP(Loc.IP, AllocaIP) && "IPs must not be ambiguous");
8584 if (!updateToLocation(Loc))
8585 return Loc.IP;
8586
8587 LLVM_DEBUG({
8588 Type *XTy = X.Var->getType();
8589 assert(XTy->isPointerTy() &&
8590 "OMP Atomic expects a pointer to target memory");
8591 Type *XElemTy = X.ElemTy;
8592 assert((XElemTy->isFloatingPointTy() || XElemTy->isIntegerTy() ||
8593 XElemTy->isPointerTy()) &&
8594 "OMP atomic update expected a scalar type");
8595 assert((RMWOp != AtomicRMWInst::Max) && (RMWOp != AtomicRMWInst::Min) &&
8596 (RMWOp != AtomicRMWInst::UMax) && (RMWOp != AtomicRMWInst::UMin) &&
8597 "OpenMP atomic does not support LT or GT operations");
8598 });
8599
8601 emitAtomicUpdate(AllocaIP, X.Var, X.ElemTy, Expr, AO, RMWOp, UpdateOp,
8602 X.IsVolatile, IsXBinopExpr);
8603 if (!AtomicResult)
8604 return AtomicResult.takeError();
8605 checkAndEmitFlushAfterAtomic(Loc, AO, AtomicKind::Update);
8606 return Builder.saveIP();
8607}
8608
8609// FIXME: Duplicating AtomicExpand
8610Value *OpenMPIRBuilder::emitRMWOpAsInstruction(Value *Src1, Value *Src2,
8611 AtomicRMWInst::BinOp RMWOp) {
8612 switch (RMWOp) {
8613 case AtomicRMWInst::Add:
8614 return Builder.CreateAdd(Src1, Src2);
8615 case AtomicRMWInst::Sub:
8616 return Builder.CreateSub(Src1, Src2);
8617 case AtomicRMWInst::And:
8618 return Builder.CreateAnd(Src1, Src2);
8620 return Builder.CreateNeg(Builder.CreateAnd(Src1, Src2));
8621 case AtomicRMWInst::Or:
8622 return Builder.CreateOr(Src1, Src2);
8623 case AtomicRMWInst::Xor:
8624 return Builder.CreateXor(Src1, Src2);
8629 case AtomicRMWInst::Max:
8630 case AtomicRMWInst::Min:
8639 llvm_unreachable("Unsupported atomic update operation");
8640 }
8641 llvm_unreachable("Unsupported atomic update operation");
8642}
8643
8644Expected<std::pair<Value *, Value *>> OpenMPIRBuilder::emitAtomicUpdate(
8645 InsertPointTy AllocaIP, Value *X, Type *XElemTy, Value *Expr,
8647 AtomicUpdateCallbackTy &UpdateOp, bool VolatileX, bool IsXBinopExpr) {
8648 // TODO: handle the case where XElemTy is not byte-sized or not a power of 2
8649 // or a complex datatype.
8650 bool emitRMWOp = false;
8651 switch (RMWOp) {
8652 case AtomicRMWInst::Add:
8653 case AtomicRMWInst::And:
8655 case AtomicRMWInst::Or:
8656 case AtomicRMWInst::Xor:
8658 emitRMWOp = XElemTy;
8659 break;
8660 case AtomicRMWInst::Sub:
8661 emitRMWOp = (IsXBinopExpr && XElemTy);
8662 break;
8663 default:
8664 emitRMWOp = false;
8665 }
8666 emitRMWOp &= XElemTy->isIntegerTy();
8667
8668 std::pair<Value *, Value *> Res;
8669 if (emitRMWOp) {
8670 Res.first = Builder.CreateAtomicRMW(RMWOp, X, Expr, llvm::MaybeAlign(), AO);
8671 // not needed except in case of postfix captures. Generate anyway for
8672 // consistency with the else part. Will be removed with any DCE pass.
8673 // AtomicRMWInst::Xchg does not have a coressponding instruction.
8674 if (RMWOp == AtomicRMWInst::Xchg)
8675 Res.second = Res.first;
8676 else
8677 Res.second = emitRMWOpAsInstruction(Res.first, Expr, RMWOp);
8678 } else if (RMWOp == llvm::AtomicRMWInst::BinOp::BAD_BINOP &&
8679 XElemTy->isStructTy()) {
8680 LoadInst *OldVal =
8681 Builder.CreateLoad(XElemTy, X, X->getName() + ".atomic.load");
8682 OldVal->setAtomic(AO);
8683 const DataLayout &LoadDL = OldVal->getModule()->getDataLayout();
8684 unsigned LoadSize =
8685 LoadDL.getTypeStoreSize(OldVal->getPointerOperand()->getType());
8686
8687 OpenMPIRBuilder::AtomicInfo atomicInfo(
8688 &Builder, XElemTy, LoadSize * 8, LoadSize * 8, OldVal->getAlign(),
8689 OldVal->getAlign(), true /* UseLibcall */, X);
8690 auto AtomicLoadRes = atomicInfo.EmitAtomicLoadLibcall(AO);
8692 Instruction *CurBBTI = CurBB->getTerminator();
8693 CurBBTI = CurBBTI ? CurBBTI : Builder.CreateUnreachable();
8694 BasicBlock *ExitBB =
8695 CurBB->splitBasicBlock(CurBBTI, X->getName() + ".atomic.exit");
8696 BasicBlock *ContBB = CurBB->splitBasicBlock(CurBB->getTerminator(),
8697 X->getName() + ".atomic.cont");
8698 ContBB->getTerminator()->eraseFromParent();
8699 Builder.restoreIP(AllocaIP);
8700 AllocaInst *NewAtomicAddr = Builder.CreateAlloca(XElemTy);
8701 NewAtomicAddr->setName(X->getName() + "x.new.val");
8702 Builder.SetInsertPoint(ContBB);
8703 llvm::PHINode *PHI = Builder.CreatePHI(OldVal->getType(), 2);
8704 PHI->addIncoming(AtomicLoadRes.first, CurBB);
8705 Value *OldExprVal = PHI;
8706 Expected<Value *> CBResult = UpdateOp(OldExprVal, Builder);
8707 if (!CBResult)
8708 return CBResult.takeError();
8709 Value *Upd = *CBResult;
8710 Builder.CreateStore(Upd, NewAtomicAddr);
8713 auto Result = atomicInfo.EmitAtomicCompareExchangeLibcall(
8714 AtomicLoadRes.second, NewAtomicAddr, AO, Failure);
8715 LoadInst *PHILoad = Builder.CreateLoad(XElemTy, Result.first);
8716 PHI->addIncoming(PHILoad, Builder.GetInsertBlock());
8717 Builder.CreateCondBr(Result.second, ExitBB, ContBB);
8718 OldVal->eraseFromParent();
8719 Res.first = OldExprVal;
8720 Res.second = Upd;
8721
8722 if (UnreachableInst *ExitTI =
8723 dyn_cast<UnreachableInst>(ExitBB->getTerminator())) {
8724 CurBBTI->eraseFromParent();
8725 Builder.SetInsertPoint(ExitBB);
8726 } else {
8727 Builder.SetInsertPoint(ExitTI);
8728 }
8729 } else {
8730 IntegerType *IntCastTy =
8732 LoadInst *OldVal =
8733 Builder.CreateLoad(IntCastTy, X, X->getName() + ".atomic.load");
8734 OldVal->setAtomic(AO);
8735 // CurBB
8736 // | /---\
8737 // ContBB |
8738 // | \---/
8739 // ExitBB
8741 Instruction *CurBBTI = CurBB->getTerminator();
8742 CurBBTI = CurBBTI ? CurBBTI : Builder.CreateUnreachable();
8743 BasicBlock *ExitBB =
8744 CurBB->splitBasicBlock(CurBBTI, X->getName() + ".atomic.exit");
8745 BasicBlock *ContBB = CurBB->splitBasicBlock(CurBB->getTerminator(),
8746 X->getName() + ".atomic.cont");
8747 ContBB->getTerminator()->eraseFromParent();
8748 Builder.restoreIP(AllocaIP);
8749 AllocaInst *NewAtomicAddr = Builder.CreateAlloca(XElemTy);
8750 NewAtomicAddr->setName(X->getName() + "x.new.val");
8751 Builder.SetInsertPoint(ContBB);
8752 llvm::PHINode *PHI = Builder.CreatePHI(OldVal->getType(), 2);
8753 PHI->addIncoming(OldVal, CurBB);
8754 bool IsIntTy = XElemTy->isIntegerTy();
8755 Value *OldExprVal = PHI;
8756 if (!IsIntTy) {
8757 if (XElemTy->isFloatingPointTy()) {
8758 OldExprVal = Builder.CreateBitCast(PHI, XElemTy,
8759 X->getName() + ".atomic.fltCast");
8760 } else {
8761 OldExprVal = Builder.CreateIntToPtr(PHI, XElemTy,
8762 X->getName() + ".atomic.ptrCast");
8763 }
8764 }
8765
8766 Expected<Value *> CBResult = UpdateOp(OldExprVal, Builder);
8767 if (!CBResult)
8768 return CBResult.takeError();
8769 Value *Upd = *CBResult;
8770 Builder.CreateStore(Upd, NewAtomicAddr);
8771 LoadInst *DesiredVal = Builder.CreateLoad(IntCastTy, NewAtomicAddr);
8775 X, PHI, DesiredVal, llvm::MaybeAlign(), AO, Failure);
8776 Result->setVolatile(VolatileX);
8777 Value *PreviousVal = Builder.CreateExtractValue(Result, /*Idxs=*/0);
8778 Value *SuccessFailureVal = Builder.CreateExtractValue(Result, /*Idxs=*/1);
8779 PHI->addIncoming(PreviousVal, Builder.GetInsertBlock());
8780 Builder.CreateCondBr(SuccessFailureVal, ExitBB, ContBB);
8781
8782 Res.first = OldExprVal;
8783 Res.second = Upd;
8784
8785 // set Insertion point in exit block
8786 if (UnreachableInst *ExitTI =
8787 dyn_cast<UnreachableInst>(ExitBB->getTerminator())) {
8788 CurBBTI->eraseFromParent();
8789 Builder.SetInsertPoint(ExitBB);
8790 } else {
8791 Builder.SetInsertPoint(ExitTI);
8792 }
8793 }
8794
8795 return Res;
8796}
8797
8799 const LocationDescription &Loc, InsertPointTy AllocaIP, AtomicOpValue &X,
8800 AtomicOpValue &V, Value *Expr, AtomicOrdering AO,
8802 bool UpdateExpr, bool IsPostfixUpdate, bool IsXBinopExpr) {
8803 if (!updateToLocation(Loc))
8804 return Loc.IP;
8805
8806 LLVM_DEBUG({
8807 Type *XTy = X.Var->getType();
8808 assert(XTy->isPointerTy() &&
8809 "OMP Atomic expects a pointer to target memory");
8810 Type *XElemTy = X.ElemTy;
8811 assert((XElemTy->isFloatingPointTy() || XElemTy->isIntegerTy() ||
8812 XElemTy->isPointerTy()) &&
8813 "OMP atomic capture expected a scalar type");
8814 assert((RMWOp != AtomicRMWInst::Max) && (RMWOp != AtomicRMWInst::Min) &&
8815 "OpenMP atomic does not support LT or GT operations");
8816 });
8817
8818 // If UpdateExpr is 'x' updated with some `expr` not based on 'x',
8819 // 'x' is simply atomically rewritten with 'expr'.
8820 AtomicRMWInst::BinOp AtomicOp = (UpdateExpr ? RMWOp : AtomicRMWInst::Xchg);
8822 emitAtomicUpdate(AllocaIP, X.Var, X.ElemTy, Expr, AO, AtomicOp, UpdateOp,
8823 X.IsVolatile, IsXBinopExpr);
8824 if (!AtomicResult)
8825 return AtomicResult.takeError();
8826 Value *CapturedVal =
8827 (IsPostfixUpdate ? AtomicResult->first : AtomicResult->second);
8828 if (CapturedVal->getType() != V.Var->getType())
8829 CapturedVal = emitImplicitCast(Builder, CapturedVal, V.Var);
8830 Builder.CreateStore(CapturedVal, V.Var, V.IsVolatile);
8831
8832 checkAndEmitFlushAfterAtomic(Loc, AO, AtomicKind::Capture);
8833 return Builder.saveIP();
8834}
8835
8839 omp::OMPAtomicCompareOp Op, bool IsXBinopExpr, bool IsPostfixUpdate,
8840 bool IsFailOnly) {
8841
8843 return createAtomicCompare(Loc, X, V, R, E, D, AO, Op, IsXBinopExpr,
8844 IsPostfixUpdate, IsFailOnly, Failure);
8845}
8846
8850 omp::OMPAtomicCompareOp Op, bool IsXBinopExpr, bool IsPostfixUpdate,
8851 bool IsFailOnly, AtomicOrdering Failure) {
8852
8853 if (!updateToLocation(Loc))
8854 return Loc.IP;
8855
8856 assert(X.Var->getType()->isPointerTy() &&
8857 "OMP atomic expects a pointer to target memory");
8858 // compare capture
8859 if (V.Var) {
8860 assert(V.Var->getType()->isPointerTy() && "v.var must be of pointer type");
8861 assert(V.ElemTy == X.ElemTy && "x and v must be of same type");
8862 }
8863
8864 bool IsInteger = E->getType()->isIntegerTy();
8865
8866 if (Op == OMPAtomicCompareOp::EQ) {
8867 AtomicCmpXchgInst *Result = nullptr;
8868 if (!IsInteger) {
8869 IntegerType *IntCastTy =
8870 IntegerType::get(M.getContext(), X.ElemTy->getScalarSizeInBits());
8871 Value *EBCast = Builder.CreateBitCast(E, IntCastTy);
8872 Value *DBCast = Builder.CreateBitCast(D, IntCastTy);
8873 Result = Builder.CreateAtomicCmpXchg(X.Var, EBCast, DBCast, MaybeAlign(),
8874 AO, Failure);
8875 } else {
8876 Result =
8877 Builder.CreateAtomicCmpXchg(X.Var, E, D, MaybeAlign(), AO, Failure);
8878 }
8879
8880 if (V.Var) {
8881 Value *OldValue = Builder.CreateExtractValue(Result, /*Idxs=*/0);
8882 if (!IsInteger)
8883 OldValue = Builder.CreateBitCast(OldValue, X.ElemTy);
8884 assert(OldValue->getType() == V.ElemTy &&
8885 "OldValue and V must be of same type");
8886 if (IsPostfixUpdate) {
8887 Builder.CreateStore(OldValue, V.Var, V.IsVolatile);
8888 } else {
8889 Value *SuccessOrFail = Builder.CreateExtractValue(Result, /*Idxs=*/1);
8890 if (IsFailOnly) {
8891 // CurBB----
8892 // | |
8893 // v |
8894 // ContBB |
8895 // | |
8896 // v |
8897 // ExitBB <-
8898 //
8899 // where ContBB only contains the store of old value to 'v'.
8901 Instruction *CurBBTI = CurBB->getTerminator();
8902 CurBBTI = CurBBTI ? CurBBTI : Builder.CreateUnreachable();
8903 BasicBlock *ExitBB = CurBB->splitBasicBlock(
8904 CurBBTI, X.Var->getName() + ".atomic.exit");
8905 BasicBlock *ContBB = CurBB->splitBasicBlock(
8906 CurBB->getTerminator(), X.Var->getName() + ".atomic.cont");
8907 ContBB->getTerminator()->eraseFromParent();
8908 CurBB->getTerminator()->eraseFromParent();
8909
8910 Builder.CreateCondBr(SuccessOrFail, ExitBB, ContBB);
8911
8912 Builder.SetInsertPoint(ContBB);
8913 Builder.CreateStore(OldValue, V.Var);
8914 Builder.CreateBr(ExitBB);
8915
8916 if (UnreachableInst *ExitTI =
8917 dyn_cast<UnreachableInst>(ExitBB->getTerminator())) {
8918 CurBBTI->eraseFromParent();
8919 Builder.SetInsertPoint(ExitBB);
8920 } else {
8921 Builder.SetInsertPoint(ExitTI);
8922 }
8923 } else {
8924 Value *CapturedValue =
8925 Builder.CreateSelect(SuccessOrFail, E, OldValue);
8926 Builder.CreateStore(CapturedValue, V.Var, V.IsVolatile);
8927 }
8928 }
8929 }
8930 // The comparison result has to be stored.
8931 if (R.Var) {
8932 assert(R.Var->getType()->isPointerTy() &&
8933 "r.var must be of pointer type");
8934 assert(R.ElemTy->isIntegerTy() && "r must be of integral type");
8935
8936 Value *SuccessFailureVal = Builder.CreateExtractValue(Result, /*Idxs=*/1);
8937 Value *ResultCast = R.IsSigned
8938 ? Builder.CreateSExt(SuccessFailureVal, R.ElemTy)
8939 : Builder.CreateZExt(SuccessFailureVal, R.ElemTy);
8940 Builder.CreateStore(ResultCast, R.Var, R.IsVolatile);
8941 }
8942 } else {
8943 assert((Op == OMPAtomicCompareOp::MAX || Op == OMPAtomicCompareOp::MIN) &&
8944 "Op should be either max or min at this point");
8945 assert(!IsFailOnly && "IsFailOnly is only valid when the comparison is ==");
8946
8947 // Reverse the ordop as the OpenMP forms are different from LLVM forms.
8948 // Let's take max as example.
8949 // OpenMP form:
8950 // x = x > expr ? expr : x;
8951 // LLVM form:
8952 // *ptr = *ptr > val ? *ptr : val;
8953 // We need to transform to LLVM form.
8954 // x = x <= expr ? x : expr;
8956 if (IsXBinopExpr) {
8957 if (IsInteger) {
8958 if (X.IsSigned)
8959 NewOp = Op == OMPAtomicCompareOp::MAX ? AtomicRMWInst::Min
8961 else
8962 NewOp = Op == OMPAtomicCompareOp::MAX ? AtomicRMWInst::UMin
8964 } else {
8965 NewOp = Op == OMPAtomicCompareOp::MAX ? AtomicRMWInst::FMin
8967 }
8968 } else {
8969 if (IsInteger) {
8970 if (X.IsSigned)
8971 NewOp = Op == OMPAtomicCompareOp::MAX ? AtomicRMWInst::Max
8973 else
8974 NewOp = Op == OMPAtomicCompareOp::MAX ? AtomicRMWInst::UMax
8976 } else {
8977 NewOp = Op == OMPAtomicCompareOp::MAX ? AtomicRMWInst::FMax
8979 }
8980 }
8981
8982 AtomicRMWInst *OldValue =
8983 Builder.CreateAtomicRMW(NewOp, X.Var, E, MaybeAlign(), AO);
8984 if (V.Var) {
8985 Value *CapturedValue = nullptr;
8986 if (IsPostfixUpdate) {
8987 CapturedValue = OldValue;
8988 } else {
8989 CmpInst::Predicate Pred;
8990 switch (NewOp) {
8991 case AtomicRMWInst::Max:
8992 Pred = CmpInst::ICMP_SGT;
8993 break;
8995 Pred = CmpInst::ICMP_UGT;
8996 break;
8998 Pred = CmpInst::FCMP_OGT;
8999 break;
9000 case AtomicRMWInst::Min:
9001 Pred = CmpInst::ICMP_SLT;
9002 break;
9004 Pred = CmpInst::ICMP_ULT;
9005 break;
9007 Pred = CmpInst::FCMP_OLT;
9008 break;
9009 default:
9010 llvm_unreachable("unexpected comparison op");
9011 }
9012 Value *NonAtomicCmp = Builder.CreateCmp(Pred, OldValue, E);
9013 CapturedValue = Builder.CreateSelect(NonAtomicCmp, E, OldValue);
9014 }
9015 Builder.CreateStore(CapturedValue, V.Var, V.IsVolatile);
9016 }
9017 }
9018
9019 checkAndEmitFlushAfterAtomic(Loc, AO, AtomicKind::Compare);
9020
9021 return Builder.saveIP();
9022}
9023
9026 BodyGenCallbackTy BodyGenCB, Value *NumTeamsLower,
9027 Value *NumTeamsUpper, Value *ThreadLimit,
9028 Value *IfExpr) {
9029 if (!updateToLocation(Loc))
9030 return InsertPointTy();
9031
9032 uint32_t SrcLocStrSize;
9033 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
9034 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
9035 Function *CurrentFunction = Builder.GetInsertBlock()->getParent();
9036
9037 // Outer allocation basicblock is the entry block of the current function.
9038 BasicBlock &OuterAllocaBB = CurrentFunction->getEntryBlock();
9039 if (&OuterAllocaBB == Builder.GetInsertBlock()) {
9040 BasicBlock *BodyBB = splitBB(Builder, /*CreateBranch=*/true, "teams.entry");
9041 Builder.SetInsertPoint(BodyBB, BodyBB->begin());
9042 }
9043
9044 // The current basic block is split into four basic blocks. After outlining,
9045 // they will be mapped as follows:
9046 // ```
9047 // def current_fn() {
9048 // current_basic_block:
9049 // br label %teams.exit
9050 // teams.exit:
9051 // ; instructions after teams
9052 // }
9053 //
9054 // def outlined_fn() {
9055 // teams.alloca:
9056 // br label %teams.body
9057 // teams.body:
9058 // ; instructions within teams body
9059 // }
9060 // ```
9061 BasicBlock *ExitBB = splitBB(Builder, /*CreateBranch=*/true, "teams.exit");
9062 BasicBlock *BodyBB = splitBB(Builder, /*CreateBranch=*/true, "teams.body");
9063 BasicBlock *AllocaBB =
9064 splitBB(Builder, /*CreateBranch=*/true, "teams.alloca");
9065
9066 bool SubClausesPresent =
9067 (NumTeamsLower || NumTeamsUpper || ThreadLimit || IfExpr);
9068 // Push num_teams
9069 if (!Config.isTargetDevice() && SubClausesPresent) {
9070 assert((NumTeamsLower == nullptr || NumTeamsUpper != nullptr) &&
9071 "if lowerbound is non-null, then upperbound must also be non-null "
9072 "for bounds on num_teams");
9073
9074 if (NumTeamsUpper == nullptr)
9075 NumTeamsUpper = Builder.getInt32(0);
9076
9077 if (NumTeamsLower == nullptr)
9078 NumTeamsLower = NumTeamsUpper;
9079
9080 if (IfExpr) {
9081 assert(IfExpr->getType()->isIntegerTy() &&
9082 "argument to if clause must be an integer value");
9083
9084 // upper = ifexpr ? upper : 1
9085 if (IfExpr->getType() != Int1)
9086 IfExpr = Builder.CreateICmpNE(IfExpr,
9087 ConstantInt::get(IfExpr->getType(), 0));
9088 NumTeamsUpper = Builder.CreateSelect(
9089 IfExpr, NumTeamsUpper, Builder.getInt32(1), "numTeamsUpper");
9090
9091 // lower = ifexpr ? lower : 1
9092 NumTeamsLower = Builder.CreateSelect(
9093 IfExpr, NumTeamsLower, Builder.getInt32(1), "numTeamsLower");
9094 }
9095
9096 if (ThreadLimit == nullptr)
9097 ThreadLimit = Builder.getInt32(0);
9098
9099 Value *ThreadNum = getOrCreateThreadID(Ident);
9101 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_push_num_teams_51),
9102 {Ident, ThreadNum, NumTeamsLower, NumTeamsUpper, ThreadLimit});
9103 }
9104 // Generate the body of teams.
9105 InsertPointTy AllocaIP(AllocaBB, AllocaBB->begin());
9106 InsertPointTy CodeGenIP(BodyBB, BodyBB->begin());
9107 if (Error Err = BodyGenCB(AllocaIP, CodeGenIP))
9108 return Err;
9109
9110 OutlineInfo OI;
9111 OI.EntryBB = AllocaBB;
9112 OI.ExitBB = ExitBB;
9113 OI.OuterAllocaBB = &OuterAllocaBB;
9114
9115 // Insert fake values for global tid and bound tid.
9117 InsertPointTy OuterAllocaIP(&OuterAllocaBB, OuterAllocaBB.begin());
9119 Builder, OuterAllocaIP, ToBeDeleted, AllocaIP, "gid", true));
9121 Builder, OuterAllocaIP, ToBeDeleted, AllocaIP, "tid", true));
9122
9123 auto HostPostOutlineCB = [this, Ident,
9124 ToBeDeleted](Function &OutlinedFn) mutable {
9125 // The stale call instruction will be replaced with a new call instruction
9126 // for runtime call with the outlined function.
9127
9128 assert(OutlinedFn.getNumUses() == 1 &&
9129 "there must be a single user for the outlined function");
9130 CallInst *StaleCI = cast<CallInst>(OutlinedFn.user_back());
9131 ToBeDeleted.push_back(StaleCI);
9132
9133 assert((OutlinedFn.arg_size() == 2 || OutlinedFn.arg_size() == 3) &&
9134 "Outlined function must have two or three arguments only");
9135
9136 bool HasShared = OutlinedFn.arg_size() == 3;
9137
9138 OutlinedFn.getArg(0)->setName("global.tid.ptr");
9139 OutlinedFn.getArg(1)->setName("bound.tid.ptr");
9140 if (HasShared)
9141 OutlinedFn.getArg(2)->setName("data");
9142
9143 // Call to the runtime function for teams in the current function.
9144 assert(StaleCI && "Error while outlining - no CallInst user found for the "
9145 "outlined function.");
9146 Builder.SetInsertPoint(StaleCI);
9147 SmallVector<Value *> Args = {
9148 Ident, Builder.getInt32(StaleCI->arg_size() - 2), &OutlinedFn};
9149 if (HasShared)
9150 Args.push_back(StaleCI->getArgOperand(2));
9152 omp::RuntimeFunction::OMPRTL___kmpc_fork_teams),
9153 Args);
9154
9155 for (Instruction *I : llvm::reverse(ToBeDeleted))
9156 I->eraseFromParent();
9157 };
9158
9159 if (!Config.isTargetDevice())
9160 OI.PostOutlineCB = HostPostOutlineCB;
9161
9162 addOutlineInfo(std::move(OI));
9163
9164 Builder.SetInsertPoint(ExitBB, ExitBB->begin());
9165
9166 return Builder.saveIP();
9167}
9168
9171 std::string VarName) {
9172 llvm::Constant *MapNamesArrayInit = llvm::ConstantArray::get(
9174 Names.size()),
9175 Names);
9176 auto *MapNamesArrayGlobal = new llvm::GlobalVariable(
9177 M, MapNamesArrayInit->getType(),
9178 /*isConstant=*/true, llvm::GlobalValue::PrivateLinkage, MapNamesArrayInit,
9179 VarName);
9180 return MapNamesArrayGlobal;
9181}
9182
9183// Create all simple and struct types exposed by the runtime and remember
9184// the llvm::PointerTypes of them for easy access later.
9185void OpenMPIRBuilder::initializeTypes(Module &M) {
9186 LLVMContext &Ctx = M.getContext();
9187 StructType *T;
9188#define OMP_TYPE(VarName, InitValue) VarName = InitValue;
9189#define OMP_ARRAY_TYPE(VarName, ElemTy, ArraySize) \
9190 VarName##Ty = ArrayType::get(ElemTy, ArraySize); \
9191 VarName##PtrTy = PointerType::getUnqual(Ctx);
9192#define OMP_FUNCTION_TYPE(VarName, IsVarArg, ReturnType, ...) \
9193 VarName = FunctionType::get(ReturnType, {__VA_ARGS__}, IsVarArg); \
9194 VarName##Ptr = PointerType::getUnqual(Ctx);
9195#define OMP_STRUCT_TYPE(VarName, StructName, Packed, ...) \
9196 T = StructType::getTypeByName(Ctx, StructName); \
9197 if (!T) \
9198 T = StructType::create(Ctx, {__VA_ARGS__}, StructName, Packed); \
9199 VarName = T; \
9200 VarName##Ptr = PointerType::getUnqual(Ctx);
9201#include "llvm/Frontend/OpenMP/OMPKinds.def"
9202}
9203
9206 SmallVectorImpl<BasicBlock *> &BlockVector) {
9208 BlockSet.insert(EntryBB);
9209 BlockSet.insert(ExitBB);
9210
9211 Worklist.push_back(EntryBB);
9212 while (!Worklist.empty()) {
9213 BasicBlock *BB = Worklist.pop_back_val();
9214 BlockVector.push_back(BB);
9215 for (BasicBlock *SuccBB : successors(BB))
9216 if (BlockSet.insert(SuccBB).second)
9217 Worklist.push_back(SuccBB);
9218 }
9219}
9220
9222 uint64_t Size, int32_t Flags,
9224 StringRef Name) {
9225 if (!Config.isGPU()) {
9228 Name.empty() ? Addr->getName() : Name, Size, Flags, /*Data=*/0,
9229 "omp_offloading_entries");
9230 return;
9231 }
9232 // TODO: Add support for global variables on the device after declare target
9233 // support.
9234 Function *Fn = dyn_cast<Function>(Addr);
9235 if (!Fn)
9236 return;
9237
9238 // Add a function attribute for the kernel.
9239 Fn->addFnAttr("kernel");
9240 if (T.isAMDGCN())
9241 Fn->addFnAttr("uniform-work-group-size", "true");
9242 Fn->addFnAttr(Attribute::MustProgress);
9243}
9244
9245// We only generate metadata for function that contain target regions.
9248
9249 // If there are no entries, we don't need to do anything.
9251 return;
9252
9256 16>
9257 OrderedEntries(OffloadInfoManager.size());
9258
9259 // Auxiliary methods to create metadata values and strings.
9260 auto &&GetMDInt = [this](unsigned V) {
9261 return ConstantAsMetadata::get(ConstantInt::get(Builder.getInt32Ty(), V));
9262 };
9263
9264 auto &&GetMDString = [&C](StringRef V) { return MDString::get(C, V); };
9265
9266 // Create the offloading info metadata node.
9267 NamedMDNode *MD = M.getOrInsertNamedMetadata("omp_offload.info");
9268 auto &&TargetRegionMetadataEmitter =
9269 [&C, MD, &OrderedEntries, &GetMDInt, &GetMDString](
9270 const TargetRegionEntryInfo &EntryInfo,
9272 // Generate metadata for target regions. Each entry of this metadata
9273 // contains:
9274 // - Entry 0 -> Kind of this type of metadata (0).
9275 // - Entry 1 -> Device ID of the file where the entry was identified.
9276 // - Entry 2 -> File ID of the file where the entry was identified.
9277 // - Entry 3 -> Mangled name of the function where the entry was
9278 // identified.
9279 // - Entry 4 -> Line in the file where the entry was identified.
9280 // - Entry 5 -> Count of regions at this DeviceID/FilesID/Line.
9281 // - Entry 6 -> Order the entry was created.
9282 // The first element of the metadata node is the kind.
9283 Metadata *Ops[] = {
9284 GetMDInt(E.getKind()), GetMDInt(EntryInfo.DeviceID),
9285 GetMDInt(EntryInfo.FileID), GetMDString(EntryInfo.ParentName),
9286 GetMDInt(EntryInfo.Line), GetMDInt(EntryInfo.Count),
9287 GetMDInt(E.getOrder())};
9288
9289 // Save this entry in the right position of the ordered entries array.
9290 OrderedEntries[E.getOrder()] = std::make_pair(&E, EntryInfo);
9291
9292 // Add metadata to the named metadata node.
9293 MD->addOperand(MDNode::get(C, Ops));
9294 };
9295
9296 OffloadInfoManager.actOnTargetRegionEntriesInfo(TargetRegionMetadataEmitter);
9297
9298 // Create function that emits metadata for each device global variable entry;
9299 auto &&DeviceGlobalVarMetadataEmitter =
9300 [&C, &OrderedEntries, &GetMDInt, &GetMDString, MD](
9301 StringRef MangledName,
9303 // Generate metadata for global variables. Each entry of this metadata
9304 // contains:
9305 // - Entry 0 -> Kind of this type of metadata (1).
9306 // - Entry 1 -> Mangled name of the variable.
9307 // - Entry 2 -> Declare target kind.
9308 // - Entry 3 -> Order the entry was created.
9309 // The first element of the metadata node is the kind.
9310 Metadata *Ops[] = {GetMDInt(E.getKind()), GetMDString(MangledName),
9311 GetMDInt(E.getFlags()), GetMDInt(E.getOrder())};
9312
9313 // Save this entry in the right position of the ordered entries array.
9314 TargetRegionEntryInfo varInfo(MangledName, 0, 0, 0);
9315 OrderedEntries[E.getOrder()] = std::make_pair(&E, varInfo);
9316
9317 // Add metadata to the named metadata node.
9318 MD->addOperand(MDNode::get(C, Ops));
9319 };
9320
9322 DeviceGlobalVarMetadataEmitter);
9323
9324 for (const auto &E : OrderedEntries) {
9325 assert(E.first && "All ordered entries must exist!");
9326 if (const auto *CE =
9327 dyn_cast<OffloadEntriesInfoManager::OffloadEntryInfoTargetRegion>(
9328 E.first)) {
9329 if (!CE->getID() || !CE->getAddress()) {
9330 // Do not blame the entry if the parent funtion is not emitted.
9331 TargetRegionEntryInfo EntryInfo = E.second;
9332 StringRef FnName = EntryInfo.ParentName;
9333 if (!M.getNamedValue(FnName))
9334 continue;
9335 ErrorFn(EMIT_MD_TARGET_REGION_ERROR, EntryInfo);
9336 continue;
9337 }
9338 createOffloadEntry(CE->getID(), CE->getAddress(),
9339 /*Size=*/0, CE->getFlags(),
9341 } else if (const auto *CE = dyn_cast<
9343 E.first)) {
9346 CE->getFlags());
9347 switch (Flags) {
9351 continue;
9352 if (!CE->getAddress()) {
9353 ErrorFn(EMIT_MD_DECLARE_TARGET_ERROR, E.second);
9354 continue;
9355 }
9356 // The vaiable has no definition - no need to add the entry.
9357 if (CE->getVarSize() == 0)
9358 continue;
9359 break;
9361 assert(((Config.isTargetDevice() && !CE->getAddress()) ||
9362 (!Config.isTargetDevice() && CE->getAddress())) &&
9363 "Declaret target link address is set.");
9364 if (Config.isTargetDevice())
9365 continue;
9366 if (!CE->getAddress()) {
9368 continue;
9369 }
9370 break;
9371 default:
9372 break;
9373 }
9374
9375 // Hidden or internal symbols on the device are not externally visible.
9376 // We should not attempt to register them by creating an offloading
9377 // entry. Indirect variables are handled separately on the device.
9378 if (auto *GV = dyn_cast<GlobalValue>(CE->getAddress()))
9379 if ((GV->hasLocalLinkage() || GV->hasHiddenVisibility()) &&
9381 continue;
9382
9383 // Indirect globals need to use a special name that doesn't match the name
9384 // of the associated host global.
9386 createOffloadEntry(CE->getAddress(), CE->getAddress(), CE->getVarSize(),
9387 Flags, CE->getLinkage(), CE->getVarName());
9388 else
9389 createOffloadEntry(CE->getAddress(), CE->getAddress(), CE->getVarSize(),
9390 Flags, CE->getLinkage());
9391
9392 } else {
9393 llvm_unreachable("Unsupported entry kind.");
9394 }
9395 }
9396
9397 // Emit requires directive globals to a special entry so the runtime can
9398 // register them when the device image is loaded.
9399 // TODO: This reduces the offloading entries to a 32-bit integer. Offloading
9400 // entries should be redesigned to better suit this use-case.
9405 /*Name=*/"",
9407 Config.getRequiresFlags(), "omp_offloading_entries");
9408}
9409
9411 SmallVectorImpl<char> &Name, StringRef ParentName, unsigned DeviceID,
9412 unsigned FileID, unsigned Line, unsigned Count) {
9414 OS << KernelNamePrefix << llvm::format("%x", DeviceID)
9415 << llvm::format("_%x_", FileID) << ParentName << "_l" << Line;
9416 if (Count)
9417 OS << "_" << Count;
9418}
9419
9422 unsigned NewCount = getTargetRegionEntryInfoCount(EntryInfo);
9424 Name, EntryInfo.ParentName, EntryInfo.DeviceID, EntryInfo.FileID,
9425 EntryInfo.Line, NewCount);
9426}
9427
9430 StringRef ParentName) {
9432 auto FileIDInfo = CallBack();
9433 if (auto EC = sys::fs::getUniqueID(std::get<0>(FileIDInfo), ID)) {
9434 report_fatal_error(("Unable to get unique ID for file, during "
9435 "getTargetEntryUniqueInfo, error message: " +
9436 EC.message())
9437 .c_str());
9438 }
9439
9440 return TargetRegionEntryInfo(ParentName, ID.getDevice(), ID.getFile(),
9441 std::get<1>(FileIDInfo));
9442}
9443
9445 unsigned Offset = 0;
9446 for (uint64_t Remain =
9447 static_cast<std::underlying_type_t<omp::OpenMPOffloadMappingFlags>>(
9449 !(Remain & 1); Remain = Remain >> 1)
9450 Offset++;
9451 return Offset;
9452}
9453
9456 // Rotate by getFlagMemberOffset() bits.
9457 return static_cast<omp::OpenMPOffloadMappingFlags>(((uint64_t)Position + 1)
9458 << getFlagMemberOffset());
9459}
9460
9463 omp::OpenMPOffloadMappingFlags MemberOfFlag) {
9464 // If the entry is PTR_AND_OBJ but has not been marked with the special
9465 // placeholder value 0xFFFF in the MEMBER_OF field, then it should not be
9466 // marked as MEMBER_OF.
9467 if (static_cast<std::underlying_type_t<omp::OpenMPOffloadMappingFlags>>(
9469 static_cast<std::underlying_type_t<omp::OpenMPOffloadMappingFlags>>(
9472 return;
9473
9474 // Reset the placeholder value to prepare the flag for the assignment of the
9475 // proper MEMBER_OF value.
9476 Flags &= ~omp::OpenMPOffloadMappingFlags::OMP_MAP_MEMBER_OF;
9477 Flags |= MemberOfFlag;
9478}
9479
9483 bool IsDeclaration, bool IsExternallyVisible,
9484 TargetRegionEntryInfo EntryInfo, StringRef MangledName,
9485 std::vector<GlobalVariable *> &GeneratedRefs, bool OpenMPSIMD,
9486 std::vector<Triple> TargetTriple, Type *LlvmPtrTy,
9487 std::function<Constant *()> GlobalInitializer,
9488 std::function<GlobalValue::LinkageTypes()> VariableLinkage) {
9489 // TODO: convert this to utilise the IRBuilder Config rather than
9490 // a passed down argument.
9491 if (OpenMPSIMD)
9492 return nullptr;
9493
9496 CaptureClause ==
9499 SmallString<64> PtrName;
9500 {
9501 raw_svector_ostream OS(PtrName);
9502 OS << MangledName;
9503 if (!IsExternallyVisible)
9504 OS << format("_%x", EntryInfo.FileID);
9505 OS << "_decl_tgt_ref_ptr";
9506 }
9507
9508 Value *Ptr = M.getNamedValue(PtrName);
9509
9510 if (!Ptr) {
9511 GlobalValue *GlobalValue = M.getNamedValue(MangledName);
9512 Ptr = getOrCreateInternalVariable(LlvmPtrTy, PtrName);
9513
9514 auto *GV = cast<GlobalVariable>(Ptr);
9515 GV->setLinkage(GlobalValue::WeakAnyLinkage);
9516
9517 if (!Config.isTargetDevice()) {
9518 if (GlobalInitializer)
9519 GV->setInitializer(GlobalInitializer());
9520 else
9521 GV->setInitializer(GlobalValue);
9522 }
9523
9525 CaptureClause, DeviceClause, IsDeclaration, IsExternallyVisible,
9526 EntryInfo, MangledName, GeneratedRefs, OpenMPSIMD, TargetTriple,
9527 GlobalInitializer, VariableLinkage, LlvmPtrTy, cast<Constant>(Ptr));
9528 }
9529
9530 return cast<Constant>(Ptr);
9531 }
9532
9533 return nullptr;
9534}
9535
9539 bool IsDeclaration, bool IsExternallyVisible,
9540 TargetRegionEntryInfo EntryInfo, StringRef MangledName,
9541 std::vector<GlobalVariable *> &GeneratedRefs, bool OpenMPSIMD,
9542 std::vector<Triple> TargetTriple,
9543 std::function<Constant *()> GlobalInitializer,
9544 std::function<GlobalValue::LinkageTypes()> VariableLinkage, Type *LlvmPtrTy,
9545 Constant *Addr) {
9547 (TargetTriple.empty() && !Config.isTargetDevice()))
9548 return;
9549
9551 StringRef VarName;
9552 int64_t VarSize;
9554
9556 CaptureClause ==
9560 VarName = MangledName;
9561 GlobalValue *LlvmVal = M.getNamedValue(VarName);
9562
9563 if (!IsDeclaration)
9564 VarSize = divideCeil(
9566 else
9567 VarSize = 0;
9568 Linkage = (VariableLinkage) ? VariableLinkage() : LlvmVal->getLinkage();
9569
9570 // This is a workaround carried over from Clang which prevents undesired
9571 // optimisation of internal variables.
9572 if (Config.isTargetDevice() &&
9573 (!IsExternallyVisible || Linkage == GlobalValue::LinkOnceODRLinkage)) {
9574 // Do not create a "ref-variable" if the original is not also available
9575 // on the host.
9577 return;
9578
9579 std::string RefName = createPlatformSpecificName({VarName, "ref"});
9580
9581 if (!M.getNamedValue(RefName)) {
9582 Constant *AddrRef =
9583 getOrCreateInternalVariable(Addr->getType(), RefName);
9584 auto *GvAddrRef = cast<GlobalVariable>(AddrRef);
9585 GvAddrRef->setConstant(true);
9586 GvAddrRef->setLinkage(GlobalValue::InternalLinkage);
9587 GvAddrRef->setInitializer(Addr);
9588 GeneratedRefs.push_back(GvAddrRef);
9589 }
9590 }
9591 } else {
9594 else
9596
9597 if (Config.isTargetDevice()) {
9598 VarName = (Addr) ? Addr->getName() : "";
9599 Addr = nullptr;
9600 } else {
9602 CaptureClause, DeviceClause, IsDeclaration, IsExternallyVisible,
9603 EntryInfo, MangledName, GeneratedRefs, OpenMPSIMD, TargetTriple,
9604 LlvmPtrTy, GlobalInitializer, VariableLinkage);
9605 VarName = (Addr) ? Addr->getName() : "";
9606 }
9607 VarSize = M.getDataLayout().getPointerSize();
9609 }
9610
9612 Flags, Linkage);
9613}
9614
9615/// Loads all the offload entries information from the host IR
9616/// metadata.
9618 // If we are in target mode, load the metadata from the host IR. This code has
9619 // to match the metadata creation in createOffloadEntriesAndInfoMetadata().
9620
9622 if (!MD)
9623 return;
9624
9625 for (MDNode *MN : MD->operands()) {
9626 auto &&GetMDInt = [MN](unsigned Idx) {
9627 auto *V = cast<ConstantAsMetadata>(MN->getOperand(Idx));
9628 return cast<ConstantInt>(V->getValue())->getZExtValue();
9629 };
9630
9631 auto &&GetMDString = [MN](unsigned Idx) {
9632 auto *V = cast<MDString>(MN->getOperand(Idx));
9633 return V->getString();
9634 };
9635
9636 switch (GetMDInt(0)) {
9637 default:
9638 llvm_unreachable("Unexpected metadata!");
9639 break;
9642 TargetRegionEntryInfo EntryInfo(/*ParentName=*/GetMDString(3),
9643 /*DeviceID=*/GetMDInt(1),
9644 /*FileID=*/GetMDInt(2),
9645 /*Line=*/GetMDInt(4),
9646 /*Count=*/GetMDInt(5));
9648 /*Order=*/GetMDInt(6));
9649 break;
9650 }
9654 /*MangledName=*/GetMDString(1),
9656 /*Flags=*/GetMDInt(2)),
9657 /*Order=*/GetMDInt(3));
9658 break;
9659 }
9660 }
9661}
9662
9664 if (HostFilePath.empty())
9665 return;
9666
9667 auto Buf = MemoryBuffer::getFile(HostFilePath);
9668 if (std::error_code Err = Buf.getError()) {
9669 report_fatal_error(("error opening host file from host file path inside of "
9670 "OpenMPIRBuilder: " +
9671 Err.message())
9672 .c_str());
9673 }
9674
9675 LLVMContext Ctx;
9677 Ctx, parseBitcodeFile(Buf.get()->getMemBufferRef(), Ctx));
9678 if (std::error_code Err = M.getError()) {
9680 ("error parsing host file inside of OpenMPIRBuilder: " + Err.message())
9681 .c_str());
9682 }
9683
9684 loadOffloadInfoMetadata(*M.get());
9685}
9686
9687//===----------------------------------------------------------------------===//
9688// OffloadEntriesInfoManager
9689//===----------------------------------------------------------------------===//
9690
9692 return OffloadEntriesTargetRegion.empty() &&
9693 OffloadEntriesDeviceGlobalVar.empty();
9694}
9695
9696unsigned OffloadEntriesInfoManager::getTargetRegionEntryInfoCount(
9697 const TargetRegionEntryInfo &EntryInfo) const {
9698 auto It = OffloadEntriesTargetRegionCount.find(
9699 getTargetRegionEntryCountKey(EntryInfo));
9700 if (It == OffloadEntriesTargetRegionCount.end())
9701 return 0;
9702 return It->second;
9703}
9704
9705void OffloadEntriesInfoManager::incrementTargetRegionEntryInfoCount(
9706 const TargetRegionEntryInfo &EntryInfo) {
9707 OffloadEntriesTargetRegionCount[getTargetRegionEntryCountKey(EntryInfo)] =
9708 EntryInfo.Count + 1;
9709}
9710
9711/// Initialize target region entry.
9713 const TargetRegionEntryInfo &EntryInfo, unsigned Order) {
9714 OffloadEntriesTargetRegion[EntryInfo] =
9715 OffloadEntryInfoTargetRegion(Order, /*Addr=*/nullptr, /*ID=*/nullptr,
9716 OMPTargetRegionEntryTargetRegion);
9717 ++OffloadingEntriesNum;
9718}
9719
9723 assert(EntryInfo.Count == 0 && "expected default EntryInfo");
9724
9725 // Update the EntryInfo with the next available count for this location.
9726 EntryInfo.Count = getTargetRegionEntryInfoCount(EntryInfo);
9727
9728 // If we are emitting code for a target, the entry is already initialized,
9729 // only has to be registered.
9730 if (OMPBuilder->Config.isTargetDevice()) {
9731 // This could happen if the device compilation is invoked standalone.
9732 if (!hasTargetRegionEntryInfo(EntryInfo)) {
9733 return;
9734 }
9735 auto &Entry = OffloadEntriesTargetRegion[EntryInfo];
9736 Entry.setAddress(Addr);
9737 Entry.setID(ID);
9738 Entry.setFlags(Flags);
9739 } else {
9741 hasTargetRegionEntryInfo(EntryInfo, /*IgnoreAddressId*/ true))
9742 return;
9743 assert(!hasTargetRegionEntryInfo(EntryInfo) &&
9744 "Target region entry already registered!");
9745 OffloadEntryInfoTargetRegion Entry(OffloadingEntriesNum, Addr, ID, Flags);
9746 OffloadEntriesTargetRegion[EntryInfo] = Entry;
9747 ++OffloadingEntriesNum;
9748 }
9749 incrementTargetRegionEntryInfoCount(EntryInfo);
9750}
9751
9753 TargetRegionEntryInfo EntryInfo, bool IgnoreAddressId) const {
9754
9755 // Update the EntryInfo with the next available count for this location.
9756 EntryInfo.Count = getTargetRegionEntryInfoCount(EntryInfo);
9757
9758 auto It = OffloadEntriesTargetRegion.find(EntryInfo);
9759 if (It == OffloadEntriesTargetRegion.end()) {
9760 return false;
9761 }
9762 // Fail if this entry is already registered.
9763 if (!IgnoreAddressId && (It->second.getAddress() || It->second.getID()))
9764 return false;
9765 return true;
9766}
9767
9769 const OffloadTargetRegionEntryInfoActTy &Action) {
9770 // Scan all target region entries and perform the provided action.
9771 for (const auto &It : OffloadEntriesTargetRegion) {
9772 Action(It.first, It.second);
9773 }
9774}
9775
9777 StringRef Name, OMPTargetGlobalVarEntryKind Flags, unsigned Order) {
9778 OffloadEntriesDeviceGlobalVar.try_emplace(Name, Order, Flags);
9779 ++OffloadingEntriesNum;
9780}
9781
9783 StringRef VarName, Constant *Addr, int64_t VarSize,
9785 if (OMPBuilder->Config.isTargetDevice()) {
9786 // This could happen if the device compilation is invoked standalone.
9787 if (!hasDeviceGlobalVarEntryInfo(VarName))
9788 return;
9789 auto &Entry = OffloadEntriesDeviceGlobalVar[VarName];
9790 if (Entry.getAddress() && hasDeviceGlobalVarEntryInfo(VarName)) {
9791 if (Entry.getVarSize() == 0) {
9792 Entry.setVarSize(VarSize);
9793 Entry.setLinkage(Linkage);
9794 }
9795 return;
9796 }
9797 Entry.setVarSize(VarSize);
9798 Entry.setLinkage(Linkage);
9799 Entry.setAddress(Addr);
9800 } else {
9801 if (hasDeviceGlobalVarEntryInfo(VarName)) {
9802 auto &Entry = OffloadEntriesDeviceGlobalVar[VarName];
9803 assert(Entry.isValid() && Entry.getFlags() == Flags &&
9804 "Entry not initialized!");
9805 if (Entry.getVarSize() == 0) {
9806 Entry.setVarSize(VarSize);
9807 Entry.setLinkage(Linkage);
9808 }
9809 return;
9810 }
9812 OffloadEntriesDeviceGlobalVar.try_emplace(VarName, OffloadingEntriesNum,
9813 Addr, VarSize, Flags, Linkage,
9814 VarName.str());
9815 else
9816 OffloadEntriesDeviceGlobalVar.try_emplace(
9817 VarName, OffloadingEntriesNum, Addr, VarSize, Flags, Linkage, "");
9818 ++OffloadingEntriesNum;
9819 }
9820}
9821
9824 // Scan all target region entries and perform the provided action.
9825 for (const auto &E : OffloadEntriesDeviceGlobalVar)
9826 Action(E.getKey(), E.getValue());
9827}
9828
9829//===----------------------------------------------------------------------===//
9830// CanonicalLoopInfo
9831//===----------------------------------------------------------------------===//
9832
9833void CanonicalLoopInfo::collectControlBlocks(
9835 // We only count those BBs as control block for which we do not need to
9836 // reverse the CFG, i.e. not the loop body which can contain arbitrary control
9837 // flow. For consistency, this also means we do not add the Body block, which
9838 // is just the entry to the body code.
9839 BBs.reserve(BBs.size() + 6);
9840 BBs.append({getPreheader(), Header, Cond, Latch, Exit, getAfter()});
9841}
9842
9844 assert(isValid() && "Requires a valid canonical loop");
9845 for (BasicBlock *Pred : predecessors(Header)) {
9846 if (Pred != Latch)
9847 return Pred;
9848 }
9849 llvm_unreachable("Missing preheader");
9850}
9851
9852void CanonicalLoopInfo::setTripCount(Value *TripCount) {
9853 assert(isValid() && "Requires a valid canonical loop");
9854
9855 Instruction *CmpI = &getCond()->front();
9856 assert(isa<CmpInst>(CmpI) && "First inst must compare IV with TripCount");
9857 CmpI->setOperand(1, TripCount);
9858
9859#ifndef NDEBUG
9860 assertOK();
9861#endif
9862}
9863
9864void CanonicalLoopInfo::mapIndVar(
9865 llvm::function_ref<Value *(Instruction *)> Updater) {
9866 assert(isValid() && "Requires a valid canonical loop");
9867
9868 Instruction *OldIV = getIndVar();
9869
9870 // Record all uses excluding those introduced by the updater. Uses by the
9871 // CanonicalLoopInfo itself to keep track of the number of iterations are
9872 // excluded.
9873 SmallVector<Use *> ReplacableUses;
9874 for (Use &U : OldIV->uses()) {
9875 auto *User = dyn_cast<Instruction>(U.getUser());
9876 if (!User)
9877 continue;
9878 if (User->getParent() == getCond())
9879 continue;
9880 if (User->getParent() == getLatch())
9881 continue;
9882 ReplacableUses.push_back(&U);
9883 }
9884
9885 // Run the updater that may introduce new uses
9886 Value *NewIV = Updater(OldIV);
9887
9888 // Replace the old uses with the value returned by the updater.
9889 for (Use *U : ReplacableUses)
9890 U->set(NewIV);
9891
9892#ifndef NDEBUG
9893 assertOK();
9894#endif
9895}
9896
9898#ifndef NDEBUG
9899 // No constraints if this object currently does not describe a loop.
9900 if (!isValid())
9901 return;
9902
9903 BasicBlock *Preheader = getPreheader();
9904 BasicBlock *Body = getBody();
9905 BasicBlock *After = getAfter();
9906
9907 // Verify standard control-flow we use for OpenMP loops.
9908 assert(Preheader);
9909 assert(isa<BranchInst>(Preheader->getTerminator()) &&
9910 "Preheader must terminate with unconditional branch");
9911 assert(Preheader->getSingleSuccessor() == Header &&
9912 "Preheader must jump to header");
9913
9914 assert(Header);
9915 assert(isa<BranchInst>(Header->getTerminator()) &&
9916 "Header must terminate with unconditional branch");
9917 assert(Header->getSingleSuccessor() == Cond &&
9918 "Header must jump to exiting block");
9919
9920 assert(Cond);
9921 assert(Cond->getSinglePredecessor() == Header &&
9922 "Exiting block only reachable from header");
9923
9924 assert(isa<BranchInst>(Cond->getTerminator()) &&
9925 "Exiting block must terminate with conditional branch");
9926 assert(size(successors(Cond)) == 2 &&
9927 "Exiting block must have two successors");
9928 assert(cast<BranchInst>(Cond->getTerminator())->getSuccessor(0) == Body &&
9929 "Exiting block's first successor jump to the body");
9930 assert(cast<BranchInst>(Cond->getTerminator())->getSuccessor(1) == Exit &&
9931 "Exiting block's second successor must exit the loop");
9932
9933 assert(Body);
9934 assert(Body->getSinglePredecessor() == Cond &&
9935 "Body only reachable from exiting block");
9936 assert(!isa<PHINode>(Body->front()));
9937
9938 assert(Latch);
9939 assert(isa<BranchInst>(Latch->getTerminator()) &&
9940 "Latch must terminate with unconditional branch");
9941 assert(Latch->getSingleSuccessor() == Header && "Latch must jump to header");
9942 // TODO: To support simple redirecting of the end of the body code that has
9943 // multiple; introduce another auxiliary basic block like preheader and after.
9944 assert(Latch->getSinglePredecessor() != nullptr);
9945 assert(!isa<PHINode>(Latch->front()));
9946
9947 assert(Exit);
9948 assert(isa<BranchInst>(Exit->getTerminator()) &&
9949 "Exit block must terminate with unconditional branch");
9950 assert(Exit->getSingleSuccessor() == After &&
9951 "Exit block must jump to after block");
9952
9953 assert(After);
9954 assert(After->getSinglePredecessor() == Exit &&
9955 "After block only reachable from exit block");
9956 assert(After->empty() || !isa<PHINode>(After->front()));
9957
9958 Instruction *IndVar = getIndVar();
9959 assert(IndVar && "Canonical induction variable not found?");
9960 assert(isa<IntegerType>(IndVar->getType()) &&
9961 "Induction variable must be an integer");
9962 assert(cast<PHINode>(IndVar)->getParent() == Header &&
9963 "Induction variable must be a PHI in the loop header");
9964 assert(cast<PHINode>(IndVar)->getIncomingBlock(0) == Preheader);
9965 assert(
9966 cast<ConstantInt>(cast<PHINode>(IndVar)->getIncomingValue(0))->isZero());
9967 assert(cast<PHINode>(IndVar)->getIncomingBlock(1) == Latch);
9968
9969 auto *NextIndVar = cast<PHINode>(IndVar)->getIncomingValue(1);
9970 assert(cast<Instruction>(NextIndVar)->getParent() == Latch);
9971 assert(cast<BinaryOperator>(NextIndVar)->getOpcode() == BinaryOperator::Add);
9972 assert(cast<BinaryOperator>(NextIndVar)->getOperand(0) == IndVar);
9973 assert(cast<ConstantInt>(cast<BinaryOperator>(NextIndVar)->getOperand(1))
9974 ->isOne());
9975
9976 Value *TripCount = getTripCount();
9977 assert(TripCount && "Loop trip count not found?");
9978 assert(IndVar->getType() == TripCount->getType() &&
9979 "Trip count and induction variable must have the same type");
9980
9981 auto *CmpI = cast<CmpInst>(&Cond->front());
9982 assert(CmpI->getPredicate() == CmpInst::ICMP_ULT &&
9983 "Exit condition must be a signed less-than comparison");
9984 assert(CmpI->getOperand(0) == IndVar &&
9985 "Exit condition must compare the induction variable");
9986 assert(CmpI->getOperand(1) == TripCount &&
9987 "Exit condition must compare with the trip count");
9988#endif
9989}
9990
9992 Header = nullptr;
9993 Cond = nullptr;
9994 Latch = nullptr;
9995 Exit = nullptr;
9996}
Rewrite undef for PHI
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
static cl::opt< ITMode > IT(cl::desc("IT block support"), cl::Hidden, cl::init(DefaultIT), cl::values(clEnumValN(DefaultIT, "arm-default-it", "Generate any type of IT block"), clEnumValN(RestrictedIT, "arm-restrict-it", "Disallow complex IT blocks")))
This file contains the simple types necessary to represent the attributes associated with functions a...
static const Function * getParent(const Value *V)
#define LLVM_ENABLE_BITMASK_ENUMS_IN_NAMESPACE()
LLVM_ENABLE_BITMASK_ENUMS_IN_NAMESPACE() pulls the operator overloads used by LLVM_MARK_AS_BITMASK_EN...
Definition: BitmaskEnum.h:83
#define LLVM_MARK_AS_BITMASK_ENUM(LargestValue)
LLVM_MARK_AS_BITMASK_ENUM lets you opt in an individual enum type so you can perform bitwise operatio...
Definition: BitmaskEnum.h:42
BlockVerifier::State From
static GCRegistry::Add< StatepointGC > D("statepoint-example", "an example strategy for statepoint")
Analysis containing CSE Info
Definition: CSEInfo.cpp:27
This file contains the declarations for the subclasses of Constant, which represent the different fla...
Returns the sub type a function will return at a given Idx Should correspond to the result type of an ExtractValue instruction executed with just that one unsigned Idx
#define LLVM_DEBUG(...)
Definition: Debug.h:106
uint64_t Addr
std::string Name
uint64_t Size
bool End
Definition: ELF_riscv.cpp:480
DenseMap< Block *, BlockRelaxAux > Blocks
Definition: ELF_riscv.cpp:507
static GCMetadataPrinterRegistry::Add< ErlangGCPrinter > X("erlang", "erlang-compatible garbage collector")
Hexagon Hardware Loops
This file provides various utilities for inspecting and working with the control flow graph in LLVM I...
This header defines various interfaces for pass management in LLVM.
iv Induction Variable Users
Definition: IVUsers.cpp:48
static LVOptions Options
Definition: LVOptions.cpp:25
static bool isZero(Value *V, const DataLayout &DL, DominatorTree *DT, AssumptionCache *AC)
Definition: Lint.cpp:557
static cl::opt< unsigned > TileSize("fuse-matrix-tile-size", cl::init(4), cl::Hidden, cl::desc("Tile size for matrix instruction fusion using square-shaped tiles."))
#define F(x, y, z)
Definition: MD5.cpp:55
#define I(x, y, z)
Definition: MD5.cpp:58
This file contains the declarations for metadata subclasses.
uint64_t IntrinsicInst * II
#define OMP_KERNEL_ARG_VERSION
Definition: OMPConstants.h:75
Provides definitions for Target specific Grid Values.
static OMPScheduleType getOpenMPBaseScheduleType(llvm::omp::ScheduleKind ClauseKind, bool HasChunks, bool HasSimdModifier)
Determine which scheduling algorithm to use, determined from schedule clause arguments.
static void redirectTo(BasicBlock *Source, BasicBlock *Target, DebugLoc DL)
Make Source branch to Target.
Value * createFakeIntVal(IRBuilderBase &Builder, OpenMPIRBuilder::InsertPointTy OuterAllocaIP, llvm::SmallVectorImpl< Instruction * > &ToBeDeleted, OpenMPIRBuilder::InsertPointTy InnerAllocaIP, const Twine &Name="", bool AsPtr=true)
static FunctionCallee getKmpcForDynamicFiniForType(Type *Ty, Module &M, OpenMPIRBuilder &OMPBuilder)
Returns an LLVM function to call for finalizing the dynamic loop using depending on type.
static Expected< Function * > createOutlinedFunction(OpenMPIRBuilder &OMPBuilder, IRBuilderBase &Builder, const OpenMPIRBuilder::TargetKernelDefaultAttrs &DefaultAttrs, StringRef FuncName, SmallVectorImpl< Value * > &Inputs, OpenMPIRBuilder::TargetBodyGenCallbackTy &CBFunc, OpenMPIRBuilder::TargetGenArgAccessorsCallbackTy &ArgAccessorFuncCB)
static Function * emitTargetTaskProxyFunction(OpenMPIRBuilder &OMPBuilder, IRBuilderBase &Builder, CallInst *StaleCI)
Create an entry point for a target task with the following.
static void updateNVPTXMetadata(Function &Kernel, StringRef Name, int32_t Value, bool Min)
static OMPScheduleType getOpenMPOrderingScheduleType(OMPScheduleType BaseScheduleType, bool HasOrderedClause)
Adds ordering modifier flags to schedule type.
static OMPScheduleType getOpenMPMonotonicityScheduleType(OMPScheduleType ScheduleType, bool HasSimdModifier, bool HasMonotonic, bool HasNonmonotonic, bool HasOrderedClause)
Adds monotonicity modifier flags to schedule type.
static void addSimdMetadata(BasicBlock *Block, MDNode *AccessGroup, LoopInfo &LI)
Attach llvm.access.group metadata to the memref instructions of Block.
static OMPScheduleType computeOpenMPScheduleType(ScheduleKind ClauseKind, bool HasChunks, bool HasSimdModifier, bool HasMonotonicModifier, bool HasNonmonotonicModifier, bool HasOrderedClause)
Determine the schedule type using schedule and ordering clause arguments.
static bool isValidWorkshareLoopScheduleType(OMPScheduleType SchedType)
static Function * getFreshReductionFunc(Module &M)
static void raiseUserConstantDataAllocasToEntryBlock(IRBuilderBase &Builder, Function *Function)
static MDNode * getNVPTXMDNode(Function &Kernel, StringRef Name)
static FunctionCallee getKmpcForDynamicNextForType(Type *Ty, Module &M, OpenMPIRBuilder &OMPBuilder)
Returns an LLVM function to call for updating the next loop using OpenMP dynamic scheduling depending...
static bool isConflictIP(IRBuilder<>::InsertPoint IP1, IRBuilder<>::InsertPoint IP2)
Return whether IP1 and IP2 are ambiguous, i.e.
static void checkReductionInfos(ArrayRef< OpenMPIRBuilder::ReductionInfo > ReductionInfos, bool IsGPU)
static FunctionCallee getKmpcForDynamicInitForType(Type *Ty, Module &M, OpenMPIRBuilder &OMPBuilder)
Returns an LLVM function to call for initializing loop bounds using OpenMP dynamic scheduling dependi...
static cl::opt< double > UnrollThresholdFactor("openmp-ir-builder-unroll-threshold-factor", cl::Hidden, cl::desc("Factor for the unroll threshold to account for code " "simplifications still taking place"), cl::init(1.5))
static llvm::Value * emitImplicitCast(IRBuilder<> &Builder, llvm::Value *XRead, llvm::Value *V)
Emit an implicit cast to convert XRead to type of variable V.
static int32_t computeHeuristicUnrollFactor(CanonicalLoopInfo *CLI)
Heuristically determine the best-performant unroll factor for CLI.
static Value * emitTaskDependencies(OpenMPIRBuilder &OMPBuilder, const SmallVectorImpl< OpenMPIRBuilder::DependData > &Dependencies)
static void emitTargetCall(OpenMPIRBuilder &OMPBuilder, IRBuilderBase &Builder, OpenMPIRBuilder::InsertPointTy AllocaIP, const OpenMPIRBuilder::TargetKernelDefaultAttrs &DefaultAttrs, const OpenMPIRBuilder::TargetKernelRuntimeAttrs &RuntimeAttrs, Value *IfCond, Function *OutlinedFn, Constant *OutlinedFnID, SmallVectorImpl< Value * > &Args, OpenMPIRBuilder::GenMapInfoCallbackTy GenMapInfoCB, SmallVector< llvm::OpenMPIRBuilder::DependData > Dependencies={}, bool HasNoWait=false)
static void workshareLoopTargetCallback(OpenMPIRBuilder *OMPIRBuilder, CanonicalLoopInfo *CLI, Value *Ident, Function &OutlinedFn, Type *ParallelTaskPtr, const SmallVector< Instruction *, 4 > &ToBeDeleted, WorksharingLoopType LoopType)
static Error emitTargetOutlinedFunction(OpenMPIRBuilder &OMPBuilder, IRBuilderBase &Builder, bool IsOffloadEntry, TargetRegionEntryInfo &EntryInfo, const OpenMPIRBuilder::TargetKernelDefaultAttrs &DefaultAttrs, Function *&OutlinedFn, Constant *&OutlinedFnID, SmallVectorImpl< Value * > &Inputs, OpenMPIRBuilder::TargetBodyGenCallbackTy &CBFunc, OpenMPIRBuilder::TargetGenArgAccessorsCallbackTy &ArgAccessorFuncCB)
static OpenMPIRBuilder::InsertPointTy getInsertPointAfterInstr(Instruction *I)
static void redirectAllPredecessorsTo(BasicBlock *OldTarget, BasicBlock *NewTarget, DebugLoc DL)
Redirect all edges that branch to OldTarget to NewTarget.
static std::unique_ptr< TargetMachine > createTargetMachine(Function *F, CodeGenOptLevel OptLevel)
Create the TargetMachine object to query the backend for optimization preferences.
static FunctionCallee getKmpcForStaticInitForType(Type *Ty, Module &M, OpenMPIRBuilder &OMPBuilder)
static void addBasicBlockMetadata(BasicBlock *BB, ArrayRef< Metadata * > Properties)
Attach metadata Properties to the basic block described by BB.
static cl::opt< bool > OptimisticAttributes("openmp-ir-builder-optimistic-attributes", cl::Hidden, cl::desc("Use optimistic attributes describing " "'as-if' properties of runtime calls."), cl::init(false))
static FunctionCallee getKmpcForStaticLoopForType(Type *Ty, OpenMPIRBuilder *OMPBuilder, WorksharingLoopType LoopType)
static void createTargetLoopWorkshareCall(OpenMPIRBuilder *OMPBuilder, WorksharingLoopType LoopType, BasicBlock *InsertBlock, Value *Ident, Value *LoopBodyArg, Type *ParallelTaskPtr, Value *TripCount, Function &LoopBodyFn)
static const omp::GV & getGridValue(const Triple &T, Function *Kernel)
static void addLoopMetadata(CanonicalLoopInfo *Loop, ArrayRef< Metadata * > Properties)
Attach loop metadata Properties to the loop described by Loop.
static void removeUnusedBlocksFromParent(ArrayRef< BasicBlock * > BBs)
Determine which blocks in BBs are reachable from outside and remove the ones that are not reachable f...
static void targetParallelCallback(OpenMPIRBuilder *OMPIRBuilder, Function &OutlinedFn, Function *OuterFn, BasicBlock *OuterAllocaBB, Value *Ident, Value *IfCondition, Value *NumThreads, Instruction *PrivTID, AllocaInst *PrivTIDAddr, Value *ThreadID, const SmallVector< Instruction *, 4 > &ToBeDeleted)
static void hostParallelCallback(OpenMPIRBuilder *OMPIRBuilder, Function &OutlinedFn, Function *OuterFn, Value *Ident, Value *IfCondition, Instruction *PrivTID, AllocaInst *PrivTIDAddr, const SmallVector< Instruction *, 4 > &ToBeDeleted)
#define P(N)
FunctionAnalysisManager FAM
This file defines the Pass Instrumentation classes that provide instrumentation points into the pass ...
const SmallVectorImpl< MachineOperand > & Cond
Basic Register Allocator
Remove Loads Into Fake Uses
static bool isValid(const char C)
Returns true if C is a valid mangled character: <0-9a-zA-Z_>.
assert(ImpDefSCC.getReg()==AMDGPU::SCC &&ImpDefSCC.isDef())
unsigned unsigned DefaultVal
raw_pwrite_stream & OS
This file implements the SmallBitVector class.
This file defines the SmallSet class.
This file contains some functions that are useful when dealing with strings.
static std::optional< unsigned > getOpcode(ArrayRef< VPValue * > Values)
Returns the opcode of Values or ~0 if they do not all agree.
Definition: VPlanSLP.cpp:245
Value * RHS
Value * LHS
static cl::opt< unsigned > MaxThreads("xcore-max-threads", cl::Optional, cl::desc("Maximum number of threads (for emulation thread-local storage)"), cl::Hidden, cl::value_desc("number"), cl::init(8))
static const uint32_t IV[8]
Definition: blake3_impl.h:78
Class for arbitrary precision integers.
Definition: APInt.h:78
This class represents a conversion between pointers from one address space to another.
an instruction to allocate memory on the stack
Definition: Instructions.h:63
Align getAlign() const
Return the alignment of the memory that is being allocated by the instruction.
Definition: Instructions.h:124
PointerType * getType() const
Overload to return most specific pointer type.
Definition: Instructions.h:99
Type * getAllocatedType() const
Return the type that is being allocated by the instruction.
Definition: Instructions.h:117
unsigned getAddressSpace() const
Return the address space for the allocation.
Definition: Instructions.h:104
std::optional< TypeSize > getAllocationSize(const DataLayout &DL) const
Get allocation size in bytes.
void setAlignment(Align Align)
Definition: Instructions.h:128
const Value * getArraySize() const
Get the number of elements allocated.
Definition: Instructions.h:95
A container for analyses that lazily runs them and caches their results.
Definition: PassManager.h:253
bool registerPass(PassBuilderT &&PassBuilder)
Register an analysis pass with the manager.
Definition: PassManager.h:471
This class represents an incoming formal argument to a Function.
Definition: Argument.h:31
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory),...
Definition: ArrayRef.h:41
iterator end() const
Definition: ArrayRef.h:157
size_t size() const
size - Get the array size.
Definition: ArrayRef.h:168
iterator begin() const
Definition: ArrayRef.h:156
bool empty() const
empty - Check if the array is empty.
Definition: ArrayRef.h:163
Class to represent array types.
Definition: DerivedTypes.h:395
static ArrayType * get(Type *ElementType, uint64_t NumElements)
This static method is the primary way to construct an ArrayType.
A function analysis which provides an AssumptionCache.
AssumptionCache run(Function &F, FunctionAnalysisManager &)
A cache of @llvm.assume calls within a function.
An instruction that atomically checks whether a specified value is in a memory location,...
Definition: Instructions.h:501
static AtomicOrdering getStrongestFailureOrdering(AtomicOrdering SuccessOrdering)
Returns the strongest permitted ordering on failure, given the desired ordering on success.
Definition: Instructions.h:652
std::pair< LoadInst *, AllocaInst * > EmitAtomicLoadLibcall(AtomicOrdering AO)
Definition: Atomic.cpp:107
an instruction that atomically reads a memory location, combines it with another value,...
Definition: Instructions.h:704
BinOp
This enumeration lists the possible modifications atomicrmw can make.
Definition: Instructions.h:716
@ Add
*p = old + v
Definition: Instructions.h:720
@ FAdd
*p = old + v
Definition: Instructions.h:741
@ USubCond
Subtract only if no unsigned overflow.
Definition: Instructions.h:764
@ Min
*p = old <signed v ? old : v
Definition: Instructions.h:734
@ Or
*p = old | v
Definition: Instructions.h:728
@ Sub
*p = old - v
Definition: Instructions.h:722
@ And
*p = old & v
Definition: Instructions.h:724
@ Xor
*p = old ^ v
Definition: Instructions.h:730
@ USubSat
*p = usub.sat(old, v) usub.sat matches the behavior of llvm.usub.sat.
Definition: Instructions.h:768
@ FSub
*p = old - v
Definition: Instructions.h:744
@ UIncWrap
Increment one up to a maximum value.
Definition: Instructions.h:756
@ Max
*p = old >signed v ? old : v
Definition: Instructions.h:732
@ UMin
*p = old <unsigned v ? old : v
Definition: Instructions.h:738
@ FMin
*p = minnum(old, v) minnum matches the behavior of llvm.minnum.
Definition: Instructions.h:752
@ UMax
*p = old >unsigned v ? old : v
Definition: Instructions.h:736
@ FMax
*p = maxnum(old, v) maxnum matches the behavior of llvm.maxnum.
Definition: Instructions.h:748
@ UDecWrap
Decrement one until a minimum value or zero.
Definition: Instructions.h:760
@ Nand
*p = ~(old & v)
Definition: Instructions.h:726
AttrBuilder & addAttribute(Attribute::AttrKind Val)
Add an attribute to the builder.
AttrBuilder & removeAttribute(Attribute::AttrKind Val)
Remove an attribute from the builder.
AttributeSet getFnAttrs() const
The function attributes are returned.
AttributeList addFnAttributes(LLVMContext &C, const AttrBuilder &B) const
Add function attribute to the list.
Definition: Attributes.h:600
AttributeSet addAttributes(LLVMContext &C, AttributeSet AS) const
Add attributes to the attribute set.
Definition: Attributes.cpp:937
AttributeSet addAttribute(LLVMContext &C, Attribute::AttrKind Kind) const
Add an argument attribute.
Definition: Attributes.cpp:922
StringRef getValueAsString() const
Return the attribute's value as a string.
Definition: Attributes.cpp:396
LLVM Basic Block Representation.
Definition: BasicBlock.h:61
void replaceSuccessorsPhiUsesWith(BasicBlock *Old, BasicBlock *New)
Update all phi nodes in this basic block's successors to refer to basic block New instead of basic bl...
Definition: BasicBlock.cpp:684
iterator end()
Definition: BasicBlock.h:474
iterator begin()
Instruction iterator methods.
Definition: BasicBlock.h:461
const_iterator getFirstInsertionPt() const
Returns an iterator to the first instruction in this block that is suitable for inserting a non-PHI i...
Definition: BasicBlock.cpp:437
reverse_iterator rbegin()
Definition: BasicBlock.h:477
bool empty() const
Definition: BasicBlock.h:483
InstListType::const_iterator getFirstNonPHIIt() const
Returns an iterator to the first instruction in this block that is not a PHINode instruction.
Definition: BasicBlock.cpp:381
const Instruction & front() const
Definition: BasicBlock.h:484
static BasicBlock * Create(LLVMContext &Context, const Twine &Name="", Function *Parent=nullptr, BasicBlock *InsertBefore=nullptr)
Creates a new BasicBlock.
Definition: BasicBlock.h:213
InstListType::const_iterator getFirstNonPHIOrDbg(bool SkipPseudoOp=true) const
Returns a pointer to the first instruction in this block that is not a PHINode or a debug intrinsic,...
Definition: BasicBlock.cpp:398
BasicBlock * splitBasicBlock(iterator I, const Twine &BBName="", bool Before=false)
Split the basic block into two basic blocks at the specified instruction.
Definition: BasicBlock.cpp:599
const BasicBlock * getUniqueSuccessor() const
Return the successor of this block if it has a unique successor.
Definition: BasicBlock.cpp:519
const BasicBlock * getSinglePredecessor() const
Return the predecessor of this block if it has a single predecessor block.
Definition: BasicBlock.cpp:481
InstListType::reverse_iterator reverse_iterator
Definition: BasicBlock.h:179
const BasicBlock * getUniquePredecessor() const
Return the predecessor of this block if it has a unique predecessor block.
Definition: BasicBlock.cpp:489
const BasicBlock * getSingleSuccessor() const
Return the successor of this block if it has a single successor.
Definition: BasicBlock.cpp:511
const Function * getParent() const
Return the enclosing method, or null if none.
Definition: BasicBlock.h:220
SymbolTableList< BasicBlock >::iterator eraseFromParent()
Unlink 'this' from the containing function and delete it.
Definition: BasicBlock.cpp:279
reverse_iterator rend()
Definition: BasicBlock.h:479
InstListType::iterator iterator
Instruction iterators...
Definition: BasicBlock.h:177
LLVMContext & getContext() const
Get the context in which this basic block lives.
Definition: BasicBlock.cpp:168
void moveBefore(BasicBlock *MovePos)
Unlink this basic block from its current function and insert it into the function that MovePos lives ...
Definition: BasicBlock.h:389
const Instruction * getTerminator() const LLVM_READONLY
Returns the terminator instruction if the block is well formed or null if the block is not well forme...
Definition: BasicBlock.h:240
void splice(BasicBlock::iterator ToIt, BasicBlock *FromBB)
Transfer all instructions from FromBB to this basic block at ToIt.
Definition: BasicBlock.h:644
const Instruction & back() const
Definition: BasicBlock.h:486
const Module * getModule() const
Return the module owning the function this basic block belongs to, or nullptr if the function does no...
Definition: BasicBlock.cpp:292
void removePredecessor(BasicBlock *Pred, bool KeepOneInputPHIs=false)
Update PHI nodes in this BasicBlock before removal of predecessor Pred.
Definition: BasicBlock.cpp:538
Conditional or Unconditional Branch instruction.
static BranchInst * Create(BasicBlock *IfTrue, InsertPosition InsertBefore=nullptr)
void setDoesNotThrow()
Definition: InstrTypes.h:1932
Function * getCalledFunction() const
Returns the function called, or null if this is an indirect function invocation or the function signa...
Definition: InstrTypes.h:1341
User::op_iterator arg_begin()
Return the iterator pointing to the beginning of the argument list.
Definition: InstrTypes.h:1261
Value * getArgOperand(unsigned i) const
Definition: InstrTypes.h:1286
User::op_iterator arg_end()
Return the iterator pointing to the end of the argument list.
Definition: InstrTypes.h:1267
unsigned arg_size() const
Definition: InstrTypes.h:1284
This class represents a function call, abstracting a target machine's calling convention.
Class to represented the control flow structure of an OpenMP canonical loop.
Value * getTripCount() const
Returns the llvm::Value containing the number of loop iterations.
BasicBlock * getHeader() const
The header is the entry for each iteration.
void assertOK() const
Consistency self-check.
Type * getIndVarType() const
Return the type of the induction variable (and the trip count).
BasicBlock * getBody() const
The body block is the single entry for a loop iteration and not controlled by CanonicalLoopInfo.
bool isValid() const
Returns whether this object currently represents the IR of a loop.
OpenMPIRBuilder::InsertPointTy getAfterIP() const
Return the insertion point for user code after the loop.
OpenMPIRBuilder::InsertPointTy getBodyIP() const
Return the insertion point for user code in the body.
BasicBlock * getAfter() const
The after block is intended for clean-up code such as lifetime end markers.
Function * getFunction() const
void invalidate()
Invalidate this loop.
BasicBlock * getLatch() const
Reaching the latch indicates the end of the loop body code.
OpenMPIRBuilder::InsertPointTy getPreheaderIP() const
Return the insertion point for user code before the loop.
BasicBlock * getCond() const
The condition block computes whether there is another loop iteration.
BasicBlock * getExit() const
Reaching the exit indicates no more iterations are being executed.
BasicBlock * getPreheader() const
The preheader ensures that there is only a single edge entering the loop.
Instruction * getIndVar() const
Returns the instruction representing the current logical induction variable.
Predicate
This enumeration lists the possible predicates for CmpInst subclasses.
Definition: InstrTypes.h:673
@ ICMP_SLT
signed less than
Definition: InstrTypes.h:702
@ ICMP_SLE
signed less or equal
Definition: InstrTypes.h:703
@ FCMP_OLT
0 1 0 0 True if ordered and less than
Definition: InstrTypes.h:679
@ FCMP_OGT
0 0 1 0 True if ordered and greater than
Definition: InstrTypes.h:677
@ ICMP_UGT
unsigned greater than
Definition: InstrTypes.h:696
@ ICMP_SGT
signed greater than
Definition: InstrTypes.h:700
@ ICMP_ULT
unsigned less than
Definition: InstrTypes.h:698
@ ICMP_NE
not equal
Definition: InstrTypes.h:695
@ ICMP_ULE
unsigned less or equal
Definition: InstrTypes.h:699
A cache for the CodeExtractor analysis.
Definition: CodeExtractor.h:46
Utility class for extracting code into a new function.
Definition: CodeExtractor.h:85
void findAllocas(const CodeExtractorAnalysisCache &CEAC, ValueSet &SinkCands, ValueSet &HoistCands, BasicBlock *&ExitBlock) const
Find the set of allocas whose life ranges are contained within the outlined region.
Function * extractCodeRegion(const CodeExtractorAnalysisCache &CEAC)
Perform the extraction, returning the new function.
void findInputsOutputs(ValueSet &Inputs, ValueSet &Outputs, const ValueSet &Allocas, bool CollectGlobalInputs=false) const
Compute the set of input values and output values for the code.
bool isEligible() const
Test whether this code extractor is eligible.
void excludeArgFromAggregate(Value *Arg)
Exclude a value from aggregate argument passing when extracting a code region, passing it instead as ...
static Constant * get(ArrayType *T, ArrayRef< Constant * > V)
Definition: Constants.cpp:1312
static ConstantAsMetadata * get(Constant *C)
Definition: Metadata.h:532
static Constant * getString(LLVMContext &Context, StringRef Initializer, bool AddNull=true)
This method constructs a CDS and initializes it with a text string.
Definition: Constants.cpp:2991
static Constant * get(LLVMContext &Context, ArrayRef< ElementTy > Elts)
get() constructor - Return a constant with array type with an element count and element type matching...
Definition: Constants.h:709
static Constant * getPointerCast(Constant *C, Type *Ty)
Create a BitCast, AddrSpaceCast, or a PtrToInt cast constant expression.
Definition: Constants.cpp:2253
static Constant * getPointerBitCastOrAddrSpaceCast(Constant *C, Type *Ty)
Create a BitCast or AddrSpaceCast for a pointer type depending on the address space.
Definition: Constants.cpp:2268
static Constant * getAddrSpaceCast(Constant *C, Type *Ty, bool OnlyIfReduced=false)
Definition: Constants.cpp:2333
This is the shared class of boolean and integer constants.
Definition: Constants.h:83
static ConstantInt * getTrue(LLVMContext &Context)
Definition: Constants.cpp:866
static ConstantInt * getSigned(IntegerType *Ty, int64_t V)
Return a ConstantInt with the specified value for the specified type.
Definition: Constants.h:126
static ConstantInt * getFalse(LLVMContext &Context)
Definition: Constants.cpp:873
static ConstantPointerNull * get(PointerType *T)
Static factory methods - Return objects of the specified value.
Definition: Constants.cpp:1826
static Constant * get(StructType *T, ArrayRef< Constant * > V)
Definition: Constants.cpp:1378
This is an important base class in LLVM.
Definition: Constant.h:42
static Constant * getAllOnesValue(Type *Ty)
Definition: Constants.cpp:420
static Constant * getNullValue(Type *Ty)
Constructor to create a '0' constant of arbitrary type.
Definition: Constants.cpp:373
DISubprogram * getSubprogram() const
Get the subprogram for this scope.
Debug location.
Subprogram description.
DISPFlags
Debug info subprogram flags.
Type array for a subprogram.
This class represents an Operation in the Expression.
A parsed version of the target data layout string in and methods for querying it.
Definition: DataLayout.h:63
unsigned getDefaultGlobalsAddressSpace() const
Definition: DataLayout.h:247
Align getABIIntegerTypeAlignment(unsigned BitWidth) const
Returns the minimum ABI-required alignment for an integer type of the specified bitwidth.
Definition: DataLayout.h:486
unsigned getAllocaAddrSpace() const
Definition: DataLayout.h:229
TypeSize getTypeAllocSize(Type *Ty) const
Returns the offset in bytes between successive objects of the specified type, including alignment pad...
Definition: DataLayout.h:457
unsigned getPointerSize(unsigned AS=0) const
Layout pointer size in bytes, rounded up to a whole number of bytes.
Definition: DataLayout.cpp:739
unsigned getIndexSizeInBits(unsigned AS) const
Size in bits of index used for address calculation in getelementptr.
Definition: DataLayout.h:369
TypeSize getTypeSizeInBits(Type *Ty) const
Size examples:
Definition: DataLayout.h:617
TypeSize getTypeStoreSize(Type *Ty) const
Returns the maximum number of bytes that may be overwritten by storing the specified type.
Definition: DataLayout.h:421
Align getPrefTypeAlign(Type *Ty) const
Returns the preferred stack/global alignment for the specified type.
Definition: DataLayout.cpp:847
A debug info location.
Definition: DebugLoc.h:33
Analysis pass which computes a DominatorTree.
Definition: Dominators.h:279
DominatorTree run(Function &F, FunctionAnalysisManager &)
Run the analysis pass over a function and produce a dominator tree.
Definition: Dominators.cpp:371
Concrete subclass of DominatorTreeBase that is used to compute a normal dominator tree.
Definition: Dominators.h:162
Lightweight error class with error context and mandatory checking.
Definition: Error.h:160
static ErrorSuccess success()
Create a success value.
Definition: Error.h:337
Tagged union holding either a T or a Error.
Definition: Error.h:481
Error takeError()
Take ownership of the stored error.
Definition: Error.h:608
A handy container for a FunctionType+Callee-pointer pair, which can be passed around as a single enti...
Definition: DerivedTypes.h:170
Class to represent function types.
Definition: DerivedTypes.h:105
static FunctionType * get(Type *Result, ArrayRef< Type * > Params, bool isVarArg)
This static method is the primary way of constructing a FunctionType.
void addFnAttr(Attribute::AttrKind Kind)
Add function attributes to this function.
Definition: Function.cpp:641
static Function * Create(FunctionType *Ty, LinkageTypes Linkage, unsigned AddrSpace, const Twine &N="", Module *M=nullptr)
Definition: Function.h:173
const BasicBlock & getEntryBlock() const
Definition: Function.h:821
bool empty() const
Definition: Function.h:871
FunctionType * getFunctionType() const
Returns the FunctionType for me.
Definition: Function.h:216
void removeFromParent()
removeFromParent - This method unlinks 'this' from the containing module, but does not delete it.
Definition: Function.cpp:454
const DataLayout & getDataLayout() const
Get the data layout of the module this function belongs to.
Definition: Function.cpp:373
Attribute getFnAttribute(Attribute::AttrKind Kind) const
Return the attribute for the given attribute kind.
Definition: Function.cpp:766
uint64_t getFnAttributeAsParsedInteger(StringRef Kind, uint64_t Default=0) const
For a string attribute Kind, parse attribute as an integer.
Definition: Function.cpp:778
AttributeList getAttributes() const
Return the attribute list for this Function.
Definition: Function.h:365
const Function & getFunction() const
Definition: Function.h:171
iterator begin()
Definition: Function.h:865
arg_iterator arg_begin()
Definition: Function.h:880
void setAttributes(AttributeList Attrs)
Set the attribute list for this Function.
Definition: Function.h:368
LLVMContext & getContext() const
getContext - Return a reference to the LLVMContext associated with this function.
Definition: Function.cpp:369
void addParamAttr(unsigned ArgNo, Attribute::AttrKind Kind)
adds the attribute to the list of attributes for the given arg.
Definition: Function.cpp:669
Function::iterator insert(Function::iterator Position, BasicBlock *BB)
Insert BB in the basic block list at Position.
Definition: Function.h:766
size_t arg_size() const
Definition: Function.h:913
Type * getReturnType() const
Returns the type of the ret val.
Definition: Function.h:221
iterator end()
Definition: Function.h:867
void setCallingConv(CallingConv::ID CC)
Definition: Function.h:281
Argument * getArg(unsigned i) const
Definition: Function.h:898
bool hasMetadata() const
Return true if this value has any metadata attached to it.
Definition: Value.h:589
void addMetadata(unsigned KindID, MDNode &MD)
Add a metadata attachment.
Definition: Metadata.cpp:1565
LinkageTypes getLinkage() const
Definition: GlobalValue.h:547
void setLinkage(LinkageTypes LT)
Definition: GlobalValue.h:538
Module * getParent()
Get the module that this global value is contained inside of...
Definition: GlobalValue.h:657
void setDSOLocal(bool Local)
Definition: GlobalValue.h:304
PointerType * getType() const
Global values are always pointers.
Definition: GlobalValue.h:295
@ HiddenVisibility
The GV is hidden.
Definition: GlobalValue.h:68
@ ProtectedVisibility
The GV is protected.
Definition: GlobalValue.h:69
void setVisibility(VisibilityTypes V)
Definition: GlobalValue.h:255
LinkageTypes
An enumeration for the kinds of linkage for global values.
Definition: GlobalValue.h:51
@ PrivateLinkage
Like Internal, but omit from symbol table.
Definition: GlobalValue.h:60
@ CommonLinkage
Tentative definitions.
Definition: GlobalValue.h:62
@ InternalLinkage
Rename collisions when linking (static functions).
Definition: GlobalValue.h:59
@ WeakODRLinkage
Same, but only replaced by something equivalent.
Definition: GlobalValue.h:57
@ WeakAnyLinkage
Keep one copy of named function when linking (weak)
Definition: GlobalValue.h:56
@ AppendingLinkage
Special purpose, only applies to global arrays.
Definition: GlobalValue.h:58
@ LinkOnceODRLinkage
Same, but only replaced by something equivalent.
Definition: GlobalValue.h:55
Type * getValueType() const
Definition: GlobalValue.h:297
InsertPoint - A saved insertion point.
Definition: IRBuilder.h:276
BasicBlock * getBlock() const
Definition: IRBuilder.h:291
bool isSet() const
Returns true if this insert point is set.
Definition: IRBuilder.h:289
BasicBlock::iterator getPoint() const
Definition: IRBuilder.h:292
Common base class shared among various IRBuilders.
Definition: IRBuilder.h:113
Value * CreateNUWMul(Value *LHS, Value *RHS, const Twine &Name="")
Definition: IRBuilder.h:1417
Value * CreatePtrDiff(Type *ElemTy, Value *LHS, Value *RHS, const Twine &Name="")
Return the i64 difference between two pointer values, dividing out the size of the pointed-to objects...
Definition: IRBuilder.cpp:1075
Value * CreateICmpULT(Value *LHS, Value *RHS, const Twine &Name="")
Definition: IRBuilder.h:2286
AtomicCmpXchgInst * CreateAtomicCmpXchg(Value *Ptr, Value *Cmp, Value *New, MaybeAlign Align, AtomicOrdering SuccessOrdering, AtomicOrdering FailureOrdering, SyncScope::ID SSID=SyncScope::System)
Definition: IRBuilder.h:1849
Value * CreateConstGEP1_32(Type *Ty, Value *Ptr, unsigned Idx0, const Twine &Name="")
Definition: IRBuilder.h:1887
AllocaInst * CreateAlloca(Type *Ty, unsigned AddrSpace, Value *ArraySize=nullptr, const Twine &Name="")
Definition: IRBuilder.h:1781
Value * CreateInsertValue(Value *Agg, Value *Val, ArrayRef< unsigned > Idxs, const Twine &Name="")
Definition: IRBuilder.h:2562
Value * CreateSIToFP(Value *V, Type *DestTy, const Twine &Name="")
Definition: IRBuilder.h:2106
IntegerType * getIntNTy(unsigned N)
Fetch the type representing an N-bit integer.
Definition: IRBuilder.h:558
Value * CreateICmpSGT(Value *LHS, Value *RHS, const Twine &Name="")
Definition: IRBuilder.h:2294
LoadInst * CreateAlignedLoad(Type *Ty, Value *Ptr, MaybeAlign Align, const char *Name)
Definition: IRBuilder.h:1815
Value * CreateZExtOrTrunc(Value *V, Type *DestTy, const Twine &Name="")
Create a ZExt or Trunc from the integer value V to DestTy.
Definition: IRBuilder.h:2051
UnreachableInst * CreateUnreachable()
Definition: IRBuilder.h:1306
Value * CreatePointerCast(Value *V, Type *DestTy, const Twine &Name="")
Definition: IRBuilder.h:2199
Value * CreateExtractValue(Value *Agg, ArrayRef< unsigned > Idxs, const Twine &Name="")
Definition: IRBuilder.h:2555
CallInst * CreateAlignmentAssumption(const DataLayout &DL, Value *PtrValue, unsigned Alignment, Value *OffsetValue=nullptr)
Create an assume intrinsic call that represents an alignment assumption on the provided pointer.
Definition: IRBuilder.cpp:1265
Value * CreateSelect(Value *C, Value *True, Value *False, const Twine &Name="", Instruction *MDFrom=nullptr)
Definition: IRBuilder.cpp:1053
BasicBlock::iterator GetInsertPoint() const
Definition: IRBuilder.h:194
Value * CreateStructGEP(Type *Ty, Value *Ptr, unsigned Idx, const Twine &Name="")
Definition: IRBuilder.h:1980
IntegerType * getIndexTy(const DataLayout &DL, unsigned AddrSpace)
Fetch the type of an integer that should be used to index GEP operations within AddressSpace.
Definition: IRBuilder.h:600
Value * CreateSExt(Value *V, Type *DestTy, const Twine &Name="")
Definition: IRBuilder.h:2045
Value * CreateIntToPtr(Value *V, Type *DestTy, const Twine &Name="")
Definition: IRBuilder.h:2147
IntegerType * getInt32Ty()
Fetch the type representing a 32-bit integer.
Definition: IRBuilder.h:545
Value * CreateNSWAdd(Value *LHS, Value *RHS, const Twine &Name="")
Definition: IRBuilder.h:1379
BasicBlock * GetInsertBlock() const
Definition: IRBuilder.h:193
void SetCurrentDebugLocation(DebugLoc L)
Set location information used by debugging information.
Definition: IRBuilder.h:239
IntegerType * getInt64Ty()
Fetch the type representing a 64-bit integer.
Definition: IRBuilder.h:550
Value * CreateInBoundsGEP(Type *Ty, Value *Ptr, ArrayRef< Value * > IdxList, const Twine &Name="")
Definition: IRBuilder.h:1882
Value * CreatePointerBitCastOrAddrSpaceCast(Value *V, Type *DestTy, const Twine &Name="")
Definition: IRBuilder.h:2211
Value * CreateUDiv(Value *LHS, Value *RHS, const Twine &Name="", bool isExact=false)
Definition: IRBuilder.h:1421
Value * CreateICmpNE(Value *LHS, Value *RHS, const Twine &Name="")
Definition: IRBuilder.h:2274
Value * CreateNUWAdd(Value *LHS, Value *RHS, const Twine &Name="")
Definition: IRBuilder.h:1383
IntegerType * getInt16Ty()
Fetch the type representing a 16-bit integer.
Definition: IRBuilder.h:540
Value * CreateGEP(Type *Ty, Value *Ptr, ArrayRef< Value * > IdxList, const Twine &Name="", GEPNoWrapFlags NW=GEPNoWrapFlags::none())
Definition: IRBuilder.h:1874
ConstantInt * getInt64(uint64_t C)
Get a constant 64-bit value.
Definition: IRBuilder.h:510
Value * CreateNeg(Value *V, const Twine &Name="", bool HasNSW=false)
Definition: IRBuilder.h:1733
InsertPoint saveIP() const
Returns the current insert point.
Definition: IRBuilder.h:296
ConstantInt * getInt32(uint32_t C)
Get a constant 32-bit value.
Definition: IRBuilder.h:505
Value * CreateCmp(CmpInst::Predicate Pred, Value *LHS, Value *RHS, const Twine &Name="", MDNode *FPMathTag=nullptr)
Definition: IRBuilder.h:2404
Value * CreateFPCast(Value *V, Type *DestTy, const Twine &Name="", MDNode *FPMathTag=nullptr)
Definition: IRBuilder.h:2246
PHINode * CreatePHI(Type *Ty, unsigned NumReservedValues, const Twine &Name="")
Definition: IRBuilder.h:2435
SwitchInst * CreateSwitch(Value *V, BasicBlock *Dest, unsigned NumCases=10, MDNode *BranchWeights=nullptr, MDNode *Unpredictable=nullptr)
Create a switch instruction with the specified value, default dest, and with a hint for the number of...
Definition: IRBuilder.h:1187
Value * CreateICmpEQ(Value *LHS, Value *RHS, const Twine &Name="")
Definition: IRBuilder.h:2270
InstTy * Insert(InstTy *I, const Twine &Name="") const
Insert and return the specified instruction.
Definition: IRBuilder.h:164
DebugLoc getCurrentDebugLocation() const
Get location information used by debugging information.
Definition: IRBuilder.cpp:64
Value * CreateSub(Value *LHS, Value *RHS, const Twine &Name="", bool HasNUW=false, bool HasNSW=false)
Definition: IRBuilder.h:1387
Value * CreateBitCast(Value *V, Type *DestTy, const Twine &Name="")
Definition: IRBuilder.h:2152
ConstantInt * getIntN(unsigned N, uint64_t C)
Get a constant N-bit value, zero extended or truncated from a 64-bit value.
Definition: IRBuilder.h:516
BranchInst * CreateCondBr(Value *Cond, BasicBlock *True, BasicBlock *False, MDNode *BranchWeights=nullptr, MDNode *Unpredictable=nullptr)
Create a conditional 'br Cond, TrueDest, FalseDest' instruction.
Definition: IRBuilder.h:1164
LoadInst * CreateLoad(Type *Ty, Value *Ptr, const char *Name)
Provided to resolve 'CreateLoad(Ty, Ptr, "...")' correctly, instead of converting the string to 'bool...
Definition: IRBuilder.h:1798
Value * CreateShl(Value *LHS, Value *RHS, const Twine &Name="", bool HasNUW=false, bool HasNSW=false)
Definition: IRBuilder.h:1459
Value * CreateZExt(Value *V, Type *DestTy, const Twine &Name="", bool IsNonNeg=false)
Definition: IRBuilder.h:2033
LLVMContext & getContext() const
Definition: IRBuilder.h:195
Value * CreateAnd(Value *LHS, Value *RHS, const Twine &Name="")
Definition: IRBuilder.h:1518
ReturnInst * CreateRetVoid()
Create a 'ret void' instruction.
Definition: IRBuilder.h:1134
Value * CreateConstInBoundsGEP2_32(Type *Ty, Value *Ptr, unsigned Idx0, unsigned Idx1, const Twine &Name="")
Definition: IRBuilder.h:1921
Value * CreateConstInBoundsGEP2_64(Type *Ty, Value *Ptr, uint64_t Idx0, uint64_t Idx1, const Twine &Name="")
Definition: IRBuilder.h:1967
StoreInst * CreateStore(Value *Val, Value *Ptr, bool isVolatile=false)
Definition: IRBuilder.h:1811
Value * CreateAdd(Value *LHS, Value *RHS, const Twine &Name="", bool HasNUW=false, bool HasNSW=false)
Definition: IRBuilder.h:1370
Value * CreatePtrToInt(Value *V, Type *DestTy, const Twine &Name="")
Definition: IRBuilder.h:2142
Value * CreateExactUDiv(Value *LHS, Value *RHS, const Twine &Name="")
Definition: IRBuilder.h:1430
Value * CreateIsNotNull(Value *Arg, const Twine &Name="")
Return a boolean value testing if Arg != 0.
Definition: IRBuilder.h:2588
CallInst * CreateCall(FunctionType *FTy, Value *Callee, ArrayRef< Value * > Args={}, const Twine &Name="", MDNode *FPMathTag=nullptr)
Definition: IRBuilder.h:2449
AtomicRMWInst * CreateAtomicRMW(AtomicRMWInst::BinOp Op, Value *Ptr, Value *Val, MaybeAlign Align, AtomicOrdering Ordering, SyncScope::ID SSID=SyncScope::System)
Definition: IRBuilder.h:1862
Value * CreateTrunc(Value *V, Type *DestTy, const Twine &Name="", bool IsNUW=false, bool IsNSW=false)
Definition: IRBuilder.h:2019
Value * CreateOr(Value *LHS, Value *RHS, const Twine &Name="")
Definition: IRBuilder.h:1540
PointerType * getPtrTy(unsigned AddrSpace=0)
Fetch the type representing a pointer.
Definition: IRBuilder.h:588
BranchInst * CreateBr(BasicBlock *Dest)
Create an unconditional 'br label X' instruction.
Definition: IRBuilder.h:1158
void ClearInsertionPoint()
Clear the insertion point: created instructions will not be inserted into a block.
Definition: IRBuilder.h:188
Value * CreateICmpSLT(Value *LHS, Value *RHS, const Twine &Name="")
Definition: IRBuilder.h:2302
ConstantInt * getInt16(uint16_t C)
Get a constant 16-bit value.
Definition: IRBuilder.h:500
Value * CreateICmpUGE(Value *LHS, Value *RHS, const Twine &Name="")
Definition: IRBuilder.h:2282
Value * CreateIntCast(Value *V, Type *DestTy, bool isSigned, const Twine &Name="")
Definition: IRBuilder.h:2225
void restoreIP(InsertPoint IP)
Sets the current insert point to a previously-saved location.
Definition: IRBuilder.h:308
Value * CreateIsNull(Value *Arg, const Twine &Name="")
Return a boolean value testing if Arg == 0.
Definition: IRBuilder.h:2583
void SetInsertPoint(BasicBlock *TheBB)
This specifies that created instructions should be appended to the end of the specified block.
Definition: IRBuilder.h:199
Type * getVoidTy()
Fetch the type representing void.
Definition: IRBuilder.h:583
StoreInst * CreateAlignedStore(Value *Val, Value *Ptr, MaybeAlign Align, bool isVolatile=false)
Definition: IRBuilder.h:1834
Value * CreateAShr(Value *LHS, Value *RHS, const Twine &Name="", bool isExact=false)
Definition: IRBuilder.h:1499
Value * CreateXor(Value *LHS, Value *RHS, const Twine &Name="")
Definition: IRBuilder.h:1562
Value * CreateICmp(CmpInst::Predicate P, Value *LHS, Value *RHS, const Twine &Name="")
Definition: IRBuilder.h:2380
IntegerType * getInt8Ty()
Fetch the type representing an 8-bit integer.
Definition: IRBuilder.h:535
Value * CreateURem(Value *LHS, Value *RHS, const Twine &Name="")
Definition: IRBuilder.h:1447
CallInst * CreateMemCpy(Value *Dst, MaybeAlign DstAlign, Value *Src, MaybeAlign SrcAlign, uint64_t Size, bool isVolatile=false, MDNode *TBAATag=nullptr, MDNode *TBAAStructTag=nullptr, MDNode *ScopeTag=nullptr, MDNode *NoAliasTag=nullptr)
Create and insert a memcpy between the specified pointers.
Definition: IRBuilder.h:677
Value * CreateSExtOrTrunc(Value *V, Type *DestTy, const Twine &Name="")
Create a SExt or Trunc from the integer value V to DestTy.
Definition: IRBuilder.h:2066
Value * CreateAddrSpaceCast(Value *V, Type *DestTy, const Twine &Name="")
Definition: IRBuilder.h:2157
Value * CreateMul(Value *LHS, Value *RHS, const Twine &Name="", bool HasNUW=false, bool HasNSW=false)
Definition: IRBuilder.h:1404
GlobalVariable * CreateGlobalString(StringRef Str, const Twine &Name="", unsigned AddressSpace=0, Module *M=nullptr, bool AddNull=true)
Make a new global variable with initializer type i8*.
Definition: IRBuilder.cpp:44
Value * CreateFPToSI(Value *V, Type *DestTy, const Twine &Name="")
Definition: IRBuilder.h:2086
This provides a uniform API for creating instructions and inserting them into a basic block: either a...
Definition: IRBuilder.h:2705
void removeFromParent()
This method unlinks 'this' from the containing basic block, but does not delete it.
Definition: Instruction.cpp:80
unsigned getNumSuccessors() const LLVM_READONLY
Return the number of successors that this instruction has.
const DebugLoc & getDebugLoc() const
Return the debug location for this node as a DebugLoc.
Definition: Instruction.h:511
const Module * getModule() const
Return the module owning the function this instruction belongs to or nullptr it the function does not...
Definition: Instruction.cpp:68
InstListType::iterator eraseFromParent()
This method unlinks 'this' from the containing basic block and deletes it.
Definition: Instruction.cpp:94
MDNode * getMetadata(unsigned KindID) const
Get the metadata of given kind attached to this Instruction.
Definition: Instruction.h:426
BasicBlock * getSuccessor(unsigned Idx) const LLVM_READONLY
Return the specified successor. This instruction must be a terminator.
void setMetadata(unsigned KindID, MDNode *Node)
Set the metadata of the specified kind to the specified node.
Definition: Metadata.cpp:1679
void moveBeforePreserving(InstListType::iterator MovePos)
Perform a moveBefore operation, while signalling that the caller intends to preserve the original ord...
void setDebugLoc(DebugLoc Loc)
Set the debug location information for this instruction.
Definition: Instruction.h:508
void insertAfter(Instruction *InsertPos)
Insert an unlinked instruction into a basic block immediately after the specified instruction.
Class to represent integer types.
Definition: DerivedTypes.h:42
static IntegerType * get(LLVMContext &C, unsigned NumBits)
This static method is the primary way of constructing an IntegerType.
Definition: Type.cpp:311
This is an important class for using LLVM in a threaded context.
Definition: LLVMContext.h:67
An instruction for reading from memory.
Definition: Instructions.h:176
Value * getPointerOperand()
Definition: Instructions.h:255
void setAtomic(AtomicOrdering Ordering, SyncScope::ID SSID=SyncScope::System)
Sets the ordering constraint and the synchronization scope ID of this load instruction.
Definition: Instructions.h:241
Align getAlign() const
Return the alignment of the access that is being performed.
Definition: Instructions.h:211
Analysis pass that exposes the LoopInfo for a function.
Definition: LoopInfo.h:566
LoopInfo run(Function &F, FunctionAnalysisManager &AM)
Definition: LoopInfo.cpp:969
LoopT * getLoopFor(const BlockT *BB) const
Return the inner most loop that BB lives in.
This class represents a loop nest and can be used to query its properties.
Represents a single loop in the control flow graph.
Definition: LoopInfo.h:39
MDNode * createCallbackEncoding(unsigned CalleeArgNo, ArrayRef< int > Arguments, bool VarArgsArePassed)
Return metadata describing a callback (see llvm::AbstractCallSite).
Definition: MDBuilder.cpp:118
Metadata node.
Definition: Metadata.h:1073
void replaceOperandWith(unsigned I, Metadata *New)
Replace a specific operand.
Definition: Metadata.cpp:1077
static MDTuple * getDistinct(LLVMContext &Context, ArrayRef< Metadata * > MDs)
Definition: Metadata.h:1557
const MDOperand & getOperand(unsigned I) const
Definition: Metadata.h:1434
ArrayRef< MDOperand > operands() const
Definition: Metadata.h:1432
static MDTuple * get(LLVMContext &Context, ArrayRef< Metadata * > MDs)
Definition: Metadata.h:1549
static MDString * get(LLVMContext &Context, StringRef Str)
Definition: Metadata.cpp:606
This class implements a map that also provides access to all stored values in a deterministic order.
Definition: MapVector.h:36
size_type size() const
Definition: MapVector.h:60
static ErrorOr< std::unique_ptr< MemoryBuffer > > getFile(const Twine &Filename, bool IsText=false, bool RequiresNullTerminator=true, bool IsVolatile=false, std::optional< Align > Alignment=std::nullopt)
Open the specified file as a MemoryBuffer, returning a new MemoryBuffer if successful,...
Root of the metadata hierarchy.
Definition: Metadata.h:62
A Module instance is used to store all the information related to an LLVM module.
Definition: Module.h:65
NamedMDNode * getNamedMetadata(StringRef Name) const
Return the first NamedMDNode in the module with the specified name.
Definition: Module.cpp:297
LLVMContext & getContext() const
Get the global data context.
Definition: Module.h:302
Function * getFunction(StringRef Name) const
Look up the specified function in the module symbol table.
Definition: Module.cpp:228
StringRef getName() const
Get a short "name" for the module.
Definition: Module.h:285
const std::string & getTargetTriple() const
Get the target triple which is a string describing the target host.
Definition: Module.h:298
iterator_range< global_iterator > globals()
Definition: Module.h:702
const FunctionListType & getFunctionList() const
Get the Module's list of functions (constant).
Definition: Module.h:614
GlobalVariable * getGlobalVariable(StringRef Name) const
Look up the specified global variable in the module symbol table.
Definition: Module.h:447
GlobalValue * getNamedValue(StringRef Name) const
Return the global value in the module with the specified name, of arbitrary type.
Definition: Module.cpp:170
NamedMDNode * getOrInsertNamedMetadata(StringRef Name)
Return the named MDNode in the module with the specified name.
Definition: Module.cpp:304
const GlobalVariable * getNamedGlobal(StringRef Name) const
Return the global variable in the module with the specified name, of arbitrary type.
Definition: Module.h:462
const DataLayout & getDataLayout() const
Get the data layout for the module's target platform.
Definition: Module.h:294
A tuple of MDNodes.
Definition: Metadata.h:1737
iterator_range< op_iterator > operands()
Definition: Metadata.h:1833
void addOperand(MDNode *M)
Definition: Metadata.cpp:1431
@ OffloadingEntryInfoTargetRegion
Entry is a target region.
Definition: OMPIRBuilder.h:244
@ OffloadingEntryInfoDeviceGlobalVar
Entry is a declare target variable.
Definition: OMPIRBuilder.h:246
OMPTargetDeviceClauseKind
Kind of device clause for declare target variables and functions NOTE: Currently not used as a part o...
Definition: OMPIRBuilder.h:377
@ OMPTargetDeviceClauseAny
The target is marked for all devices.
Definition: OMPIRBuilder.h:379
void registerDeviceGlobalVarEntryInfo(StringRef VarName, Constant *Addr, int64_t VarSize, OMPTargetGlobalVarEntryKind Flags, GlobalValue::LinkageTypes Linkage)
Register device global variable entry.
void initializeDeviceGlobalVarEntryInfo(StringRef Name, OMPTargetGlobalVarEntryKind Flags, unsigned Order)
Initialize device global variable entry.
void actOnDeviceGlobalVarEntriesInfo(const OffloadDeviceGlobalVarEntryInfoActTy &Action)
OMPTargetRegionEntryKind
Kind of the target registry entry.
Definition: OMPIRBuilder.h:297
@ OMPTargetRegionEntryTargetRegion
Mark the entry as target region.
Definition: OMPIRBuilder.h:299
void getTargetRegionEntryFnName(SmallVectorImpl< char > &Name, const TargetRegionEntryInfo &EntryInfo)
bool hasTargetRegionEntryInfo(TargetRegionEntryInfo EntryInfo, bool IgnoreAddressId=false) const
Return true if a target region entry with the provided information exists.
void registerTargetRegionEntryInfo(TargetRegionEntryInfo EntryInfo, Constant *Addr, Constant *ID, OMPTargetRegionEntryKind Flags)
Register target region entry.
void actOnTargetRegionEntriesInfo(const OffloadTargetRegionEntryInfoActTy &Action)
unsigned size() const
Return number of entries defined so far.
Definition: OMPIRBuilder.h:288
void initializeTargetRegionEntryInfo(const TargetRegionEntryInfo &EntryInfo, unsigned Order)
Initialize target region entry.
OMPTargetGlobalVarEntryKind
Kind of the global variable entry..
Definition: OMPIRBuilder.h:357
@ OMPTargetGlobalVarEntryEnter
Mark the entry as a declare target enter.
Definition: OMPIRBuilder.h:363
@ OMPTargetGlobalRegisterRequires
Mark the entry as a register requires global.
Definition: OMPIRBuilder.h:369
@ OMPTargetGlobalVarEntryIndirect
Mark the entry as a declare target indirect global.
Definition: OMPIRBuilder.h:367
@ OMPTargetGlobalVarEntryLink
Mark the entry as a to declare target link.
Definition: OMPIRBuilder.h:361
@ OMPTargetGlobalVarEntryTo
Mark the entry as a to declare target.
Definition: OMPIRBuilder.h:359
bool hasDeviceGlobalVarEntryInfo(StringRef VarName) const
Checks if the variable with the given name has been registered already.
Definition: OMPIRBuilder.h:433
bool empty() const
Return true if a there are no entries defined.
std::optional< bool > IsTargetDevice
Flag to define whether to generate code for the role of the OpenMP host (if set to false) or device (...
Definition: OMPIRBuilder.h:93
void setGridValue(omp::GV G)
Definition: OMPIRBuilder.h:189
StringRef separator() const
Definition: OMPIRBuilder.h:175
int64_t getRequiresFlags() const
Returns requires directive clauses as flags compatible with those expected by libomptarget.
StringRef firstSeparator() const
Definition: OMPIRBuilder.h:165
std::optional< bool > EmitLLVMUsedMetaInfo
Flag for specifying if LLVMUsed information should be emitted.
Definition: OMPIRBuilder.h:106
omp::GV getGridValue() const
Definition: OMPIRBuilder.h:148
void setHasRequiresReverseOffload(bool Value)
bool hasRequiresUnifiedSharedMemory() const
void setHasRequiresUnifiedSharedMemory(bool Value)
bool hasRequiresDynamicAllocators() const
bool openMPOffloadMandatory() const
Definition: OMPIRBuilder.h:142
void setHasRequiresUnifiedAddress(bool Value)
void setHasRequiresDynamicAllocators(bool Value)
void setEmitLLVMUsed(bool Value=true)
Definition: OMPIRBuilder.h:185
bool hasRequiresReverseOffload() const
bool hasRequiresUnifiedAddress() const
Struct that keeps the information that should be kept throughout a 'target data' region.
An interface to create LLVM-IR for OpenMP directives.
Definition: OMPIRBuilder.h:474
InsertPointOrErrorTy createOrderedThreadsSimd(const LocationDescription &Loc, BodyGenCallbackTy BodyGenCB, FinalizeCallbackTy FiniCB, bool IsThreads)
Generator for '#omp ordered [threads | simd]'.
Constant * getOrCreateIdent(Constant *SrcLocStr, uint32_t SrcLocStrSize, omp::IdentFlag Flags=omp::IdentFlag(0), unsigned Reserve2Flags=0)
Return an ident_t* encoding the source location SrcLocStr and Flags.
FunctionCallee getOrCreateRuntimeFunction(Module &M, omp::RuntimeFunction FnID)
Return the function declaration for the runtime function with FnID.
InsertPointOrErrorTy createCancel(const LocationDescription &Loc, Value *IfCondition, omp::Directive CanceledDirective)
Generator for '#omp cancel'.
ReductionGenCBKind
Enum class for the RedctionGen CallBack type to be used.
CanonicalLoopInfo * collapseLoops(DebugLoc DL, ArrayRef< CanonicalLoopInfo * > Loops, InsertPointTy ComputeIP)
Collapse a loop nest into a single loop.
InsertPointOrErrorTy createTask(const LocationDescription &Loc, InsertPointTy AllocaIP, BodyGenCallbackTy BodyGenCB, bool Tied=true, Value *Final=nullptr, Value *IfCondition=nullptr, SmallVector< DependData > Dependencies={}, bool Mergeable=false, Value *EventHandle=nullptr, Value *Priority=nullptr)
Generator for #omp task
void createTaskyield(const LocationDescription &Loc)
Generator for '#omp taskyield'.
std::function< Error(InsertPointTy CodeGenIP)> FinalizeCallbackTy
Callback type for variable finalization (think destructors).
Definition: OMPIRBuilder.h:543
void emitBranch(BasicBlock *Target)
InsertPointTy createAtomicWrite(const LocationDescription &Loc, AtomicOpValue &X, Value *Expr, AtomicOrdering AO)
Emit atomic write for : X = Expr — Only Scalar data types.
static void writeThreadBoundsForKernel(const Triple &T, Function &Kernel, int32_t LB, int32_t UB)
static TargetRegionEntryInfo getTargetEntryUniqueInfo(FileIdentifierInfoCallbackTy CallBack, StringRef ParentName="")
Creates a unique info for a target entry when provided a filename and line number from.
void emitTaskwaitImpl(const LocationDescription &Loc)
Generate a taskwait runtime call.
Constant * registerTargetRegionFunction(TargetRegionEntryInfo &EntryInfo, Function *OutlinedFunction, StringRef EntryFnName, StringRef EntryFnIDName)
Registers the given function and sets up the attribtues of the function Returns the FunctionID.
GlobalVariable * emitKernelExecutionMode(StringRef KernelName, omp::OMPTgtExecModeFlags Mode)
Emit the kernel execution mode.
void initialize()
Initialize the internal state, this will put structures types and potentially other helpers into the ...
void createTargetDeinit(const LocationDescription &Loc, int32_t TeamsReductionDataSize=0, int32_t TeamsReductionBufferLength=1024)
Create a runtime call for kmpc_target_deinit.
InsertPointOrErrorTy createTaskgroup(const LocationDescription &Loc, InsertPointTy AllocaIP, BodyGenCallbackTy BodyGenCB)
Generator for the taskgroup construct.
void loadOffloadInfoMetadata(Module &M)
Loads all the offload entries information from the host IR metadata.
InsertPointOrErrorTy emitTargetTask(TargetTaskBodyCallbackTy TaskBodyCB, Value *DeviceID, Value *RTLoc, OpenMPIRBuilder::InsertPointTy AllocaIP, const SmallVector< llvm::OpenMPIRBuilder::DependData > &Dependencies, bool HasNoWait)
Generate a target-task for the target construct.
void unrollLoopFull(DebugLoc DL, CanonicalLoopInfo *Loop)
Fully unroll a loop.
void emitFlush(const LocationDescription &Loc)
Generate a flush runtime call.
static std::pair< int32_t, int32_t > readThreadBoundsForKernel(const Triple &T, Function &Kernel)
}
OpenMPIRBuilderConfig Config
The OpenMPIRBuilder Configuration.
CallInst * createOMPInteropDestroy(const LocationDescription &Loc, Value *InteropVar, Value *Device, Value *NumDependences, Value *DependenceAddress, bool HaveNowaitClause)
Create a runtime call for __tgt_interop_destroy.
InsertPointTy createAtomicRead(const LocationDescription &Loc, AtomicOpValue &X, AtomicOpValue &V, AtomicOrdering AO)
Emit atomic Read for : V = X — Only Scalar data types.
Error emitIfClause(Value *Cond, BodyGenCallbackTy ThenGen, BodyGenCallbackTy ElseGen, InsertPointTy AllocaIP={})
Emits code for OpenMP 'if' clause using specified BodyGenCallbackTy Here is the logic: if (Cond) { Th...
std::function< void(EmitMetadataErrorKind, TargetRegionEntryInfo)> EmitMetadataErrorReportFunctionTy
Callback function type.
void emitUsed(StringRef Name, ArrayRef< llvm::WeakTrackingVH > List)
Emit the llvm.used metadata.
InsertPointOrErrorTy createSingle(const LocationDescription &Loc, BodyGenCallbackTy BodyGenCB, FinalizeCallbackTy FiniCB, bool IsNowait, ArrayRef< llvm::Value * > CPVars={}, ArrayRef< llvm::Function * > CPFuncs={})
Generator for '#omp single'.
InsertPointOrErrorTy createTeams(const LocationDescription &Loc, BodyGenCallbackTy BodyGenCB, Value *NumTeamsLower=nullptr, Value *NumTeamsUpper=nullptr, Value *ThreadLimit=nullptr, Value *IfExpr=nullptr)
Generator for #omp teams
std::forward_list< CanonicalLoopInfo > LoopInfos
Collection of owned canonical loop objects that eventually need to be free'd.
void createTaskwait(const LocationDescription &Loc)
Generator for '#omp taskwait'.
CanonicalLoopInfo * createLoopSkeleton(DebugLoc DL, Value *TripCount, Function *F, BasicBlock *PreInsertBefore, BasicBlock *PostInsertBefore, const Twine &Name={})
Create the control flow structure of a canonical OpenMP loop.
std::string createPlatformSpecificName(ArrayRef< StringRef > Parts) const
Get the create a name using the platform specific separators.
FunctionCallee createDispatchNextFunction(unsigned IVSize, bool IVSigned)
Returns __kmpc_dispatch_next_* runtime function for the specified size IVSize and sign IVSigned.
static void getKernelArgsVector(TargetKernelArgs &KernelArgs, IRBuilderBase &Builder, SmallVector< Value * > &ArgsVector)
Create the kernel args vector used by emitTargetKernel.
void unrollLoopHeuristic(DebugLoc DL, CanonicalLoopInfo *Loop)
Fully or partially unroll a loop.
InsertPointOrErrorTy createParallel(const LocationDescription &Loc, InsertPointTy AllocaIP, BodyGenCallbackTy BodyGenCB, PrivatizeCallbackTy PrivCB, FinalizeCallbackTy FiniCB, Value *IfCondition, Value *NumThreads, omp::ProcBindKind ProcBind, bool IsCancellable)
Generator for '#omp parallel'.
omp::OpenMPOffloadMappingFlags getMemberOfFlag(unsigned Position)
Get OMP_MAP_MEMBER_OF flag with extra bits reserved based on the position given.
void addAttributes(omp::RuntimeFunction FnID, Function &Fn)
Add attributes known for FnID to Fn.
Module & M
The underlying LLVM-IR module.
StringMap< Constant * > SrcLocStrMap
Map to remember source location strings.
void createMapperAllocas(const LocationDescription &Loc, InsertPointTy AllocaIP, unsigned NumOperands, struct MapperAllocas &MapperAllocas)
Create the allocas instruction used in call to mapper functions.
Constant * getOrCreateSrcLocStr(StringRef LocStr, uint32_t &SrcLocStrSize)
Return the (LLVM-IR) string describing the source location LocStr.
void addOutlineInfo(OutlineInfo &&OI)
Add a new region that will be outlined later.
Error emitTargetRegionFunction(TargetRegionEntryInfo &EntryInfo, FunctionGenCallback &GenerateFunctionCallback, bool IsOffloadEntry, Function *&OutlinedFn, Constant *&OutlinedFnID)
Create a unique name for the entry function using the source location information of the current targ...
FunctionCallee createDispatchFiniFunction(unsigned IVSize, bool IVSigned)
Returns __kmpc_dispatch_fini_* runtime function for the specified size IVSize and sign IVSigned.
void unrollLoopPartial(DebugLoc DL, CanonicalLoopInfo *Loop, int32_t Factor, CanonicalLoopInfo **UnrolledCLI)
Partially unroll a loop.
void emitTaskyieldImpl(const LocationDescription &Loc)
Generate a taskyield runtime call.
void emitMapperCall(const LocationDescription &Loc, Function *MapperFunc, Value *SrcLocInfo, Value *MaptypesArg, Value *MapnamesArg, struct MapperAllocas &MapperAllocas, int64_t DeviceID, unsigned NumOperands)
Create the call for the target mapper function.
InsertPointTy createAtomicCompare(const LocationDescription &Loc, AtomicOpValue &X, AtomicOpValue &V, AtomicOpValue &R, Value *E, Value *D, AtomicOrdering AO, omp::OMPAtomicCompareOp Op, bool IsXBinopExpr, bool IsPostfixUpdate, bool IsFailOnly)
Emit atomic compare for constructs: — Only scalar data types cond-expr-stmt: x = x ordop expr ?...
InsertPointOrErrorTy createAtomicCapture(const LocationDescription &Loc, InsertPointTy AllocaIP, AtomicOpValue &X, AtomicOpValue &V, Value *Expr, AtomicOrdering AO, AtomicRMWInst::BinOp RMWOp, AtomicUpdateCallbackTy &UpdateOp, bool UpdateExpr, bool IsPostfixUpdate, bool IsXBinopExpr)
Emit atomic update for constructs: — Only Scalar data types V = X; X = X BinOp Expr ,...
InsertPointTy createOrderedDepend(const LocationDescription &Loc, InsertPointTy AllocaIP, unsigned NumLoops, ArrayRef< llvm::Value * > StoreValues, const Twine &Name, bool IsDependSource)
Generator for '#omp ordered depend (source | sink)'.
InsertPointTy createCopyinClauseBlocks(InsertPointTy IP, Value *MasterAddr, Value *PrivateAddr, llvm::IntegerType *IntPtrTy, bool BranchtoEnd=true)
Generate conditional branch and relevant BasicBlocks through which private threads copy the 'copyin' ...
void emitOffloadingArrays(InsertPointTy AllocaIP, InsertPointTy CodeGenIP, MapInfosTy &CombinedInfo, TargetDataInfo &Info, bool IsNonContiguous=false, function_ref< void(unsigned int, Value *)> DeviceAddrCB=nullptr, function_ref< Value *(unsigned int)> CustomMapperCB=nullptr)
Emit the arrays used to pass the captures and map information to the offloading runtime library.
SmallVector< FinalizationInfo, 8 > FinalizationStack
The finalization stack made up of finalize callbacks currently in-flight, wrapped into FinalizationIn...
std::vector< CanonicalLoopInfo * > tileLoops(DebugLoc DL, ArrayRef< CanonicalLoopInfo * > Loops, ArrayRef< Value * > TileSizes)
Tile a loop nest.
CallInst * createOMPInteropInit(const LocationDescription &Loc, Value *InteropVar, omp::OMPInteropType InteropType, Value *Device, Value *NumDependences, Value *DependenceAddress, bool HaveNowaitClause)
Create a runtime call for __tgt_interop_init.
void finalize(Function *Fn=nullptr)
Finalize the underlying module, e.g., by outlining regions.
SmallVector< OutlineInfo, 16 > OutlineInfos
Collection of regions that need to be outlined during finalization.
Function * getOrCreateRuntimeFunctionPtr(omp::RuntimeFunction FnID)
InsertPointTy createTargetInit(const LocationDescription &Loc, const llvm::OpenMPIRBuilder::TargetKernelDefaultAttrs &Attrs)
The omp target interface.
const Triple T
The target triple of the underlying module.
DenseMap< std::pair< Constant *, uint64_t >, Constant * > IdentMap
Map to remember existing ident_t*.
CallInst * createOMPFree(const LocationDescription &Loc, Value *Addr, Value *Allocator, std::string Name="")
Create a runtime call for kmpc_free.
FunctionCallee createForStaticInitFunction(unsigned IVSize, bool IVSigned, bool IsGPUDistribute)
Returns __kmpc_for_static_init_* runtime function for the specified size IVSize and sign IVSigned.
CallInst * createOMPAlloc(const LocationDescription &Loc, Value *Size, Value *Allocator, std::string Name="")
Create a runtime call for kmpc_Alloc.
void emitNonContiguousDescriptor(InsertPointTy AllocaIP, InsertPointTy CodeGenIP, MapInfosTy &CombinedInfo, TargetDataInfo &Info)
Emit an array of struct descriptors to be assigned to the offload args.
InsertPointOrErrorTy createSection(const LocationDescription &Loc, BodyGenCallbackTy BodyGenCB, FinalizeCallbackTy FiniCB)
Generator for '#omp section'.
void emitBlock(BasicBlock *BB, Function *CurFn, bool IsFinished=false)
Value * getOrCreateThreadID(Value *Ident)
Return the current thread ID.
void emitOffloadingArraysAndArgs(InsertPointTy AllocaIP, InsertPointTy CodeGenIP, TargetDataInfo &Info, TargetDataRTArgs &RTArgs, MapInfosTy &CombinedInfo, bool IsNonContiguous=false, bool ForEndCall=false, function_ref< void(unsigned int, Value *)> DeviceAddrCB=nullptr, function_ref< Value *(unsigned int)> CustomMapperCB=nullptr)
Allocates memory for and populates the arrays required for offloading (offload_{baseptrs|ptrs|mappers...
InsertPointOrErrorTy createMaster(const LocationDescription &Loc, BodyGenCallbackTy BodyGenCB, FinalizeCallbackTy FiniCB)
Generator for '#omp master'.
Error emitCancelationCheckImpl(Value *CancelFlag, omp::Directive CanceledDirective, FinalizeCallbackTy ExitCB={})
Generate control flow and cleanup for cancellation.
InsertPointOrErrorTy emitKernelLaunch(const LocationDescription &Loc, Value *OutlinedFnID, EmitFallbackCallbackTy EmitTargetCallFallbackCB, TargetKernelArgs &Args, Value *DeviceID, Value *RTLoc, InsertPointTy AllocaIP)
Generate a target region entry call and host fallback call.
InsertPointOrErrorTy createTarget(const LocationDescription &Loc, bool IsOffloadEntry, OpenMPIRBuilder::InsertPointTy AllocaIP, OpenMPIRBuilder::InsertPointTy CodeGenIP, TargetRegionEntryInfo &EntryInfo, const TargetKernelDefaultAttrs &DefaultAttrs, const TargetKernelRuntimeAttrs &RuntimeAttrs, Value *IfCond, SmallVectorImpl< Value * > &Inputs, GenMapInfoCallbackTy GenMapInfoCB, TargetBodyGenCallbackTy BodyGenCB, TargetGenArgAccessorsCallbackTy ArgAccessorFuncCB, SmallVector< DependData > Dependencies={}, bool HasNowait=false)
Generator for '#omp target'.
StringMap< GlobalVariable *, BumpPtrAllocator > InternalVars
An ordered map of auto-generated variables to their unique names.
GlobalVariable * getOrCreateInternalVariable(Type *Ty, const StringRef &Name, unsigned AddressSpace=0)
Gets (if variable with the given name already exist) or creates internal global variable with the spe...
InsertPointOrErrorTy createReductionsGPU(const LocationDescription &Loc, InsertPointTy AllocaIP, InsertPointTy CodeGenIP, ArrayRef< ReductionInfo > ReductionInfos, bool IsNoWait=false, bool IsTeamsReduction=false, bool HasDistribute=false, ReductionGenCBKind ReductionGenCBKind=ReductionGenCBKind::MLIR, std::optional< omp::GV > GridValue={}, unsigned ReductionBufNum=1024, Value *SrcLocInfo=nullptr)
Design of OpenMP reductions on the GPU.
FunctionCallee createDispatchInitFunction(unsigned IVSize, bool IVSigned)
Returns __kmpc_dispatch_init_* runtime function for the specified size IVSize and sign IVSigned.
Function * emitUserDefinedMapper(function_ref< MapInfosTy &(InsertPointTy CodeGenIP, llvm::Value *PtrPHI, llvm::Value *BeginArg)> PrivAndGenMapInfoCB, llvm::Type *ElemTy, StringRef FuncName, function_ref< bool(unsigned int, Function **)> CustomMapperCB=nullptr)
Emit the user-defined mapper function.
CallInst * createOMPInteropUse(const LocationDescription &Loc, Value *InteropVar, Value *Device, Value *NumDependences, Value *DependenceAddress, bool HaveNowaitClause)
Create a runtime call for __tgt_interop_use.
IRBuilder<>::InsertPoint InsertPointTy
Type used throughout for insertion points.
Definition: OMPIRBuilder.h:520
InsertPointOrErrorTy createReductions(const LocationDescription &Loc, InsertPointTy AllocaIP, ArrayRef< ReductionInfo > ReductionInfos, ArrayRef< bool > IsByRef, bool IsNoWait=false)
Generator for '#omp reduction'.
GlobalVariable * createOffloadMapnames(SmallVectorImpl< llvm::Constant * > &Names, std::string VarName)
Create the global variable holding the offload names information.
std::function< Expected< Function * >(StringRef FunctionName)> FunctionGenCallback
Functions used to generate a function with the given name.
static void writeTeamsForKernel(const Triple &T, Function &Kernel, int32_t LB, int32_t UB)
InsertPointOrErrorTy createBarrier(const LocationDescription &Loc, omp::Directive Kind, bool ForceSimpleCall=false, bool CheckCancelFlag=true)
Emitter methods for OpenMP directives.
void setCorrectMemberOfFlag(omp::OpenMPOffloadMappingFlags &Flags, omp::OpenMPOffloadMappingFlags MemberOfFlag)
Given an initial flag set, this function modifies it to contain the passed in MemberOfFlag generated ...
Constant * getOrCreateDefaultSrcLocStr(uint32_t &SrcLocStrSize)
Return the (LLVM-IR) string describing the default source location.
InsertPointOrErrorTy createCritical(const LocationDescription &Loc, BodyGenCallbackTy BodyGenCB, FinalizeCallbackTy FiniCB, StringRef CriticalName, Value *HintInst)
Generator for '#omp critical'.
void createOffloadEntry(Constant *ID, Constant *Addr, uint64_t Size, int32_t Flags, GlobalValue::LinkageTypes, StringRef Name="")
Creates offloading entry for the provided entry ID ID, address Addr, size Size, and flags Flags.
static unsigned getOpenMPDefaultSimdAlign(const Triple &TargetTriple, const StringMap< bool > &Features)
Get the default alignment value for given target.
unsigned getFlagMemberOffset()
Get the offset of the OMP_MAP_MEMBER_OF field.
void createOffloadEntriesAndInfoMetadata(EmitMetadataErrorReportFunctionTy &ErrorReportFunction)
void applySimd(CanonicalLoopInfo *Loop, MapVector< Value *, Value * > AlignedVars, Value *IfCond, omp::OrderKind Order, ConstantInt *Simdlen, ConstantInt *Safelen)
Add metadata to simd-ize a loop.
bool isLastFinalizationInfoCancellable(omp::Directive DK)
Return true if the last entry in the finalization stack is of kind DK and cancellable.
InsertPointTy emitTargetKernel(const LocationDescription &Loc, InsertPointTy AllocaIP, Value *&Return, Value *Ident, Value *DeviceID, Value *NumTeams, Value *NumThreads, Value *HostPtr, ArrayRef< Value * > KernelArgs)
Generate a target region entry call.
GlobalVariable * createOffloadMaptypes(SmallVectorImpl< uint64_t > &Mappings, std::string VarName)
Create the global variable holding the offload mappings information.
CallInst * createCachedThreadPrivate(const LocationDescription &Loc, llvm::Value *Pointer, llvm::ConstantInt *Size, const llvm::Twine &Name=Twine(""))
Create a runtime call for kmpc_threadprivate_cached.
IRBuilder Builder
The LLVM-IR Builder used to create IR.
GlobalValue * createGlobalFlag(unsigned Value, StringRef Name)
Create a hidden global flag Name in the module with initial value Value.
InsertPointOrErrorTy applyWorkshareLoop(DebugLoc DL, CanonicalLoopInfo *CLI, InsertPointTy AllocaIP, bool NeedsBarrier, llvm::omp::ScheduleKind SchedKind=llvm::omp::OMP_SCHEDULE_Default, Value *ChunkSize=nullptr, bool HasSimdModifier=false, bool HasMonotonicModifier=false, bool HasNonmonotonicModifier=false, bool HasOrderedClause=false, omp::WorksharingLoopType LoopType=omp::WorksharingLoopType::ForStaticLoop)
Modifies the canonical loop to be a workshare loop.
void emitOffloadingArraysArgument(IRBuilderBase &Builder, OpenMPIRBuilder::TargetDataRTArgs &RTArgs, OpenMPIRBuilder::TargetDataInfo &Info, bool ForEndCall=false)
Emit the arguments to be passed to the runtime library based on the arrays of base pointers,...
InsertPointOrErrorTy createMasked(const LocationDescription &Loc, BodyGenCallbackTy BodyGenCB, FinalizeCallbackTy FiniCB, Value *Filter)
Generator for '#omp masked'.
Expected< CanonicalLoopInfo * > createCanonicalLoop(const LocationDescription &Loc, LoopBodyGenCallbackTy BodyGenCB, Value *TripCount, const Twine &Name="loop")
Generator for the control flow structure of an OpenMP canonical loop.
Value * getSizeInBytes(Value *BasePtr)
Computes the size of type in bytes.
FunctionCallee createDispatchDeinitFunction()
Returns __kmpc_dispatch_deinit runtime function.
void registerTargetGlobalVariable(OffloadEntriesInfoManager::OMPTargetGlobalVarEntryKind CaptureClause, OffloadEntriesInfoManager::OMPTargetDeviceClauseKind DeviceClause, bool IsDeclaration, bool IsExternallyVisible, TargetRegionEntryInfo EntryInfo, StringRef MangledName, std::vector< GlobalVariable * > &GeneratedRefs, bool OpenMPSIMD, std::vector< Triple > TargetTriple, std::function< Constant *()> GlobalInitializer, std::function< GlobalValue::LinkageTypes()> VariableLinkage, Type *LlvmPtrTy, Constant *Addr)
Registers a target variable for device or host.
InsertPointOrErrorTy createTargetData(const LocationDescription &Loc, InsertPointTy AllocaIP, InsertPointTy CodeGenIP, Value *DeviceID, Value *IfCond, TargetDataInfo &Info, GenMapInfoCallbackTy GenMapInfoCB, omp::RuntimeFunction *MapperFunc=nullptr, function_ref< InsertPointOrErrorTy(InsertPointTy CodeGenIP, BodyGenTy BodyGenType)> BodyGenCB=nullptr, function_ref< void(unsigned int, Value *)> DeviceAddrCB=nullptr, function_ref< Value *(unsigned int)> CustomMapperCB=nullptr, Value *SrcLocInfo=nullptr)
Generator for '#omp target data'.
BodyGenTy
Type of BodyGen to use for region codegen.
InsertPointOrErrorTy createAtomicUpdate(const LocationDescription &Loc, InsertPointTy AllocaIP, AtomicOpValue &X, Value *Expr, AtomicOrdering AO, AtomicRMWInst::BinOp RMWOp, AtomicUpdateCallbackTy &UpdateOp, bool IsXBinopExpr)
Emit atomic update for constructs: X = X BinOp Expr ,or X = Expr BinOp X For complex Operations: X = ...
SmallVector< llvm::Function *, 16 > ConstantAllocaRaiseCandidates
A collection of candidate target functions that's constant allocas will attempt to be raised on a cal...
OffloadEntriesInfoManager OffloadInfoManager
Info manager to keep track of target regions.
static std::pair< int32_t, int32_t > readTeamBoundsForKernel(const Triple &T, Function &Kernel)
Read/write a bounds on teams for Kernel.
std::function< std::tuple< std::string, uint64_t >()> FileIdentifierInfoCallbackTy
const std::string ompOffloadInfoName
OMP Offload Info Metadata name string.
Expected< InsertPointTy > InsertPointOrErrorTy
Type used to represent an insertion point or an error value.
Definition: OMPIRBuilder.h:523
InsertPointTy createCopyPrivate(const LocationDescription &Loc, llvm::Value *BufSize, llvm::Value *CpyBuf, llvm::Value *CpyFn, llvm::Value *DidIt)
Generator for __kmpc_copyprivate.
InsertPointOrErrorTy createSections(const LocationDescription &Loc, InsertPointTy AllocaIP, ArrayRef< StorableBodyGenCallbackTy > SectionCBs, PrivatizeCallbackTy PrivCB, FinalizeCallbackTy FiniCB, bool IsCancellable, bool IsNowait)
Generator for '#omp sections'.
bool updateToLocation(const LocationDescription &Loc)
Update the internal location to Loc.
void createFlush(const LocationDescription &Loc)
Generator for '#omp flush'.
Constant * getAddrOfDeclareTargetVar(OffloadEntriesInfoManager::OMPTargetGlobalVarEntryKind CaptureClause, OffloadEntriesInfoManager::OMPTargetDeviceClauseKind DeviceClause, bool IsDeclaration, bool IsExternallyVisible, TargetRegionEntryInfo EntryInfo, StringRef MangledName, std::vector< GlobalVariable * > &GeneratedRefs, bool OpenMPSIMD, std::vector< Triple > TargetTriple, Type *LlvmPtrTy, std::function< Constant *()> GlobalInitializer, std::function< GlobalValue::LinkageTypes()> VariableLinkage)
Retrieve (or create if non-existent) the address of a declare target variable, used in conjunction wi...
EmitMetadataErrorKind
The kind of errors that can occur when emitting the offload entries and metadata.
The optimization diagnostic interface.
void addIncoming(Value *V, BasicBlock *BB)
Add an incoming value to the end of the PHI list.
Pseudo-analysis pass that exposes the PassInstrumentation to pass managers.
Class to represent pointers.
Definition: DerivedTypes.h:670
static PointerType * getUnqual(Type *ElementType)
This constructs a pointer to an object of the specified type in the default address space (address sp...
Definition: DerivedTypes.h:686
Analysis pass that exposes the ScalarEvolution for a function.
ScalarEvolution run(Function &F, FunctionAnalysisManager &AM)
The main scalar evolution driver.
A vector that has set insertion semantics.
Definition: SetVector.h:57
bool remove_if(UnaryPredicate P)
Remove items from the set vector based on a predicate function.
Definition: SetVector.h:237
bool empty() const
Determine if the SetVector is empty or not.
Definition: SetVector.h:93
This is a 'bitvector' (really, a variable-sized bit array), optimized for the case when the array is ...
SmallBitVector & set()
bool test(unsigned Idx) const
bool all() const
Returns true if all bits are set.
bool any() const
Returns true if any bit is set.
A templated base class for SmallPtrSet which provides the typesafe interface that is common across al...
Definition: SmallPtrSet.h:363
size_type count(ConstPtrType Ptr) const
count - Return 1 if the specified pointer is in the set, 0 otherwise.
Definition: SmallPtrSet.h:452
iterator end() const
Definition: SmallPtrSet.h:477
std::pair< iterator, bool > insert(PtrType Ptr)
Inserts Ptr if and only if there is no element in the container equal to Ptr.
Definition: SmallPtrSet.h:384
iterator begin() const
Definition: SmallPtrSet.h:472
SmallPtrSet - This class implements a set which is optimized for holding SmallSize or less elements.
Definition: SmallPtrSet.h:519
SmallSet - This maintains a set of unique values, optimizing for the case when the set is small (less...
Definition: SmallSet.h:132
std::pair< const_iterator, bool > insert(const T &V)
insert - Insert an element into the set if it isn't already there.
Definition: SmallSet.h:181
SmallString - A SmallString is just a SmallVector with methods and accessors that make it work better...
Definition: SmallString.h:26
void append(StringRef RHS)
Append from a StringRef.
Definition: SmallString.h:68
StringRef str() const
Explicit conversion to StringRef.
Definition: SmallString.h:254
bool empty() const
Definition: SmallVector.h:81
size_t size() const
Definition: SmallVector.h:78
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
Definition: SmallVector.h:573
reference emplace_back(ArgTypes &&... Args)
Definition: SmallVector.h:937
void reserve(size_type N)
Definition: SmallVector.h:663
void append(ItTy in_start, ItTy in_end)
Add the specified range to the end of the SmallVector.
Definition: SmallVector.h:683
void resize(size_type N)
Definition: SmallVector.h:638
void push_back(const T &Elt)
Definition: SmallVector.h:413
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
Definition: SmallVector.h:1196
An instruction for storing to memory.
Definition: Instructions.h:292
void setAlignment(Align Align)
Definition: Instructions.h:337
void setAtomic(AtomicOrdering Ordering, SyncScope::ID SSID=SyncScope::System)
Sets the ordering constraint and the synchronization scope ID of this store instruction.
Definition: Instructions.h:364
StringMap - This is an unconventional map that is specialized for handling keys that are "strings",...
Definition: StringMap.h:128
ValueTy lookup(StringRef Key) const
lookup - Return the entry for the specified key, or a default constructed value if no such entry exis...
Definition: StringMap.h:253
StringRef - Represent a constant reference to a string, i.e.
Definition: StringRef.h:51
std::pair< StringRef, StringRef > split(char Separator) const
Split into two substrings around the first occurrence of a separator character.
Definition: StringRef.h:700
constexpr bool empty() const
empty - Check if the string is empty.
Definition: StringRef.h:147
constexpr size_t size() const
size - Get the string size.
Definition: StringRef.h:150
size_t count(char C) const
Return the number of occurrences of C in the string.
Definition: StringRef.h:451
bool ends_with(StringRef Suffix) const
Check if this string ends with the given Suffix.
Definition: StringRef.h:277
StringRef drop_back(size_t N=1) const
Return a StringRef equal to 'this' but with the last N elements dropped.
Definition: StringRef.h:616
Class to represent struct types.
Definition: DerivedTypes.h:218
static StructType * get(LLVMContext &Context, ArrayRef< Type * > Elements, bool isPacked=false)
This static method is the primary way to create a literal StructType.
Definition: Type.cpp:406
static StructType * create(LLVMContext &Context, StringRef Name)
This creates an identified struct.
Definition: Type.cpp:612
Multiway switch.
void addCase(ConstantInt *OnVal, BasicBlock *Dest)
Add an entry to the switch instruction.
Analysis pass providing the TargetTransformInfo.
Result run(const Function &F, FunctionAnalysisManager &)
Analysis pass providing the TargetLibraryInfo.
This pass provides access to the codegen interfaces that are needed for IR-level transformations.
Target - Wrapper for Target specific information.
TargetMachine * createTargetMachine(StringRef TT, StringRef CPU, StringRef Features, const TargetOptions &Options, std::optional< Reloc::Model > RM, std::optional< CodeModel::Model > CM=std::nullopt, CodeGenOptLevel OL=CodeGenOptLevel::Default, bool JIT=false) const
createTargetMachine - Create a target specific machine implementation for the specified Triple.
Triple - Helper class for working with autoconf configuration names.
Definition: Triple.h:44
bool isPPC() const
Tests whether the target is PowerPC (32- or 64-bit LE or BE).
Definition: Triple.h:996
bool isX86() const
Tests whether the target is x86 (32- or 64-bit).
Definition: Triple.h:1054
bool isWasm() const
Tests whether the target is wasm (32- and 64-bit).
Definition: Triple.h:1064
Twine - A lightweight data structure for efficiently representing the concatenation of temporary valu...
Definition: Twine.h:81
std::string str() const
Return the twine contents as a std::string.
Definition: Twine.cpp:17
The instances of the Type class are immutable: once they are created, they are never changed.
Definition: Type.h:45
unsigned getIntegerBitWidth() const
Type * getStructElementType(unsigned N) const
bool isPointerTy() const
True if this is an instance of PointerType.
Definition: Type.h:264
static IntegerType * getInt1Ty(LLVMContext &C)
unsigned getScalarSizeInBits() const LLVM_READONLY
If this is a vector type, return the getPrimitiveSizeInBits value for the element type.
static Type * getVoidTy(LLVMContext &C)
bool isStructTy() const
True if this is an instance of StructType.
Definition: Type.h:258
bool isFloatingPointTy() const
Return true if this is one of the floating-point types.
Definition: Type.h:184
static IntegerType * getInt32Ty(LLVMContext &C)
static IntegerType * getInt64Ty(LLVMContext &C)
bool isIntegerTy() const
True if this is an instance of IntegerType.
Definition: Type.h:237
bool isVoidTy() const
Return true if this is 'void'.
Definition: Type.h:139
static UndefValue * get(Type *T)
Static factory methods - Return an 'undef' object of the specified type.
Definition: Constants.cpp:1859
This function has undefined behavior.
Produce an estimate of the unrolled cost of the specified loop.
Definition: UnrollLoop.h:130
bool canUnroll() const
Whether it is legal to unroll this loop.
uint64_t getRolledLoopSize() const
Definition: UnrollLoop.h:146
A Use represents the edge between a Value definition and its users.
Definition: Use.h:35
void setOperand(unsigned i, Value *Val)
Definition: User.h:233
Value * getOperand(unsigned i) const
Definition: User.h:228
ValueT lookup(const KeyT &Val) const
lookup - Return the entry for the specified key, or a default constructed value if no such entry exis...
Definition: ValueMap.h:164
LLVM Value Representation.
Definition: Value.h:74
Type * getType() const
All values are typed, get the type of this value.
Definition: Value.h:255
user_iterator user_begin()
Definition: Value.h:397
void setName(const Twine &Name)
Change the name of the value.
Definition: Value.cpp:377
void replaceAllUsesWith(Value *V)
Change all uses of this to point to a new Value.
Definition: Value.cpp:534
iterator_range< user_iterator > users()
Definition: Value.h:421
User * user_back()
Definition: Value.h:407
Align getPointerAlignment(const DataLayout &DL) const
Returns an alignment of the pointer value.
Definition: Value.cpp:946
void replaceUsesWithIf(Value *New, llvm::function_ref< bool(Use &U)> ShouldReplace)
Go through the uses list for this definition and make each use point to "V" if the callback ShouldRep...
Definition: Value.cpp:542
User * getUniqueUndroppableUser()
Return true if there is exactly one unique user of this value that cannot be dropped (that user can h...
Definition: Value.cpp:179
bool use_empty() const
Definition: Value.h:344
user_iterator user_end()
Definition: Value.h:405
unsigned getNumUses() const
This method computes the number of uses of this Value.
Definition: Value.cpp:255
iterator_range< use_iterator > uses()
Definition: Value.h:376
StringRef getName() const
Return a constant reference to the value's name.
Definition: Value.cpp:309
An efficient, type-erasing, non-owning reference to a callable.
const ParentTy * getParent() const
Definition: ilist_node.h:32
self_iterator getIterator()
Definition: ilist_node.h:132
NodeTy * getNextNode()
Get the next node, or nullptr for the list tail.
Definition: ilist_node.h:353
iterator insertAfter(iterator where, pointer New)
Definition: ilist.h:174
A raw_ostream that writes to an SmallVector or SmallString.
Definition: raw_ostream.h:691
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
@ Exit
Definition: COFF.h:845
@ AMDGPU_KERNEL
Used for AMDGPU code object kernels.
Definition: CallingConv.h:200
@ PTX_Kernel
Call to a PTX kernel. Passes all arguments in parameter space.
Definition: CallingConv.h:125
@ C
The default llvm calling convention, compatible with C.
Definition: CallingConv.h:34
unsigned ID
LLVM IR allows to use arbitrary numbers as calling convention identifiers.
Definition: CallingConv.h:24
initializer< Ty > init(const Ty &Val)
Definition: CommandLine.h:443
void emitOffloadingEntry(Module &M, object::OffloadKind Kind, Constant *Addr, StringRef Name, uint64_t Size, uint32_t Flags, uint64_t Data, StringRef SectionName, Constant *AuxAddr=nullptr)
Create an offloading section struct used to register this global at runtime.
Definition: Utility.cpp:83
OpenMPOffloadMappingFlags
Values for bit flags used to specify the mapping type for offloading.
Definition: OMPConstants.h:195
@ OMP_MAP_PTR_AND_OBJ
The element being mapped is a pointer-pointee pair; both the pointer and the pointee should be mapped...
@ OMP_MAP_MEMBER_OF
The 16 MSBs of the flags indicate whether the entry is member of some struct/class.
@ OMP_DEVICEID_UNDEF
Device ID if the device was not defined, runtime should get it from environment variables in the spec...
Definition: OMPConstants.h:252
IdentFlag
IDs for all omp runtime library ident_t flag encodings (see their defintion in openmp/runtime/src/kmp...
Definition: OMPConstants.h:65
RuntimeFunction
IDs for all omp runtime library (RTL) functions.
Definition: OMPConstants.h:45
static constexpr GV NVPTXGridValues
For Nvidia GPUs.
WorksharingLoopType
A type of worksharing loop construct.
Definition: OMPConstants.h:283
OMPAtomicCompareOp
Atomic compare operations. Currently OpenMP only supports ==, >, and <.
Definition: OMPConstants.h:267
NodeAddr< PhiNode * > Phi
Definition: RDFGraph.h:390
std::error_code getUniqueID(const Twine Path, UniqueID &Result)
Definition: Path.cpp:787
This is an optimization pass for GlobalISel generic memory operations.
Definition: AddressRanges.h:18
auto drop_begin(T &&RangeOrContainer, size_t N=1)
Return a range covering RangeOrContainer with the first N elements excluded.
Definition: STLExtras.h:329
BasicBlock * splitBBWithSuffix(IRBuilderBase &Builder, bool CreateBranch, llvm::Twine Suffix=".split")
Like splitBB, but reuses the current block's name for the new name.
@ Offset
Definition: DWP.cpp:480
detail::zippy< detail::zip_shortest, T, U, Args... > zip(T &&t, U &&u, Args &&...args)
zip iterator for two or more iteratable types.
Definition: STLExtras.h:854
bool all_of(R &&range, UnaryPredicate P)
Provide wrappers to std::all_of which take ranges instead of having to pass begin/end explicitly.
Definition: STLExtras.h:1739
auto size(R &&Range, std::enable_if_t< std::is_base_of< std::random_access_iterator_tag, typename std::iterator_traits< decltype(Range.begin())>::iterator_category >::value, void > *=nullptr)
Get the size of a range.
Definition: STLExtras.h:1697
Expected< std::unique_ptr< Module > > parseBitcodeFile(MemoryBufferRef Buffer, LLVMContext &Context, ParserCallbacks Callbacks={})
Read the specified bitcode file, returning the module.
detail::zippy< detail::zip_first, T, U, Args... > zip_equal(T &&t, U &&u, Args &&...args)
zip iterator that assumes that all iteratees have the same length.
Definition: STLExtras.h:864
auto enumerate(FirstRange &&First, RestRanges &&...Rest)
Given two or more input ranges, returns a new range whose values are tuples (A, B,...
Definition: STLExtras.h:2448
decltype(auto) dyn_cast(const From &Val)
dyn_cast<X> - Return the argument parameter cast to the specified type.
Definition: Casting.h:649
auto successors(const MachineBasicBlock *BB)
testing::Matcher< const detail::ErrorHolder & > Failed()
Definition: Error.h:198
iterator_range< T > make_range(T x, T y)
Convenience function for iterating over sub-ranges.
void append_range(Container &C, Range &&R)
Wrapper function to append range R to container C.
Definition: STLExtras.h:2115
iterator_range< early_inc_iterator_impl< detail::IterOfRange< RangeT > > > make_early_inc_range(RangeT &&Range)
Make a range that does early increment to allow mutation of the underlying range without disrupting i...
Definition: STLExtras.h:657
ErrorOr< T > expectedToErrorOrAndEmitErrors(LLVMContext &Ctx, Expected< T > Val)
Definition: BitcodeReader.h:66
bool convertUsersOfConstantsToInstructions(ArrayRef< Constant * > Consts, Function *RestrictToFunc=nullptr, bool RemoveDeadConstants=true, bool IncludeSelf=false)
Replace constant expressions users of the given constants with instructions.
unsigned Log2_32(uint32_t Value)
Return the floor log base 2 of the specified value, -1 if the value is zero.
Definition: MathExtras.h:341
auto reverse(ContainerTy &&C)
Definition: STLExtras.h:420
TargetTransformInfo::PeelingPreferences gatherPeelingPreferences(Loop *L, ScalarEvolution &SE, const TargetTransformInfo &TTI, std::optional< bool > UserAllowPeeling, std::optional< bool > UserAllowProfileBasedPeeling, bool UnrollingSpecficValues=false)
Definition: LoopPeel.cpp:870
void spliceBB(IRBuilderBase::InsertPoint IP, BasicBlock *New, bool CreateBranch)
Move the instruction after an InsertPoint to the beginning of another BasicBlock.
void SplitBlockAndInsertIfThenElse(Value *Cond, BasicBlock::iterator SplitBefore, Instruction **ThenTerm, Instruction **ElseTerm, MDNode *BranchWeights=nullptr, DomTreeUpdater *DTU=nullptr, LoopInfo *LI=nullptr)
SplitBlockAndInsertIfThenElse is similar to SplitBlockAndInsertIfThen, but also creates the ElseBlock...
raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition: Debug.cpp:163
bool none_of(R &&Range, UnaryPredicate P)
Provide wrappers to std::none_of which take ranges instead of having to pass begin/end explicitly.
Definition: STLExtras.h:1753
void report_fatal_error(Error Err, bool gen_crash_diag=true)
Report a serious error, calling any installed error handler.
Definition: Error.cpp:167
BasicBlock * splitBB(IRBuilderBase::InsertPoint IP, bool CreateBranch, llvm::Twine Name={})
Split a BasicBlock at an InsertPoint, even if the block is degenerate (missing the terminator).
CodeGenOptLevel
Code generation optimization level.
Definition: CodeGen.h:54
bool computeUnrollCount(Loop *L, const TargetTransformInfo &TTI, DominatorTree &DT, LoopInfo *LI, AssumptionCache *AC, ScalarEvolution &SE, const SmallPtrSetImpl< const Value * > &EphValues, OptimizationRemarkEmitter *ORE, unsigned TripCount, unsigned MaxTripCount, bool MaxOrZero, unsigned TripMultiple, const UnrollCostEstimator &UCE, TargetTransformInfo::UnrollingPreferences &UP, TargetTransformInfo::PeelingPreferences &PP, bool &UseUpperBound)
format_object< Ts... > format(const char *Fmt, const Ts &... Vals)
These are helper functions used to produce formatted output.
Definition: Format.h:125
raw_fd_ostream & errs()
This returns a reference to a raw_ostream for standard error.
AtomicOrdering
Atomic ordering for LLVM's memory model.
constexpr T divideCeil(U Numerator, V Denominator)
Returns the integer ceil(Numerator / Denominator).
Definition: MathExtras.h:404
BasicBlock * CloneBasicBlock(const BasicBlock *BB, ValueToValueMapTy &VMap, const Twine &NameSuffix="", Function *F=nullptr, ClonedCodeInfo *CodeInfo=nullptr)
Return a copy of the specified basic block, but without embedding the block into a particular functio...
void cantFail(Error Err, const char *Msg=nullptr)
Report a fatal error if Err is a failure value.
Definition: Error.h:756
bool MergeBlockIntoPredecessor(BasicBlock *BB, DomTreeUpdater *DTU=nullptr, LoopInfo *LI=nullptr, MemorySSAUpdater *MSSAU=nullptr, MemoryDependenceResults *MemDep=nullptr, bool PredecessorWithTwoSuccessors=false, DominatorTree *DT=nullptr)
Attempts to merge a block into its predecessor, if possible.
DWARFExpression::Operation Op
void remapInstructionsInBlocks(ArrayRef< BasicBlock * > Blocks, ValueToValueMapTy &VMap)
Remaps instructions in Blocks using the mapping in VMap.
TargetTransformInfo::UnrollingPreferences gatherUnrollingPreferences(Loop *L, ScalarEvolution &SE, const TargetTransformInfo &TTI, BlockFrequencyInfo *BFI, ProfileSummaryInfo *PSI, llvm::OptimizationRemarkEmitter &ORE, int OptLevel, std::optional< unsigned > UserThreshold, std::optional< unsigned > UserCount, std::optional< bool > UserAllowPartial, std::optional< bool > UserRuntime, std::optional< bool > UserUpperBound, std::optional< unsigned > UserFullUnrollMaxCount)
Gather the various unrolling parameters based on the defaults, compiler flags, TTI overrides and user...
BasicBlock * SplitBlock(BasicBlock *Old, BasicBlock::iterator SplitPt, DominatorTree *DT, LoopInfo *LI=nullptr, MemorySSAUpdater *MSSAU=nullptr, const Twine &BBName="", bool Before=false)
Split the specified block at the specified instruction.
auto predecessors(const MachineBasicBlock *BB)
Constant * ConstantFoldInsertValueInstruction(Constant *Agg, Constant *Val, ArrayRef< unsigned > Idxs)
ConstantFoldInsertValueInstruction - Attempt to constant fold an insertvalue instruction with the spe...
@ Continue
Definition: DWP.h:21
void DeleteDeadBlocks(ArrayRef< BasicBlock * > BBs, DomTreeUpdater *DTU=nullptr, bool KeepOneInputPHIs=false)
Delete the specified blocks from BB.
This struct is a compact representation of a valid (non-zero power of two) alignment.
Definition: Alignment.h:39
static void collectEphemeralValues(const Loop *L, AssumptionCache *AC, SmallPtrSetImpl< const Value * > &EphValues)
Collect a loop's ephemeral values (those used only by an assume or similar intrinsics in the loop).
Definition: CodeMetrics.cpp:71
This struct is a compact representation of a valid (power of two) or undefined (0) alignment.
Definition: Alignment.h:117
a struct to pack relevant information while generating atomic Ops
A struct to pack the relevant information for an OpenMP depend clause.
Description of a LLVM-IR insertion point (IP) and a debug/source location (filename,...
Definition: OMPIRBuilder.h:645
This structure contains combined information generated for mappable clauses, including base pointers,...
MapDeviceInfoArrayTy DevicePointers
StructNonContiguousInfo NonContigInfo
Helper that contains information about regions we need to outline during finalization.
void collectBlocks(SmallPtrSetImpl< BasicBlock * > &BlockSet, SmallVectorImpl< BasicBlock * > &BlockVector)
Collect all blocks in between EntryBB and ExitBB in both the given vector and set.
SmallVector< Value *, 2 > ExcludeArgsFromAggregate
Information about an OpenMP reduction.
EvalKind EvaluationKind
Reduction evaluation kind - scalar, complex or aggregate.
ReductionGenAtomicCBTy AtomicReductionGen
Callback for generating the atomic reduction body, may be null.
ReductionGenCBTy ReductionGen
Callback for generating the reduction body.
Value * Variable
Reduction variable of pointer type.
Value * PrivateVariable
Thread-private partial reduction variable.
ReductionGenClangCBTy ReductionGenClang
Clang callback for generating the reduction body.
Type * ElementType
Reduction element type, must match pointee type of variable.
Container for the arguments used to pass data to the runtime library.
Value * SizesArray
The array of sizes passed to the runtime library.
Value * PointersArray
The array of section pointers passed to the runtime library.
Value * MappersArray
The array of user-defined mappers passed to the runtime library.
Value * BasePointersArray
The array of base pointer passed to the runtime library.
Value * MapTypesArray
The array of map types passed to the runtime library for the beginning of the region or for the entir...
Value * MapNamesArray
The array of original declaration names of mapped pointers sent to the runtime library for debugging.
Data structure that contains the needed information to construct the kernel args vector.
Value * DynCGGroupMem
The size of the dynamic shared memory.
ArrayRef< Value * > NumThreads
The number of threads.
TargetDataRTArgs RTArgs
Arguments passed to the runtime library.
Value * NumIterations
The number of iterations.
unsigned NumTargetItems
Number of arguments passed to the runtime library.
bool HasNoWait
True if the kernel has 'no wait' clause.
ArrayRef< Value * > NumTeams
The number of teams.
Container to pass the default attributes with which a kernel must be launched, used to set kernel att...
Container to pass LLVM IR runtime values or constants related to the number of teams and threads with...
Value * MaxThreads
'parallel' construct 'num_threads' clause value, if present and it is an SPMD kernel.
Value * LoopTripCount
Total number of iterations of the SPMD or Generic-SPMD kernel or null if it is a generic kernel.
Data structure to contain the information needed to uniquely identify a target entry.
Definition: OMPIRBuilder.h:203
static void getTargetRegionEntryFnName(SmallVectorImpl< char > &Name, StringRef ParentName, unsigned DeviceID, unsigned FileID, unsigned Line, unsigned Count)
static const Target * lookupTarget(StringRef Triple, std::string &Error)
lookupTarget - Lookup a target based on a target triple.
Parameters that control the generic loop unrolling transformation.
unsigned Count
A forced unrolling factor (the number of concatenated bodies of the original loop in the unrolled loo...
unsigned Threshold
The cost threshold for the unrolled loop.
bool Force
Apply loop unroll on any kind of loop (mainly to loops that fail runtime unrolling).
unsigned PartialOptSizeThreshold
The cost threshold for the unrolled loop when optimizing for size, like OptSizeThreshold,...
unsigned PartialThreshold
The cost threshold for the unrolled loop, like Threshold, but used for partial/runtime unrolling (set...
unsigned OptSizeThreshold
The cost threshold for the unrolled loop when optimizing for size (set to UINT_MAX to disable).
Defines various target-specific GPU grid values that must be consistent between host RTL (plugin),...
Definition: OMPGridValues.h:57
unsigned GV_Warp_Size
The default value of maximum number of threads in a worker warp.
Definition: OMPGridValues.h:61