aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorRoman Gareev <gareevroman@gmail.com>2022-05-21 22:40:33 +0300
committerRoman Gareev <gareevroman@gmail.com>2022-08-07 13:10:32 +0300
commitb02c7e2b630a04701d12efd2376f25eff2767279 (patch)
tree1d57311eb996b0fe9665994a98622030d34efc8b
parent6bb51bf06214af3690af7034f4edeb265732c481 (diff)
[Polly] Generalize the pattern matching to the case of tensor contractions
The pattern matching optimization of Polly detects and optimizes dense general matrix-matrix multiplication. The generated code is close to high performance implementations of matrix-matrix multiplications, which are contained in manually tuned libraries. The described pattern matching optimization is a particular case of tensor contraction optimization, which was introduced in [1]. This patch generalizes the pattern matching to the case of tensor contractions using the form of data dependencies and memory accesses produced by tensor contractions [1]. Optimization of tensor contractions will be added in the next patch. Following the ideas introduced in [2], it will logically represent tensor contraction operands as matrix multiplication operands and use an approach for optimization of matrix-matrix multiplications. [1] - Gareev R., Grosser T., Kruse M. High-Performance Generalized Tensor Op­erations: A Compiler-Oriented Approach // ACM Transactions on Architec­ture and Code Optimization (TACO). 2018. Vol. 15, no. 3. P. 34:1–34:27. DOI: 10.1145/3235029. [2] - Matthews D. High-Performance Tensor Contraction without BLAS // SIAM Journal on Scientific Computing. 2018. Vol. 40, no. 1. P. C 1—C 24. DOI: 110.1137/16m108968x. Reviewed By: Meinersbur Differential Revision: https://reviews.llvm.org/D114336
-rw-r--r--polly/include/polly/Support/ISLTools.h5
-rw-r--r--polly/lib/Support/ISLTools.cpp12
-rw-r--r--polly/lib/Transform/MatmulOptimizer.cpp812
-rw-r--r--polly/lib/Transform/ScheduleOptimizer.cpp36
-rw-r--r--polly/test/ScheduleOptimizer/pattern-matching-based-opts-after-delicm.ll32
-rw-r--r--polly/test/ScheduleOptimizer/pattern-matching-based-opts-after-delicm_2.ll108
-rw-r--r--polly/test/ScheduleOptimizer/pattern-matching-based-opts.ll9
-rw-r--r--polly/test/ScheduleOptimizer/pattern-matching-based-opts_11.ll4
-rw-r--r--polly/test/ScheduleOptimizer/pattern-matching-based-opts_15.ll4
-rw-r--r--polly/test/ScheduleOptimizer/pattern-matching-based-opts_16.ll64
-rw-r--r--polly/test/ScheduleOptimizer/pattern-matching-based-opts_17.ll64
-rw-r--r--polly/test/ScheduleOptimizer/pattern-matching-based-opts_18.ll84
-rw-r--r--polly/test/ScheduleOptimizer/pattern-matching-based-opts_19.ll84
-rw-r--r--polly/test/ScheduleOptimizer/pattern-matching-based-opts_2.ll4
-rw-r--r--polly/test/ScheduleOptimizer/pattern-matching-based-opts_20.ll94
-rw-r--r--polly/test/ScheduleOptimizer/pattern-matching-based-opts_21.ll64
-rw-r--r--polly/test/ScheduleOptimizer/pattern-matching-based-opts_22.ll65
-rw-r--r--polly/test/ScheduleOptimizer/pattern-matching-based-opts_23.ll79
-rw-r--r--polly/test/ScheduleOptimizer/pattern-matching-based-opts_24.ll65
-rw-r--r--polly/test/ScheduleOptimizer/pattern-matching-based-opts_25.ll56
-rw-r--r--polly/test/ScheduleOptimizer/pattern-matching-based-opts_4.ll10
21 files changed, 1734 insertions, 21 deletions
diff --git a/polly/include/polly/Support/ISLTools.h b/polly/include/polly/Support/ISLTools.h
index f7bc29495a9c..64f044071b55 100644
--- a/polly/include/polly/Support/ISLTools.h
+++ b/polly/include/polly/Support/ISLTools.h
@@ -523,6 +523,11 @@ isl::set subtractParams(isl::set Set, isl::set Params);
/// value. Otherwise, return NaN.
isl::val getConstant(isl::pw_aff PwAff, bool Max, bool Min);
+/// If the relation @p PwAff lies on a hyperplane where the given
+/// dimension @p Pos with the type @p Dim has a fixed value, then
+/// return that value. Otherwise return NaN.
+isl::val getConstant(isl::map Map, isl::dim Dim, int Pos);
+
/// Check that @p End is valid and return an iterator from @p Begin to @p End
///
/// Use case example:
diff --git a/polly/lib/Support/ISLTools.cpp b/polly/lib/Support/ISLTools.cpp
index 8b8c51aa202d..b6ac9bac819b 100644
--- a/polly/lib/Support/ISLTools.cpp
+++ b/polly/lib/Support/ISLTools.cpp
@@ -263,6 +263,18 @@ isl::map polly::shiftDim(isl::map Map, isl::dim Dim, int Pos, int Amount) {
}
}
+isl::val polly::getConstant(isl::map Map, isl::dim Dim, int Pos) {
+ unsigned NumDims = unsignedFromIslSize(Map.dim(Dim));
+ if (Pos < 0)
+ Pos = NumDims + Pos;
+ assert(unsigned(Pos) < NumDims && "Dimension index must be in range");
+ // TODO: The isl_map_plain_get_val_if_fixed function is not robust, since its
+ // result is different depending on the internal representation.
+ // Replace it with a different implementation.
+ return isl::manage(isl_map_plain_get_val_if_fixed(
+ Map.get(), static_cast<enum isl_dim_type>(Dim), Pos));
+}
+
isl::union_map polly::shiftDim(isl::union_map UMap, isl::dim Dim, int Pos,
int Amount) {
isl::union_map Result = isl::union_map::empty(UMap.ctx());
diff --git a/polly/lib/Transform/MatmulOptimizer.cpp b/polly/lib/Transform/MatmulOptimizer.cpp
index 3e0e06cef869..16f9541b9e07 100644
--- a/polly/lib/Transform/MatmulOptimizer.cpp
+++ b/polly/lib/Transform/MatmulOptimizer.cpp
@@ -13,10 +13,13 @@
#include "polly/ScopInfo.h"
#include "polly/ScopPass.h"
#include "polly/Simplify.h"
+#include "polly/Support/GICHelper.h"
#include "polly/Support/ISLTools.h"
#include "llvm/ADT/ArrayRef.h"
+#include "llvm/ADT/DenseSet.h"
#include "llvm/ADT/Optional.h"
#include "llvm/ADT/Sequence.h"
+#include "llvm/ADT/SetOperations.h"
#include "llvm/ADT/SmallVector.h"
#include "llvm/ADT/StringRef.h"
#include "llvm/ADT/iterator_range.h"
@@ -126,6 +129,24 @@ static cl::opt<int> PollyPatternMatchingNcQuotient(
"macro-kernel, by Nr, the parameter of the micro-kernel"),
cl::Hidden, cl::init(256), cl::cat(PollyCategory));
+static cl::opt<bool>
+ PMBasedTCOpts("polly-tc-opt",
+ cl::desc("Perform optimizations of tensor contractions based "
+ "on pattern matching"),
+ cl::init(false), cl::ZeroOrMore, cl::cat(PollyCategory));
+
+static cl::opt<bool>
+ PMBasedMMMOpts("polly-matmul-opt",
+ cl::desc("Perform optimizations of matrix multiplications "
+ "based on pattern matching"),
+ cl::init(true), cl::ZeroOrMore, cl::cat(PollyCategory));
+
+static cl::opt<int> OptComputeOut(
+ "polly-tc-dependences-computeout",
+ cl::desc("Bound the dependence analysis by a maximal amount of "
+ "computational steps (0 means no bound)"),
+ cl::Hidden, cl::init(500000), cl::ZeroOrMore, cl::cat(PollyCategory));
+
namespace {
/// Parameters of the micro kernel.
///
@@ -160,6 +181,83 @@ struct MatMulInfoTy {
int k = -1;
};
+/// Parameters of the tensor contraction operands.
+///
+/// A general d-dimensional tensor T ∈ R ^ Nu0 x ... x Nud−1 can be defined
+/// as the set of scalar elements indexed by the set of indices u0 ... ud,
+///
+/// T ≡ {Anu0...nud−1 ∈ R | (u0,...,ud−1) ∈ Nu0 x ... x Nud−1}.
+///
+/// Let A, B, and C be dA, dB, and dC-dimensional tensors, respectively.
+/// Let the free and the contracted indices of the tensor A be grouped into
+/// two bundles I = i0...ir−1 and P = p0...pt−1, respectively. Similarly,
+/// the free and the contracted indices of B are grouped into bundles
+/// J = j0..js−1 and P and the free indices of C are grouped into
+/// bundles I and J.
+///
+/// Tensor contraction (TC) of tensors A, B into tensor C can be represented as
+/// C(shuffle(I,J))=∑α·A(shuffle(I,P))·B(shuffle(P,J))+β·C(shuffle(I,J)),
+/// where ∑ is a summation over all contracted indices of P,
+/// α, β ∈ R, Npi is the length of the tensor dimension that corresponds
+/// to the index pi, A(shuffle(I, P)), B(shuffle(P, J)), C(shuffle(I, J)) are
+/// accesses to tensors A, B, C, respectively,
+/// shuffle(I, J), shuffle(I, P), and shuffle(P, J) are permutations of
+/// the enclosed indices.
+///
+/// Multiplication of C(shuffle(I,J)) by β can be moved into a different SCoP
+/// statement by loop distribution, which is done by the isl scheduler.
+// If β is not equal to one, the optimization of TC of Polly requires
+/// such a transformation.
+///
+/// TCInfoTy contains parameters, which describe access relations that represent
+/// operands of the tensor contraction.
+struct TCInfoTy {
+ /// @{
+ /// Memory accesses that represent reading from tensors, which are operands of
+ /// the tensor contraction.
+ MemoryAccess *A = nullptr;
+ MemoryAccess *B = nullptr;
+ /// @}
+
+ /// @{
+ /// Memory accesses that represent reading from and writing into the tensor,
+ /// which contains the result of the tensor contraction.
+ MemoryAccess *ReadFromC = nullptr;
+ MemoryAccess *WriteToC = nullptr;
+ /// @}
+
+ /// @{
+ /// Input dimensions of the schedule space, which represent free
+ /// indices of tensors.
+ SmallDenseSet<int> I;
+ SmallDenseSet<int> J;
+ /// @}
+
+ /// Input dimension of the schedule space, which represents contracted
+ /// indices of tensors.
+ SmallDenseSet<int> P;
+
+ /// @{
+ /// Sizes of tensor dimensions for corresponding input dimensions of
+ /// the schedule space. The size of the tensor dimension can be larger than
+ /// the size of the corresponding input dimension of the schedule space.
+ /// This does not correspond to a tensor contraction. However, such a pattern
+ /// will be optimized by the transformation.
+ SmallVector<int> DimensionSizes;
+ SmallVector<int> ADimensions;
+ SmallVector<int> BDimensions;
+ SmallVector<int> CDimensions;
+ /// @}
+
+ /// @{
+ /// Permutations of indices of I, J, and P, which describe operands of
+ /// the tensor contraction and its result.
+ SmallVector<int> OrderedI;
+ SmallVector<int> OrderedJ;
+ SmallVector<int> OrderedP;
+ /// @}
+};
+
/// Create an isl::union_set, which describes the option of the form
/// [isolate[] -> unroll[x]].
///
@@ -1007,11 +1105,7 @@ static bool isMatrMultPattern(isl::schedule_node Node, const Dependences *D,
MatMulInfoTy &MMI) {
auto PartialSchedule = isl::manage(
isl_schedule_node_band_get_partial_schedule_union_map(Node.get()));
- Node = Node.child(0);
- auto LeafType = isl_schedule_node_get_type(Node.get());
- Node = Node.parent();
- if (LeafType != isl_schedule_node_leaf ||
- isl_schedule_node_band_n_member(Node.get()) < 3 ||
+ if (isl_schedule_node_band_n_member(Node.get()) < 3 ||
Node.get_schedule_depth().release() != 0 ||
isl_union_map_n_map(PartialSchedule.get()) != 1)
return false;
@@ -1021,14 +1115,720 @@ static bool isMatrMultPattern(isl::schedule_node Node, const Dependences *D,
return false;
}
+/// Get the dimension size.
+///
+/// Return the size of the dimension @p Pos, which is obtained from @p SAI.
+/// Return -1 in the case of the first dimension of a multi-dimensional array,
+/// since the ScopArrayInfo class does not carry size information.
+///
+/// @param SAI The information about the array.
+/// @param Pos The position of the dimension.
+/// @return The size of the dimension.
+static int getDimSize(const ScopArrayInfo *SAI, unsigned Pos) {
+ if (Pos == 0)
+ return -1;
+ const llvm::SCEV *SCEVDimSize = SAI->getDimensionSize(Pos);
+ assert(SCEVDimSize);
+ auto *ConstantDimSize = dyn_cast<const SCEVConstant>(SCEVDimSize);
+ assert(ConstantDimSize);
+ auto *IntDimSize = dyn_cast<ConstantInt>(ConstantDimSize->getValue());
+ assert(IntDimSize);
+ return IntDimSize->getSExtValue();
+}
+
+/// Check whether the access relation has the specified form.
+///
+/// Check that the access relation @p AccMap has the form T[I0, …, In], where
+/// indexes I0, …, In are specified by @p Dimensions.
+///
+/// @param Domain The domain of the access relation.
+/// @param AccMap The access relation to be checked.
+/// @param Dimensions The permutation of the subset of the input dimensions.
+/// @return True if @p AccMap has the expected form and false,
+/// otherwise.
+static bool isCorrectAccessMap(isl::set Domain, isl::map AccMap,
+ ArrayRef<int> Dimensions) {
+ isl::space Space = AccMap.get_space();
+ if (unsignedFromIslSize(Space.dim(isl::dim::out)) != Dimensions.size())
+ return false;
+
+ // Create an access relation of the following form:
+ // [I0, …, Im] -> [Il, …, In], where indexes
+ // Il, …, In are specified by @p Dimensions.
+ isl::map PossibleTensor = isl::map::universe(Space);
+ unsigned DimInSize = unsignedFromIslSize(Space.dim(isl::dim::in));
+ for (unsigned i = 0; i < Dimensions.size(); i++) {
+ const int InPos = Dimensions[i];
+ if ((InPos >= static_cast<int>(DimInSize)) || (InPos < 0))
+ return false;
+ PossibleTensor =
+ PossibleTensor.equate(isl::dim::in, InPos, isl::dim::out, i);
+ }
+
+ AccMap = AccMap.intersect_domain(Domain);
+ PossibleTensor = PossibleTensor.intersect_domain(Domain);
+
+ // If AccMap != PossibleTensor here (the two maps have been gisted at
+ // this point), it means that the writes are not complete, or in other
+ // words, it is a Partial write and Partial writes must be rejected.
+ return AccMap.is_equal(PossibleTensor);
+}
+
+/// Check whether the access represents the tensor contraction operand.
+///
+/// Check that the access relation @p AccMap has the form T[i1, …, in].
+/// Obtained indexes i1, …, in, their sizes and their permutation are stored
+/// into @p IndexSet, @p DimensionSizes, and @p Dimensions, respectively.
+///
+/// @param Domain The domain of the access relation.
+/// @param AccMap The access relation to be checked.
+/// @param IndexSet The subset of the input dimensions.
+/// @param DimensionSizes Sizes of the input dimensions of @p Dimensions.
+/// @param Dimensions The permutation of the subset of the input dimensions.
+/// @return True if @p AccMap has the expected form and false,
+/// otherwise.
+static bool isTCOperandAcc(isl::set Domain, isl::map AccMap,
+ SmallDenseSet<int> &IndexSet,
+ SmallVectorImpl<int> &DimensionSizes,
+ SmallVectorImpl<int> &Dimensions) {
+ isl::id Id = AccMap.get_tuple_id(isl::dim::out);
+ const ScopArrayInfo *SAI = ScopArrayInfo::getFromId(Id);
+ assert(SAI && "AccMap should represent memory access");
+
+ // Fix values of output dimensions with respect to their positions.
+ // In the case of the tensor contraction, values of output dimensions are
+ // fixed and form a permutation of a subset of values of input dimensions.
+ //
+ // For example, in the case of Stmt[i][j][k] -> A[k][i], which represents
+ // the operand of the tensor contraction, we get the following map by fixing
+ // the output dimensions Stmt[1][j][0] -> A[0][1].
+ //
+ // We store the permutation of the subset of the input dimensions {2, 0} into
+ // @p Dimensions.
+ //
+ // The obtained permutation and the isCorrectAccessMap function are used to
+ // check whether the access relation @p AccMap represents the tensor
+ // contraction operand. For example, in the case of
+ // Stmt[i][j][k] -> A[i-1][j+1], we get Stmt[1][0][k] -> A[0][1] and,
+ // consequently, {1, 0}, which is rejected by isCorrectAccessMap,
+ // since it corresponds to Stmt[i][j][k] -> A[j][i].
+ isl::map CheckMap = isl::manage(AccMap.copy());
+ unsigned OutDimNum = unsignedFromIslSize(CheckMap.dim(isl::dim::out));
+ for (unsigned i = 0; i < OutDimNum; i++)
+ CheckMap = CheckMap.fix_si(isl::dim::out, i, i);
+
+ // Try to obtain the permutation and sizes of corresponding input dimensions.
+ Dimensions.assign(OutDimNum, -1);
+ for (unsigned i : rangeIslSize(0, CheckMap.dim(isl::dim::in))) {
+ isl::val Val = getConstant(CheckMap, isl::dim::in, i);
+ if (!Val.is_int())
+ continue;
+ int OutPos = -1;
+ llvm::APInt ValAPInt = APIntFromVal(Val);
+ if (ValAPInt.isSignedIntN(32))
+ OutPos = ValAPInt.getSExtValue();
+ if ((OutPos < 0) || (OutPos >= static_cast<int>(OutDimNum)) ||
+ IndexSet.count(i))
+ return false;
+ IndexSet.insert(i);
+ Dimensions[OutPos] = i;
+ if (DimensionSizes[i] <= 0)
+ DimensionSizes[i] = getDimSize(SAI, OutPos);
+ }
+
+ return isCorrectAccessMap(Domain, AccMap, Dimensions);
+}
+
+/// Find the intersection of two sets.
+///
+/// Find the intersection of the set @p A and the set @p B.
+///
+/// @param A, B Sets to intersect.
+/// @return The set intersection.
+static SmallDenseSet<int> intersect(const SmallDenseSet<int> &A,
+ const SmallDenseSet<int> &B) {
+ SmallDenseSet<int> Intersection = A;
+ set_intersect(Intersection, B);
+ return Intersection;
+}
+
+/// Check whether the set is a superset.
+///
+/// Check that the set @p A is a superset of @p B.
+///
+/// @param A, B Sets to be checked.
+/// @return True if the set A is a superset of B.
+static bool isSuperset(const SmallDenseSet<int> &A,
+ const SmallDenseSet<int> &B) {
+ return intersect(A, B).size() == B.size();
+}
+
+/// Find the union of two sets.
+///
+/// Find the union of the set @p A and the set @p B.
+///
+/// @param A, B Sets to unite.
+/// @return The set union.
+static SmallDenseSet<int> unite(const SmallDenseSet<int> &A,
+ const SmallDenseSet<int> &B) {
+ SmallDenseSet<int> Union = A;
+ set_union(Union, B);
+ return Union;
+}
+
+/// Determine the access that writes to the tensor, which contains
+/// the result of the tensor contraction.
+///
+/// @param Domain The domain of the statement.
+/// @param Stmt The statement, which writes to memory.
+/// @param TCI The information about the tensor contraction.
+/// @param IandJIndexSet The set, which contains free indexes of tensors.
+/// @return The determined MemoryAccess, or nullptr if there is no necessary
+/// access within the SCoP.
+static MemoryAccess *getWriteAccess(isl::set Domain, ScopStmt *Stmt,
+ TCInfoTy &TCI,
+ SmallDenseSet<int> &IandJIndexSet) {
+ TCI.WriteToC = nullptr;
+ SmallVector<MemoryAccess *, 32> Accesses = getAccessesInOrder(*Stmt);
+ for (MemoryAccess *MemA : reverse(Accesses)) {
+ // A TC-like does not contain write scalar memory accesses
+ if (!MemA->isLatestArrayKind())
+ return nullptr;
+ // The last memory access should be a write memory access.
+ if (!MemA->isWrite())
+ return nullptr;
+
+ isl::map AccMap = MemA->getLatestAccessRelation();
+ if (!isTCOperandAcc(Domain, AccMap, IandJIndexSet, TCI.DimensionSizes,
+ TCI.CDimensions))
+ return nullptr;
+
+ return MemA;
+ }
+ return nullptr;
+}
+
+/// Determine an access, which reads elements of an operand of the tensor
+/// contraction
+///
+/// @param MemAccessPtr The access, which reads elements of the tensor.
+/// @param IndexSet The set, which contains indexes of the tensors.
+/// @param IandJIndexSet The set, which contains free indexes of tensors.
+/// @param Dimensions The permutation of the subset of the input dimensions.
+/// @param TCI The information about the tensor contraction.
+/// @return True if the memory access @p MemAccessPtr corresponds
+/// to the tensor contraction.
+static bool setReadAccess(MemoryAccess *MemAccessPtr,
+ const SmallDenseSet<int> &IndexSet,
+ const SmallDenseSet<int> &IandJIndexSet,
+ ArrayRef<int> Dimensions, TCInfoTy &TCI) {
+ if (!TCI.A) {
+ // Probably IndexSet is a union of I and P sets.
+ if (!isSuperset(IndexSet, TCI.P))
+ return false;
+
+ // Obtain the set I.
+ TCI.I = set_difference(IndexSet, TCI.P);
+ if (!isSuperset(IandJIndexSet, TCI.I))
+ return false;
+
+ // Obtain the set J.
+ TCI.J = set_difference(IandJIndexSet, TCI.I);
+
+ // Set the first operand of the tensor contraction.
+ TCI.A = MemAccessPtr;
+ llvm::replace(TCI.ADimensions, TCI.ADimensions.begin(),
+ TCI.ADimensions.end(), Dimensions.begin(), Dimensions.end());
+ return true;
+ }
+
+ if (!TCI.B) {
+ // IndexSet should be a union of J and P sets.
+ if (unite(TCI.P, TCI.J) != IndexSet)
+ return false;
+
+ // Set the second operand of the tensor contraction.
+ TCI.B = MemAccessPtr;
+ llvm::replace(TCI.BDimensions, TCI.BDimensions.begin(),
+ TCI.BDimensions.end(), Dimensions.begin(), Dimensions.end());
+ return true;
+ }
+
+ return false;
+}
+
+/// Check that all memory accesses of the statement, except from the last
+/// one, are read memory accesses, which read elements of operands of the tensor
+/// contraction and its result.
+///
+/// @param Domain The domain of the statement.
+/// @param Stmt The statement, which writes to memory.
+/// @param TCI The information about the tensor contraction.
+/// @param IandJIndexSet The set, which contains free indexes of tensors.
+/// @return True if all read memory accesses of the statement @p Stmt correspond
+/// to the tensor contraction.
+static bool setReadAccesses(isl::set Domain, ScopStmt *Stmt, TCInfoTy &TCI,
+ SmallDenseSet<int> &IandJIndexSet) {
+ TCI.A = nullptr;
+ TCI.B = nullptr;
+ TCI.ReadFromC = nullptr;
+ SmallVector<MemoryAccess *, 32> Accesses = getAccessesInOrder(*Stmt);
+ for (auto *MemA = Accesses.begin(); *MemA != TCI.WriteToC; MemA++) {
+ MemoryAccess *MemAccessPtr = *MemA;
+
+ // All memory accesses, except from the last one, should be read memory
+ // accesses.
+ if (MemAccessPtr->isWrite())
+ return false;
+
+ isl::map AccMap = MemAccessPtr->getLatestAccessRelation();
+
+ if (!MemAccessPtr->isLatestArrayKind()) {
+ // Check whether the scalar read memory access is not partial.
+ if (!Domain.is_subset(AccMap.domain()))
+ return false;
+ continue;
+ return false;
+ }
+
+ // There is only one memory access, which reads elements of the result of
+ // the tensor contraction.
+ if (AccMap.is_equal(TCI.WriteToC->getLatestAccessRelation())) {
+ if (TCI.ReadFromC)
+ return false;
+ TCI.ReadFromC = MemAccessPtr;
+ continue;
+ }
+
+ SmallVector<int> Dimensions;
+ SmallDenseSet<int> IndexSet;
+ if (!isTCOperandAcc(Domain, AccMap, IndexSet, TCI.DimensionSizes,
+ Dimensions))
+ return false;
+
+ if (!setReadAccess(MemAccessPtr, IndexSet, IandJIndexSet, Dimensions, TCI))
+ return false;
+ }
+
+ // Check that there are read memory accesses, which read elements of operands
+ // of the tensor contraction and its result.
+ return TCI.ReadFromC && TCI.A && TCI.B;
+}
+
+/// Check accesses to operands of the tensor contraction.
+///
+/// Check that accesses of the SCoP statement, which corresponds to
+/// the partial schedule @p PartialSchedule, represent accesses
+/// to the non-scalar operands of the tensor contraction.
+///
+/// @param Domain The domain of the SCoP statement.
+/// @param PartialSchedule The partial schedule of the SCoP statement.
+/// @param TCI Parameters of the tensor contraction operands.
+/// @return True if the corresponding SCoP statement
+/// represents tensor contraction and false,
+/// otherwise.
+static bool containsOnlyTCAcc(isl::set Domain, isl::map PartialSchedule,
+ TCInfoTy &TCI) {
+ isl::id InputDimsId = PartialSchedule.get_tuple_id(isl::dim::in);
+ ScopStmt *Stmt = static_cast<ScopStmt *>(InputDimsId.get_user());
+
+ // In region statements, the order of memory accesses execution is not
+ // predictable at compile-time.
+ if ((Stmt->size() <= 1) || Stmt->isRegionStmt())
+ return false;
+
+ unsigned DimNum = unsignedFromIslSize(PartialSchedule.dim(isl::dim::in));
+ TCI.DimensionSizes.resize(DimNum);
+ SmallDenseSet<int> IandJIndexSet;
+
+ TCI.WriteToC = getWriteAccess(Domain, Stmt, TCI, IandJIndexSet);
+ if (!TCI.WriteToC)
+ return false;
+
+ if (intersect(IandJIndexSet, TCI.P).size() != 0)
+ return false;
+
+ if (!setReadAccesses(Domain, Stmt, TCI, IandJIndexSet))
+ return false;
+
+ return true;
+}
+
+/// Check that dependency corresponds to the tensor contraction carried over
+/// loop dimension @p Dim.
+///
+/// Check that the dependency has the form
+/// S(..., ki, max(k(i + 1)), ..., max(kn), ...) ->
+/// S(..., ki + 1, min(k(i + 1)), ..., min(kn), ...), where S is the SCoP
+/// statement. For this purpose, we analyze the set @p DepDelta, which
+/// represents the differences between image elements and domain elements of
+/// the corresponding map.
+///
+/// @param DepDelta The set contains the differences between image elements
+/// and corresponding domain elements of the map, which
+/// represents the dependency.
+/// @param Dim The position of the index ki.
+/// @param BoundDeltas In the case of indexes of ki, the difference between
+/// image elements and corresponding domain elements
+/// corresponds to the difference between lexicographic
+/// minimum and lexicographic maximum of the corresponding
+/// dimension of the domain of the statement.
+/// @param IndexSet Obtained indexes ki, which describe the dependency.
+/// @return True if dependencies correspond to the tensor contraction
+/// and false, otherwise.
+static bool isReductionCarriedOverDim(isl::set DepDelta, unsigned Dim,
+ isl::pw_multi_aff BoundDeltas,
+ const SmallDenseSet<int> &IndexSet) {
+ isl::space Space = DepDelta.get_space();
+ isl::set Superset = isl::set::universe(Space);
+ for (unsigned i = 0; i < Dim; i += 1)
+ Superset = Superset.fix_si(isl::dim::set, i, 0);
+ Superset = Superset.fix_si(isl::dim::set, Dim, 1);
+
+ // Check that the difference between the image element and the domain element
+ // is equal to one in the case of the index ki. Image elements and
+ // corresponding domain elements should be equal in the case of positions,
+ // which are lower than the specified position.
+ if (!DepDelta.is_subset(Superset))
+ return false;
+
+ // Compute a set, which is used to analyze how values of
+ // the domain are related to the map that describes the dependency.
+ isl_pw_multi_aff *DepDeltaPW = isl_pw_multi_aff_from_set(DepDelta.copy());
+ BoundDeltas = BoundDeltas.add(isl::manage(DepDeltaPW));
+ isl_set *ComplementRawSet = isl_set_from_pw_multi_aff(BoundDeltas.release());
+ isl::set Complement = isl::manage(ComplementRawSet);
+
+ for (unsigned i : rangeIslSize(Dim + 1, DepDelta.dim(isl::dim::set))) {
+ if (!IndexSet.count(i)) {
+ // Check the difference between the image element and the domain element
+ // in the case of indexes, which do not describe the dependency.
+ if (DepDelta.plain_get_val_if_fixed(isl::dim::set, i).is_zero())
+ continue;
+ return false;
+ }
+
+ // In the case of other indexes, which describe the dependency,
+ // the difference between the image element and the domain element
+ // should be equal to the difference between lexicographic minimum and
+ // lexicographic maximum of the domain of the statement.
+ if (!Complement.plain_get_val_if_fixed(isl::dim::set, i).is_zero())
+ return false;
+ }
+
+ return true;
+}
+
+/// Check whether dependencies are over the complete domain.
+///
+/// In the case of the tensor contraction RAW, WAW, WAR dependencies
+/// have the form
+/// S(..., ki, max(k(i + 1)), ..., max(kn), ...) ->
+/// S(..., ki + 1, min(k(i + 1)), ..., min(kn), ...), where S is the SCoP
+/// statement. Consequently, the domain of the dependencies
+/// can be described as
+/// Domain / Domain ∩ S(…, max(kn),…) ∩ S(…, max(k(i + 1)),…),
+/// where Domain is the domain of the statement S.
+///
+/// For example, in the case of the following tensor contraction,
+/// corresponding domains will have the following form.
+///
+/// An example of the tensor contraction:
+/// for (i = 0; i < 1024; i++)
+/// for (j = 0; j < 1024; j++)
+/// for (l = 0; l < 64; ++l)
+/// for (w = 0; w < 64; ++w)
+/// C[i][j] += A[i][l][w] * B[w][j][l];
+///
+/// The domain of the statement:
+/// { S[i0, i1, i2, i3] : i0 >= 0 and i0 <= 1023 and
+/// i1 >= 0 and i1 <= 1023 and
+/// i2 >= 0 and i2 <= 63 and
+/// i3 >= 0 and i3 <= 63 }
+///
+/// The domain of the dependencies:
+/// { S[i0, i1, i2, i3] : (i0 >= 0 and i0 <= 1023 and
+/// i1 >= 0 and i1 <= 1023 and
+/// i2 >= 0 and i2 <= 63 and
+/// i3 >= 0 and i3 <= 62) or
+/// (i3 = 63 and i0 >= 0 and i0 <= 1023 and
+/// i1 >= 0 and i1 <= 1023 and
+/// i2 >= 0 and i2 <= 62) }
+///
+/// @param Domain The domain of the statement.
+/// @param DepsForStmt RAW and RED dependencies for the statement.
+/// @param UpperBound The lexicographic maximum of the elements in
+/// the @p Domain.
+/// @param IndexSet Obtained indexes ki, which describe the dependencies.
+/// @return True if dependencies are over the complete domain
+/// and false, otherwise.
+static bool areDepsOverCompleteDomain(isl::set Domain, isl::map DepsForStmt,
+ isl::pw_multi_aff UpperBound,
+ SmallDenseSet<int> &IndexSet) {
+ isl_set *UpperBoundRawSet = isl_set_from_pw_multi_aff(UpperBound.copy());
+ isl::set UpperBoundSet = isl::manage(UpperBoundRawSet);
+
+ isl::set DomainRed = isl::manage(Domain.copy());
+ for (const auto It : IndexSet) {
+ isl::val FixedVal = UpperBoundSet.plain_get_val_if_fixed(isl::dim::set, It);
+ if (FixedVal.is_nan())
+ return false;
+ DomainRed = isl::manage(
+ isl_set_fix_val(DomainRed.copy(), isl_dim_set, It, FixedVal.release()));
+ }
+ return DepsForStmt.domain().intersect(Domain).is_equal(
+ Domain.subtract(DomainRed));
+}
+
+/// Check that dependencies correspond to the tensor contraction.
+///
+/// Check that there are only true dependencies of the form
+/// S(..., ki, max(k(i + 1)), ..., max(kn), ...) ->
+/// S(..., ki + 1, min(k(i + 1)), ..., min(kn), ...), where S is the SCoP
+/// statement represented by @p Schedule. Such dependencies are produced by
+/// the tensor contraction. Obtained indexes ki are stored into @p IndexSet.
+///
+/// The form of anti and output dependencies is specified implicitly by
+/// the form the SCoP statement, which is checked by subsequent analysis.
+///
+/// @param Schedule The schedule of the SCoP statement.
+/// @param D The SCoP dependencies.
+/// @param Domain The domain of the statement.
+/// @param IndexSet Obtained indexes ki, which describe the dependencies.
+/// @return True if dependencies correspond to the tensor contraction
+/// and false, otherwise.
+static bool containsOnlyTcDeps(isl::map Schedule, const Dependences *D,
+ SmallDenseSet<int> &IndexSet, isl::set Domain) {
+ IslMaxOperationsGuard MaxOpGuard(Schedule.ctx().get(), OptComputeOut);
+
+ isl::union_map Dep =
+ D->getDependences(Dependences::TYPE_RAW | Dependences::TYPE_RED);
+
+ isl::space DomainSpace = Schedule.get_space().domain();
+ isl::space Space = DomainSpace.map_from_domain_and_range(DomainSpace);
+ isl::map DepsForStmt = Dep.extract_map(Space);
+ isl::set DepDeltas = DepsForStmt.deltas();
+ isl::size DeltasDimNum = DepDeltas.dim(isl::dim::set);
+ isl::pw_multi_aff LowerBound = Domain.lexmin_pw_multi_aff();
+ isl::pw_multi_aff UpperBound = Domain.lexmax_pw_multi_aff();
+ isl::pw_multi_aff BoundDeltas = UpperBound.sub(LowerBound);
+
+ for (int i : reverse(rangeIslSize(0, DeltasDimNum))) {
+ // In the case of the tensor contraction, the difference between image
+ // elements and domain elements lies on a hyperplane where a dimension
+ // has the fixed value one.
+ isl::set Intersection = DepDeltas.fix_si(isl::dim::set, i, 1);
+ if (Intersection.is_empty())
+ continue;
+
+ if (!isReductionCarriedOverDim(Intersection, i, BoundDeltas, IndexSet))
+ return false;
+
+ IndexSet.insert(i);
+ DepDeltas = DepDeltas.subtract(Intersection);
+ }
+
+ // In the case of the tensor contraction, all dependencies should have
+ // the previously described form.
+ if ((unsignedFromIslSize(DeltasDimNum) == 0) || !DepDeltas.is_empty())
+ return false;
+
+ return areDepsOverCompleteDomain(Domain, DepsForStmt, UpperBound, IndexSet);
+}
+
+/// Check if the SCoP statement could probably be optimized with analytical
+/// modeling.
+///
+/// containsTCInfoTy tries to determine whether the following conditions
+/// are true:
+///
+/// 1. The last memory access modeling an array, MA1, represents writing to
+/// memory and has the form S(..., I, ..., J, ...) -> M(shuffle(I, J)),
+/// where S is the SCoP statement under consideration and shuffle(I, J)
+/// is a permutation of indexes of sets I and J.
+/// 2. There are only true dependencies of the form
+/// S(..., ki, max(k(i + 1)), ..., max(kn), ...) ->
+/// S(..., ki + 1, min(k(i + 1)), ..., min(kn), ...), where S is the SCoP
+/// statement represented by @p Schedule and ki are indexes of the set P.
+/// 3. SCoP contains an arbitrary number of reads from constants and only three
+/// access relations, MA2, MA3, and MA4 that represent reading from memory
+/// and have the form
+/// S(..., I, ..., P, ...) -> M(shuffle(I, P)),
+/// S(..., P, ..., J, ...) -> M(shuffle(J, P)),
+/// S(...) -> M(shuffle(I, J)), respectively.
+///
+/// @param PartialSchedule The PartialSchedule that contains a SCoP statement
+/// to check.
+/// @param D The SCoP dependencies.
+/// @param TCI Parameters of the tensor contraction operands.
+/// @param Domain The domain of the statement.
+/// @return True if dependencies and memory accesses correspond to the tensor
+/// contraction and false, otherwise.
+static bool containsTCInfoTy(isl::map PartialSchedule, const Dependences *D,
+ TCInfoTy &TCI, isl::set Domain) {
+ if (!containsOnlyTcDeps(PartialSchedule, D, TCI.P, Domain))
+ return false;
+
+ // TODO: handle cases of scalar multiplication if needed.
+ if (TCI.P.size() == 0)
+ return false;
+
+ if (!containsOnlyTCAcc(Domain, PartialSchedule, TCI))
+ return false;
+
+ // TODO: handle cases of GEMV if needed.
+ if ((TCI.I.size() == 0) || (TCI.J.size() == 0))
+ return false;
+
+ return true;
+}
+
+/// Check if this node contains a partial schedule that could
+/// probably be optimized with analytical modeling.
+///
+/// isTCPattern is used to determine whether the SCoP represents a TC-like
+/// kernel [1], which is a perfectly nested set of loops, with a data usage
+/// pattern that is similar to that produced by the tensor contraction.
+///
+/// A TC-like kernel can be defined as follows:
+///
+/// 1. It satisfies the requirements of the polyhedral model.
+/// 2. Without loss of generality, it contains three nonempty bundles of
+/// one-dimensional for-loops with induction variables that are grouped into
+/// bundles I = i0...i(r-1), J = j0..j(s-1), and P = p0...p(t-1), and they
+/// are incremented by one.
+/// 3. The innermost loop body can be represented as a statement of the form
+/// C(shuffle(I, J)) = E(A(shuffle(I, P)), B(shuffle(P, J)),
+/// C(shuffle(I, J))), where A(shuffle(I, P)), B(shuffle(P, J)),
+/// C(shuffle(I, J)) are accesses to tensors A, B, C, respectively,
+/// shuffle(I, J), shuffle(I, P), and shuffle(P, J) are permutations of the
+/// enclosed indices, and E is an expression that contains reads from
+/// the tensors A, B, C, and an arbitrary number of reads from constants
+/// with respect to bundles I, J, and P.
+///
+/// TC can be considered as a particular case of a TC-like kernel.
+///
+/// The order of loops with indexes from P should be preserved. Otherwise,
+/// isTCPattern should check if a commutative operation is used.
+///
+/// isTCPattern performs the following steps to check whether the SCoP
+/// corresponds to a definition of a TC-like kernel:
+///
+/// 1. Checks that the node is the innermost band node.
+/// 2. Checks that the partial schedule contains only one statement.
+/// 3. Check that all ancestors of the node contain all band nodes for
+/// the statement and only mark nodes interleave such band nodes. This
+/// corresponds to a straightforward implementation of TC.
+/// 4. Analyses the dependencies to determine contraction dimensions.
+/// 5. Check that the last memory access modeling an array, represents writing
+/// to the result of the TC-like kernel.
+/// 6. Check that SCoP contains only three access relations that represent
+/// reading of the operands of the TC-like kernel and an arbitrary number of
+/// reads from constants.
+///
+/// [1] - Gareev R., Grosser T., Kruse M. High-Performance Generalized Tensor
+/// Operations: A Compiler-Oriented Approach // ACM Transactions
+/// Architecture and Code Optimization (TACO). 2018.
+/// Vol. 15, no. 3. P. 34:1–34:27. DOI: 10.1145/3235029.
+///
+/// If this is the case, we could logically represent tensors as matrices and
+/// apply algorithms, which are used to get close-to-peak performance of
+/// matrix multiplications in manually tuned BLAS libraries (e.g., BLIS).
+///
+/// @param Node The node to check.
+/// @param D The SCoP dependencies.
+/// @param TCI Parameters of the tensor contraction operands.
+static bool isTCPattern(isl::schedule_node Node, const Dependences *D,
+ TCInfoTy &TCI) {
+ Node = Node.child(0);
+ isl::union_map PartialSchedule = Node.get_prefix_schedule_union_map();
+ isl::union_set Domain = Node.domain();
+ Node = Node.parent();
+
+ // The partial schedule should contain only one statement.
+ // TODO: This constraint should not be intrinsic to the algorithm.
+ if (isl_union_set_n_set(Domain.get()) != 1)
+ return false;
+
+ isl_schedule_node_type NodeType = isl_schedule_node_get_type(Node.get());
+
+ // Check that all ancestors of the node contain all band nodes for
+ // the statement, which represents the TC-like kernel, and only mark nodes
+ // interleave such band nodes. This corresponds to a straightforward
+ // implementation of TC with/without DeLICM applied.
+ //
+ // For example, this covers the matrix multiplication pattern after a full
+ // run of -polly-optree and -polly-delicm, where the write access is not
+ // through the original memory access, but trough a PHI node that was
+ // delicmed. Subsequently, such band nodes will be replaced by a single band
+ // node.
+ //
+ // The corresponding schedule can be the following, where Stmt_for_body8
+ // contains the matrix multiplication:
+ //
+ // domain: "{ Stmt_for_body8[i0, i1, i2] : 0 <= i0 <= 1599 and
+ // 0 <= i1 <= 1799 and
+ // 0 <= i2 <= 2199;
+ // Stmt_for_body3[i0, i1] : 0 <= i0 <= 1599 and
+ // 0 <= i1 <= 1799;
+ // Stmt_for_body3_last[i0, i1] : 0 <= i0 <= 1599 and
+ // 0 <= i1 <= 1799 }"
+ // child:
+ // sequence:
+ // - filter: "{ Stmt_for_body3[i0, i1] }"
+ // child:
+ // schedule: "[{ Stmt_for_body3[i0, i1] -> [(i0)] },
+ // { Stmt_for_body3[i0, i1] -> [(i1)] }]"
+ // permutable: 1
+ // coincident: [ 1, 1 ]
+ // - filter: "{ Stmt_for_body3_last[i0, i1] }"
+ // child:
+ // schedule: "[{ Stmt_for_body3_last[i0, i1] -> [(i0)] },
+ // { Stmt_for_body3_last[i0, i1] -> [(i1)] }]"
+ // permutable: 1
+ // coincident: [ 1, 1 ]
+ // - filter: "{ Stmt_for_body8[i0, i1, i2] }"
+ // child:
+ // schedule: "[{ Stmt_for_body8[i0, i1, i2] -> [(i0)] },
+ // { Stmt_for_body8[i0, i1, i2] -> [(i1)] },
+ // { Stmt_for_body8[i0, i1, i2] -> [(i2)] }]"
+ // permutable: 1
+ // coincident: [ 1, 1, 0 ]
+ //
+ while (NodeType != isl_schedule_node_domain) {
+ if (NodeType == isl_schedule_node_filter) {
+ if (!Node.parent().isa<isl::schedule_node_sequence>() ||
+ !Node.parent().parent().isa<isl::schedule_node_domain>())
+ return false;
+ break;
+ }
+
+ if ((NodeType != isl_schedule_node_band) &&
+ (NodeType != isl_schedule_node_mark))
+ return false;
+
+ Node = Node.parent();
+ NodeType = isl_schedule_node_get_type(Node.get());
+ }
+
+ isl::map PartialScheduleMap = isl::map::from_union_map(PartialSchedule);
+ if (containsTCInfoTy(PartialScheduleMap, D, TCI, isl::set(Domain)))
+ return true;
+
+ return false;
+}
+
} // namespace
isl::schedule_node
polly::tryOptimizeMatMulPattern(isl::schedule_node Node,
const llvm::TargetTransformInfo *TTI,
const Dependences *D) {
+ TCInfoTy TCI;
+ if (PMBasedTCOpts && isTCPattern(Node, D, TCI))
+ LLVM_DEBUG(dbgs() << "The tensor contraction pattern was detected\n");
MatMulInfoTy MMI;
- if (isMatrMultPattern(Node, D, MMI)) {
+ if (PMBasedMMMOpts && isMatrMultPattern(Node, D, MMI)) {
LLVM_DEBUG(dbgs() << "The matrix multiplication pattern was detected\n");
return optimizeMatMulPattern(Node, TTI, MMI);
}
diff --git a/polly/lib/Transform/ScheduleOptimizer.cpp b/polly/lib/Transform/ScheduleOptimizer.cpp
index 99645d0ab3dd..94d96d9a9dfd 100644
--- a/polly/lib/Transform/ScheduleOptimizer.cpp
+++ b/polly/lib/Transform/ScheduleOptimizer.cpp
@@ -296,6 +296,16 @@ private:
/// @param Node The node to check.
static bool isTileableBandNode(isl::schedule_node Node);
+ /// Check if this node is a band node we want to transform using pattern
+ /// matching.
+ ///
+ /// We look for innermost band nodes where individual dimensions are marked as
+ /// permutable. There is no restriction on the number of individual
+ /// dimensions.
+ ///
+ /// @param Node The node to check.
+ static bool isPMOptimizableBandNode(isl::schedule_node Node);
+
/// Pre-vectorizes one scheduling dimension of a schedule band.
///
/// prevectSchedBand splits out the dimension DimToVectorize, tiles it and
@@ -456,13 +466,23 @@ static bool isSimpleInnermostBand(const isl::schedule_node &Node) {
return true;
}
-bool ScheduleTreeOptimizer::isTileableBandNode(isl::schedule_node Node) {
+/// Check if this node is a band node, which has only one child.
+///
+/// @param Node The node to check.
+static bool isOneTimeParentBandNode(isl::schedule_node Node) {
if (isl_schedule_node_get_type(Node.get()) != isl_schedule_node_band)
return false;
if (isl_schedule_node_n_children(Node.get()) != 1)
return false;
+ return true;
+}
+
+bool ScheduleTreeOptimizer::isTileableBandNode(isl::schedule_node Node) {
+ if (!isOneTimeParentBandNode(Node))
+ return false;
+
if (!isl_schedule_node_band_get_permutable(Node.get()))
return false;
@@ -474,6 +494,13 @@ bool ScheduleTreeOptimizer::isTileableBandNode(isl::schedule_node Node) {
return isSimpleInnermostBand(Node);
}
+bool ScheduleTreeOptimizer::isPMOptimizableBandNode(isl::schedule_node Node) {
+ if (!isOneTimeParentBandNode(Node))
+ return false;
+
+ return Node.child(0).isa<isl::schedule_node_leaf>();
+}
+
__isl_give isl::schedule_node
ScheduleTreeOptimizer::applyTileBandOpt(isl::schedule_node Node) {
if (FirstLevelTiling) {
@@ -519,10 +546,8 @@ ScheduleTreeOptimizer::optimizeBand(__isl_take isl_schedule_node *NodeArg,
assert(OAI && "Expecting optimization options");
isl::schedule_node Node = isl::manage(NodeArg);
- if (!isTileableBandNode(Node))
- return Node.release();
- if (OAI->PatternOpts) {
+ if (OAI->PatternOpts && isPMOptimizableBandNode(Node)) {
isl::schedule_node PatternOptimizedSchedule =
tryOptimizeMatMulPattern(Node, OAI->TTI, OAI->D);
if (!PatternOptimizedSchedule.is_null()) {
@@ -532,6 +557,9 @@ ScheduleTreeOptimizer::optimizeBand(__isl_take isl_schedule_node *NodeArg,
}
}
+ if (!isTileableBandNode(Node))
+ return Node.release();
+
if (OAI->Postopts)
Node = applyTileBandOpt(Node);
diff --git a/polly/test/ScheduleOptimizer/pattern-matching-based-opts-after-delicm.ll b/polly/test/ScheduleOptimizer/pattern-matching-based-opts-after-delicm.ll
index 2903e6e0b4a4..a05c796b3a20 100644
--- a/polly/test/ScheduleOptimizer/pattern-matching-based-opts-after-delicm.ll
+++ b/polly/test/ScheduleOptimizer/pattern-matching-based-opts-after-delicm.ll
@@ -1,7 +1,7 @@
; RUN: opt %loadPolly \
; RUN: -polly-pattern-matching-based-opts=true \
; RUN: -polly-optree -polly-delicm -polly-simplify \
-; RUN: -polly-opt-isl -debug < %s 2>&1 \
+; RUN: -polly-opt-isl -polly-tc-opt=true -debug < %s 2>&1 \
; RUN: | FileCheck %s
; REQUIRES: asserts
@@ -10,6 +10,36 @@
; is not through the original memory access, but trough a PHI node that was
; delicmed. This test covers the polybench 2mm and 3mm cases.
;
+; This test case generates the following schedule, which contains filters:
+;
+; domain: "{ Stmt_for_body8[i0, i1, i2] : 0 <= i0 <= 1599 and
+; 0 <= i1 <= 1799 and
+; 0 <= i2 <= 2199;
+; Stmt_for_body3[i0, i1] : 0 <= i0 <= 1599 and
+; 0 <= i1 <= 1799;
+; Stmt_for_body3_last[i0, i1] : 0 <= i0 <= 1599 and
+; 0 <= i1 <= 1799 }"
+; child:
+; sequence:
+; - filter: "{ Stmt_for_body3[i0, i1] }"
+; child:
+; schedule: "[{ Stmt_for_body3[i0, i1] -> [(i0)] }, { Stmt_for_body3[i0, i1] -> [(i1)] }]"
+; permutable: 1
+; coincident: [ 1, 1 ]
+; - filter: "{ Stmt_for_body3_last[i0, i1] }"
+; child:
+; schedule: "[{ Stmt_for_body3_last[i0, i1] -> [(i0)] }, { Stmt_for_body3_last[i0, i1] -> [(i1)] }]"
+; permutable: 1
+; coincident: [ 1, 1 ]
+; - filter: "{ Stmt_for_body8[i0, i1, i2] }"
+; child:
+; schedule: "[{ Stmt_for_body8[i0, i1, i2] -> [(i0)] },
+; { Stmt_for_body8[i0, i1, i2] -> [(i1)] },
+; { Stmt_for_body8[i0, i1, i2] -> [(i2)] }]"
+; permutable: 1
+; coincident: [ 1, 1, 0 ]
+;
+; CHECK: The tensor contraction pattern was detected
; CHECK: The matrix multiplication pattern was detected
;
diff --git a/polly/test/ScheduleOptimizer/pattern-matching-based-opts-after-delicm_2.ll b/polly/test/ScheduleOptimizer/pattern-matching-based-opts-after-delicm_2.ll
new file mode 100644
index 000000000000..f81c5bf8d81b
--- /dev/null
+++ b/polly/test/ScheduleOptimizer/pattern-matching-based-opts-after-delicm_2.ll
@@ -0,0 +1,108 @@
+; RUN: opt %loadPolly -polly-delicm -polly-simplify -polly-opt-isl \
+; RUN: -polly-pattern-matching-based-opts=true \
+; RUN: -polly-tc-opt=true -debug -disable-output < %s 2>&1 | FileCheck %s
+; REQUIRES: asserts
+;
+; Check that the pattern matching detects the tensor contraction pattern
+; after a full run of -polly-delicm. This test case generates the following
+; schedule, which contans two band nodes. Without DeLICM two statement are
+; generated.
+;
+; domain: "{ Stmt5[i0, i1, i2, i3, i4, i5] : 0 <= i0 <= 31 and 0 <= i1 <= 31 and
+; 0 <= i2 <= 31 and 0 <= i3 <= 31 and
+; 0 <= i4 <= 31 and 0 <= i5 <= 31 }"
+; child:
+; schedule: "[{ Stmt5[i0, i1, i2, i3, i4, i5] -> [(i0)] },
+; { Stmt5[i0, i1, i2, i3, i4, i5] -> [(i1)] },
+; { Stmt5[i0, i1, i2, i3, i4, i5] -> [(i2)] },
+; { Stmt5[i0, i1, i2, i3, i4, i5] -> [(i4)] },
+; { Stmt5[i0, i1, i2, i3, i4, i5] -> [(i3)] }]"
+; permutable: 1
+; coincident: [ 1, 1, 1, 1, 0 ]
+; child:
+; schedule: "[{ Stmt5[i0, i1, i2, i3, i4, i5] -> [(i5)] }]"
+; permutable: 1
+; child:
+; leaf
+;
+; for (i = 0; i < 32; i++)
+; for (j = 0; j < 32; j++)
+; for (k = 0; k < 32; ++k)
+; for (l = 0; l < 32; ++l)
+; for (w = 0; w < 32; ++w)
+; for (q = 0; q < 32; ++q)
+; C[i][j][k][w] += A[i][l][j][q] * B[q][w][l][k];
+;
+; CHECK: The tensor contraction pattern was detected
+;
+target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+define internal fastcc void @kernel_tc([32 x [32 x [32 x double]]]* nocapture %C, [32 x [32 x [32 x double]]]* nocapture readonly %A, [32 x [32 x [32 x double]]]* nocapture readonly %B) {
+entry:
+ br label %for.cond1.preheader
+
+for.cond1.preheader: ; preds = %for.inc50, %entry
+ %indvars.iv19 = phi i64 [ 0, %entry ], [ %indvars.iv.next20, %for.inc50 ]
+ br label %for.cond4.preheader
+
+for.cond4.preheader: ; preds = %for.inc47, %for.cond1.preheader
+ %indvars.iv16 = phi i64 [ 0, %for.cond1.preheader ], [ %indvars.iv.next17, %for.inc47 ]
+ br label %for.cond7.preheader
+
+for.cond7.preheader: ; preds = %for.inc44, %for.cond4.preheader
+ %indvars.iv13 = phi i64 [ 0, %for.cond4.preheader ], [ %indvars.iv.next14, %for.inc44 ]
+ br label %for.cond10.preheader
+
+for.cond10.preheader: ; preds = %for.inc41, %for.cond7.preheader
+ %indvars.iv10 = phi i64 [ 0, %for.cond7.preheader ], [ %indvars.iv.next11, %for.inc41 ]
+ br label %for.cond13.preheader
+
+for.cond13.preheader: ; preds = %for.inc38, %for.cond10.preheader
+ %indvars.iv7 = phi i64 [ 0, %for.cond10.preheader ], [ %indvars.iv.next8, %for.inc38 ]
+ %arrayidx37 = getelementptr inbounds [32 x [32 x [32 x double]]], [32 x [32 x [32 x double]]]* %C, i64 %indvars.iv19, i64 %indvars.iv16, i64 %indvars.iv13, i64 %indvars.iv7
+ %.pre = load double, double* %arrayidx37, align 8
+ br label %for.body15
+
+for.body15: ; preds = %for.body15, %for.cond13.preheader
+ %i = phi double [ %.pre, %for.cond13.preheader ], [ %add, %for.body15 ]
+ %indvars.iv = phi i64 [ 0, %for.cond13.preheader ], [ %indvars.iv.next, %for.body15 ]
+ %arrayidx21 = getelementptr inbounds [32 x [32 x [32 x double]]], [32 x [32 x [32 x double]]]* %A, i64 %indvars.iv19, i64 %indvars.iv10, i64 %indvars.iv16, i64 %indvars.iv
+ %i1 = load double, double* %arrayidx21, align 8
+ %arrayidx29 = getelementptr inbounds [32 x [32 x [32 x double]]], [32 x [32 x [32 x double]]]* %B, i64 %indvars.iv, i64 %indvars.iv7, i64 %indvars.iv10, i64 %indvars.iv13
+ %i2 = load double, double* %arrayidx29, align 8
+ %mul = fmul fast double %i2, %i1
+ %add = fadd fast double %i, %mul
+ store double %add, double* %arrayidx37, align 8
+ %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+ %exitcond.not = icmp eq i64 %indvars.iv.next, 32
+ br i1 %exitcond.not, label %for.inc38, label %for.body15
+
+for.inc38: ; preds = %for.body15
+ %indvars.iv.next8 = add nuw nsw i64 %indvars.iv7, 1
+ %exitcond9.not = icmp eq i64 %indvars.iv.next8, 32
+ br i1 %exitcond9.not, label %for.inc41, label %for.cond13.preheader
+
+for.inc41: ; preds = %for.inc38
+ %indvars.iv.next11 = add nuw nsw i64 %indvars.iv10, 1
+ %exitcond12.not = icmp eq i64 %indvars.iv.next11, 32
+ br i1 %exitcond12.not, label %for.inc44, label %for.cond10.preheader
+
+for.inc44: ; preds = %for.inc41
+ %indvars.iv.next14 = add nuw nsw i64 %indvars.iv13, 1
+ %exitcond15.not = icmp eq i64 %indvars.iv.next14, 32
+ br i1 %exitcond15.not, label %for.inc47, label %for.cond7.preheader
+
+for.inc47: ; preds = %for.inc44
+ %indvars.iv.next17 = add nuw nsw i64 %indvars.iv16, 1
+ %exitcond18.not = icmp eq i64 %indvars.iv.next17, 32
+ br i1 %exitcond18.not, label %for.inc50, label %for.cond4.preheader
+
+for.inc50: ; preds = %for.inc47
+ %indvars.iv.next20 = add nuw nsw i64 %indvars.iv19, 1
+ %exitcond21.not = icmp eq i64 %indvars.iv.next20, 32
+ br i1 %exitcond21.not, label %for.end52, label %for.cond1.preheader
+
+for.end52: ; preds = %for.inc50
+ ret void
+}
diff --git a/polly/test/ScheduleOptimizer/pattern-matching-based-opts.ll b/polly/test/ScheduleOptimizer/pattern-matching-based-opts.ll
index 69f43d1f43de..1df94df148f1 100644
--- a/polly/test/ScheduleOptimizer/pattern-matching-based-opts.ll
+++ b/polly/test/ScheduleOptimizer/pattern-matching-based-opts.ll
@@ -1,6 +1,7 @@
-; RUN: opt %loadPolly -polly-opt-isl -polly-pattern-matching-based-opts=false -debug < %s 2>&1 | FileCheck %s
-; RUN: opt %loadPolly -polly-opt-isl -polly-pattern-matching-based-opts=true -debug < %s 2>&1 | FileCheck %s --check-prefix=PATTERN-MATCHING-OPTS
-; RUN: opt %loadPolly -polly-opt-isl -polly-pattern-matching-based-opts=true -polly-ast-detect-parallel -polly-print-ast -disable-output < %s | FileCheck %s --check-prefix=PARALLEL-AST
+; RUN: opt %loadPolly -polly-opt-isl -polly-pattern-matching-based-opts=false \
+; RUN: -debug -polly-tc-opt < %s 2>&1 | FileCheck %s
+; RUN: opt %loadPolly -polly-opt-isl -polly-pattern-matching-based-opts=true -debug -polly-tc-opt < %s 2>&1 | FileCheck %s --check-prefix=PATTERN-MATCHING-OPTS
+; RUN: opt %loadPolly -polly-opt-isl -polly-pattern-matching-based-opts=true -polly-ast-detect-parallel -polly-print-ast -disable-output < %s | FileCheck %s --check-prefix=PARALLEL-AST
; RUN: opt %loadPolly -polly-opt-isl -polly-pattern-matching-based-opts=true -stats -disable-output < %s 2>&1 | FileCheck %s --check-prefix=STATS -match-full-lines
; REQUIRES: asserts
;
@@ -14,6 +15,8 @@
; }
;
; CHECK-NOT: The matrix multiplication pattern was detected
+; CHECK-NOT: The tensor contraction pattern was detected
+; PATTERN-MATCHING-OPTS: The tensor contraction pattern was detected
; PATTERN-MATCHING-OPTS: The matrix multiplication pattern was detected
; PARALLEL-AST-NOT: #pragma known-parallel
; STATS: 1 polly-opt-isl - Number of matrix multiplication patterns detected and optimized
diff --git a/polly/test/ScheduleOptimizer/pattern-matching-based-opts_11.ll b/polly/test/ScheduleOptimizer/pattern-matching-based-opts_11.ll
index a2d11fb3229f..443632504495 100644
--- a/polly/test/ScheduleOptimizer/pattern-matching-based-opts_11.ll
+++ b/polly/test/ScheduleOptimizer/pattern-matching-based-opts_11.ll
@@ -8,7 +8,8 @@
; RUN: -polly-target-1st-cache-level-size=32768 \
; RUN: -polly-target-vector-register-bitwidth=256 \
; RUN: -polly-target-2nd-cache-level-size=262144 \
-; RUN: -polly-opt-isl -debug < %s 2>&1 \
+; RUN: -polly-opt-isl -debug \
+; RUN: -polly-tc-opt=true < %s 2>&1 \
; RUN: | FileCheck %s
; REQUIRES: asserts
;
@@ -16,6 +17,7 @@
; in case scalar memory accesses were replaced by accesses to newly created
; arrays.
;
+; CHECK: The tensor contraction pattern was detected
; CHECK: The matrix multiplication pattern was detected
;
target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
diff --git a/polly/test/ScheduleOptimizer/pattern-matching-based-opts_15.ll b/polly/test/ScheduleOptimizer/pattern-matching-based-opts_15.ll
index d472e9e2d60e..fa8bcab84de8 100644
--- a/polly/test/ScheduleOptimizer/pattern-matching-based-opts_15.ll
+++ b/polly/test/ScheduleOptimizer/pattern-matching-based-opts_15.ll
@@ -1,5 +1,6 @@
; RUN: opt %loadPolly -polly-opt-isl -polly-pattern-matching-based-opts=true \
-; RUN: -debug-only=polly-opt-isl -disable-output < %s 2>&1 | FileCheck %s
+; RUN: -debug-only=polly-opt-isl -disable-output \
+; RUN: -polly-tc-opt=true < %s 2>&1 | FileCheck %s
; REQUIRES: asserts
;
; for (i = 0; i < _PB_NI; i++)
@@ -14,6 +15,7 @@
; }
;
; CHECK-NOT: The matrix multiplication pattern was detected
+; CHECK-NOT: The tensor contraction pattern was detected
target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128"
target triple = "x86_64-unknown-linux-gnu"
diff --git a/polly/test/ScheduleOptimizer/pattern-matching-based-opts_16.ll b/polly/test/ScheduleOptimizer/pattern-matching-based-opts_16.ll
new file mode 100644
index 000000000000..d48e6296c403
--- /dev/null
+++ b/polly/test/ScheduleOptimizer/pattern-matching-based-opts_16.ll
@@ -0,0 +1,64 @@
+; RUN: opt %loadPolly -polly-opt-isl -polly-pattern-matching-based-opts=true \
+; RUN: -polly-tc-opt=true -debug -disable-output < %s 2>&1 | FileCheck %s
+; REQUIRES: asserts
+;
+; for (i = 0; i < 1024; i++)
+; for (j = 0; j < 1024; j++)
+; for (l = 0; l < 64; ++l)
+; for (w = 0; w < 64; ++w)
+; C[i][j] += A[i][l][w] * B[w][j][l];
+;
+; CHECK: The tensor contraction pattern was detected
+;
+target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+define internal void @kernel_tc(i32 %ni, i32 %nj, i32 %nl, i32 %nq, i32 %nw, double %alpha, double %beta, [1024 x double]* %C, [64 x [64 x double]]* %A, [1024 x [64 x double]]* %B) {
+entry:
+ br label %for.cond1.preheader
+
+for.cond1.preheader: ; preds = %for.inc30, %entry
+ %indvars.iv43 = phi i64 [ 0, %entry ], [ %indvars.iv.next44, %for.inc30 ]
+ br label %for.cond4.preheader
+
+for.cond4.preheader: ; preds = %for.inc27, %for.cond1.preheader
+ %indvars.iv40 = phi i64 [ 0, %for.cond1.preheader ], [ %indvars.iv.next41, %for.inc27 ]
+ br label %for.cond7.preheader
+
+for.cond7.preheader: ; preds = %for.inc24, %for.cond4.preheader
+ %indvars.iv37 = phi i64 [ 0, %for.cond4.preheader ], [ %indvars.iv.next38, %for.inc24 ]
+ br label %for.body9
+
+for.body9: ; preds = %for.body9, %for.cond7.preheader
+ %indvars.iv = phi i64 [ 0, %for.cond7.preheader ], [ %indvars.iv.next, %for.body9 ]
+ %arrayidx13 = getelementptr inbounds [64 x [64 x double]], [64 x [64 x double]]* %A, i64 %indvars.iv43, i64 %indvars.iv37, i64 %indvars.iv
+ %i = load double, double* %arrayidx13, align 8
+ %arrayidx19 = getelementptr inbounds [1024 x [64 x double]], [1024 x [64 x double]]* %B, i64 %indvars.iv, i64 %indvars.iv40, i64 %indvars.iv37
+ %i1 = load double, double* %arrayidx19, align 8
+ %mul = fmul fast double %i1, %i
+ %arrayidx23 = getelementptr inbounds [1024 x double], [1024 x double]* %C, i64 %indvars.iv43, i64 %indvars.iv40
+ %i2 = load double, double* %arrayidx23, align 8
+ %add = fadd fast double %i2, %mul
+ store double %add, double* %arrayidx23, align 8
+ %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+ %exitcond = icmp ne i64 %indvars.iv.next, 64
+ br i1 %exitcond, label %for.body9, label %for.inc24
+
+for.inc24: ; preds = %for.body9
+ %indvars.iv.next38 = add nuw nsw i64 %indvars.iv37, 1
+ %exitcond39 = icmp ne i64 %indvars.iv.next38, 64
+ br i1 %exitcond39, label %for.cond7.preheader, label %for.inc27
+
+for.inc27: ; preds = %for.inc24
+ %indvars.iv.next41 = add nuw nsw i64 %indvars.iv40, 1
+ %exitcond42 = icmp ne i64 %indvars.iv.next41, 1024
+ br i1 %exitcond42, label %for.cond4.preheader, label %for.inc30
+
+for.inc30: ; preds = %for.inc27
+ %indvars.iv.next44 = add nuw nsw i64 %indvars.iv43, 1
+ %exitcond45 = icmp ne i64 %indvars.iv.next44, 1024
+ br i1 %exitcond45, label %for.cond1.preheader, label %for.end32
+
+for.end32: ; preds = %for.inc30
+ ret void
+}
diff --git a/polly/test/ScheduleOptimizer/pattern-matching-based-opts_17.ll b/polly/test/ScheduleOptimizer/pattern-matching-based-opts_17.ll
new file mode 100644
index 000000000000..904ff4c98753
--- /dev/null
+++ b/polly/test/ScheduleOptimizer/pattern-matching-based-opts_17.ll
@@ -0,0 +1,64 @@
+; RUN: opt %loadPolly -polly-opt-isl -polly-pattern-matching-based-opts=true \
+; RUN: -polly-tc-opt=true -debug -disable-output < %s 2>&1 | FileCheck %s
+; REQUIRES: asserts
+;
+; for (i = 0; i < 32; i++)
+; for (j = 0; j < 1024; j++)
+; for (k = 0; k < 32; ++k)
+; for (l = 0; l < 1024; ++l)
+; C[i][j][k] += A[i][k][l] * B[l][j];
+;
+; CHECK: The tensor contraction pattern was detected
+;
+target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+define internal void @kernel_tc(i32 %ni, i32 %nj, i32 %nk, i32 %nl, double %alpha, double %beta, [1024 x [32 x double]]* %C, [32 x [1024 x double]]* %A, [1024 x double]* %B) {
+entry:
+ br label %for.cond1.preheader
+
+for.cond1.preheader: ; preds = %for.inc30, %entry
+ %indvars.iv43 = phi i64 [ 0, %entry ], [ %indvars.iv.next44, %for.inc30 ]
+ br label %for.cond4.preheader
+
+for.cond4.preheader: ; preds = %for.inc27, %for.cond1.preheader
+ %indvars.iv40 = phi i64 [ 0, %for.cond1.preheader ], [ %indvars.iv.next41, %for.inc27 ]
+ br label %for.cond7.preheader
+
+for.cond7.preheader: ; preds = %for.inc24, %for.cond4.preheader
+ %indvars.iv37 = phi i64 [ 0, %for.cond4.preheader ], [ %indvars.iv.next38, %for.inc24 ]
+ br label %for.body9
+
+for.body9: ; preds = %for.body9, %for.cond7.preheader
+ %indvars.iv = phi i64 [ 0, %for.cond7.preheader ], [ %indvars.iv.next, %for.body9 ]
+ %arrayidx13 = getelementptr inbounds [32 x [1024 x double]], [32 x [1024 x double]]* %A, i64 %indvars.iv43, i64 %indvars.iv37, i64 %indvars.iv
+ %i = load double, double* %arrayidx13, align 8
+ %arrayidx17 = getelementptr inbounds [1024 x double], [1024 x double]* %B, i64 %indvars.iv, i64 %indvars.iv40
+ %i1 = load double, double* %arrayidx17, align 8
+ %mul = fmul fast double %i1, %i
+ %arrayidx23 = getelementptr inbounds [1024 x [32 x double]], [1024 x [32 x double]]* %C, i64 %indvars.iv43, i64 %indvars.iv40, i64 %indvars.iv37
+ %i2 = load double, double* %arrayidx23, align 8
+ %add = fadd fast double %i2, %mul
+ store double %add, double* %arrayidx23, align 8
+ %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+ %exitcond = icmp ne i64 %indvars.iv.next, 1024
+ br i1 %exitcond, label %for.body9, label %for.inc24
+
+for.inc24: ; preds = %for.body9
+ %indvars.iv.next38 = add nuw nsw i64 %indvars.iv37, 1
+ %exitcond39 = icmp ne i64 %indvars.iv.next38, 32
+ br i1 %exitcond39, label %for.cond7.preheader, label %for.inc27
+
+for.inc27: ; preds = %for.inc24
+ %indvars.iv.next41 = add nuw nsw i64 %indvars.iv40, 1
+ %exitcond42 = icmp ne i64 %indvars.iv.next41, 1024
+ br i1 %exitcond42, label %for.cond4.preheader, label %for.inc30
+
+for.inc30: ; preds = %for.inc27
+ %indvars.iv.next44 = add nuw nsw i64 %indvars.iv43, 1
+ %exitcond45 = icmp ne i64 %indvars.iv.next44, 32
+ br i1 %exitcond45, label %for.cond1.preheader, label %for.end32
+
+for.end32: ; preds = %for.inc30
+ ret void
+}
diff --git a/polly/test/ScheduleOptimizer/pattern-matching-based-opts_18.ll b/polly/test/ScheduleOptimizer/pattern-matching-based-opts_18.ll
new file mode 100644
index 000000000000..c7057a42e448
--- /dev/null
+++ b/polly/test/ScheduleOptimizer/pattern-matching-based-opts_18.ll
@@ -0,0 +1,84 @@
+; RUN: opt %loadPolly -polly-opt-isl -polly-pattern-matching-based-opts=true \
+; RUN: -polly-tc-opt=true -debug -disable-output < %s 2>&1 | FileCheck %s
+; REQUIRES: asserts
+;
+; for (i = 0; i < 32; i++)
+; for (j = 0; j < 32; j++)
+; for (k = 0; k < 32; ++k)
+; for (l = 0; l < 32; ++l)
+; for (w = 0; w < 32; ++w)
+; for (q = 0; q < 32; ++q)
+; C[i][j][k][w] += A[i][l][j][q] * B[q][w][l][k];
+;
+; CHECK: The tensor contraction pattern was detected
+;
+target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+define internal void @kernel_tc(i32 %ni, i32 %nj, i32 %nk, i32 %nl, i32 %nq, i32 %nw, double %alpha, double %beta, [32 x [32 x [32 x double]]]* %C, [32 x [32 x [32 x double]]]* %A, [32 x [32 x [32 x double]]]* %B) {
+entry:
+ br label %for.cond1.preheader
+
+for.cond1.preheader: ; preds = %for.inc50, %entry
+ %indvars.iv71 = phi i64 [ 0, %entry ], [ %indvars.iv.next72, %for.inc50 ]
+ br label %for.cond4.preheader
+
+for.cond4.preheader: ; preds = %for.inc47, %for.cond1.preheader
+ %indvars.iv68 = phi i64 [ 0, %for.cond1.preheader ], [ %indvars.iv.next69, %for.inc47 ]
+ br label %for.cond7.preheader
+
+for.cond7.preheader: ; preds = %for.inc44, %for.cond4.preheader
+ %indvars.iv65 = phi i64 [ 0, %for.cond4.preheader ], [ %indvars.iv.next66, %for.inc44 ]
+ br label %for.cond10.preheader
+
+for.cond10.preheader: ; preds = %for.inc41, %for.cond7.preheader
+ %indvars.iv62 = phi i64 [ 0, %for.cond7.preheader ], [ %indvars.iv.next63, %for.inc41 ]
+ br label %for.cond13.preheader
+
+for.cond13.preheader: ; preds = %for.inc38, %for.cond10.preheader
+ %indvars.iv59 = phi i64 [ 0, %for.cond10.preheader ], [ %indvars.iv.next60, %for.inc38 ]
+ br label %for.body15
+
+for.body15: ; preds = %for.body15, %for.cond13.preheader
+ %indvars.iv = phi i64 [ 0, %for.cond13.preheader ], [ %indvars.iv.next, %for.body15 ]
+ %arrayidx21 = getelementptr inbounds [32 x [32 x [32 x double]]], [32 x [32 x [32 x double]]]* %A, i64 %indvars.iv71, i64 %indvars.iv62, i64 %indvars.iv68, i64 %indvars.iv
+ %i = load double, double* %arrayidx21, align 8
+ %arrayidx29 = getelementptr inbounds [32 x [32 x [32 x double]]], [32 x [32 x [32 x double]]]* %B, i64 %indvars.iv, i64 %indvars.iv59, i64 %indvars.iv62, i64 %indvars.iv65
+ %i1 = load double, double* %arrayidx29, align 8
+ %mul = fmul fast double %i1, %i
+ %arrayidx37 = getelementptr inbounds [32 x [32 x [32 x double]]], [32 x [32 x [32 x double]]]* %C, i64 %indvars.iv71, i64 %indvars.iv68, i64 %indvars.iv65, i64 %indvars.iv59
+ %i2 = load double, double* %arrayidx37, align 8
+ %add = fadd fast double %i2, %mul
+ store double %add, double* %arrayidx37, align 8
+ %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+ %exitcond = icmp ne i64 %indvars.iv.next, 32
+ br i1 %exitcond, label %for.body15, label %for.inc38
+
+for.inc38: ; preds = %for.body15
+ %indvars.iv.next60 = add nuw nsw i64 %indvars.iv59, 1
+ %exitcond61 = icmp ne i64 %indvars.iv.next60, 32
+ br i1 %exitcond61, label %for.cond13.preheader, label %for.inc41
+
+for.inc41: ; preds = %for.inc38
+ %indvars.iv.next63 = add nuw nsw i64 %indvars.iv62, 1
+ %exitcond64 = icmp ne i64 %indvars.iv.next63, 32
+ br i1 %exitcond64, label %for.cond10.preheader, label %for.inc44
+
+for.inc44: ; preds = %for.inc41
+ %indvars.iv.next66 = add nuw nsw i64 %indvars.iv65, 1
+ %exitcond67 = icmp ne i64 %indvars.iv.next66, 32
+ br i1 %exitcond67, label %for.cond7.preheader, label %for.inc47
+
+for.inc47: ; preds = %for.inc44
+ %indvars.iv.next69 = add nuw nsw i64 %indvars.iv68, 1
+ %exitcond70 = icmp ne i64 %indvars.iv.next69, 32
+ br i1 %exitcond70, label %for.cond4.preheader, label %for.inc50
+
+for.inc50: ; preds = %for.inc47
+ %indvars.iv.next72 = add nuw nsw i64 %indvars.iv71, 1
+ %exitcond73 = icmp ne i64 %indvars.iv.next72, 32
+ br i1 %exitcond73, label %for.cond1.preheader, label %for.end52
+
+for.end52: ; preds = %for.inc50
+ ret void
+}
diff --git a/polly/test/ScheduleOptimizer/pattern-matching-based-opts_19.ll b/polly/test/ScheduleOptimizer/pattern-matching-based-opts_19.ll
new file mode 100644
index 000000000000..53a03112e28b
--- /dev/null
+++ b/polly/test/ScheduleOptimizer/pattern-matching-based-opts_19.ll
@@ -0,0 +1,84 @@
+; RUN: opt %loadPolly -polly-opt-isl -polly-pattern-matching-based-opts=true \
+; RUN: -polly-tc-opt=true -debug -disable-output < %s 2>&1 | FileCheck %s
+; REQUIRES: asserts
+;
+; for (i = 0; i < 8; i++)
+; for (j = 0; j < 8; j++)
+; for (k = 0; k < 4; ++k)
+; for (l = 0; l < 1024; ++l)
+; for (w = 0; w < 1024; ++w)
+; for (q = 0; q < 4; ++q)
+; C[i][j][k][w][q] += A[q][k][j][l][i] * B[l][w];
+;
+; CHECK: The tensor contraction pattern was detected
+;
+target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+define internal void @kernel_tc([8 x [4 x [1024 x [4 x double]]]]* %C, [4 x [8 x [1024 x [8 x double]]]]* %A, [1024 x double]* %B) {
+entry:
+ br label %for.cond1.preheader
+
+for.cond1.preheader: ; preds = %for.inc50, %entry
+ %indvars.iv71 = phi i64 [ 0, %entry ], [ %indvars.iv.next72, %for.inc50 ]
+ br label %for.cond4.preheader
+
+for.cond4.preheader: ; preds = %for.inc47, %for.cond1.preheader
+ %indvars.iv68 = phi i64 [ 0, %for.cond1.preheader ], [ %indvars.iv.next69, %for.inc47 ]
+ br label %for.cond7.preheader
+
+for.cond7.preheader: ; preds = %for.inc44, %for.cond4.preheader
+ %indvars.iv65 = phi i64 [ 0, %for.cond4.preheader ], [ %indvars.iv.next66, %for.inc44 ]
+ br label %for.cond10.preheader
+
+for.cond10.preheader: ; preds = %for.inc41, %for.cond7.preheader
+ %indvars.iv62 = phi i64 [ 0, %for.cond7.preheader ], [ %indvars.iv.next63, %for.inc41 ]
+ br label %for.cond13.preheader
+
+for.cond13.preheader: ; preds = %for.inc38, %for.cond10.preheader
+ %indvars.iv59 = phi i64 [ 0, %for.cond10.preheader ], [ %indvars.iv.next60, %for.inc38 ]
+ br label %for.body15
+
+for.body15: ; preds = %for.body15, %for.cond13.preheader
+ %indvars.iv = phi i64 [ 0, %for.cond13.preheader ], [ %indvars.iv.next, %for.body15 ]
+ %arrayidx23 = getelementptr inbounds [4 x [8 x [1024 x [8 x double]]]], [4 x [8 x [1024 x [8 x double]]]]* %A, i64 %indvars.iv, i64 %indvars.iv65, i64 %indvars.iv68, i64 %indvars.iv62, i64 %indvars.iv71
+ %i = load double, double* %arrayidx23, align 8
+ %arrayidx27 = getelementptr inbounds [1024 x double], [1024 x double]* %B, i64 %indvars.iv62, i64 %indvars.iv59
+ %i1 = load double, double* %arrayidx27, align 8
+ %mul = fmul fast double %i1, %i
+ %arrayidx37 = getelementptr inbounds [8 x [4 x [1024 x [4 x double]]]], [8 x [4 x [1024 x [4 x double]]]]* %C, i64 %indvars.iv71, i64 %indvars.iv68, i64 %indvars.iv65, i64 %indvars.iv59, i64 %indvars.iv
+ %i2 = load double, double* %arrayidx37, align 8
+ %add = fadd fast double %i2, %mul
+ store double %add, double* %arrayidx37, align 8
+ %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+ %exitcond = icmp ne i64 %indvars.iv.next, 4
+ br i1 %exitcond, label %for.body15, label %for.inc38
+
+for.inc38: ; preds = %for.body15
+ %indvars.iv.next60 = add nuw nsw i64 %indvars.iv59, 1
+ %exitcond61 = icmp ne i64 %indvars.iv.next60, 1024
+ br i1 %exitcond61, label %for.cond13.preheader, label %for.inc41
+
+for.inc41: ; preds = %for.inc38
+ %indvars.iv.next63 = add nuw nsw i64 %indvars.iv62, 1
+ %exitcond64 = icmp ne i64 %indvars.iv.next63, 1024
+ br i1 %exitcond64, label %for.cond10.preheader, label %for.inc44
+
+for.inc44: ; preds = %for.inc41
+ %indvars.iv.next66 = add nuw nsw i64 %indvars.iv65, 1
+ %exitcond67 = icmp ne i64 %indvars.iv.next66, 4
+ br i1 %exitcond67, label %for.cond7.preheader, label %for.inc47
+
+for.inc47: ; preds = %for.inc44
+ %indvars.iv.next69 = add nuw nsw i64 %indvars.iv68, 1
+ %exitcond70 = icmp ne i64 %indvars.iv.next69, 8
+ br i1 %exitcond70, label %for.cond4.preheader, label %for.inc50
+
+for.inc50: ; preds = %for.inc47
+ %indvars.iv.next72 = add nuw nsw i64 %indvars.iv71, 1
+ %exitcond73 = icmp ne i64 %indvars.iv.next72, 8
+ br i1 %exitcond73, label %for.cond1.preheader, label %for.end52
+
+for.end52: ; preds = %for.inc50
+ ret void
+}
diff --git a/polly/test/ScheduleOptimizer/pattern-matching-based-opts_2.ll b/polly/test/ScheduleOptimizer/pattern-matching-based-opts_2.ll
index 5498f8476875..100604d40c11 100644
--- a/polly/test/ScheduleOptimizer/pattern-matching-based-opts_2.ll
+++ b/polly/test/ScheduleOptimizer/pattern-matching-based-opts_2.ll
@@ -1,4 +1,5 @@
-; RUN: opt %loadPolly -polly-opt-isl -polly-pattern-matching-based-opts=true -debug < %s 2>&1 | FileCheck %s
+; RUN: opt %loadPolly -polly-opt-isl -polly-pattern-matching-based-opts=true \
+; RUN: -polly-tc-opt=true -debug < %s 2>&1 | FileCheck %s
; REQUIRES: asserts
;
; /* C := alpha*A*B + beta*C */
@@ -15,6 +16,7 @@
; after the interchanging of loops.
;
; CHECK-NOT: The matrix multiplication pattern was detected
+; CHECK-NOT: The tensor contraction pattern was detected
;
target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
target triple = "x86_64-unknown-unknown"
diff --git a/polly/test/ScheduleOptimizer/pattern-matching-based-opts_20.ll b/polly/test/ScheduleOptimizer/pattern-matching-based-opts_20.ll
new file mode 100644
index 000000000000..7e5d0ca4aa4b
--- /dev/null
+++ b/polly/test/ScheduleOptimizer/pattern-matching-based-opts_20.ll
@@ -0,0 +1,94 @@
+; RUN: opt %loadPolly -polly-opt-isl -polly-pattern-matching-based-opts=true \
+; RUN: -polly-tc-opt=true -debug -disable-output < %s 2>&1 | FileCheck %s
+; REQUIRES: asserts
+;
+; for (i = 0; i < 16; i++)
+; for (j = 0; j < 16; j++)
+; for (k = 0; k < 8; ++k)
+; for (l = 0; l < 1024; ++l)
+; for (w = 0; w < 8; ++w)
+; for (q = 0; q < 8; ++q)
+; for (x = 0; x < 8; ++x)
+; C[i][j][k][w][q][x] += A[l][x][j][k] * B[w][q][l][i];
+;
+; CHECK: The tensor contraction pattern was detected
+;
+target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+define internal void @kernel_tc([16 x [8 x [8 x [8 x [8 x double]]]]]* %C, [8 x [16 x [8 x double]]]* %A, [8 x [1024 x [16 x double]]]* %B) {
+entry:
+ br label %for.cond1.preheader
+
+for.cond1.preheader: ; preds = %for.inc60, %entry
+ %indvars.iv85 = phi i64 [ 0, %entry ], [ %indvars.iv.next86, %for.inc60 ]
+ br label %for.cond4.preheader
+
+for.cond4.preheader: ; preds = %for.inc57, %for.cond1.preheader
+ %indvars.iv82 = phi i64 [ 0, %for.cond1.preheader ], [ %indvars.iv.next83, %for.inc57 ]
+ br label %for.cond7.preheader
+
+for.cond7.preheader: ; preds = %for.inc54, %for.cond4.preheader
+ %indvars.iv79 = phi i64 [ 0, %for.cond4.preheader ], [ %indvars.iv.next80, %for.inc54 ]
+ br label %for.cond10.preheader
+
+for.cond10.preheader: ; preds = %for.inc51, %for.cond7.preheader
+ %indvars.iv76 = phi i64 [ 0, %for.cond7.preheader ], [ %indvars.iv.next77, %for.inc51 ]
+ br label %for.cond13.preheader
+
+for.cond13.preheader: ; preds = %for.inc48, %for.cond10.preheader
+ %indvars.iv73 = phi i64 [ 0, %for.cond10.preheader ], [ %indvars.iv.next74, %for.inc48 ]
+ br label %for.cond16.preheader
+
+for.cond16.preheader: ; preds = %for.inc45, %for.cond13.preheader
+ %indvars.iv70 = phi i64 [ 0, %for.cond13.preheader ], [ %indvars.iv.next71, %for.inc45 ]
+ br label %for.body18
+
+for.body18: ; preds = %for.body18, %for.cond16.preheader
+ %indvars.iv = phi i64 [ 0, %for.cond16.preheader ], [ %indvars.iv.next, %for.body18 ]
+ %arrayidx24 = getelementptr inbounds [8 x [16 x [8 x double]]], [8 x [16 x [8 x double]]]* %A, i64 %indvars.iv76, i64 %indvars.iv, i64 %indvars.iv82, i64 %indvars.iv79
+ %i = load double, double* %arrayidx24, align 8
+ %arrayidx32 = getelementptr inbounds [8 x [1024 x [16 x double]]], [8 x [1024 x [16 x double]]]* %B, i64 %indvars.iv73, i64 %indvars.iv70, i64 %indvars.iv76, i64 %indvars.iv85
+ %i1 = load double, double* %arrayidx32, align 8
+ %mul = fmul fast double %i1, %i
+ %arrayidx44 = getelementptr inbounds [16 x [8 x [8 x [8 x [8 x double]]]]], [16 x [8 x [8 x [8 x [8 x double]]]]]* %C, i64 %indvars.iv85, i64 %indvars.iv82, i64 %indvars.iv79, i64 %indvars.iv73, i64 %indvars.iv70, i64 %indvars.iv
+ %i2 = load double, double* %arrayidx44, align 8
+ %add = fadd fast double %i2, %mul
+ store double %add, double* %arrayidx44, align 8
+ %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+ %exitcond = icmp ne i64 %indvars.iv.next, 8
+ br i1 %exitcond, label %for.body18, label %for.inc45
+
+for.inc45: ; preds = %for.body18
+ %indvars.iv.next71 = add nuw nsw i64 %indvars.iv70, 1
+ %exitcond72 = icmp ne i64 %indvars.iv.next71, 8
+ br i1 %exitcond72, label %for.cond16.preheader, label %for.inc48
+
+for.inc48: ; preds = %for.inc45
+ %indvars.iv.next74 = add nuw nsw i64 %indvars.iv73, 1
+ %exitcond75 = icmp ne i64 %indvars.iv.next74, 8
+ br i1 %exitcond75, label %for.cond13.preheader, label %for.inc51
+
+for.inc51: ; preds = %for.inc48
+ %indvars.iv.next77 = add nuw nsw i64 %indvars.iv76, 1
+ %exitcond78 = icmp ne i64 %indvars.iv.next77, 1024
+ br i1 %exitcond78, label %for.cond10.preheader, label %for.inc54
+
+for.inc54: ; preds = %for.inc51
+ %indvars.iv.next80 = add nuw nsw i64 %indvars.iv79, 1
+ %exitcond81 = icmp ne i64 %indvars.iv.next80, 8
+ br i1 %exitcond81, label %for.cond7.preheader, label %for.inc57
+
+for.inc57: ; preds = %for.inc54
+ %indvars.iv.next83 = add nuw nsw i64 %indvars.iv82, 1
+ %exitcond84 = icmp ne i64 %indvars.iv.next83, 16
+ br i1 %exitcond84, label %for.cond4.preheader, label %for.inc60
+
+for.inc60: ; preds = %for.inc57
+ %indvars.iv.next86 = add nuw nsw i64 %indvars.iv85, 1
+ %exitcond87 = icmp ne i64 %indvars.iv.next86, 16
+ br i1 %exitcond87, label %for.cond1.preheader, label %for.end62
+
+for.end62: ; preds = %for.inc60
+ ret void
+}
diff --git a/polly/test/ScheduleOptimizer/pattern-matching-based-opts_21.ll b/polly/test/ScheduleOptimizer/pattern-matching-based-opts_21.ll
new file mode 100644
index 000000000000..a255abacfaba
--- /dev/null
+++ b/polly/test/ScheduleOptimizer/pattern-matching-based-opts_21.ll
@@ -0,0 +1,64 @@
+; RUN: opt %loadPolly -polly-opt-isl -polly-pattern-matching-based-opts=true \
+; RUN: -polly-tc-opt=true -debug -disable-output < %s 2>&1 | FileCheck %s
+; REQUIRES: asserts
+;
+; for (int i = 0; i < 32; i++)
+; for (int j = 0; j < 32; j++)
+; for (int l = 0; l < 32; l++)
+; for (int w = 0; w < 32; w++)
+; C[i][j] += A[i][l][w] * B[w][j][i];
+;
+; CHECK-NOT: The tensor contraction pattern was detected
+;
+target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+define void @foo([64 x double]* noundef %C, [64 x [64 x double]]* noundef %A, [64 x [64 x double]]* noundef %B) {
+entry:
+ br label %for.cond1.preheader
+
+for.cond1.preheader: ; preds = %for.inc33, %entry
+ %indvars.iv49 = phi i64 [ 0, %entry ], [ %indvars.iv.next50, %for.inc33 ]
+ br label %for.cond5.preheader
+
+for.cond5.preheader: ; preds = %for.inc30, %for.cond1.preheader
+ %indvars.iv45 = phi i64 [ 0, %for.cond1.preheader ], [ %indvars.iv.next46, %for.inc30 ]
+ br label %for.cond9.preheader
+
+for.cond9.preheader: ; preds = %for.inc27, %for.cond5.preheader
+ %indvars.iv41 = phi i64 [ 0, %for.cond5.preheader ], [ %indvars.iv.next42, %for.inc27 ]
+ br label %for.body12
+
+for.body12: ; preds = %for.body12, %for.cond9.preheader
+ %indvars.iv = phi i64 [ 0, %for.cond9.preheader ], [ %indvars.iv.next, %for.body12 ]
+ %arrayidx16 = getelementptr inbounds [64 x [64 x double]], [64 x [64 x double]]* %A, i64 %indvars.iv49, i64 %indvars.iv41, i64 %indvars.iv
+ %i = load double, double* %arrayidx16, align 8
+ %arrayidx22 = getelementptr inbounds [64 x [64 x double]], [64 x [64 x double]]* %B, i64 %indvars.iv, i64 %indvars.iv45, i64 %indvars.iv49
+ %i1 = load double, double* %arrayidx22, align 8
+ %mul = fmul fast double %i1, %i
+ %arrayidx26 = getelementptr inbounds [64 x double], [64 x double]* %C, i64 %indvars.iv49, i64 %indvars.iv45
+ %i2 = load double, double* %arrayidx26, align 8
+ %add = fadd fast double %i2, %mul
+ store double %add, double* %arrayidx26, align 8
+ %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+ %exitcond = icmp ne i64 %indvars.iv.next, 32
+ br i1 %exitcond, label %for.body12, label %for.inc27
+
+for.inc27: ; preds = %for.body12
+ %indvars.iv.next42 = add nuw nsw i64 %indvars.iv41, 1
+ %exitcond44 = icmp ne i64 %indvars.iv.next42, 32
+ br i1 %exitcond44, label %for.cond9.preheader, label %for.inc30
+
+for.inc30: ; preds = %for.inc27
+ %indvars.iv.next46 = add nuw nsw i64 %indvars.iv45, 1
+ %exitcond48 = icmp ne i64 %indvars.iv.next46, 32
+ br i1 %exitcond48, label %for.cond5.preheader, label %for.inc33
+
+for.inc33: ; preds = %for.inc30
+ %indvars.iv.next50 = add nuw nsw i64 %indvars.iv49, 1
+ %exitcond52 = icmp ne i64 %indvars.iv.next50, 32
+ br i1 %exitcond52, label %for.cond1.preheader, label %for.end35
+
+for.end35: ; preds = %for.inc33
+ ret void
+}
diff --git a/polly/test/ScheduleOptimizer/pattern-matching-based-opts_22.ll b/polly/test/ScheduleOptimizer/pattern-matching-based-opts_22.ll
new file mode 100644
index 000000000000..1d36d49a4529
--- /dev/null
+++ b/polly/test/ScheduleOptimizer/pattern-matching-based-opts_22.ll
@@ -0,0 +1,65 @@
+; RUN: opt %loadPolly -polly-opt-isl -polly-pattern-matching-based-opts=true \
+; RUN: -polly-tc-opt=true -debug -disable-output < %s 2>&1 | FileCheck %s
+; REQUIRES: asserts
+;
+; for (int i = 0; i < 32; i++)
+; for (int j = 0; j < 32; j++)
+; for (int l = 0; l < 32; l++)
+; for (int w = 0; w < 32; w++)
+; C[i][j] += A[i][l][w] * B[w][j][i+3];
+;
+; CHECK-NOT: The tensor contraction pattern was detected
+;
+target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+define void @foo([64 x double]* noundef %C, [64 x [64 x double]]* noundef %A, [64 x [64 x double]]* noundef %B) {
+entry:
+ br label %for.cond1.preheader
+
+for.cond1.preheader: ; preds = %for.inc34, %entry
+ %indvars.iv50 = phi i64 [ 0, %entry ], [ %indvars.iv.next51, %for.inc34 ]
+ br label %for.cond5.preheader
+
+for.cond5.preheader: ; preds = %for.inc31, %for.cond1.preheader
+ %indvars.iv46 = phi i64 [ 0, %for.cond1.preheader ], [ %indvars.iv.next47, %for.inc31 ]
+ br label %for.cond9.preheader
+
+for.cond9.preheader: ; preds = %for.inc28, %for.cond5.preheader
+ %indvars.iv42 = phi i64 [ 0, %for.cond5.preheader ], [ %indvars.iv.next43, %for.inc28 ]
+ br label %for.body12
+
+for.body12: ; preds = %for.body12, %for.cond9.preheader
+ %indvars.iv = phi i64 [ 0, %for.cond9.preheader ], [ %indvars.iv.next, %for.body12 ]
+ %arrayidx16 = getelementptr inbounds [64 x [64 x double]], [64 x [64 x double]]* %A, i64 %indvars.iv50, i64 %indvars.iv42, i64 %indvars.iv
+ %i = load double, double* %arrayidx16, align 8
+ %i1 = add nuw nsw i64 %indvars.iv50, 3
+ %arrayidx22 = getelementptr inbounds [64 x [64 x double]], [64 x [64 x double]]* %B, i64 %indvars.iv, i64 %indvars.iv46, i64 %i1
+ %i2 = load double, double* %arrayidx22, align 8
+ %mul = fmul fast double %i2, %i
+ %arrayidx26 = getelementptr inbounds [64 x double], [64 x double]* %C, i64 %indvars.iv50, i64 %indvars.iv46
+ %i3 = load double, double* %arrayidx26, align 8
+ %add27 = fadd fast double %i3, %mul
+ store double %add27, double* %arrayidx26, align 8
+ %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+ %exitcond = icmp ne i64 %indvars.iv.next, 32
+ br i1 %exitcond, label %for.body12, label %for.inc28
+
+for.inc28: ; preds = %for.body12
+ %indvars.iv.next43 = add nuw nsw i64 %indvars.iv42, 1
+ %exitcond45 = icmp ne i64 %indvars.iv.next43, 32
+ br i1 %exitcond45, label %for.cond9.preheader, label %for.inc31
+
+for.inc31: ; preds = %for.inc28
+ %indvars.iv.next47 = add nuw nsw i64 %indvars.iv46, 1
+ %exitcond49 = icmp ne i64 %indvars.iv.next47, 32
+ br i1 %exitcond49, label %for.cond5.preheader, label %for.inc34
+
+for.inc34: ; preds = %for.inc31
+ %indvars.iv.next51 = add nuw nsw i64 %indvars.iv50, 1
+ %exitcond54 = icmp ne i64 %indvars.iv.next51, 32
+ br i1 %exitcond54, label %for.cond1.preheader, label %for.end36
+
+for.end36: ; preds = %for.inc34
+ ret void
+}
diff --git a/polly/test/ScheduleOptimizer/pattern-matching-based-opts_23.ll b/polly/test/ScheduleOptimizer/pattern-matching-based-opts_23.ll
new file mode 100644
index 000000000000..2ca48156d3e3
--- /dev/null
+++ b/polly/test/ScheduleOptimizer/pattern-matching-based-opts_23.ll
@@ -0,0 +1,79 @@
+; RUN: opt %loadPolly -polly-delicm -polly-simplify -polly-opt-isl \
+; RUN: -polly-pattern-matching-based-opts=true \
+; RUN: -polly-tc-opt=true -debug -disable-output < %s 2>&1 | FileCheck %s
+; REQUIRES: asserts
+;
+; Check that a region statement, which has the correct order of accesses, is not
+; detected.
+;
+; for (int i = 0; i < 32; i++)
+; for (int j = 0; j < 32; j++)
+; for (int k = 0; k < 32; k++) {
+; int c = C[i][j];
+; if (i*j*k < 10) {
+; C[i][j] = A[i][k] + B[k][j];
+; } else {
+; C[i][j] = c;
+; }
+; }
+;
+; CHECK-NOT: The tensor contraction pattern was detected
+;
+target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+; Function Attrs: nounwind uwtable
+define dso_local void @foo([64 x double]* noundef %C, [64 x double]* noundef %A, [64 x double]* noundef %B) #0 {
+entry:
+ br label %for.cond1.preheader
+
+for.cond1.preheader: ; preds = %entry, %for.inc34
+ %indvars.iv48 = phi i64 [ 0, %entry ], [ %indvars.iv.next49, %for.inc34 ]
+ br label %for.cond5.preheader
+
+for.cond5.preheader: ; preds = %for.cond1.preheader, %for.inc31
+ %indvars.iv43 = phi i64 [ 0, %for.cond1.preheader ], [ %indvars.iv.next44, %for.inc31 ]
+ br label %for.body8
+
+for.body8: ; preds = %for.cond5.preheader, %if.end
+ %indvars.iv = phi i64 [ 0, %for.cond5.preheader ], [ %indvars.iv.next, %if.end ]
+ %arrayidx10 = getelementptr inbounds [64 x double], [64 x double]* %C, i64 %indvars.iv48, i64 %indvars.iv43
+ %0 = mul nuw nsw i64 %indvars.iv43, %indvars.iv48
+ %1 = mul nuw nsw i64 %0, %indvars.iv
+ %cmp12 = icmp ult i64 %1, 10
+ br i1 %cmp12, label %if.then, label %if.else
+
+if.then: ; preds = %for.body8
+ %arrayidx17 = getelementptr inbounds [64 x double], [64 x double]* %A, i64 %indvars.iv48, i64 %indvars.iv
+ %2 = load double, double* %arrayidx17, align 8
+ %arrayidx21 = getelementptr inbounds [64 x double], [64 x double]* %B, i64 %indvars.iv, i64 %indvars.iv43
+ %3 = load double, double* %arrayidx21, align 8
+ %add = fadd fast double %3, %2
+ br label %if.end
+
+if.else: ; preds = %for.body8
+ %4 = load double, double* %arrayidx10, align 8
+ %conv = fptosi double %4 to i32
+ %conv26 = sitofp i32 %conv to double
+ br label %if.end
+
+if.end: ; preds = %if.else, %if.then
+ %storemerge = phi double [ %conv26, %if.else ], [ %add, %if.then ]
+ store double %storemerge, double* %arrayidx10, align 8
+ %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+ %exitcond = icmp ne i64 %indvars.iv.next, 32
+ br i1 %exitcond, label %for.body8, label %for.inc31
+
+for.inc31: ; preds = %if.end
+ %indvars.iv.next44 = add nuw nsw i64 %indvars.iv43, 1
+ %exitcond47 = icmp ne i64 %indvars.iv.next44, 32
+ br i1 %exitcond47, label %for.cond5.preheader, label %for.inc34
+
+for.inc34: ; preds = %for.inc31
+ %indvars.iv.next49 = add nuw nsw i64 %indvars.iv48, 1
+ %exitcond51 = icmp ne i64 %indvars.iv.next49, 32
+ br i1 %exitcond51, label %for.cond1.preheader, label %for.end36
+
+for.end36: ; preds = %for.inc34
+ ret void
+}
diff --git a/polly/test/ScheduleOptimizer/pattern-matching-based-opts_24.ll b/polly/test/ScheduleOptimizer/pattern-matching-based-opts_24.ll
new file mode 100644
index 000000000000..82226e37cc19
--- /dev/null
+++ b/polly/test/ScheduleOptimizer/pattern-matching-based-opts_24.ll
@@ -0,0 +1,65 @@
+; RUN: opt %loadPolly -polly-reschedule=0 -polly-opt-isl \
+; RUN: -polly-pattern-matching-based-opts=true -polly-tc-opt=true \
+; RUN: -debug -disable-output < %s 2>&1 | FileCheck %s
+; REQUIRES: asserts
+;
+; for (i = 0; i < 1024; i++)
+; for (j = 0; j < 1024; j++)
+; for (l = 0; l < 64; ++l)
+; for (w = 0; w < 64; ++w)
+; C[i][j] += A[i][l][w] * B[w][j][l];
+;
+; CHECK: The tensor contraction pattern was detected
+;
+target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+define internal void @kernel_tc(i32 %ni, i32 %nj, i32 %nl, i32 %nq, i32 %nw, double %alpha, double %beta, [1024 x double]* %C, [64 x [64 x double]]* %A, [1024 x [64 x double]]* %B) {
+entry:
+ br label %for.cond1.preheader
+
+for.cond1.preheader: ; preds = %for.inc30, %entry
+ %indvars.iv43 = phi i64 [ 0, %entry ], [ %indvars.iv.next44, %for.inc30 ]
+ br label %for.cond4.preheader
+
+for.cond4.preheader: ; preds = %for.inc27, %for.cond1.preheader
+ %indvars.iv40 = phi i64 [ 0, %for.cond1.preheader ], [ %indvars.iv.next41, %for.inc27 ]
+ br label %for.cond7.preheader
+
+for.cond7.preheader: ; preds = %for.inc24, %for.cond4.preheader
+ %indvars.iv37 = phi i64 [ 0, %for.cond4.preheader ], [ %indvars.iv.next38, %for.inc24 ]
+ br label %for.body9
+
+for.body9: ; preds = %for.body9, %for.cond7.preheader
+ %indvars.iv = phi i64 [ 0, %for.cond7.preheader ], [ %indvars.iv.next, %for.body9 ]
+ %arrayidx13 = getelementptr inbounds [64 x [64 x double]], [64 x [64 x double]]* %A, i64 %indvars.iv43, i64 %indvars.iv37, i64 %indvars.iv
+ %i = load double, double* %arrayidx13, align 8
+ %arrayidx19 = getelementptr inbounds [1024 x [64 x double]], [1024 x [64 x double]]* %B, i64 %indvars.iv, i64 %indvars.iv40, i64 %indvars.iv37
+ %i1 = load double, double* %arrayidx19, align 8
+ %mul = fmul fast double %i1, %i
+ %arrayidx23 = getelementptr inbounds [1024 x double], [1024 x double]* %C, i64 %indvars.iv43, i64 %indvars.iv40
+ %i2 = load double, double* %arrayidx23, align 8
+ %add = fadd fast double %i2, %mul
+ store double %add, double* %arrayidx23, align 8
+ %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+ %exitcond = icmp ne i64 %indvars.iv.next, 64
+ br i1 %exitcond, label %for.body9, label %for.inc24
+
+for.inc24: ; preds = %for.body9
+ %indvars.iv.next38 = add nuw nsw i64 %indvars.iv37, 1
+ %exitcond39 = icmp ne i64 %indvars.iv.next38, 64
+ br i1 %exitcond39, label %for.cond7.preheader, label %for.inc27
+
+for.inc27: ; preds = %for.inc24
+ %indvars.iv.next41 = add nuw nsw i64 %indvars.iv40, 1
+ %exitcond42 = icmp ne i64 %indvars.iv.next41, 1024
+ br i1 %exitcond42, label %for.cond4.preheader, label %for.inc30
+
+for.inc30: ; preds = %for.inc27
+ %indvars.iv.next44 = add nuw nsw i64 %indvars.iv43, 1
+ %exitcond45 = icmp ne i64 %indvars.iv.next44, 1024
+ br i1 %exitcond45, label %for.cond1.preheader, label %for.end32
+
+for.end32: ; preds = %for.inc30
+ ret void
+}
diff --git a/polly/test/ScheduleOptimizer/pattern-matching-based-opts_25.ll b/polly/test/ScheduleOptimizer/pattern-matching-based-opts_25.ll
new file mode 100644
index 000000000000..71cbe45657d8
--- /dev/null
+++ b/polly/test/ScheduleOptimizer/pattern-matching-based-opts_25.ll
@@ -0,0 +1,56 @@
+; RUN: opt %loadPolly -polly-opt-isl -polly-pattern-matching-based-opts=true \
+; RUN: -polly-tc-opt=true -debug -disable-output < %s 2>&1 | FileCheck %s
+; REQUIRES: asserts
+;
+; for (int i = 0; i < 32; i++)
+; for (int j = 0; j < 32; j++)
+; for (int w = 0; w < 32; w++)
+; C[i][j] += A[i][w] * B[w][j][i];
+;
+; CHECK-NOT: The tensor contraction pattern was detected
+;
+target datalayout = "e-m:o-i64:64-i128:128-n32:64-S128"
+target triple = "arm64-apple-macosx12.0.0"
+
+define void @foo([64 x double]* noundef %C, [64 x double]* noundef %A, [64 x [64 x double]]* noundef %B) {
+entry:
+ br label %for.cond1.preheader
+
+for.cond1.preheader: ; preds = %for.cond.cleanup3, %entry
+ %indvars.iv45 = phi i64 [ 0, %entry ], [ %indvars.iv.next46, %for.cond.cleanup3 ]
+ br label %for.cond5.preheader
+
+for.cond.cleanup: ; preds = %for.cond.cleanup3
+ ret void
+
+for.cond5.preheader: ; preds = %for.cond.cleanup7, %for.cond1.preheader
+ %indvars.iv41 = phi i64 [ 0, %for.cond1.preheader ], [ %indvars.iv.next42, %for.cond.cleanup7 ]
+ %arrayidx20 = getelementptr inbounds [64 x double], [64 x double]* %C, i64 %indvars.iv45, i64 %indvars.iv41
+ %.pre = load double, double* %arrayidx20, align 8
+ br label %for.body8
+
+for.cond.cleanup3: ; preds = %for.cond.cleanup7
+ %indvars.iv.next46 = add nuw nsw i64 %indvars.iv45, 1
+ %exitcond48.not = icmp eq i64 %indvars.iv.next46, 32
+ br i1 %exitcond48.not, label %for.cond.cleanup, label %for.cond1.preheader
+
+for.cond.cleanup7: ; preds = %for.body8
+ %indvars.iv.next42 = add nuw nsw i64 %indvars.iv41, 1
+ %exitcond44.not = icmp eq i64 %indvars.iv.next42, 32
+ br i1 %exitcond44.not, label %for.cond.cleanup3, label %for.cond5.preheader
+
+for.body8: ; preds = %for.body8, %for.cond5.preheader
+ %i = phi double [ %.pre, %for.cond5.preheader ], [ %i3, %for.body8 ]
+ %indvars.iv = phi i64 [ 0, %for.cond5.preheader ], [ %indvars.iv.next, %for.body8 ]
+ %arrayidx10 = getelementptr inbounds [64 x double], [64 x double]* %A, i64 %indvars.iv45, i64 %indvars.iv
+ %i1 = load double, double* %arrayidx10, align 8
+ %arrayidx16 = getelementptr inbounds [64 x [64 x double]], [64 x [64 x double]]* %B, i64 %indvars.iv, i64 %indvars.iv41, i64 %indvars.iv45
+ %i2 = load double, double* %arrayidx16, align 8
+ %i3 = tail call double @llvm.fmuladd.f64(double %i1, double %i2, double %i)
+ store double %i3, double* %arrayidx20, align 8
+ %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+ %exitcond.not = icmp eq i64 %indvars.iv.next, 32
+ br i1 %exitcond.not, label %for.cond.cleanup7, label %for.body8
+}
+
+declare double @llvm.fmuladd.f64(double, double, double) \ No newline at end of file
diff --git a/polly/test/ScheduleOptimizer/pattern-matching-based-opts_4.ll b/polly/test/ScheduleOptimizer/pattern-matching-based-opts_4.ll
index 6cc4fda84115..4bc8928cd3bd 100644
--- a/polly/test/ScheduleOptimizer/pattern-matching-based-opts_4.ll
+++ b/polly/test/ScheduleOptimizer/pattern-matching-based-opts_4.ll
@@ -1,12 +1,13 @@
; RUN: opt %loadPolly -polly-opt-isl -polly-pattern-matching-based-opts=true \
-; RUN: -debug < %s 2>&1 | FileCheck %s
-; RUN: opt %loadPolly -polly-pattern-matching-based-opts=true \
+; RUN: -debug -polly-tc-opt=true -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadPolly -polly-opt-isl -polly-pattern-matching-based-opts=true \
; RUN: -polly-target-throughput-vector-fma=1 \
; RUN: -polly-target-latency-vector-fma=8 \
; RUN: -polly-target-1st-cache-level-size=32768 \
; RUN: -polly-target-vector-register-bitwidth=256 \
-; RUN: -polly-target-2nd-cache-level-size=262144 \
-; RUN: -polly-opt-isl -polly-print-ast -disable-output < %s | FileCheck %s --check-prefix=PATTERN-MATCHING-OPTS
+; RUN: -polly-target-2nd-cache-level-size=262144 -polly-print-ast \
+; RUN: -polly-tc-opt=true -disable-output -polly-opt-isl < %s | \
+; RUN: FileCheck %s --check-prefix=PATTERN-MATCHING-OPTS
; REQUIRES: asserts
;
; C := A * B + C
@@ -20,6 +21,7 @@
; for (j = 0; j < _PB_NJ; j++)
; C[i][j] += A[i][k] * B[k][j];
;
+; CHECK: The tensor contraction pattern was detected
; CHECK: The matrix multiplication pattern was detected
;
; PATTERN-MATCHING-OPTS: // 1st level tiling - Tiles