From d7ee1644d27469fed4e2b91b70eddf0c1124e6a3 Mon Sep 17 00:00:00 2001
From: Ivan Butygin <ivan.butygin@intel.com>
Date: Tue, 8 Sep 2020 19:22:12 +0300
Subject: [PATCH] Fixes vectorizer and extends SVML support

Patch was updated to fix SVML calling convention issues uncovered by llvm 10.
In previous versions of patch SVML calling convention was selected based on
compilation settings. So if you try to call 256bit vector function from avx512
code function will be called with avx512 cc which is incorrect. To fix this
SVML cc was separated into 3 different cc for 128, 256 and 512bit vector lengths
which are selected based on actual input vector length.

Original patch merged several fixes:

1. https://reviews.llvm.org/D47188 patch fixes the problem with improper calls
to SVML library as it has non-standard calling conventions. So accordingly it
has SVML calling conventions definitions and code to set CC to the vectorized
calls. As SVML provides several implementations for the math functions we also
took into consideration fast attribute and select more fast implementation in
such case. This work is based on original Matt Masten's work.
Author: Denis Nagorny

2. https://reviews.llvm.org/D53035 patch implements support to legalize SVML
calls by breaking down the illegal vector call instruction into multiple legal
vector call instructions during code generation. Currently the vectorizer does
not check legality of the generated SVML (or any VECLIB) call instructions, and
this can lead to potential problems even during vector type legalization. This
patch addresses this issue by adding a legality check during code generation and
replaces the illegal SVML call with corresponding legalized instructions.
(RFC: http://lists.llvm.org/pipermail/llvm-dev/2018-June/124357.html)
Author: Karthik Senthil

3. Functional merge of the patches above, which fixes calling convention
---
 include/llvm/Analysis/TargetLibraryInfo.h          |  21 +-
 include/llvm/IR/CMakeLists.txt                     |   4 +
 include/llvm/IR/CallingConv.h                      |   5 +
 include/llvm/IR/SVML.td                            |  62 +++
 lib/Analysis/CMakeLists.txt                        |   1 +
 lib/Analysis/TargetLibraryInfo.cpp                 |  55 ++-
 lib/AsmParser/LLLexer.cpp                          |   3 +
 lib/AsmParser/LLParser.cpp                         |   6 +
 lib/AsmParser/LLToken.h                            |   3 +
 lib/IR/AsmWriter.cpp                               |   3 +
 lib/IR/Verifier.cpp                                |   3 +
 lib/Target/X86/X86CallingConv.td                   |  71 ++-
 lib/Target/X86/X86ISelLowering.cpp                 |   3 +-
 lib/Target/X86/X86RegisterInfo.cpp                 |  46 ++
 lib/Target/X86/X86Subtarget.h                      |   3 +
 lib/Transforms/Utils/InjectTLIMappings.cpp         |   3 +-
 lib/Transforms/Vectorize/LoopVectorize.cpp         | 273 ++++++++++-
 .../LoopVectorize/X86/svml-calls-finite.ll         |  15 +-
 test/Transforms/LoopVectorize/X86/svml-calls.ll    |  88 +++-
 .../LoopVectorize/X86/svml-legal-calls.ll          | 513 +++++++++++++++++++++
 .../LoopVectorize/X86/svml-legal-codegen.ll        |  61 +++
 utils/TableGen/CMakeLists.txt                      |   1 +
 utils/TableGen/SVMLEmitter.cpp                     | 110 +++++
 utils/TableGen/TableGen.cpp                        |   8 +-
 utils/TableGen/TableGenBackends.h                  |   1 +
 utils/vim/syntax/llvm.vim                          |   1 +
 26 files changed, 1315 insertions(+), 48 deletions(-)
 create mode 100644 llvm/include/llvm/IR/SVML.td
 create mode 100644 llvm/test/Transforms/LoopVectorize/X86/svml-legal-calls.ll
 create mode 100644 llvm/test/Transforms/LoopVectorize/X86/svml-legal-codegen.ll
 create mode 100644 llvm/utils/TableGen/SVMLEmitter.cpp

diff --git a/include/llvm/Analysis/TargetLibraryInfo.h b/include/llvm/Analysis/TargetLibraryInfo.h
index 3a7c26e..4d37b34 100644
--- a/include/llvm/Analysis/TargetLibraryInfo.h
+++ b/include/llvm/Analysis/TargetLibraryInfo.h
@@ -39,6 +39,12 @@ struct VecDesc {
     NotLibFunc
   };
 
+enum SVMLAccuracy {
+  SVML_DEFAULT,
+  SVML_HA,
+  SVML_EP
+};
+
 /// Implementation of the target library information.
 ///
 /// This class constructs tables that hold the target library information and
@@ -152,7 +158,7 @@ public:
   /// Return true if the function F has a vector equivalent with vectorization
   /// factor VF.
   bool isFunctionVectorizable(StringRef F, unsigned VF) const {
-    return !getVectorizedFunction(F, VF).empty();
+     return !getVectorizedFunction(F, VF, false).empty();
   }
 
   /// Return true if the function F has a vector equivalent with any
@@ -161,7 +167,10 @@ public:
 
   /// Return the name of the equivalent of F, vectorized with factor VF. If no
   /// such mapping exists, return the empty string.
-  StringRef getVectorizedFunction(StringRef F, unsigned VF) const;
+  std::string getVectorizedFunction(StringRef F, unsigned VF, bool IsFast) const;
+
+  Optional<CallingConv::ID> getVectorizedFunctionCallingConv(
+    StringRef F, const FunctionType &FTy, const DataLayout &DL) const;
 
   /// Return true if the function F has a scalar equivalent, and set VF to be
   /// the vectorization factor.
@@ -322,8 +331,12 @@ public:
   bool isFunctionVectorizable(StringRef F) const {
     return Impl->isFunctionVectorizable(F);
   }
-  StringRef getVectorizedFunction(StringRef F, unsigned VF) const {
-    return Impl->getVectorizedFunction(F, VF);
+  std::string getVectorizedFunction(StringRef F, unsigned VF, bool IsFast) const {
+    return Impl->getVectorizedFunction(F, VF, IsFast);
+  }
+  Optional<CallingConv::ID> getVectorizedFunctionCallingConv(
+    StringRef F, const FunctionType &FTy, const DataLayout &DL) const {
+    return Impl->getVectorizedFunctionCallingConv(F, FTy, DL);
   }
 
   /// Tests if the function is both available and a candidate for optimized code
diff --git a/include/llvm/IR/CMakeLists.txt b/include/llvm/IR/CMakeLists.txt
index c8edc29..e532ce0 100644
--- a/include/llvm/IR/CMakeLists.txt
+++ b/include/llvm/IR/CMakeLists.txt
@@ -19,3 +19,7 @@ tablegen(LLVM IntrinsicsWebAssembly.h -gen-intrinsic-enums -intrinsic-prefix=was
 tablegen(LLVM IntrinsicsX86.h -gen-intrinsic-enums -intrinsic-prefix=x86)
 tablegen(LLVM IntrinsicsXCore.h -gen-intrinsic-enums -intrinsic-prefix=xcore)
 add_public_tablegen_target(intrinsics_gen)
+
+set(LLVM_TARGET_DEFINITIONS SVML.td)
+tablegen(LLVM SVML.inc -gen-svml)
+add_public_tablegen_target(svml_gen)
diff --git a/include/llvm/IR/CallingConv.h b/include/llvm/IR/CallingConv.h
index d0906de3..ab1e07f 100644
--- a/include/llvm/IR/CallingConv.h
+++ b/include/llvm/IR/CallingConv.h
@@ -241,6 +241,11 @@ namespace CallingConv {
     /// The remainder matches the regular calling convention.
     WASM_EmscriptenInvoke = 99,
 
+    /// Intel_SVML - Calling conventions for Intel Short Math Vector Library
+    Intel_SVML128 = 100,
+    Intel_SVML256 = 101,
+    Intel_SVML512 = 102,
+
     /// The highest possible calling convention ID. Must be some 2^k - 1.
     MaxID = 1023
   };
diff --git a/include/llvm/IR/SVML.td b/include/llvm/IR/SVML.td
new file mode 100644
index 0000000..5af7104
--- /dev/null
+++ b/include/llvm/IR/SVML.td
@@ -0,0 +1,62 @@
+//===-- Intel_SVML.td - Defines SVML call variants ---------*- tablegen -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file is used by TableGen to define the different typs of SVML function
+// variants used with -fveclib=SVML.
+//
+//===----------------------------------------------------------------------===//
+
+class SvmlVariant;
+
+def sin        : SvmlVariant;
+def cos        : SvmlVariant;
+def pow        : SvmlVariant;
+def exp        : SvmlVariant;
+def log        : SvmlVariant;
+def acos       : SvmlVariant;
+def acosh      : SvmlVariant;
+def asin       : SvmlVariant;
+def asinh      : SvmlVariant;
+def atan2      : SvmlVariant;
+def atan       : SvmlVariant;
+def atanh      : SvmlVariant;
+def cbrt       : SvmlVariant;
+def cdfnorm    : SvmlVariant;
+def cdfnorminv : SvmlVariant;
+def cosd       : SvmlVariant;
+def cosh       : SvmlVariant;
+def erf        : SvmlVariant;
+def erfc       : SvmlVariant;
+def erfcinv    : SvmlVariant;
+def erfinv     : SvmlVariant;
+def exp10      : SvmlVariant;
+def exp2       : SvmlVariant;
+def expm1      : SvmlVariant;
+def hypot      : SvmlVariant;
+def invsqrt    : SvmlVariant;
+def log10      : SvmlVariant;
+def log1p      : SvmlVariant;
+def log2       : SvmlVariant;
+def sind       : SvmlVariant;
+def sinh       : SvmlVariant;
+def sqrt       : SvmlVariant;
+def tan        : SvmlVariant;
+def tanh       : SvmlVariant;
+
+// TODO: SVML does not currently provide _ha and _ep variants of these fucnctions.
+// We should call the default variant of these functions in all cases instead.
+
+// def nearbyint  : SvmlVariant;
+// def logb       : SvmlVariant;
+// def floor      : SvmlVariant;
+// def fmod       : SvmlVariant;
+// def ceil       : SvmlVariant;
+// def trunc      : SvmlVariant;
+// def rint       : SvmlVariant;
+// def round      : SvmlVariant;
diff --git a/lib/Analysis/CMakeLists.txt b/lib/Analysis/CMakeLists.txt
index 7036233..015e350 100644
--- a/lib/Analysis/CMakeLists.txt
+++ b/lib/Analysis/CMakeLists.txt
@@ -143,6 +143,7 @@ add_llvm_component_library(LLVMAnalysis
 
   DEPENDS
   intrinsics_gen
+  svml_gen
 
   LINK_LIBS
   ${MLLinkDeps}
diff --git a/lib/Analysis/TargetLibraryInfo.cpp b/lib/Analysis/TargetLibraryInfo.cpp
index 60cfb04..1417550 100644
--- a/lib/Analysis/TargetLibraryInfo.cpp
+++ b/lib/Analysis/TargetLibraryInfo.cpp
@@ -63,6 +63,11 @@ static bool hasBcmp(const Triple &TT) {
   return TT.isOSFreeBSD() || TT.isOSSolaris();
 }
 
+std::string svmlMangle(StringRef FnName, const bool IsFast) {
+  std::string FullName = FnName.str();
+  return IsFast ? FullName : FullName + "_ha";
+}
+
 /// Initialize the set of available library functions based on the specified
 /// target triple. This should be carefully written so that a missing target
 /// triple gets a sane set of defaults.
@@ -1559,8 +1564,9 @@ void TargetLibraryInfoImpl::addVectorizableFunctionsFromVecLib(
   }
   case SVML: {
     const VecDesc VecFuncs[] = {
-    #define TLI_DEFINE_SVML_VECFUNCS
-    #include "llvm/Analysis/VecFuncs.def"
+    #define GET_SVML_VARIANTS
+    #include "llvm/IR/SVML.inc"
+    #undef GET_SVML_VARIANTS
     };
     addVectorizableFunctions(VecFuncs);
     break;
@@ -1580,19 +1586,52 @@ bool TargetLibraryInfoImpl::isFunctionVectorizable(StringRef funcName) const {
   return I != VectorDescs.end() && StringRef(I->ScalarFnName) == funcName;
 }
 
-StringRef TargetLibraryInfoImpl::getVectorizedFunction(StringRef F,
-                                                       unsigned VF) const {
+std::string TargetLibraryInfoImpl::getVectorizedFunction(StringRef F,
+                                                         unsigned VF,
+                                                         bool IsFast) const {
+  bool FromSVML = ClVectorLibrary == SVML;
   F = sanitizeFunctionName(F);
   if (F.empty())
-    return F;
+    return F.str();
   std::vector<VecDesc>::const_iterator I =
       llvm::lower_bound(VectorDescs, F, compareWithScalarFnName);
   while (I != VectorDescs.end() && StringRef(I->ScalarFnName) == F) {
-    if (I->VectorizationFactor == VF)
-      return I->VectorFnName;
+    if (I->VectorizationFactor == VF) {
+      if (FromSVML) {
+        return svmlMangle(I->VectorFnName, IsFast);
+      }
+      return I->VectorFnName.str();
+    }
     ++I;
   }
-  return StringRef();
+  return std::string();
+}
+
+static CallingConv::ID getSVMLCallingConv(const DataLayout &DL, const FunctionType &FType)
+{
+  assert(isa<VectorType>(FType.getReturnType()));
+  auto *VecCallRetType = cast<VectorType>(FType.getReturnType());
+  auto TypeBitWidth = DL.getTypeSizeInBits(VecCallRetType);
+  if (TypeBitWidth == 128) {
+    return CallingConv::Intel_SVML128;
+  } else if (TypeBitWidth == 256) {
+    return CallingConv::Intel_SVML256;
+  } else if (TypeBitWidth == 512) {
+    return CallingConv::Intel_SVML512;
+  } else {
+    llvm_unreachable("Invalid vector width");
+  }
+  return 0; // not reachable
+}
+
+Optional<CallingConv::ID>
+TargetLibraryInfoImpl::getVectorizedFunctionCallingConv(
+    StringRef F, const FunctionType &FTy, const DataLayout &DL) const {
+  if (ClVectorLibrary == SVML) {
+    assert(F.startswith("__svml"));
+    return getSVMLCallingConv(DL, FTy);
+  }
+  return {};
 }
 
 StringRef TargetLibraryInfoImpl::getScalarizedFunction(StringRef F,
diff --git a/lib/AsmParser/LLLexer.cpp b/lib/AsmParser/LLLexer.cpp
index 777ce3a..9a92418 100644
--- a/lib/AsmParser/LLLexer.cpp
+++ b/lib/AsmParser/LLLexer.cpp
@@ -603,6 +603,9 @@ lltok::Kind LLLexer::LexIdentifier() {
   KEYWORD(spir_kernel);
   KEYWORD(spir_func);
   KEYWORD(intel_ocl_bicc);
+  KEYWORD(intel_svmlcc128);
+  KEYWORD(intel_svmlcc256);
+  KEYWORD(intel_svmlcc512);
   KEYWORD(x86_64_sysvcc);
   KEYWORD(win64cc);
   KEYWORD(x86_regcallcc);
diff --git a/lib/AsmParser/LLParser.cpp b/lib/AsmParser/LLParser.cpp
index c9f21ee..6dac503 100644
--- a/lib/AsmParser/LLParser.cpp
+++ b/lib/AsmParser/LLParser.cpp
@@ -1978,6 +1978,9 @@ void LLParser::ParseOptionalDLLStorageClass(unsigned &Res) {
 ///   ::= 'ccc'
 ///   ::= 'fastcc'
 ///   ::= 'intel_ocl_bicc'
+///   ::= 'intel_svmlcc128'
+///   ::= 'intel_svmlcc256'
+///   ::= 'intel_svmlcc512'
 ///   ::= 'coldcc'
 ///   ::= 'cfguard_checkcc'
 ///   ::= 'x86_stdcallcc'
@@ -2046,6 +2049,9 @@ bool LLParser::ParseOptionalCallingConv(unsigned &CC) {
   case lltok::kw_spir_kernel:    CC = CallingConv::SPIR_KERNEL; break;
   case lltok::kw_spir_func:      CC = CallingConv::SPIR_FUNC; break;
   case lltok::kw_intel_ocl_bicc: CC = CallingConv::Intel_OCL_BI; break;
+  case lltok::kw_intel_svmlcc128:CC = CallingConv::Intel_SVML128; break;
+  case lltok::kw_intel_svmlcc256:CC = CallingConv::Intel_SVML256; break;
+  case lltok::kw_intel_svmlcc512:CC = CallingConv::Intel_SVML512; break;
   case lltok::kw_x86_64_sysvcc:  CC = CallingConv::X86_64_SysV; break;
   case lltok::kw_win64cc:        CC = CallingConv::Win64; break;
   case lltok::kw_webkit_jscc:    CC = CallingConv::WebKit_JS; break;
diff --git a/lib/AsmParser/LLToken.h b/lib/AsmParser/LLToken.h
index 0fb3bae..b2dabda 100644
--- a/lib/AsmParser/LLToken.h
+++ b/lib/AsmParser/LLToken.h
@@ -132,6 +132,9 @@ enum Kind {
   kw_fastcc,
   kw_coldcc,
   kw_intel_ocl_bicc,
+  kw_intel_svmlcc128,
+  kw_intel_svmlcc256,
+  kw_intel_svmlcc512,
   kw_cfguard_checkcc,
   kw_x86_stdcallcc,
   kw_x86_fastcallcc,
diff --git a/lib/IR/AsmWriter.cpp b/lib/IR/AsmWriter.cpp
index fd08310..f1afb8e 100644
--- a/lib/IR/AsmWriter.cpp
+++ b/lib/IR/AsmWriter.cpp
@@ -360,6 +360,9 @@ static void PrintCallingConv(unsigned cc, raw_ostream &Out) {
   case CallingConv::X86_RegCall:   Out << "x86_regcallcc"; break;
   case CallingConv::X86_VectorCall:Out << "x86_vectorcallcc"; break;
   case CallingConv::Intel_OCL_BI:  Out << "intel_ocl_bicc"; break;
+  case CallingConv::Intel_SVML128: Out << "intel_svmlcc128"; break;
+  case CallingConv::Intel_SVML256: Out << "intel_svmlcc256"; break;
+  case CallingConv::Intel_SVML512: Out << "intel_svmlcc512"; break;
   case CallingConv::ARM_APCS:      Out << "arm_apcscc"; break;
   case CallingConv::ARM_AAPCS:     Out << "arm_aapcscc"; break;
   case CallingConv::ARM_AAPCS_VFP: Out << "arm_aapcs_vfpcc"; break;
diff --git a/lib/IR/Verifier.cpp b/lib/IR/Verifier.cpp
index c518ae8..c4bb0fb 100644
--- a/lib/IR/Verifier.cpp
+++ b/lib/IR/Verifier.cpp
@@ -2296,6 +2296,9 @@ void Verifier::visitFunction(const Function &F) {
   case CallingConv::Fast:
   case CallingConv::Cold:
   case CallingConv::Intel_OCL_BI:
+  case CallingConv::Intel_SVML128:
+  case CallingConv::Intel_SVML256:
+  case CallingConv::Intel_SVML512:
   case CallingConv::PTX_Kernel:
   case CallingConv::PTX_Device:
     Assert(!F.isVarArg(), "Calling convention does not support varargs or "
diff --git a/lib/Target/X86/X86CallingConv.td b/lib/Target/X86/X86CallingConv.td
index 802e694..2b8855e 100644
--- a/lib/Target/X86/X86CallingConv.td
+++ b/lib/Target/X86/X86CallingConv.td
@@ -482,6 +482,21 @@ def RetCC_X86_64 : CallingConv<[
   CCDelegateTo<RetCC_X86_64_C>
 ]>;
 
+// Intel_SVML return-value convention.
+def RetCC_Intel_SVML : CallingConv<[
+  // Vector types are returned in XMM0,XMM1
+  CCIfType<[v4f32, v2f64],
+            CCAssignToReg<[XMM0,XMM1]>>,
+
+  // 256-bit FP vectors
+  CCIfType<[v8f32, v4f64],
+            CCAssignToReg<[YMM0,YMM1]>>,
+
+  // 512-bit FP vectors
+  CCIfType<[v16f32, v8f64],
+            CCAssignToReg<[ZMM0,ZMM1]>>
+]>;
+
 // This is the return-value convention used for the entire X86 backend.
 let Entry = 1 in
 def RetCC_X86 : CallingConv<[
@@ -489,6 +504,10 @@ def RetCC_X86 : CallingConv<[
   // Check if this is the Intel OpenCL built-ins calling convention
   CCIfCC<"CallingConv::Intel_OCL_BI", CCDelegateTo<RetCC_Intel_OCL_BI>>,
 
+  CCIfCC<"CallingConv::Intel_SVML128", CCDelegateTo<RetCC_Intel_SVML>>,
+  CCIfCC<"CallingConv::Intel_SVML256", CCDelegateTo<RetCC_Intel_SVML>>,
+  CCIfCC<"CallingConv::Intel_SVML512", CCDelegateTo<RetCC_Intel_SVML>>,
+
   CCIfSubtarget<"is64Bit()", CCDelegateTo<RetCC_X86_64>>,
   CCDelegateTo<RetCC_X86_32>
 ]>;
@@ -1006,6 +1025,30 @@ def CC_Intel_OCL_BI : CallingConv<[
   CCDelegateTo<CC_X86_32_C>
 ]>;
 
+// X86-64 Intel Short Vector Math Library calling convention.
+def CC_Intel_SVML : CallingConv<[
+
+  // The SSE vector arguments are passed in XMM registers.
+  CCIfType<[v4f32, v2f64],
+           CCAssignToReg<[XMM0, XMM1, XMM2]>>,
+
+  // The 256-bit vector arguments are passed in YMM registers.
+  CCIfType<[v8f32, v4f64],
+           CCAssignToReg<[YMM0, YMM1, YMM2]>>,
+
+  // The 512-bit vector arguments are passed in ZMM registers.
+  CCIfType<[v16f32, v8f64],
+           CCAssignToReg<[ZMM0, ZMM1, ZMM2]>>
+]>;
+
+def CC_X86_32_Intr : CallingConv<[
+  CCAssignToStack<4, 4>
+]>;
+
+def CC_X86_64_Intr : CallingConv<[
+  CCAssignToStack<8, 8>
+]>;
+
 //===----------------------------------------------------------------------===//
 // X86 Root Argument Calling Conventions
 //===----------------------------------------------------------------------===//
@@ -1057,6 +1100,9 @@ def CC_X86_64 : CallingConv<[
 let Entry = 1 in
 def CC_X86 : CallingConv<[
   CCIfCC<"CallingConv::Intel_OCL_BI", CCDelegateTo<CC_Intel_OCL_BI>>,
+  CCIfCC<"CallingConv::Intel_SVML128", CCDelegateTo<CC_Intel_SVML>>,
+  CCIfCC<"CallingConv::Intel_SVML256", CCDelegateTo<CC_Intel_SVML>>,
+  CCIfCC<"CallingConv::Intel_SVML512", CCDelegateTo<CC_Intel_SVML>>,
   CCIfSubtarget<"is64Bit()", CCDelegateTo<CC_X86_64>>,
   CCDelegateTo<CC_X86_32>
 ]>;
@@ -1167,4 +1213,27 @@ def CSR_SysV64_RegCall_NoSSE : CalleeSavedRegs<(add RBX, RBP, RSP,
                                                (sequence "R%u", 12, 15))>;
 def CSR_SysV64_RegCall       : CalleeSavedRegs<(add CSR_SysV64_RegCall_NoSSE,               
                                                (sequence "XMM%u", 8, 15))>;
-                                               
+
+// SVML calling convention
+def CSR_32_Intel_SVML        : CalleeSavedRegs<(add CSR_32_RegCall_NoSSE)>;
+def CSR_32_Intel_SVML_AVX512 : CalleeSavedRegs<(add CSR_32_Intel_SVML,
+                                                K4, K5, K6, K7)>;
+
+def CSR_64_Intel_SVML_NoSSE : CalleeSavedRegs<(add RBX, RSI, RDI, RBP, RSP, R12, R13, R14, R15)>;
+
+def CSR_64_Intel_SVML       : CalleeSavedRegs<(add CSR_64_Intel_SVML_NoSSE,
+                                               (sequence "XMM%u", 8, 15))>;
+def CSR_Win64_Intel_SVML    : CalleeSavedRegs<(add CSR_64_Intel_SVML_NoSSE,
+                                               (sequence "XMM%u", 6, 15))>;
+
+def CSR_64_Intel_SVML_AVX        : CalleeSavedRegs<(add CSR_64_Intel_SVML_NoSSE,
+                                                    (sequence "YMM%u", 8, 15))>;
+def CSR_Win64_Intel_SVML_AVX     : CalleeSavedRegs<(add CSR_64_Intel_SVML_NoSSE,
+                                                    (sequence "YMM%u", 6, 15))>;
+
+def CSR_64_Intel_SVML_AVX512     : CalleeSavedRegs<(add CSR_64_Intel_SVML_NoSSE,
+                                                    (sequence "ZMM%u", 16, 31),
+                                                    K4, K5, K6, K7)>;
+def CSR_Win64_Intel_SVML_AVX512  : CalleeSavedRegs<(add CSR_64_Intel_SVML_NoSSE,
+                                                    (sequence "ZMM%u", 6, 21),
+                                                    K4, K5, K6, K7)>;
diff --git a/lib/Target/X86/X86ISelLowering.cpp b/lib/Target/X86/X86ISelLowering.cpp
index 16719171..f78b0f7 100644
--- a/lib/Target/X86/X86ISelLowering.cpp
+++ b/lib/Target/X86/X86ISelLowering.cpp
@@ -3496,7 +3496,8 @@ void VarArgsLoweringHelper::forwardMustTailParameters(SDValue &Chain) {
   // FIXME: Only some x86_32 calling conventions support AVX512.
   if (Subtarget.useAVX512Regs() &&
       (is64Bit() || (CallConv == CallingConv::X86_VectorCall ||
-                     CallConv == CallingConv::Intel_OCL_BI)))
+                     CallConv == CallingConv::Intel_OCL_BI   ||
+                     CallConv == CallingConv::Intel_SVML512)))
     VecVT = MVT::v16f32;
   else if (Subtarget.hasAVX())
     VecVT = MVT::v8f32;
diff --git a/lib/Target/X86/X86RegisterInfo.cpp b/lib/Target/X86/X86RegisterInfo.cpp
index f456728..48e3ae6 100644
--- a/lib/Target/X86/X86RegisterInfo.cpp
+++ b/lib/Target/X86/X86RegisterInfo.cpp
@@ -270,6 +270,42 @@ X86RegisterInfo::getRegPressureLimit(const TargetRegisterClass *RC,
   }
 }
 
+namespace {
+std::pair<const uint32_t *, const MCPhysReg *> getSVMLRegMaskAndSaveList(
+  bool Is64Bit, bool IsWin64, CallingConv::ID CC) {
+  assert(CC >= CallingConv::Intel_SVML128 && CC <= CallingConv::Intel_SVML512);
+  unsigned Abi = CC - CallingConv::Intel_SVML128 ; // 0 - 128, 1 - 256, 2 - 512
+
+  const std::pair<const uint32_t *, const MCPhysReg *> Abi64[] = {
+    std::make_pair(CSR_64_Intel_SVML_RegMask,        CSR_64_Intel_SVML_SaveList),
+    std::make_pair(CSR_64_Intel_SVML_AVX_RegMask,    CSR_64_Intel_SVML_AVX_SaveList),
+    std::make_pair(CSR_64_Intel_SVML_AVX512_RegMask, CSR_64_Intel_SVML_AVX512_SaveList),
+  };
+
+  const std::pair<const uint32_t *, const MCPhysReg *> AbiWin64[] = {
+    std::make_pair(CSR_Win64_Intel_SVML_RegMask,        CSR_Win64_Intel_SVML_SaveList),
+    std::make_pair(CSR_Win64_Intel_SVML_AVX_RegMask,    CSR_Win64_Intel_SVML_AVX_SaveList),
+    std::make_pair(CSR_Win64_Intel_SVML_AVX512_RegMask, CSR_Win64_Intel_SVML_AVX512_SaveList),
+  };
+
+  const std::pair<const uint32_t *, const MCPhysReg *> Abi32[] = {
+    std::make_pair(CSR_32_Intel_SVML_RegMask,        CSR_32_Intel_SVML_SaveList),
+    std::make_pair(CSR_32_Intel_SVML_RegMask,        CSR_32_Intel_SVML_SaveList),
+    std::make_pair(CSR_32_Intel_SVML_AVX512_RegMask, CSR_32_Intel_SVML_AVX512_SaveList),
+  };
+
+  if (Is64Bit) {
+    if (IsWin64) {
+      return AbiWin64[Abi];
+    } else {
+      return Abi64[Abi];
+    }
+  } else {
+    return Abi32[Abi];
+  }
+}
+}
+
 const MCPhysReg *
 X86RegisterInfo::getCalleeSavedRegs(const MachineFunction *MF) const {
   assert(MF && "MachineFunction required");
@@ -320,6 +356,11 @@ X86RegisterInfo::getCalleeSavedRegs(const MachineFunction *MF) const {
       return CSR_64_Intel_OCL_BI_SaveList;
     break;
   }
+  case CallingConv::Intel_SVML128:
+  case CallingConv::Intel_SVML256:
+  case CallingConv::Intel_SVML512: {
+    return getSVMLRegMaskAndSaveList(Is64Bit, IsWin64, CC).second;
+  }
   case CallingConv::HHVM:
     return CSR_64_HHVM_SaveList;
   case CallingConv::X86_RegCall:
@@ -438,6 +479,11 @@ X86RegisterInfo::getCallPreservedMask(const MachineFunction &MF,
       return CSR_64_Intel_OCL_BI_RegMask;
     break;
   }
+  case CallingConv::Intel_SVML128:
+  case CallingConv::Intel_SVML256:
+  case CallingConv::Intel_SVML512: {
+    return getSVMLRegMaskAndSaveList(Is64Bit, IsWin64, CC).first;
+  }
   case CallingConv::HHVM:
     return CSR_64_HHVM_RegMask;
   case CallingConv::X86_RegCall:
diff --git a/lib/Target/X86/X86Subtarget.h b/lib/Target/X86/X86Subtarget.h
index de45d35..6acc768 100644
--- a/lib/Target/X86/X86Subtarget.h
+++ b/lib/Target/X86/X86Subtarget.h
@@ -876,6 +876,9 @@ public:
     case CallingConv::X86_ThisCall:
     case CallingConv::X86_VectorCall:
     case CallingConv::Intel_OCL_BI:
+    case CallingConv::Intel_SVML128:
+    case CallingConv::Intel_SVML256:
+    case CallingConv::Intel_SVML512:
       return isTargetWin64();
     // This convention allows using the Win64 convention on other targets.
     case CallingConv::Win64:
diff --git a/lib/Transforms/Utils/InjectTLIMappings.cpp b/lib/Transforms/Utils/InjectTLIMappings.cpp
index 9d8f59d..87cf764 100644
--- a/lib/Transforms/Utils/InjectTLIMappings.cpp
+++ b/lib/Transforms/Utils/InjectTLIMappings.cpp
@@ -90,8 +90,7 @@ static void addMappingsFromTLI(const TargetLibraryInfo &TLI, CallInst &CI) {
   //  All VFs in the TLI are powers of 2.
   for (unsigned VF = 2, WidestVF = TLI.getWidestVF(ScalarName); VF <= WidestVF;
        VF *= 2) {
-    const std::string TLIName =
-        std::string(TLI.getVectorizedFunction(ScalarName, VF));
+    const std::string TLIName = TLI.getVectorizedFunction(ScalarName, VF, CI.getFastMathFlags().isFast());
     if (!TLIName.empty()) {
       std::string MangledName = VFABI::mangleTLIVectorName(
           TLIName, ScalarName, CI.getNumArgOperands(), VF);
diff --git a/lib/Transforms/Vectorize/LoopVectorize.cpp b/lib/Transforms/Vectorize/LoopVectorize.cpp
index 35af8e4..d4305fe 100644
--- a/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -672,6 +672,27 @@ protected:
   /// vector of instructions.
   void addMetadata(ArrayRef<Value *> To, Instruction *From);
 
+  /// Check legality of given SVML call instruction \p VecCall generated for
+  /// scalar call \p Call. If illegal then the appropriate legal instruction
+  /// is returned.
+  Value *legalizeSVMLCall(CallInst *VecCall, CallInst *Call);
+
+  /// Returns the legal VF for a call instruction \p CI using TTI information
+  /// and vector type.
+  unsigned getLegalVFForCall(CallInst *CI);
+
+  /// Partially vectorize a given call \p Call by breaking it down into multiple
+  /// calls of \p LegalCall, decided by the variant VF \p LegalVF.
+  Value *partialVectorizeCall(CallInst *Call, CallInst *LegalCall,
+                              unsigned LegalVF);
+
+  /// Generate shufflevector instruction for a vector value \p V based on the
+  /// current \p Part and a smaller VF \p LegalVF.
+  Value *generateShuffleValue(Value *V, unsigned LegalVF, unsigned Part);
+
+  /// Combine partially vectorized calls stored in \p CallResults.
+  Value *combinePartialVecCalls(SmallVectorImpl<Value *> &CallResults);
+
   /// The original loop.
   Loop *OrigLoop;
 
@@ -4362,6 +4383,17 @@ void InnerLoopVectorizer::widenInstruction(Instruction &I, VPUser &User,
   } // end of switch.
 }
 
+static void setVectorFunctionCallingConv(CallInst &CI, const DataLayout &DL,
+                                         const TargetLibraryInfo &TLI) {
+  Function *VectorF = CI.getCalledFunction();
+  FunctionType *FTy = VectorF->getFunctionType();
+  StringRef VFName = VectorF->getName();
+  auto CC = TLI.getVectorizedFunctionCallingConv(VFName, *FTy, DL);
+  if (CC) {
+    CI.setCallingConv(*CC);
+  }
+}
+
 void InnerLoopVectorizer::widenCallInstruction(CallInst &I, VPUser &ArgOperands,
                                                VPTransformState &State) {
   assert(!isa<DbgInfoIntrinsic>(I) &&
@@ -4426,8 +4458,24 @@ void InnerLoopVectorizer::widenCallInstruction(CallInst &I, VPUser &ArgOperands,
       if (isa<FPMathOperator>(V))
         V->copyFastMathFlags(CI);
 
-      VectorLoopValueMap.setVectorValue(&I, Part, V);
-      addMetadata(V, &I);
+      const DataLayout &DL = V->getModule()->getDataLayout();
+      setVectorFunctionCallingConv(*V, DL, *TLI);
+
+      // Perform legalization of SVML call instruction only if original call
+      // was not Intrinsic
+      if (!UseVectorIntrinsic &&
+          (V->getCalledFunction()->getName()).startswith("__svml")) {
+        // assert((V->getCalledFunction()->getName()).startswith("__svml"));
+        LLVM_DEBUG(dbgs() << "LV(SVML): Vector call inst:"; V->dump());
+        auto *LegalV = cast<Instruction>(legalizeSVMLCall(V, CI));
+        LLVM_DEBUG(dbgs() << "LV: Completed SVML legalization.\n LegalV: ";
+                   LegalV->dump());
+        VectorLoopValueMap.setVectorValue(&I, Part, LegalV);
+        addMetadata(LegalV, &I);
+      } else {
+        VectorLoopValueMap.setVectorValue(&I, Part, V);
+        addMetadata(V, &I);
+      }
   }
 }
 
@@ -4455,6 +4503,227 @@ void InnerLoopVectorizer::widenSelectInstruction(SelectInst &I,
   }
 }
 
+//===----------------------------------------------------------------------===//
+// Implementation of functions for SVML vector call legalization.
+//===----------------------------------------------------------------------===//
+//
+// Unlike other VECLIBs, SVML needs to be used with target-legal
+// vector types. Otherwise, link failures and/or runtime failures
+// will occur. A motivating example could be -
+//
+//   double *a;
+//   float *b;
+//   #pragma clang loop vectorize_width(8)
+//   for(i = 0; i < N; ++i) {
+//     a[i] = sin(i);   // Legal SVML VF must be 4 or below on AVX
+//     b[i] = cosf(i);  // VF can be 8 on AVX since 8 floats can fit in YMM
+//    }
+//
+// Current implementation of vector code generation in LV is
+// driven based on a single VF (in InnerLoopVectorizer::VF). This
+// inhibits the flexibility of adjusting/choosing different VF
+// for different instructions.
+//
+// Due to this limitation it is much more straightforward to
+// first generate the illegal sin8 (svml_sin8 for SVML vector
+// library) call and then legalize it than trying to avoid
+// generating illegal code from the beginning.
+//
+// A solution for this problem is to check legality of the
+// call instruction right after generating it in vectorizer and
+// if it is illegal we split the call arguments and issue multiple
+// calls to match the legal VF. This is demonstrated currently for
+// the SVML vector library calls (non-intrinsic version only).
+//
+// Future directions and extensions:
+// 1) This legalization example shows us that a good direction
+//    for the VPlan framework would be to model the vector call
+//    instructions in a way that legal VF for each call is chosen
+//    correctly within vectorizer and illegal code generation is
+//    avoided.
+// 2) This logic can also be extended to general vector functions
+//    i.e. legalization OpenMP decalre simd functions. The
+//    requirements needed for this will be documented soon.
+
+Value *InnerLoopVectorizer::legalizeSVMLCall(CallInst *VecCall,
+                                             CallInst *Call) {
+  unsigned LegalVF = getLegalVFForCall(VecCall);
+
+  assert(LegalVF > 1 &&
+         "Legal VF for SVML call must be greater than 1 to vectorize");
+
+  if (LegalVF == VF)
+    return VecCall;
+  else if (LegalVF > VF)
+    // TODO: handle case when we are underfilling vectors
+    return VecCall;
+
+  // Legal VF for this SVML call is smaller than chosen VF, break it down into
+  // smaller call instructions
+
+  // Convert args, types and return type to match legal VF
+  SmallVector<Type *, 4> NewTys;
+  SmallVector<Value *, 4> NewArgs;
+
+  for (Value *ArgOperand : Call->arg_operands()) {
+    Type *Ty = ToVectorTy(ArgOperand->getType(), LegalVF);
+    NewTys.push_back(Ty);
+    NewArgs.push_back(UndefValue::get(Ty));
+  }
+
+  // Construct legal vector function
+  const VFShape Shape =
+    VFShape::get(*Call, {LegalVF, false} /*EC*/, false /*HasGlobalPred*/);
+  Function *LegalVectorF = VFDatabase(*Call).getVectorizedFunction(Shape);
+  assert(LegalVectorF != nullptr && "Can't create legal vector function.");
+
+  LLVM_DEBUG(dbgs() << "LV(SVML): LegalVectorF: "; LegalVectorF->dump());
+
+  SmallVector<OperandBundleDef, 1> OpBundles;
+  Call->getOperandBundlesAsDefs(OpBundles);
+  auto LegalV = std::unique_ptr<CallInst>(CallInst::Create(LegalVectorF, NewArgs, OpBundles));
+
+  if (isa<FPMathOperator>(LegalV))
+    LegalV->copyFastMathFlags(Call);
+
+  const DataLayout &DL = VecCall->getModule()->getDataLayout();
+  // Set SVML calling conventions
+  setVectorFunctionCallingConv(*LegalV, DL, *TLI);
+
+  LLVM_DEBUG(dbgs() << "LV(SVML): LegalV: "; LegalV->dump());
+
+  Value *LegalizedCall = partialVectorizeCall(VecCall, LegalV.get(), LegalVF);
+
+  LLVM_DEBUG(dbgs() << "LV(SVML): LegalizedCall: "; LegalizedCall->dump());
+
+  // Remove the illegal call from Builder
+  VecCall->eraseFromParent();
+
+  return LegalizedCall;
+}
+
+unsigned InnerLoopVectorizer::getLegalVFForCall(CallInst *CI) {
+  const DataLayout DL = CI->getModule()->getDataLayout();
+  FunctionType *CallFT = CI->getFunctionType();
+  // All functions that need legalization should have a vector return type.
+  // This is true for all SVML functions that are currently supported.
+  assert(isa<VectorType>(CallFT->getReturnType()) &&
+         "Return type of call that needs legalization is not a vector.");
+  auto *VecCallRetType = cast<VectorType>(CallFT->getReturnType());
+  Type *ElemType = VecCallRetType->getElementType();
+
+  unsigned TypeBitWidth = DL.getTypeSizeInBits(ElemType);
+  unsigned VectorBitWidth = TTI->getRegisterBitWidth(true);
+  unsigned LegalVF = VectorBitWidth / TypeBitWidth;
+
+  LLVM_DEBUG(dbgs() << "LV(SVML): Type Bit Width: " << TypeBitWidth << "\n");
+  LLVM_DEBUG(dbgs() << "LV(SVML): Current VL: " << VF << "\n");
+  LLVM_DEBUG(dbgs() << "LV(SVML): Vector Bit Width: " << VectorBitWidth
+                    << "\n");
+  LLVM_DEBUG(dbgs() << "LV(SVML): Legal Target VL: " << LegalVF << "\n");
+
+  return LegalVF;
+}
+
+// Partial vectorization of a call instruction is achieved by making clones of
+// \p LegalCall and overwriting its argument operands with shufflevector
+// equivalent decided based on \p LegalVF and current Part being filled.
+Value *InnerLoopVectorizer::partialVectorizeCall(CallInst *Call,
+                                                 CallInst *LegalCall,
+                                                 unsigned LegalVF) {
+  unsigned NumParts = VF / LegalVF;
+  LLVM_DEBUG(dbgs() << "LV(SVML): NumParts: " << NumParts << "\n");
+  SmallVector<Value *, 8> CallResults;
+
+  for (unsigned Part = 0; Part < NumParts; ++Part) {
+    auto *ClonedCall = cast<CallInst>(LegalCall->clone());
+
+    // Update the arg operand of cloned call to shufflevector
+    for (unsigned i = 0, ie = Call->getNumArgOperands(); i != ie; ++i) {
+      auto *NewOp = generateShuffleValue(Call->getArgOperand(i), LegalVF, Part);
+      ClonedCall->setArgOperand(i, NewOp);
+    }
+
+    LLVM_DEBUG(dbgs() << "LV(SVML): ClonedCall: "; ClonedCall->dump());
+
+    auto *PartialVecCall = Builder.Insert(ClonedCall);
+    CallResults.push_back(PartialVecCall);
+  }
+
+  return combinePartialVecCalls(CallResults);
+}
+
+Value *InnerLoopVectorizer::generateShuffleValue(Value *V, unsigned LegalVF,
+                                                 unsigned Part) {
+  // Example:
+  // Consider the following vector code -
+  // %1 = sitofp <4 x i32> %0 to <4 x double>
+  // %2 = call <4 x double> @__svml_sin4(<4 x double> %1)
+  //
+  // If the LegalVF is 2, we partially vectorize the sin4 call by invoking
+  // generateShuffleValue on the operand %1
+  // If Part = 1, output value is -
+  // %shuffle = shufflevector <4 x double> %1, <4 x double> undef, <2 x i32><i32 0, i32 1>
+  // and if Part = 2, output is -
+  // %shuffle7 =shufflevector <4 x double> %1, <4 x double> undef, <2 x i32><i32 2, i32 3>
+
+  assert(isa<VectorType>(V->getType()) &&
+         "Cannot generate shuffles for non-vector values.");
+  SmallVector<int, 4> ShuffleMask;
+  Value *Undef = UndefValue::get(V->getType());
+
+  unsigned ElemIdx = Part * LegalVF;
+
+  for (unsigned K = 0; K < LegalVF; K++)
+    ShuffleMask.push_back(static_cast<int>(ElemIdx + K));
+
+  auto *ShuffleInst =
+      Builder.CreateShuffleVector(V, Undef, ShuffleMask, "shuffle");
+
+  return ShuffleInst;
+}
+
+// Results of the calls executed by smaller legal call instructions must be
+// combined to match the original VF for later use. This is done by constructing
+// shufflevector instructions in a cumulative fashion.
+Value *InnerLoopVectorizer::combinePartialVecCalls(
+    SmallVectorImpl<Value *> &CallResults) {
+  assert(isa<VectorType>(CallResults[0]->getType()) &&
+         "Cannot combine calls with non-vector results.");
+  auto *CallType = cast<VectorType>(CallResults[0]->getType());
+
+  Value *CombinedShuffle;
+  unsigned NumElems = CallType->getNumElements() * 2;
+  unsigned NumRegs = CallResults.size();
+
+  assert(NumRegs >= 2 && isPowerOf2_32(NumRegs) &&
+         "Number of partial vector calls to combine must be a power of 2 "
+         "(atleast 2^1)");
+
+  while (NumRegs > 1) {
+    for (unsigned I = 0; I < NumRegs; I += 2) {
+      SmallVector<int, 4> ShuffleMask;
+      for (unsigned J = 0; J < NumElems; J++)
+        ShuffleMask.push_back(static_cast<int>(J));
+
+      CombinedShuffle = Builder.CreateShuffleVector(
+          CallResults[I], CallResults[I + 1], ShuffleMask, "combined");
+      LLVM_DEBUG(dbgs() << "LV(SVML): CombinedShuffle:";
+                 CombinedShuffle->dump());
+      CallResults.push_back(CombinedShuffle);
+    }
+
+    SmallVector<Value *, 2>::iterator Start = CallResults.begin();
+    SmallVector<Value *, 2>::iterator End = Start + NumRegs;
+    CallResults.erase(Start, End);
+
+    NumElems *= 2;
+    NumRegs /= 2;
+  }
+
+  return CombinedShuffle;
+}
+
 void LoopVectorizationCostModel::collectLoopScalars(unsigned VF) {
   // We should not collect Scalars more than once per VF. Right now, this
   // function is called from collectUniformsAndScalars(), which already does
diff --git a/test/Transforms/LoopVectorize/X86/svml-calls-finite.ll b/test/Transforms/LoopVectorize/X86/svml-calls-finite.ll
index 1e55e7d..5cf1f9d 100644
--- a/test/Transforms/LoopVectorize/X86/svml-calls-finite.ll
+++ b/test/Transforms/LoopVectorize/X86/svml-calls-finite.ll
@@ -39,7 +39,8 @@ for.end:                                          ; preds = %for.body
 declare double @__exp_finite(double) #0
 
 ; CHECK-LABEL: @exp_f64
-; CHECK: <4 x double> @__svml_exp4
+; CHECK: <2 x double> @__svml_exp2
+; CHECK: <2 x double> @__svml_exp2
 ; CHECK: ret
 define void @exp_f64(double* nocapture %varray) {
 entry:
@@ -99,7 +100,8 @@ for.end:                                          ; preds = %for.body
 declare double @__log_finite(double) #0
 
 ; CHECK-LABEL: @log_f64
-; CHECK: <4 x double> @__svml_log4
+; CHECK: <2 x double> @__svml_log2
+; CHECK: <2 x double> @__svml_log2
 ; CHECK: ret
 define void @log_f64(double* nocapture %varray) {
 entry:
@@ -159,7 +161,8 @@ for.end:                                          ; preds = %for.body
 declare double @__pow_finite(double, double) #0
 
 ; CHECK-LABEL: @pow_f64
-; CHECK: <4 x double> @__svml_pow4
+; CHECK: <2 x double> @__svml_pow2
+; CHECK: <2 x double> @__svml_pow2
 ; CHECK: ret
 define void @pow_f64(double* nocapture %varray, double* nocapture readonly %exp) {
 entry:
@@ -190,7 +193,8 @@ declare float @__exp2f_finite(float) #0
 
 define void @exp2f_finite(float* nocapture %varray) {
 ; CHECK-LABEL: @exp2f_finite(
-; CHECK:    call <4 x float> @__svml_exp2f4(<4 x float> %{{.*}})
+; CHECK:    call intel_svmlcc128 <4 x float> @__svml_exp2f4_ha(<4 x float> %{{.*}})
+; CHECK:    call intel_svmlcc128 <4 x float> @__svml_exp2f4_ha(<4 x float> %{{.*}})
 ; CHECK:    ret void
 ;
 entry:
@@ -219,7 +223,8 @@ declare double @__exp2_finite(double) #0
 
 define void @exp2_finite(double* nocapture %varray) {
 ; CHECK-LABEL: @exp2_finite(
-; CHECK:    call <4 x double> @__svml_exp24(<4 x double> {{.*}})
+; CHECK:    call intel_svmlcc128 <2 x double> @__svml_exp22_ha(<2 x double> {{.*}})
+; CHECK:    call intel_svmlcc128 <2 x double> @__svml_exp22_ha(<2 x double> {{.*}})
 ; CHECK:    ret void
 ;
 entry:
diff --git a/test/Transforms/LoopVectorize/X86/svml-calls.ll b/test/Transforms/LoopVectorize/X86/svml-calls.ll
index 1f2b71f..e84d9ae 100644
--- a/test/Transforms/LoopVectorize/X86/svml-calls.ll
+++ b/test/Transforms/LoopVectorize/X86/svml-calls.ll
@@ -35,7 +35,7 @@ declare float @llvm.exp2.f32(float) #0
 
 define void @sin_f64(double* nocapture %varray) {
 ; CHECK-LABEL: @sin_f64(
-; CHECK:    [[TMP5:%.*]] = call <4 x double> @__svml_sin4(<4 x double> [[TMP4:%.*]])
+; CHECK:    [[TMP5:%.*]] = call intel_svmlcc256 <4 x double> @__svml_sin4_ha(<4 x double> [[TMP4:%.*]])
 ; CHECK:    ret void
 ;
 entry:
@@ -58,7 +58,7 @@ for.end:
 
 define void @sin_f32(float* nocapture %varray) {
 ; CHECK-LABEL: @sin_f32(
-; CHECK:    [[TMP5:%.*]] = call <4 x float> @__svml_sinf4(<4 x float> [[TMP4:%.*]])
+; CHECK:    [[TMP5:%.*]] = call intel_svmlcc128 <4 x float> @__svml_sinf4_ha(<4 x float> [[TMP4:%.*]])
 ; CHECK:    ret void
 ;
 entry:
@@ -81,7 +81,7 @@ for.end:
 
 define void @sin_f64_intrinsic(double* nocapture %varray) {
 ; CHECK-LABEL: @sin_f64_intrinsic(
-; CHECK:    [[TMP5:%.*]] = call <4 x double> @__svml_sin4(<4 x double> [[TMP4:%.*]])
+; CHECK:    [[TMP5:%.*]] = call intel_svmlcc256 <4 x double> @__svml_sin4_ha(<4 x double> [[TMP4:%.*]])
 ; CHECK:    ret void
 ;
 entry:
@@ -104,7 +104,7 @@ for.end:
 
 define void @sin_f32_intrinsic(float* nocapture %varray) {
 ; CHECK-LABEL: @sin_f32_intrinsic(
-; CHECK:    [[TMP5:%.*]] = call <4 x float> @__svml_sinf4(<4 x float> [[TMP4:%.*]])
+; CHECK:    [[TMP5:%.*]] = call intel_svmlcc128 <4 x float> @__svml_sinf4_ha(<4 x float> [[TMP4:%.*]])
 ; CHECK:    ret void
 ;
 entry:
@@ -127,7 +127,7 @@ for.end:
 
 define void @cos_f64(double* nocapture %varray) {
 ; CHECK-LABEL: @cos_f64(
-; CHECK:    [[TMP5:%.*]] = call <4 x double> @__svml_cos4(<4 x double> [[TMP4:%.*]])
+; CHECK:    [[TMP5:%.*]] = call intel_svmlcc256 <4 x double> @__svml_cos4_ha(<4 x double> [[TMP4:%.*]])
 ; CHECK:    ret void
 ;
 entry:
@@ -150,7 +150,7 @@ for.end:
 
 define void @cos_f32(float* nocapture %varray) {
 ; CHECK-LABEL: @cos_f32(
-; CHECK:    [[TMP5:%.*]] = call <4 x float> @__svml_cosf4(<4 x float> [[TMP4:%.*]])
+; CHECK:    [[TMP5:%.*]] = call intel_svmlcc128 <4 x float> @__svml_cosf4_ha(<4 x float> [[TMP4:%.*]])
 ; CHECK:    ret void
 ;
 entry:
@@ -173,7 +173,7 @@ for.end:
 
 define void @cos_f64_intrinsic(double* nocapture %varray) {
 ; CHECK-LABEL: @cos_f64_intrinsic(
-; CHECK:    [[TMP5:%.*]] = call <4 x double> @__svml_cos4(<4 x double> [[TMP4:%.*]])
+; CHECK:    [[TMP5:%.*]] = call intel_svmlcc256 <4 x double> @__svml_cos4_ha(<4 x double> [[TMP4:%.*]])
 ; CHECK:    ret void
 ;
 entry:
@@ -196,7 +196,7 @@ for.end:
 
 define void @cos_f32_intrinsic(float* nocapture %varray) {
 ; CHECK-LABEL: @cos_f32_intrinsic(
-; CHECK:    [[TMP5:%.*]] = call <4 x float> @__svml_cosf4(<4 x float> [[TMP4:%.*]])
+; CHECK:    [[TMP5:%.*]] = call intel_svmlcc128 <4 x float> @__svml_cosf4_ha(<4 x float> [[TMP4:%.*]])
 ; CHECK:    ret void
 ;
 entry:
@@ -219,7 +219,7 @@ for.end:
 
 define void @pow_f64(double* nocapture %varray, double* nocapture readonly %exp) {
 ; CHECK-LABEL: @pow_f64(
-; CHECK:    [[TMP8:%.*]] = call <4 x double> @__svml_pow4(<4 x double> [[TMP4:%.*]], <4 x double> [[WIDE_LOAD:%.*]])
+; CHECK:    [[TMP8:%.*]] = call intel_svmlcc256 <4 x double> @__svml_pow4_ha(<4 x double> [[TMP4:%.*]], <4 x double> [[WIDE_LOAD:%.*]])
 ; CHECK:    ret void
 ;
 entry:
@@ -244,7 +244,7 @@ for.end:
 
 define void @pow_f64_intrinsic(double* nocapture %varray, double* nocapture readonly %exp) {
 ; CHECK-LABEL: @pow_f64_intrinsic(
-; CHECK:    [[TMP8:%.*]] = call <4 x double> @__svml_pow4(<4 x double> [[TMP4:%.*]], <4 x double> [[WIDE_LOAD:%.*]])
+; CHECK:    [[TMP8:%.*]] = call intel_svmlcc256 <4 x double> @__svml_pow4_ha(<4 x double> [[TMP4:%.*]], <4 x double> [[WIDE_LOAD:%.*]])
 ; CHECK:    ret void
 ;
 entry:
@@ -269,7 +269,7 @@ for.end:
 
 define void @pow_f32(float* nocapture %varray, float* nocapture readonly %exp) {
 ; CHECK-LABEL: @pow_f32(
-; CHECK:    [[TMP8:%.*]] = call <4 x float> @__svml_powf4(<4 x float> [[TMP4:%.*]], <4 x float> [[WIDE_LOAD:%.*]])
+; CHECK:    [[TMP8:%.*]] = call intel_svmlcc128 <4 x float> @__svml_powf4_ha(<4 x float> [[TMP4:%.*]], <4 x float> [[WIDE_LOAD:%.*]])
 ; CHECK:    ret void
 ;
 entry:
@@ -294,7 +294,7 @@ for.end:
 
 define void @pow_f32_intrinsic(float* nocapture %varray, float* nocapture readonly %exp) {
 ; CHECK-LABEL: @pow_f32_intrinsic(
-; CHECK:    [[TMP8:%.*]] = call <4 x float> @__svml_powf4(<4 x float> [[TMP4:%.*]], <4 x float> [[WIDE_LOAD:%.*]])
+; CHECK:    [[TMP8:%.*]] = call intel_svmlcc128 <4 x float> @__svml_powf4_ha(<4 x float> [[TMP4:%.*]], <4 x float> [[WIDE_LOAD:%.*]])
 ; CHECK:    ret void
 ;
 entry:
@@ -319,7 +319,7 @@ for.end:
 
 define void @exp_f64(double* nocapture %varray) {
 ; CHECK-LABEL: @exp_f64(
-; CHECK:    [[TMP5:%.*]] = call <4 x double> @__svml_exp4(<4 x double> [[TMP4:%.*]])
+; CHECK:    [[TMP5:%.*]] = call intel_svmlcc256 <4 x double> @__svml_exp4_ha(<4 x double> [[TMP4:%.*]])
 ; CHECK:    ret void
 ;
 entry:
@@ -342,7 +342,7 @@ for.end:
 
 define void @exp_f32(float* nocapture %varray) {
 ; CHECK-LABEL: @exp_f32(
-; CHECK:    [[TMP5:%.*]] = call <4 x float> @__svml_expf4(<4 x float> [[TMP4:%.*]])
+; CHECK:    [[TMP5:%.*]] = call intel_svmlcc128 <4 x float> @__svml_expf4_ha(<4 x float> [[TMP4:%.*]])
 ; CHECK:    ret void
 ;
 entry:
@@ -365,7 +365,7 @@ for.end:
 
 define void @exp_f64_intrinsic(double* nocapture %varray) {
 ; CHECK-LABEL: @exp_f64_intrinsic(
-; CHECK:    [[TMP5:%.*]] = call <4 x double> @__svml_exp4(<4 x double> [[TMP4:%.*]])
+; CHECK:    [[TMP5:%.*]] = call intel_svmlcc256 <4 x double> @__svml_exp4_ha(<4 x double> [[TMP4:%.*]])
 ; CHECK:    ret void
 ;
 entry:
@@ -388,7 +388,7 @@ for.end:
 
 define void @exp_f32_intrinsic(float* nocapture %varray) {
 ; CHECK-LABEL: @exp_f32_intrinsic(
-; CHECK:    [[TMP5:%.*]] = call <4 x float> @__svml_expf4(<4 x float> [[TMP4:%.*]])
+; CHECK:    [[TMP5:%.*]] = call intel_svmlcc128 <4 x float> @__svml_expf4_ha(<4 x float> [[TMP4:%.*]])
 ; CHECK:    ret void
 ;
 entry:
@@ -411,7 +411,7 @@ for.end:
 
 define void @log_f64(double* nocapture %varray) {
 ; CHECK-LABEL: @log_f64(
-; CHECK:    [[TMP5:%.*]] = call <4 x double> @__svml_log4(<4 x double> [[TMP4:%.*]])
+; CHECK:    [[TMP5:%.*]] = call intel_svmlcc256 <4 x double> @__svml_log4_ha(<4 x double> [[TMP4:%.*]])
 ; CHECK:    ret void
 ;
 entry:
@@ -434,7 +434,7 @@ for.end:
 
 define void @log_f32(float* nocapture %varray) {
 ; CHECK-LABEL: @log_f32(
-; CHECK:    [[TMP5:%.*]] = call <4 x float> @__svml_logf4(<4 x float> [[TMP4:%.*]])
+; CHECK:    [[TMP5:%.*]] = call intel_svmlcc128 <4 x float> @__svml_logf4_ha(<4 x float> [[TMP4:%.*]])
 ; CHECK:    ret void
 ;
 entry:
@@ -457,7 +457,7 @@ for.end:
 
 define void @log_f64_intrinsic(double* nocapture %varray) {
 ; CHECK-LABEL: @log_f64_intrinsic(
-; CHECK:    [[TMP5:%.*]] = call <4 x double> @__svml_log4(<4 x double> [[TMP4:%.*]])
+; CHECK:    [[TMP5:%.*]] = call intel_svmlcc256 <4 x double> @__svml_log4_ha(<4 x double> [[TMP4:%.*]])
 ; CHECK:    ret void
 ;
 entry:
@@ -480,7 +480,7 @@ for.end:
 
 define void @log_f32_intrinsic(float* nocapture %varray) {
 ; CHECK-LABEL: @log_f32_intrinsic(
-; CHECK:    [[TMP5:%.*]] = call <4 x float> @__svml_logf4(<4 x float> [[TMP4:%.*]])
+; CHECK:    [[TMP5:%.*]] = call intel_svmlcc128 <4 x float> @__svml_logf4_ha(<4 x float> [[TMP4:%.*]])
 ; CHECK:    ret void
 ;
 entry:
@@ -503,7 +503,7 @@ for.end:
 
 define void @exp2_f64(double* nocapture %varray) {
 ; CHECK-LABEL: @exp2_f64(
-; CHECK:    [[TMP5:%.*]] = call <4 x double> @__svml_exp24(<4 x double> [[TMP4:%.*]])
+; CHECK:    [[TMP5:%.*]] = call intel_svmlcc256 <4 x double> @__svml_exp24_ha(<4 x double> [[TMP4:%.*]])
 ; CHECK:    ret void
 ;
 entry:
@@ -526,7 +526,7 @@ for.end:
 
 define void @exp2_f32(float* nocapture %varray) {
 ; CHECK-LABEL: @exp2_f32(
-; CHECK:    [[TMP5:%.*]] = call <4 x float> @__svml_exp2f4(<4 x float> [[TMP4:%.*]])
+; CHECK:    [[TMP5:%.*]] = call intel_svmlcc128 <4 x float> @__svml_exp2f4_ha(<4 x float> [[TMP4:%.*]])
 ; CHECK:    ret void
 ;
 entry:
@@ -549,7 +549,7 @@ for.end:
 
 define void @exp2_f64_intrinsic(double* nocapture %varray) {
 ; CHECK-LABEL: @exp2_f64_intrinsic(
-; CHECK:    [[TMP5:%.*]] = call <4 x double> @__svml_exp24(<4 x double> [[TMP4:%.*]])
+; CHECK:    [[TMP5:%.*]] = call intel_svmlcc256 <4 x double> @__svml_exp24_ha(<4 x double> [[TMP4:%.*]])
 ; CHECK:    ret void
 ;
 entry:
@@ -572,7 +572,7 @@ for.end:
 
 define void @exp2_f32_intrinsic(float* nocapture %varray) {
 ; CHECK-LABEL: @exp2_f32_intrinsic(
-; CHECK:    [[TMP5:%.*]] = call <4 x float> @__svml_exp2f4(<4 x float> [[TMP4:%.*]])
+; CHECK:    [[TMP5:%.*]] = call intel_svmlcc128 <4 x float> @__svml_exp2f4_ha(<4 x float> [[TMP4:%.*]])
 ; CHECK:    ret void
 ;
 entry:
@@ -593,4 +593,44 @@ for.end:
   ret void
 }
 
+; CHECK-LABEL: @atan2_finite
+; CHECK: intel_svmlcc256 <4 x double> @__svml_atan24
+; CHECK: intel_svmlcc256 <4 x double> @__svml_atan24
+; CHECK: ret
+
+declare double @__atan2_finite(double, double) local_unnamed_addr #0
+
+define void @atan2_finite([100 x double]* nocapture %varray) local_unnamed_addr #0 {
+entry:
+  br label %for.cond1.preheader
+
+for.cond1.preheader:                              ; preds = %for.inc7, %entry
+  %indvars.iv19 = phi i64 [ 0, %entry ], [ %indvars.iv.next20, %for.inc7 ]
+  %0 = trunc i64 %indvars.iv19 to i32
+  %conv = sitofp i32 %0 to double
+  br label %for.body3
+
+for.body3:                                        ; preds = %for.body3, %for.cond1.preheader
+  %indvars.iv = phi i64 [ 0, %for.cond1.preheader ], [ %indvars.iv.next, %for.body3 ]
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %1 = trunc i64 %indvars.iv.next to i32
+  %conv4 = sitofp i32 %1 to double
+  %call = tail call fast double @__atan2_finite(double %conv, double %conv4)
+  %arrayidx6 = getelementptr inbounds [100 x double], [100 x double]* %varray, i64 %indvars.iv19, i64 %indvars.iv
+  store double %call, double* %arrayidx6, align 8
+  %exitcond = icmp eq i64 %indvars.iv.next, 100
+  br i1 %exitcond, label %for.inc7, label %for.body3, !llvm.loop !5
+
+for.inc7:                                         ; preds = %for.body3
+  %indvars.iv.next20 = add nuw nsw i64 %indvars.iv19, 1
+  %exitcond21 = icmp eq i64 %indvars.iv.next20, 100
+  br i1 %exitcond21, label %for.end9, label %for.cond1.preheader
+
+for.end9:                                         ; preds = %for.inc7
+  ret void
+}
+
 attributes #0 = { nounwind readnone }
+!5 = distinct !{!5, !6, !7}
+!6 = !{!"llvm.loop.vectorize.width", i32 8}
+!7 = !{!"llvm.loop.vectorize.enable", i1 true}
diff --git a/test/Transforms/LoopVectorize/X86/svml-legal-calls.ll b/test/Transforms/LoopVectorize/X86/svml-legal-calls.ll
new file mode 100644
index 0000000..6e4267c
--- /dev/null
+++ b/test/Transforms/LoopVectorize/X86/svml-legal-calls.ll
@@ -0,0 +1,513 @@
+; Check legalization of SVML calls, including intrinsic versions (like @llvm.<fn_name>.<type>).
+
+; RUN: opt -vector-library=SVML -loop-vectorize -force-vector-width=8 -force-vector-interleave=1 -mattr=avx -S < %s | FileCheck %s
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+declare double @sin(double) #0
+declare float @sinf(float) #0
+declare double @llvm.sin.f64(double) #0
+declare float @llvm.sin.f32(float) #0
+
+declare double @cos(double) #0
+declare float @cosf(float) #0
+declare double @llvm.cos.f64(double) #0
+declare float @llvm.cos.f32(float) #0
+
+declare double @pow(double, double) #0
+declare float @powf(float, float) #0
+declare double @llvm.pow.f64(double, double) #0
+declare float @llvm.pow.f32(float, float) #0
+
+declare double @exp(double) #0
+declare float @expf(float) #0
+declare double @llvm.exp.f64(double) #0
+declare float @llvm.exp.f32(float) #0
+
+declare double @log(double) #0
+declare float @logf(float) #0
+declare double @llvm.log.f64(double) #0
+declare float @llvm.log.f32(float) #0
+
+
+define void @sin_f64(double* nocapture %varray) {
+; CHECK-LABEL: @sin_f64(
+; CHECK:    [[TMP1:%.*]] = call intel_svmlcc256 <4 x double> @__svml_sin4_ha(<4 x double> [[TMP2:%.*]])
+; CHECK:    [[TMP3:%.*]] = call intel_svmlcc256 <4 x double> @__svml_sin4_ha(<4 x double> [[TMP4:%.*]])
+; CHECK:    ret void
+;
+entry:
+  br label %for.body
+
+for.body:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+  %tmp = trunc i64 %iv to i32
+  %conv = sitofp i32 %tmp to double
+  %call = tail call double @sin(double %conv)
+  %arrayidx = getelementptr inbounds double, double* %varray, i64 %iv
+  store double %call, double* %arrayidx, align 4
+  %iv.next = add nuw nsw i64 %iv, 1
+  %exitcond = icmp eq i64 %iv.next, 1000
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:
+  ret void
+}
+
+define void @sin_f32(float* nocapture %varray) {
+; CHECK-LABEL: @sin_f32(
+; CHECK:    [[TMP1:%.*]] = call intel_svmlcc256 <8 x float> @__svml_sinf8_ha(<8 x float> [[TMP2:%.*]])
+; CHECK:    ret void
+;
+entry:
+  br label %for.body
+
+for.body:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+  %tmp = trunc i64 %iv to i32
+  %conv = sitofp i32 %tmp to float
+  %call = tail call float @sinf(float %conv)
+  %arrayidx = getelementptr inbounds float, float* %varray, i64 %iv
+  store float %call, float* %arrayidx, align 4
+  %iv.next = add nuw nsw i64 %iv, 1
+  %exitcond = icmp eq i64 %iv.next, 1000
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:
+  ret void
+}
+
+define void @sin_f64_intrinsic(double* nocapture %varray) {
+; CHECK-LABEL: @sin_f64_intrinsic(
+; CHECK:    [[TMP1:%.*]] = call intel_svmlcc256 <4 x double> @__svml_sin4_ha(<4 x double> [[TMP2:%.*]])
+; CHECK:    [[TMP3:%.*]] = call intel_svmlcc256 <4 x double> @__svml_sin4_ha(<4 x double> [[TMP4:%.*]])
+; CHECK:    ret void
+;
+entry:
+  br label %for.body
+
+for.body:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+  %tmp = trunc i64 %iv to i32
+  %conv = sitofp i32 %tmp to double
+  %call = tail call double @llvm.sin.f64(double %conv)
+  %arrayidx = getelementptr inbounds double, double* %varray, i64 %iv
+  store double %call, double* %arrayidx, align 4
+  %iv.next = add nuw nsw i64 %iv, 1
+  %exitcond = icmp eq i64 %iv.next, 1000
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:
+  ret void
+}
+
+define void @sin_f32_intrinsic(float* nocapture %varray) {
+; CHECK-LABEL: @sin_f32_intrinsic(
+; CHECK:    [[TMP1:%.*]] = call intel_svmlcc256 <8 x float> @__svml_sinf8_ha(<8 x float> [[TMP2:%.*]])
+; CHECK:    ret void
+;
+entry:
+  br label %for.body
+
+for.body:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+  %tmp = trunc i64 %iv to i32
+  %conv = sitofp i32 %tmp to float
+  %call = tail call float @llvm.sin.f32(float %conv)
+  %arrayidx = getelementptr inbounds float, float* %varray, i64 %iv
+  store float %call, float* %arrayidx, align 4
+  %iv.next = add nuw nsw i64 %iv, 1
+  %exitcond = icmp eq i64 %iv.next, 1000
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:
+  ret void
+}
+
+define void @cos_f64(double* nocapture %varray) {
+; CHECK-LABEL: @cos_f64(
+; CHECK:    [[TMP1:%.*]] = call intel_svmlcc256 <4 x double> @__svml_cos4_ha(<4 x double> [[TMP2:%.*]])
+; CHECK:    [[TMP3:%.*]] = call intel_svmlcc256 <4 x double> @__svml_cos4_ha(<4 x double> [[TMP4:%.*]])
+; CHECK:    ret void
+;
+entry:
+  br label %for.body
+
+for.body:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+  %tmp = trunc i64 %iv to i32
+  %conv = sitofp i32 %tmp to double
+  %call = tail call double @cos(double %conv)
+  %arrayidx = getelementptr inbounds double, double* %varray, i64 %iv
+  store double %call, double* %arrayidx, align 4
+  %iv.next = add nuw nsw i64 %iv, 1
+  %exitcond = icmp eq i64 %iv.next, 1000
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:
+  ret void
+}
+
+define void @cos_f32(float* nocapture %varray) {
+; CHECK-LABEL: @cos_f32(
+; CHECK:    [[TMP1:%.*]] = call intel_svmlcc256 <8 x float> @__svml_cosf8_ha(<8 x float> [[TMP2:%.*]])
+; CHECK:    ret void
+;
+entry:
+  br label %for.body
+
+for.body:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+  %tmp = trunc i64 %iv to i32
+  %conv = sitofp i32 %tmp to float
+  %call = tail call float @cosf(float %conv)
+  %arrayidx = getelementptr inbounds float, float* %varray, i64 %iv
+  store float %call, float* %arrayidx, align 4
+  %iv.next = add nuw nsw i64 %iv, 1
+  %exitcond = icmp eq i64 %iv.next, 1000
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:
+  ret void
+}
+
+define void @cos_f64_intrinsic(double* nocapture %varray) {
+; CHECK-LABEL: @cos_f64_intrinsic(
+; CHECK:    [[TMP1:%.*]] = call intel_svmlcc256 <4 x double> @__svml_cos4_ha(<4 x double> [[TMP2:%.*]])
+; CHECK:    [[TMP3:%.*]] = call intel_svmlcc256 <4 x double> @__svml_cos4_ha(<4 x double> [[TMP4:%.*]])
+; CHECK:    ret void
+;
+entry:
+  br label %for.body
+
+for.body:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+  %tmp = trunc i64 %iv to i32
+  %conv = sitofp i32 %tmp to double
+  %call = tail call double @llvm.cos.f64(double %conv)
+  %arrayidx = getelementptr inbounds double, double* %varray, i64 %iv
+  store double %call, double* %arrayidx, align 4
+  %iv.next = add nuw nsw i64 %iv, 1
+  %exitcond = icmp eq i64 %iv.next, 1000
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:
+  ret void
+}
+
+define void @cos_f32_intrinsic(float* nocapture %varray) {
+; CHECK-LABEL: @cos_f32_intrinsic(
+; CHECK:    [[TMP1:%.*]] = call intel_svmlcc256 <8 x float> @__svml_cosf8_ha(<8 x float> [[TMP2:%.*]])
+; CHECK:    ret void
+;
+entry:
+  br label %for.body
+
+for.body:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+  %tmp = trunc i64 %iv to i32
+  %conv = sitofp i32 %tmp to float
+  %call = tail call float @llvm.cos.f32(float %conv)
+  %arrayidx = getelementptr inbounds float, float* %varray, i64 %iv
+  store float %call, float* %arrayidx, align 4
+  %iv.next = add nuw nsw i64 %iv, 1
+  %exitcond = icmp eq i64 %iv.next, 1000
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:
+  ret void
+}
+
+define void @pow_f64(double* nocapture %varray, double* nocapture readonly %exp) {
+; CHECK-LABEL: @pow_f64(
+; CHECK:    [[TMP1:%.*]] = call intel_svmlcc256 <4 x double> @__svml_pow4_ha(<4 x double> [[TMP2:%.*]], <4 x double> [[TMP3:%.*]])
+; CHECK:    [[TMP4:%.*]] = call intel_svmlcc256 <4 x double> @__svml_pow4_ha(<4 x double> [[TMP5:%.*]], <4 x double> [[TMP6:%.*]])
+; CHECK:    ret void
+;
+entry:
+  br label %for.body
+
+for.body:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+  %tmp = trunc i64 %iv to i32
+  %conv = sitofp i32 %tmp to double
+  %arrayidx = getelementptr inbounds double, double* %exp, i64 %iv
+  %tmp1 = load double, double* %arrayidx, align 4
+  %tmp2 = tail call double @pow(double %conv, double %tmp1)
+  %arrayidx2 = getelementptr inbounds double, double* %varray, i64 %iv
+  store double %tmp2, double* %arrayidx2, align 4
+  %iv.next = add nuw nsw i64 %iv, 1
+  %exitcond = icmp eq i64 %iv.next, 1000
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:
+  ret void
+}
+
+define void @pow_f64_intrinsic(double* nocapture %varray, double* nocapture readonly %exp) {
+; CHECK-LABEL: @pow_f64_intrinsic(
+; CHECK:    [[TMP1:%.*]] = call intel_svmlcc256 <4 x double> @__svml_pow4_ha(<4 x double> [[TMP2:%.*]], <4 x double> [[TMP3:%.*]])
+; CHECK:    [[TMP4:%.*]] = call intel_svmlcc256 <4 x double> @__svml_pow4_ha(<4 x double> [[TMP5:%.*]], <4 x double> [[TMP6:%.*]])
+; CHECK:    ret void
+;
+entry:
+  br label %for.body
+
+for.body:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+  %tmp = trunc i64 %iv to i32
+  %conv = sitofp i32 %tmp to double
+  %arrayidx = getelementptr inbounds double, double* %exp, i64 %iv
+  %tmp1 = load double, double* %arrayidx, align 4
+  %tmp2 = tail call double @llvm.pow.f64(double %conv, double %tmp1)
+  %arrayidx2 = getelementptr inbounds double, double* %varray, i64 %iv
+  store double %tmp2, double* %arrayidx2, align 4
+  %iv.next = add nuw nsw i64 %iv, 1
+  %exitcond = icmp eq i64 %iv.next, 1000
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:
+  ret void
+}
+
+define void @pow_f32(float* nocapture %varray, float* nocapture readonly %exp) {
+; CHECK-LABEL: @pow_f32(
+; CHECK:    [[TMP1:%.*]] = call intel_svmlcc256 <8 x float> @__svml_powf8_ha(<8 x float> [[TMP2:%.*]], <8 x float> [[WIDE_LOAD:%.*]])
+; CHECK:    ret void
+;
+entry:
+  br label %for.body
+
+for.body:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+  %tmp = trunc i64 %iv to i32
+  %conv = sitofp i32 %tmp to float
+  %arrayidx = getelementptr inbounds float, float* %exp, i64 %iv
+  %tmp1 = load float, float* %arrayidx, align 4
+  %tmp2 = tail call float @powf(float %conv, float %tmp1)
+  %arrayidx2 = getelementptr inbounds float, float* %varray, i64 %iv
+  store float %tmp2, float* %arrayidx2, align 4
+  %iv.next = add nuw nsw i64 %iv, 1
+  %exitcond = icmp eq i64 %iv.next, 1000
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:
+  ret void
+}
+
+define void @pow_f32_intrinsic(float* nocapture %varray, float* nocapture readonly %exp) {
+; CHECK-LABEL: @pow_f32_intrinsic(
+; CHECK:    [[TMP1:%.*]] = call intel_svmlcc256 <8 x float> @__svml_powf8_ha(<8 x float> [[TMP2:%.*]], <8 x float> [[TMP3:%.*]])
+; CHECK:    ret void
+;
+entry:
+  br label %for.body
+
+for.body:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+  %tmp = trunc i64 %iv to i32
+  %conv = sitofp i32 %tmp to float
+  %arrayidx = getelementptr inbounds float, float* %exp, i64 %iv
+  %tmp1 = load float, float* %arrayidx, align 4
+  %tmp2 = tail call float @llvm.pow.f32(float %conv, float %tmp1)
+  %arrayidx2 = getelementptr inbounds float, float* %varray, i64 %iv
+  store float %tmp2, float* %arrayidx2, align 4
+  %iv.next = add nuw nsw i64 %iv, 1
+  %exitcond = icmp eq i64 %iv.next, 1000
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:
+  ret void
+}
+
+define void @exp_f64(double* nocapture %varray) {
+; CHECK-LABEL: @exp_f64(
+; CHECK:    [[TMP1:%.*]] = call intel_svmlcc256 <4 x double> @__svml_exp4_ha(<4 x double> [[TMP2:%.*]])
+; CHECK:    [[TMP3:%.*]] = call intel_svmlcc256 <4 x double> @__svml_exp4_ha(<4 x double> [[TMP4:%.*]])
+; CHECK:    ret void
+;
+entry:
+  br label %for.body
+
+for.body:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+  %tmp = trunc i64 %iv to i32
+  %conv = sitofp i32 %tmp to double
+  %call = tail call double @exp(double %conv)
+  %arrayidx = getelementptr inbounds double, double* %varray, i64 %iv
+  store double %call, double* %arrayidx, align 4
+  %iv.next = add nuw nsw i64 %iv, 1
+  %exitcond = icmp eq i64 %iv.next, 1000
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:
+  ret void
+}
+
+define void @exp_f32(float* nocapture %varray) {
+; CHECK-LABEL: @exp_f32(
+; CHECK:    [[TMP1:%.*]] = call intel_svmlcc256 <8 x float> @__svml_expf8_ha(<8 x float> [[TMP2:%.*]])
+; CHECK:    ret void
+;
+entry:
+  br label %for.body
+
+for.body:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+  %tmp = trunc i64 %iv to i32
+  %conv = sitofp i32 %tmp to float
+  %call = tail call float @expf(float %conv)
+  %arrayidx = getelementptr inbounds float, float* %varray, i64 %iv
+  store float %call, float* %arrayidx, align 4
+  %iv.next = add nuw nsw i64 %iv, 1
+  %exitcond = icmp eq i64 %iv.next, 1000
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:
+  ret void
+}
+
+define void @exp_f64_intrinsic(double* nocapture %varray) {
+; CHECK-LABEL: @exp_f64_intrinsic(
+; CHECK:    [[TMP1:%.*]] = call intel_svmlcc256 <4 x double> @__svml_exp4_ha(<4 x double> [[TMP2:%.*]])
+; CHECK:    [[TMP3:%.*]] = call intel_svmlcc256 <4 x double> @__svml_exp4_ha(<4 x double> [[TMP4:%.*]])
+; CHECK:    ret void
+;
+entry:
+  br label %for.body
+
+for.body:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+  %tmp = trunc i64 %iv to i32
+  %conv = sitofp i32 %tmp to double
+  %call = tail call double @llvm.exp.f64(double %conv)
+  %arrayidx = getelementptr inbounds double, double* %varray, i64 %iv
+  store double %call, double* %arrayidx, align 4
+  %iv.next = add nuw nsw i64 %iv, 1
+  %exitcond = icmp eq i64 %iv.next, 1000
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:
+  ret void
+}
+
+define void @exp_f32_intrinsic(float* nocapture %varray) {
+; CHECK-LABEL: @exp_f32_intrinsic(
+; CHECK:    [[TMP1:%.*]] = call intel_svmlcc256 <8 x float> @__svml_expf8_ha(<8 x float> [[TMP2:%.*]])
+; CHECK:    ret void
+;
+entry:
+  br label %for.body
+
+for.body:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+  %tmp = trunc i64 %iv to i32
+  %conv = sitofp i32 %tmp to float
+  %call = tail call float @llvm.exp.f32(float %conv)
+  %arrayidx = getelementptr inbounds float, float* %varray, i64 %iv
+  store float %call, float* %arrayidx, align 4
+  %iv.next = add nuw nsw i64 %iv, 1
+  %exitcond = icmp eq i64 %iv.next, 1000
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:
+  ret void
+}
+
+define void @log_f64(double* nocapture %varray) {
+; CHECK-LABEL: @log_f64(
+; CHECK:    [[TMP1:%.*]] = call intel_svmlcc256 <4 x double> @__svml_log4_ha(<4 x double> [[TMP2:%.*]])
+; CHECK:    [[TMP3:%.*]] = call intel_svmlcc256 <4 x double> @__svml_log4_ha(<4 x double> [[TMP4:%.*]])
+; CHECK:    ret void
+;
+entry:
+  br label %for.body
+
+for.body:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+  %tmp = trunc i64 %iv to i32
+  %conv = sitofp i32 %tmp to double
+  %call = tail call double @log(double %conv)
+  %arrayidx = getelementptr inbounds double, double* %varray, i64 %iv
+  store double %call, double* %arrayidx, align 4
+  %iv.next = add nuw nsw i64 %iv, 1
+  %exitcond = icmp eq i64 %iv.next, 1000
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:
+  ret void
+}
+
+define void @log_f32(float* nocapture %varray) {
+; CHECK-LABEL: @log_f32(
+; CHECK:    [[TMP1:%.*]] = call intel_svmlcc256 <8 x float> @__svml_logf8_ha(<8 x float> [[TMP2:%.*]])
+; CHECK:    ret void
+;
+entry:
+  br label %for.body
+
+for.body:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+  %tmp = trunc i64 %iv to i32
+  %conv = sitofp i32 %tmp to float
+  %call = tail call float @logf(float %conv)
+  %arrayidx = getelementptr inbounds float, float* %varray, i64 %iv
+  store float %call, float* %arrayidx, align 4
+  %iv.next = add nuw nsw i64 %iv, 1
+  %exitcond = icmp eq i64 %iv.next, 1000
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:
+  ret void
+}
+
+define void @log_f64_intrinsic(double* nocapture %varray) {
+; CHECK-LABEL: @log_f64_intrinsic(
+; CHECK:    [[TMP1:%.*]] = call intel_svmlcc256 <4 x double> @__svml_log4_ha(<4 x double> [[TMP2:%.*]])
+; CHECK:    [[TMP3:%.*]] = call intel_svmlcc256 <4 x double> @__svml_log4_ha(<4 x double> [[TMP4:%.*]])
+; CHECK:    ret void
+;
+entry:
+  br label %for.body
+
+for.body:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+  %tmp = trunc i64 %iv to i32
+  %conv = sitofp i32 %tmp to double
+  %call = tail call double @llvm.log.f64(double %conv)
+  %arrayidx = getelementptr inbounds double, double* %varray, i64 %iv
+  store double %call, double* %arrayidx, align 4
+  %iv.next = add nuw nsw i64 %iv, 1
+  %exitcond = icmp eq i64 %iv.next, 1000
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:
+  ret void
+}
+
+define void @log_f32_intrinsic(float* nocapture %varray) {
+; CHECK-LABEL: @log_f32_intrinsic(
+; CHECK:    [[TMP1:%.*]] = call intel_svmlcc256 <8 x float> @__svml_logf8_ha(<8 x float> [[TMP2:%.*]])
+; CHECK:    ret void
+;
+entry:
+  br label %for.body
+
+for.body:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+  %tmp = trunc i64 %iv to i32
+  %conv = sitofp i32 %tmp to float
+  %call = tail call float @llvm.log.f32(float %conv)
+  %arrayidx = getelementptr inbounds float, float* %varray, i64 %iv
+  store float %call, float* %arrayidx, align 4
+  %iv.next = add nuw nsw i64 %iv, 1
+  %exitcond = icmp eq i64 %iv.next, 1000
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:
+  ret void
+}
+
+attributes #0 = { nounwind readnone }
+
diff --git a/test/Transforms/LoopVectorize/X86/svml-legal-codegen.ll b/test/Transforms/LoopVectorize/X86/svml-legal-codegen.ll
new file mode 100644
index 0000000..f9e9170
--- /dev/null
+++ b/test/Transforms/LoopVectorize/X86/svml-legal-codegen.ll
@@ -0,0 +1,61 @@
+; Check that vector codegen splits illegal sin8 call to two sin4 calls on AVX for double datatype.
+; The C code used to generate this test:
+
+; #include <math.h>
+;
+; void foo(double *a, int N){
+;   int i;
+; #pragma clang loop vectorize_width(8)
+;   for (i=0;i<N;i++){
+;     a[i] = sin(i);
+;   }
+; }
+
+; RUN: opt -O2 -vector-library=SVML -loop-vectorize -force-vector-width=8 -mattr=avx -S < %s | FileCheck %s
+
+; CHECK: [[I1:%.*]] = sitofp <8 x i32> [[I0:%.*]] to <8 x double>
+; CHECK-NEXT: [[S1:%shuffle.*]] = shufflevector <8 x double> [[I1]], <8 x double> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEXT: [[I2:%.*]] = call fast intel_svmlcc256 <4 x double> @__svml_sin4(<4 x double> [[S1]])
+; CHECK-NEXT: [[S2:%shuffle.*]] = shufflevector <8 x double> [[I1]], <8 x double> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+; CHECK-NEXT: [[I3:%.*]] = call fast intel_svmlcc256 <4 x double> @__svml_sin4(<4 x double> [[S2]])
+; CHECK-NEXT: [[comb:%combined.*]] = shufflevector <4 x double> [[I2]], <4 x double> [[I3]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; CHECK: store <8 x double> [[comb]], <8 x double>* [[TMP:%.*]], align 8
+
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+; Function Attrs: nounwind uwtable
+define dso_local void @foo(double* nocapture %a, i32 %N) local_unnamed_addr #0 {
+entry:
+  %cmp5 = icmp sgt i32 %N, 0
+  br i1 %cmp5, label %for.body.preheader, label %for.end
+
+for.body.preheader:                               ; preds = %entry
+  %wide.trip.count = zext i32 %N to i64
+  br label %for.body
+
+for.body:                                         ; preds = %for.body, %for.body.preheader
+  %indvars.iv = phi i64 [ 0, %for.body.preheader ], [ %indvars.iv.next, %for.body ]
+  %0 = trunc i64 %indvars.iv to i32
+  %conv = sitofp i32 %0 to double
+  %call = tail call fast double @sin(double %conv) #2
+  %arrayidx = getelementptr inbounds double, double* %a, i64 %indvars.iv
+  store double %call, double* %arrayidx, align 8, !tbaa !2
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond = icmp eq i64 %indvars.iv.next, %wide.trip.count
+  br i1 %exitcond, label %for.end, label %for.body, !llvm.loop !6
+
+for.end:                                          ; preds = %for.body, %entry
+  ret void
+}
+
+; Function Attrs: nounwind
+declare dso_local double @sin(double) local_unnamed_addr #1
+
+!2 = !{!3, !3, i64 0}
+!3 = !{!"double", !4, i64 0}
+!4 = !{!"omnipotent char", !5, i64 0}
+!5 = !{!"Simple C/C++ TBAA"}
+!6 = distinct !{!6, !7}
+!7 = !{!"llvm.loop.vectorize.width", i32 8}
diff --git a/utils/TableGen/CMakeLists.txt b/utils/TableGen/CMakeLists.txt
index 8673a25..dcf2430 100644
--- a/utils/TableGen/CMakeLists.txt
+++ b/utils/TableGen/CMakeLists.txt
@@ -46,6 +46,7 @@ add_tablegen(llvm-tblgen LLVM
   SearchableTableEmitter.cpp
   SubtargetEmitter.cpp
   SubtargetFeatureInfo.cpp
+  SVMLEmitter.cpp
   TableGen.cpp
   Types.cpp
   X86DisassemblerTables.cpp
diff --git a/utils/TableGen/SVMLEmitter.cpp b/utils/TableGen/SVMLEmitter.cpp
new file mode 100644
index 0000000..8800ca8
--- /dev/null
+++ b/utils/TableGen/SVMLEmitter.cpp
@@ -0,0 +1,110 @@
+//===------ SVMLEmitter.cpp - Generate SVML function variants -------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This tablegen backend emits the scalar to svml function map for TLI.
+//
+//===----------------------------------------------------------------------===//
+
+#include "CodeGenTarget.h"
+#include "llvm/Support/Format.h"
+#include "llvm/TableGen/Error.h"
+#include "llvm/TableGen/Record.h"
+#include "llvm/TableGen/TableGenBackend.h"
+#include <map>
+#include <vector>
+
+using namespace llvm;
+
+#define DEBUG_TYPE "SVMLVariants"
+#include "llvm/Support/Debug.h"
+
+namespace {
+
+class SVMLVariantsEmitter {
+
+  RecordKeeper &Records;
+
+private:
+  void emitSVMLVariants(raw_ostream &OS);
+
+public:
+  SVMLVariantsEmitter(RecordKeeper &R) : Records(R) {}
+
+  void run(raw_ostream &OS);
+};
+} // End anonymous namespace
+
+/// \brief Emit the set of SVML variant function names.
+// The default is to emit the high accuracy SVML variants until a mechanism is
+// introduced to allow a selection of different variants through precision
+// requirements specified by the user. This code generates mappings to svml
+// that are in the scalar form of llvm intrinsics, math library calls, or the
+// finite variants of math library calls.
+void SVMLVariantsEmitter::emitSVMLVariants(raw_ostream &OS) {
+
+  const unsigned MinSinglePrecVL = 4;
+  const unsigned MaxSinglePrecVL = 16;
+  const unsigned MinDoublePrecVL = 2;
+  const unsigned MaxDoublePrecVL = 8;
+
+  OS << "#ifdef GET_SVML_VARIANTS\n";
+
+  for (const auto &D : Records.getAllDerivedDefinitions("SvmlVariant")) {
+    StringRef SvmlVariantNameStr = D->getName();
+    // Single Precision SVML
+    for (unsigned VL = MinSinglePrecVL; VL <= MaxSinglePrecVL; VL *= 2) {
+      // Emit the scalar math library function to svml function entry.
+      OS << "{\"" << SvmlVariantNameStr << "f" << "\", ";
+      OS << "\"" << "__svml_" << SvmlVariantNameStr << "f" << VL << "\", "
+         << VL << "},\n";
+
+      // Emit the scalar intrinsic to svml function entry.
+      OS << "{\"" << "llvm." << SvmlVariantNameStr << ".f32" << "\", ";
+      OS << "\"" << "__svml_" << SvmlVariantNameStr << "f" << VL << "\", "
+         << VL << "},\n";
+
+      // Emit the finite math library function to svml function entry.
+      OS << "{\"__" << SvmlVariantNameStr << "f_finite" << "\", ";
+      OS << "\"" << "__svml_" << SvmlVariantNameStr << "f" << VL << "\", "
+         << VL << "},\n";
+    }
+
+    // Double Precision SVML
+    for (unsigned VL = MinDoublePrecVL; VL <= MaxDoublePrecVL; VL *= 2) {
+      // Emit the scalar math library function to svml function entry.
+      OS << "{\"" << SvmlVariantNameStr << "\", ";
+      OS << "\"" << "__svml_" << SvmlVariantNameStr << VL << "\", " << VL
+         << "},\n";
+
+      // Emit the scalar intrinsic to svml function entry.
+      OS << "{\"" << "llvm." << SvmlVariantNameStr << ".f64" << "\", ";
+      OS << "\"" << "__svml_" << SvmlVariantNameStr << VL << "\", " << VL
+         << "},\n";
+
+      // Emit the finite math library function to svml function entry.
+      OS << "{\"__" << SvmlVariantNameStr << "_finite" << "\", ";
+      OS << "\"" << "__svml_" << SvmlVariantNameStr << VL << "\", "
+         << VL << "},\n";
+    }
+  }
+
+  OS << "#endif // GET_SVML_VARIANTS\n\n";
+}
+
+void SVMLVariantsEmitter::run(raw_ostream &OS) {
+  emitSVMLVariants(OS);
+}
+
+namespace llvm {
+
+void EmitSVMLVariants(RecordKeeper &RK, raw_ostream &OS) {
+  SVMLVariantsEmitter(RK).run(OS);
+}
+
+} // End llvm namespace
diff --git a/utils/TableGen/TableGen.cpp b/utils/TableGen/TableGen.cpp
index 8015a58..a61a7b3 100644
--- a/utils/TableGen/TableGen.cpp
+++ b/utils/TableGen/TableGen.cpp
@@ -57,6 +57,7 @@ enum ActionType {
   GenDirectivesEnumDecl,
   GenDirectivesEnumImpl,
   GenDirectivesEnumGen,
+  GenSVMLVariants,
 };
 
 namespace llvm {
@@ -137,7 +138,9 @@ cl::opt<ActionType> Action(
         clEnumValN(GenDirectivesEnumImpl, "gen-directive-impl",
                    "Generate directive related implementation code"),
         clEnumValN(GenDirectivesEnumGen, "gen-directive-gen",
-                   "Generate directive related implementation code part")));
+                   "Generate directive related implementation code part"),
+        clEnumValN(GenSVMLVariants, "gen-svml",
+                   "Generate SVML variant function names")));
 
 cl::OptionCategory PrintEnumsCat("Options for -print-enums");
 cl::opt<std::string> Class("class", cl::desc("Print Enum list for this class"),
@@ -271,6 +274,9 @@ bool LLVMTableGenMain(raw_ostream &OS, RecordKeeper &Records) {
   case GenDirectivesEnumGen:
     EmitDirectivesGen(Records, OS);
     break;
+  case GenSVMLVariants:
+    EmitSVMLVariants(Records, OS);
+    break;
   }
 
   return false;
diff --git a/utils/TableGen/TableGenBackends.h b/utils/TableGen/TableGenBackends.h
index 92204f3..c1bcadc 100644
--- a/utils/TableGen/TableGenBackends.h
+++ b/utils/TableGen/TableGenBackends.h
@@ -93,6 +93,7 @@ void EmitAutomata(RecordKeeper &RK, raw_ostream &OS);
 void EmitDirectivesDecl(RecordKeeper &RK, raw_ostream &OS);
 void EmitDirectivesImpl(RecordKeeper &RK, raw_ostream &OS);
 void EmitDirectivesGen(RecordKeeper &RK, raw_ostream &OS);
+void EmitSVMLVariants(RecordKeeper &RK, raw_ostream &OS);
 
 } // End llvm namespace
 
diff --git a/utils/vim/syntax/llvm.vim b/utils/vim/syntax/llvm.vim
index ce36b76..da3a4a9 100644
--- a/utils/vim/syntax/llvm.vim
+++ b/utils/vim/syntax/llvm.vim
@@ -96,6 +96,7 @@ syn keyword llvmKeyword
       \ inreg
       \ inteldialect
       \ intel_ocl_bicc
+      \ intel_svmlcc
       \ internal
       \ linkonce
       \ linkonce_odr
-- 
2.7.4