git clone git@github.com:llvm-mirror/clang
cd clang
git remote add hcc https://github.com/RadeonOpenCompute/hcc-clang-upgrade
git fetch hcc
git reset --hard roc-hcc-2.7.0
git revert 0d3577fd8da3929e4b5729ba83d7bb925c8db0a5
git merge -X theirs release_90

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 60937aa9db..4d6ee2fe8d 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -194,9 +194,9 @@ Please install Python or specify the PYTHON_EXECUTABLE CMake variable.")
   endif()
 
   set( CLANG_BUILT_STANDALONE 1 )
-  set(BACKEND_PACKAGE_STRING "LLVM ${LLVM_PACKAGE_VERSION}")
+  set(BACKEND_PACKAGE_STRING "HCC ${HCC_VERSION_STRING} LLVM ${LLVM_PACKAGE_VERSION}")
 else()
-  set(BACKEND_PACKAGE_STRING "${PACKAGE_STRING}")
+  set(BACKEND_PACKAGE_STRING "HCC ${HCC_VERSION_STRING} ${LLVM_PACKAGE_VERSION}")
 endif()
 
 # Make sure that our source directory is on the current cmake module path so that
@@ -302,8 +302,7 @@ if (NOT DEFINED MATCHED_ARCH OR "${CMAKE_MATCH_1}" LESS 35)
     "Default architecture for OpenMP offloading to Nvidia GPUs." FORCE)
 endif()
 
-set(CLANG_VENDOR ${PACKAGE_VENDOR} CACHE STRING
-  "Vendor-specific text for showing with version information.")
+set(CLANG_VENDOR "HCC")
 
 if( CLANG_VENDOR )
   add_definitions( -DCLANG_VENDOR="${CLANG_VENDOR} " )
@@ -447,6 +446,12 @@ if (NOT LLVM_INSTALL_TOOLCHAIN_ONLY)
 endif()
 
 add_definitions( -D_GNU_SOURCE )
+add_definitions(-DHCC_AMDGPU_TARGET="${AMDGPU_TARGET}")
+if (HCC_TOOLCHAIN_RHEL)
+ add_definitions(-DHCC_TOOLCHAIN_RHEL=true)
+else()
+ add_definitions(-DHCC_TOOLCHAIN_RHEL=false)
+endif()
 
 option(CLANG_BUILD_TOOLS
   "Build the Clang tools. If OFF, just generate build targets." ON)
diff --git a/README.md b/README.md
new file mode 100755
index 0000000000..78f306f17d
--- /dev/null
+++ b/README.md
@@ -0,0 +1,38 @@
+ToT HCC Clang
+=============
+
+This repository hosts ToT HCC Clang which is synchronized with upstream Clang.
+
+Branches
+========
+- master : holds production-ready codes
+
+- upstream : holds commits from upstream clang
+  The branch always have the latest vanilla clang.
+
+- clang_tot_upgrade : holds hcc-specific codes
+  Developments are always conducted here.
+
+- release_YYWW : release branches for week WW year YY
+  Periodically release branches would be created to merge all latest commits
+  upstream and develop branch. Once tested, it would be promoted to master.
+
+How to Build It
+===============
+This is how I build it now. The commands assumes:
+- ROCm stack is already installed
+- ROCm-Device-Libs is built, and installed at ~/hcc/ROCm-Device-Libs/build/dist
+- N is the number of threads available for make
+
+```bash
+git clone --recursive -b clang_tot_upgrade git@github.com:RadeonOpenCompute/hcc.git hcc_upstream
+mkdir build_upstream
+cd build_upstream
+cmake \
+    -DCMAKE_BUILD_TYPE=Release \
+    -DHSA_AMDGPU_GPU_TARGET=gfx803 \
+    -DROCM_DEVICE_LIB_DIR=~/hcc/ROCm-Device-Libs/build/dist/lib \
+    ../hcc_upstream
+make -jN world
+make -jN
+```
diff --git a/include/clang/AST/ASTContext.h b/include/clang/AST/ASTContext.h
index 1d1aaf4fb1..c73ed1fb4e 100644
--- a/include/clang/AST/ASTContext.h
+++ b/include/clang/AST/ASTContext.h
@@ -2529,13 +2529,9 @@ public:
   QualType getFloatingTypeOfSizeWithinDomain(QualType typeSize,
                                              QualType typeDomain) const;
 
-  unsigned getTargetAddressSpace(QualType T) const {
-    return getTargetAddressSpace(T.getQualifiers());
-  }
+  unsigned getTargetAddressSpace(QualType T) const;
 
-  unsigned getTargetAddressSpace(Qualifiers Q) const {
-    return getTargetAddressSpace(Q.getAddressSpace());
-  }
+  unsigned getTargetAddressSpace(Qualifiers Q) const;
 
   unsigned getTargetAddressSpace(LangAS AS) const;
 
diff --git a/include/clang/AST/DeclCXX.h b/include/clang/AST/DeclCXX.h
index 7add83f896..14c2f7a603 100644
--- a/include/clang/AST/DeclCXX.h
+++ b/include/clang/AST/DeclCXX.h
@@ -975,10 +975,22 @@ public:
   /// This value is used for lazy creation of default constructors.
   bool needsImplicitDefaultConstructor() const {
     return !data().UserDeclaredConstructor &&
-           !(data().DeclaredSpecialMembers & SMF_DefaultConstructor) &&
+           !(data().DeclaredSpecialMembers & SMF_DefaultConstructor)
+           // UPGRADE_TBD: workaround to avoid "no matching constructor" issue
+#if 1
+           ;
+#else
+           &&
+           // C++14 [expr.prim.lambda]p20:
+           //   The closure type associated with a lambda-expression has no
+           //   default constructor.
            (!isLambda() || lambdaIsDefaultConstructibleAndAssignable());
+#endif
   }
 
+  /// Returns the deserialization constructor for this class.
+  CXXMethodDecl *getCXXAMPDeserializationConstructor() const;
+
   /// Determine whether this class has any user-declared constructors.
   ///
   /// When true, a default constructor will not be implicitly declared.
diff --git a/include/clang/AST/Type.h b/include/clang/AST/Type.h
index 584655fe78..58df93777c 100644
--- a/include/clang/AST/Type.h
+++ b/include/clang/AST/Type.h
@@ -472,7 +472,8 @@ public:
     return A == B ||
            // Otherwise in OpenCLC v2.0 s6.5.5: every address space except
            // for __constant can be used as __generic.
-           (A == LangAS::opencl_generic && B != LangAS::opencl_constant);
+           (A == LangAS::opencl_generic && B != LangAS::opencl_constant) ||
+           (A == LangAS::Default && B == LangAS::hcc_tilestatic);
   }
 
   /// Returns true if the address space in these qualifiers is equal to or
@@ -2391,6 +2392,12 @@ public:
   CanQualType getCanonicalTypeUnqualified() const; // in CanonicalType.h
   void dump() const;
   void dump(llvm::raw_ostream &OS) const;
+
+  friend class ASTReader;
+  friend class ASTWriter;
+
+  /// \brief True if object is of hc::array or Concurrency:type
+  bool isGPUArrayType() const;
 };
 
 /// This will check for a TypedefType by removing any existing sugar
diff --git a/include/clang/Basic/AddressSpaces.h b/include/clang/Basic/AddressSpaces.h
index 2cc67474c1..96ddd63bce 100644
--- a/include/clang/Basic/AddressSpaces.h
+++ b/include/clang/Basic/AddressSpaces.h
@@ -42,6 +42,11 @@ enum class LangAS : unsigned {
   cuda_constant,
   cuda_shared,
 
+  // HCC specific address spaces.
+  hcc_tilestatic,
+  hcc_generic,
+  hcc_global,
+
   // This denotes the count of language-specific address spaces and also
   // the offset added to the target-specific address spaces, which are usually
   // specified by address space attributes __attribute__(address_space(n))).
diff --git a/include/clang/Basic/Attr.td b/include/clang/Basic/Attr.td
index d39b16e62b..d4abd21145 100644
--- a/include/clang/Basic/Attr.td
+++ b/include/clang/Basic/Attr.td
@@ -300,6 +300,7 @@ def COnly : LangOpt<"COnly", "!LangOpts.CPlusPlus">;
 def CPlusPlus : LangOpt<"CPlusPlus">;
 def OpenCL : LangOpt<"OpenCL">;
 def RenderScript : LangOpt<"RenderScript">;
+def CPlusPlusAMP : LangOpt<"CPlusPlusAMP">;
 def ObjC : LangOpt<"ObjC">;
 def BlocksSupported : LangOpt<"Blocks">;
 def ObjCAutoRefCount : LangOpt<"ObjCAutoRefCount">;
@@ -911,6 +912,18 @@ def Const : InheritableAttr {
   let Documentation = [Undocumented];
 }
 
+def HC_HC : InheritableAttr {
+  let Spellings = [CXX11<"","hc", 201511>];
+  let Subjects = SubjectList<[Var, Function], ErrorDiag>;
+  let Documentation = [Undocumented];
+}
+
+def HC_CPU : InheritableAttr {
+  let Spellings = [CXX11<"","cpu", 201511>];
+  let Subjects = SubjectList<[Function], ErrorDiag>;
+  let Documentation = [Undocumented];
+}
+
 def Constructor : InheritableAttr {
   let Spellings = [GCC<"constructor">];
   let Args = [DefaultIntArgument<"Priority", 65535>];
@@ -1517,31 +1530,51 @@ def RISCVInterrupt : InheritableAttr, TargetSpecificAttr<TargetRISCV> {
 // this should be rejected on non-kernels.
 
 def AMDGPUFlatWorkGroupSize : InheritableAttr {
-  let Spellings = [Clang<"amdgpu_flat_work_group_size", 0>];
-  let Args = [ExprArgument<"Min">, ExprArgument<"Max">];
+  let Spellings = [Clang<"amdgpu_flat_work_group_size",0>,
+                   CXX11<"","hc_flat_workgroup_size", 201511>];
+  let Args = [ExprArgument<"Min">,
+              ExprArgument<"Max", 1>];
   let Documentation = [AMDGPUFlatWorkGroupSizeDocs];
+  let TemplateDependent = 1;
   let Subjects = SubjectList<[Function], ErrorDiag, "kernel functions">;
 }
 
 def AMDGPUWavesPerEU : InheritableAttr {
-  let Spellings = [Clang<"amdgpu_waves_per_eu", 0>];
-  let Args = [ExprArgument<"Min">, ExprArgument<"Max", 1>];
+  let Spellings = [Clang<"amdgpu_waves_per_eu",0>,
+                   CXX11<"", "hc_waves_per_eu", 201511>];
+  let Args = [ExprArgument<"Min">,
+              ExprArgument<"Max", 1>];
   let Documentation = [AMDGPUWavesPerEUDocs];
+  let TemplateDependent = 1;
   let Subjects = SubjectList<[Function], ErrorDiag, "kernel functions">;
 }
 
 def AMDGPUNumSGPR : InheritableAttr {
-  let Spellings = [Clang<"amdgpu_num_sgpr", 0>];
-  let Args = [UnsignedArgument<"NumSGPR">];
+  let Spellings = [Clang<"amdgpu_num_sgpr">];
+  let Args = [ExprArgument<"NumSGPR">];
   let Documentation = [AMDGPUNumSGPRNumVGPRDocs];
+  let TemplateDependent = 1;
   let Subjects = SubjectList<[Function], ErrorDiag, "kernel functions">;
 }
 
 def AMDGPUNumVGPR : InheritableAttr {
-  let Spellings = [Clang<"amdgpu_num_vgpr", 0>];
-  let Args = [UnsignedArgument<"NumVGPR">];
+  let Spellings = [Clang<"amdgpu_num_vgpr">];
+  let Args = [ExprArgument<"NumVGPR">];
   let Documentation = [AMDGPUNumSGPRNumVGPRDocs];
   let Subjects = SubjectList<[Function], ErrorDiag, "kernel functions">;
+  let TemplateDependent = 1;
+}
+
+def AMDGPUMaxWorkGroupDim : InheritableAttr {
+  let Spellings = [CXX11<"","hc_max_workgroup_dim", 201511>];
+  let Args = [ExprArgument<"X">,
+              ExprArgument<"Y">,
+              ExprArgument<"Z">,
+              StringArgument<"ISA", 1>];
+  let Subjects = SubjectList<[Function], ErrorDiag>;
+  let Documentation = [Undocumented];
+  let TemplateDependent = 1;
+  let Subjects = SubjectList<[Function], ErrorDiag, "kernel functions">;
 }
 
 def WebAssemblyImportModule : InheritableAttr,
@@ -3225,6 +3258,30 @@ def InternalLinkage : InheritableAttr {
   let Documentation = [InternalLinkageDocs];
 }
 
+// C++AMP attributes
+
+def CXXAMPRestrictAMP : InheritableAttr {
+  let Spellings = [GNU<"amp">, GNU<"hc">];
+  let Documentation = [Undocumented];
+}
+
+def CXXAMPRestrictAUTO : InheritableAttr {
+  let Spellings = [GNU<"auto">];
+  let Documentation = [Undocumented];
+}
+
+def CXXAMPRestrictCPU : InheritableAttr {
+  let Spellings = [GNU<"cpu">];
+  let Documentation = [Undocumented];
+}
+
+def HCCTileStatic : InheritableAttr {
+  let Spellings = [GNU<"tile_static">];
+  let Subjects = SubjectList<[Var]>;
+  let LangOpts = [CPlusPlusAMP];
+  let Documentation = [Undocumented];
+}
+
 def ExcludeFromExplicitInstantiation : InheritableAttr {
   let Spellings = [Clang<"exclude_from_explicit_instantiation">];
   let Subjects = SubjectList<[Var, Function, CXXRecord]>;
diff --git a/include/clang/Basic/AttrDocs.td b/include/clang/Basic/AttrDocs.td
index fac6116057..e6aef02596 100644
--- a/include/clang/Basic/AttrDocs.td
+++ b/include/clang/Basic/AttrDocs.td
@@ -1810,14 +1810,18 @@ specified when the kernel is dispatched. It is the product of the sizes of the
 x, y, and z dimension of the work-group.
 
 Clang supports the
-``__attribute__((amdgpu_flat_work_group_size(<min>, <max>)))`` attribute for the
+``__attribute__((amdgpu_flat_work_group_size(<min>[, <max>][, <ISA>])))`` and 
+``[[ hc_flat_work_group_size(<min>[, <max>][, <ISA>] ]]`` attributes for the
 AMDGPU target. This attribute may be attached to a kernel function definition
-and is an optimization hint.
+in OpenCL or an lambda function definition in HCC and is an optimization hint.
 
 ``<min>`` parameter specifies the minimum flat work-group size, and ``<max>``
 parameter specifies the maximum flat work-group size (must be greater than
-``<min>``) to which all dispatches of the kernel will conform. Passing ``0, 0``
-as ``<min>, <max>`` implies the default behavior (``128, 256``).
+``<min>``) to which all dispatches of the kernel will conform. ``<min>`` and
+``<max>`` must be 32 bit integer constants. Negative value is treated as 32 bit
+two's complement of a positive value. Passing ``0`` or ``0, 0`` as
+``<min>[, <max>]`` implies the default behavior (``128, 256``). If ``<max>``
+is unspecified, platform allowed maximum value is assumed.
 
 If specified, the AMDGPU target backend might be able to produce better machine
 code for barriers and perform scratch promotion by estimating available group
@@ -1843,16 +1847,21 @@ resources used by a single wavefront have to be limited. For example, the number
 of SGPRs and VGPRs. Limiting such resources can allow greater latency hiding,
 but can result in having to spill some register state to memory.
 
-Clang supports the ``__attribute__((amdgpu_waves_per_eu(<min>[, <max>])))``
-attribute for the AMDGPU target. This attribute may be attached to a kernel
-function definition and is an optimization hint.
+Clang supports the
+``__attribute__((amdgpu_waves_per_eu(<min>[, <max>][,<ISA>])))``
+and [[ hc_waves_per_eu(<min>[, <max>][, <ISA>] ]]``
+attribute for the AMDGPU target.  This attribute may be attached to a kernel
+function definition in OpenCL or an lambda function definition in HCC and is an
+optimization hint.
 
 ``<min>`` parameter specifies the requested minimum number of waves per EU, and
 *optional* ``<max>`` parameter specifies the requested maximum number of waves
 per EU (must be greater than ``<min>`` if specified). If ``<max>`` is omitted,
 then there is no restriction on the maximum number of waves per EU other than
 the one dictated by the hardware for which the kernel is compiled. Passing
-``0, 0`` as ``<min>, <max>`` implies the default behavior (no limits).
+``0, 0`` as ``<min>, <max>`` implies the default behavior (no limits). ``<min>``
+and ``<max>`` must be 32 bit integer constants. Negative value is treated as 32
+bit two's complement of a postive value.
 
 If specified, this attribute allows an advanced developer to tune the number of
 wavefronts that are capable of fitting within the resources of an EU. The AMDGPU
@@ -1906,6 +1915,36 @@ An error will be given if:
   }];
 }
 
+def AMDGPUMaxWorkGroupDimDocs : Documentation {
+  let Category = DocCatAMDGPUAttributes;
+  let Content = [{
+Clang supports the
+``[[ hc_max_work_group_dim(<x>, <y>, <z>[, <ISA>] ]]`` attributes for the
+AMDGPU target. This attribute may be attached to an lambda function definition
+in HCC and is an optimization hint.
+
+``<x>``, ``<y>``, and ``<z>`` parameter specifies the maximum work-group
+dimension in x, y, and z direction. They must be 32 bit integer constants. 0
+indicates platform allowed maximum value. Negative value is treated as 32 bit
+two's complement of a positive value.
+
+``<ISA>`` parameter specifies the target ISA version for which this parameter
+is applied. It is either a three-digit ISA version string prefixed by "gfx",
+e.g. "gfx810". If this parameter is not specified, the attribute is always
+applied. If this parameter is specified but does not match the ISA version of
+the target GPU, the attribute is not applied. Multiple attributes with
+different ``<ISA>`` parameter can be used for the same function.
+
+If specified, the AMDGPU target backend might be able to produce better machine
+code for array indices calculation.
+
+An error will be given if:
+  - Specified values violate subtarget specifications;
+  - Specified values are not compatible with values provided through other
+    attributes.
+  }];
+}
+
 def DocCatCallingConvs : DocumentationCategory<"Calling Conventions"> {
   let Content = [{
 Clang supports several different calling conventions, depending on the target
diff --git a/include/clang/Basic/CodeGenOptions.def b/include/clang/Basic/CodeGenOptions.def
index cd7a845487..aaa524047b 100644
--- a/include/clang/Basic/CodeGenOptions.def
+++ b/include/clang/Basic/CodeGenOptions.def
@@ -41,6 +41,8 @@ CODEGENOPT(ControlFlowGuard  , 1, 0) ///< -cfguard
 CODEGENOPT(CoverageExtraChecksum, 1, 0) ///< Whether we need a second checksum for functions in GCNO files.
 CODEGENOPT(CoverageNoFunctionNamesInData, 1, 0) ///< Do not include function names in GCDA files.
 CODEGENOPT(CoverageExitBlockBeforeBody, 1, 0) ///< Whether to emit the exit block before the body blocks in GCNO files.
+CODEGENOPT(AMPIsDevice       , 1, 0) ///< Set when compiling for C++AMP kernels.
+CODEGENOPT(AMPCPU            , 1, 0) ///< Set when compiling for C++AMP kernels on CPU.
 CODEGENOPT(CXAAtExit         , 1, 1) ///< Use __cxa_atexit for calling destructors.
 CODEGENOPT(RegisterGlobalDtorsWithAtExit, 1, 1) ///< Use atexit or __cxa_atexit to register global destructors.
 CODEGENOPT(CXXCtorDtorAliases, 1, 0) ///< Emit complete ctors/dtors as linker
diff --git a/include/clang/Basic/DiagnosticCommonKinds.td b/include/clang/Basic/DiagnosticCommonKinds.td
index ca2faf59d7..8e0ef71c1e 100644
--- a/include/clang/Basic/DiagnosticCommonKinds.td
+++ b/include/clang/Basic/DiagnosticCommonKinds.td
@@ -299,6 +299,10 @@ def err_openclcxx_not_supported : Error<
 def err_omp_more_one_clause : Error<
   "directive '#pragma omp %0' cannot contain more than one '%1' clause%select{| with '%3' name modifier| with 'source' dependence}2">;
 
+// C++AMP
+def err_amp_ill_formed_functor : Error<
+  "Ill-formed functor class. Check restriction specifier of kernel.">;
+
 // Static Analyzer Core
 def err_unknown_analyzer_checker : Error<
     "no analyzer checkers or packages are associated with '%0'">;
diff --git a/include/clang/Basic/DiagnosticDriverKinds.td b/include/clang/Basic/DiagnosticDriverKinds.td
index 12f1a7f6c4..72d22da30b 100644
--- a/include/clang/Basic/DiagnosticDriverKinds.td
+++ b/include/clang/Basic/DiagnosticDriverKinds.td
@@ -251,6 +251,22 @@ def err_drv_unsupported_embed_bitcode
 def err_drv_bitcode_unsupported_on_toolchain : Error<
   "-fembed-bitcode is not supported on versions of iOS prior to 6.0">;
 
+def warn_amdgpu_agent_detector_failed : Warning<
+    "ROCm agent detector failed to execute correctly; using default "
+    "-amdgpu-target instead">, InGroup<Fallback>;
+def warn_amdgpu_target_deprecated : Warning<
+    "-amdgpu-target argument '%0' identifies an architecture for which support "
+    "is deprecated; please consider upgrading">, InGroup<Deprecated>;
+def warn_amdgpu_target_auto_nonsingular : Warning<
+    "using auto as argument to -amdgpu-target in conjunction with other "
+    "arguments is disallowed; ignoring auto and using explicit targets "
+    "instead">, InGroup<Fallback>;
+def err_amdgpu_no_agent_available : Error<
+    "ROCm agent detector could not identify any valid targets; please specify "
+    "the target explicitly by passing a valid value to -amdgpu-target">;
+def warn_amdgpu_target_invalid : Warning<"-amdgpu-target argument '%0' is not recognized; using gfx803 instead">, InGroup<InvalidCommandLineArgument>;
+def warn_drv_O0_ignored_for_GPU : Warning<"-O0 is ignored in GPU compilation path">, InGroup<InvalidCommandLineArgument>;
+def warn_drv_O1_ignored_for_GPU : Warning<"-O1 is ignored in GPU compilation path">, InGroup<InvalidCommandLineArgument>;
 def warn_O4_is_O3 : Warning<"-O4 is equivalent to -O3">, InGroup<Deprecated>;
 def warn_drv_optimization_value : Warning<"optimization level '%0' is not supported; using '%1%2' instead">,
   InGroup<InvalidCommandLineArgument>;
diff --git a/include/clang/Basic/DiagnosticParseKinds.td b/include/clang/Basic/DiagnosticParseKinds.td
index 8e6ced0dea..0886614670 100644
--- a/include/clang/Basic/DiagnosticParseKinds.td
+++ b/include/clang/Basic/DiagnosticParseKinds.td
@@ -1230,7 +1230,6 @@ def err_pragma_pipeline_invalid_keyword : Error<
 def warn_pragma_unroll_cuda_value_in_parens : Warning<
   "argument to '#pragma unroll' should not be in parentheses in CUDA C/C++">,
   InGroup<CudaCompat>;
-
 def warn_cuda_attr_lambda_position : Warning<
   "nvcc does not allow '__%0__' to appear after '()' in lambdas">,
   InGroup<CudaCompat>;
@@ -1240,6 +1239,52 @@ def warn_pragma_force_cuda_host_device_bad_arg : Warning<
 def err_pragma_cannot_end_force_cuda_host_device : Error<
   "force_cuda_host_device end pragma without matching "
   "force_cuda_host_device begin">;
+
+// C++AMP supoport.
+def err_expected_restrict : Error<
+  "expected 'restrict' specifier">;
+def err_expected_lparen_after_restriction : Error<
+  "expected '(' for restriction specifier">;
+def err_amp_unrecognized_restriction : Error<
+  "'%0' : unrecognized restriction specifier">;
+def err_amp_empty_restriction : Error<
+  "empty restriction sepcifier is not allowed">;
+def err_amp_expected_auto_restriction_on_definition : Error<
+  "'auto' restriction specifier is only allowed on function definition">;
+def err_amp_auto_restricted_function_has_other_declaration : Error<
+  "'%0':  expected no other declaration since it is auto restricted">;
+def note_auto_restricted_prev_declaration : Note<
+  "previous declaration is here">;
+def err_amp_no_throw : Error<
+  "exception specifier is not allowed in C++AMP context">;
+def err_tile_static_no_init : Error<
+  "tile_static variables can't be initialized">;
+def err_amp_illegal_function_parameter : Error<
+  "incompatible type for function parameter in AMP-restricted functions">;
+def err_amp_illegal_function_parameter_char : Error<
+  "char type can't be used as a function parameter type in AMP-restricted functions">;
+def err_amp_illegal_function_parameter_short : Error<
+  "short type can't be used as a function parameter type in AMP-restricted functions">;
+def err_amp_illegal_function_parameter_volatile : Error<
+  "volatile type qualifier can't be used as a function parameter type qualifier in AMP-restricted functions">;
+def err_amp_illegal_function_return_char : Error<
+  "char type can't be used as function return type in AMP-restricted functions">;
+def err_amp_illegal_function_return_short : Error<
+  "short type can't be used as function return type in AMP-restricted functions">;
+def err_amp_illegal_function_return_volatile : Error<
+  "volatile type qualifier can't be used as function return type qualifier in AMP-restricted functions">;
+def err_amp_illegal_keyword_dynamiccast : Error<
+  "dynamic_cast is prohibited in AMP-restricted functions">;
+def err_amp_illegal_keyword_typeid : Error<
+  "typeid is prohibited in AMP-restricted functions">;
+def err_amp_illegal_keyword_goto : Error<
+  "goto is prohibited in AMP-restricted functions">;
+def err_amp_illegal_keyword_throw : Error<
+  "throw is prohibited in AMP-restricted functions">;
+def err_amp_illegal_keyword_trycatch : Error<
+  "try/catch is prohibited in AMP-restricted functions">;
+def err_amp_illegal_keyword_asm : Error<
+  "asm is prohibited in AMP-restricted functions">;
 } // end of Parse Issue category.
 
 let CategoryName = "Modules Issue" in {
diff --git a/include/clang/Basic/DiagnosticSemaKinds.td b/include/clang/Basic/DiagnosticSemaKinds.td
index 275c4e4365..5a79c901c9 100644
--- a/include/clang/Basic/DiagnosticSemaKinds.td
+++ b/include/clang/Basic/DiagnosticSemaKinds.td
@@ -2661,6 +2661,8 @@ def warn_attribute_address_multiple_identical_qualifiers : Warning<
   InGroup<DuplicateDeclSpecifier>;
 def err_attribute_address_function_type : Error<
   "function type may not be qualified with an address space">;
+def err_attribute_amdgpu_invalid_isa_version : Error<
+  "invalid AMD GPU ISA version parameter '%0'">;
 def err_as_qualified_auto_decl : Error<
   "automatic variable qualified with an%select{| invalid}0 address space">;
 def err_arg_with_address_space : Error<
@@ -7300,8 +7302,9 @@ def err_cuda_device_exceptions : Error<
 def err_dynamic_var_init : Error<
     "dynamic initialization is not supported for "
     "__device__, __constant__, and __shared__ variables.">;
-def err_shared_var_init : Error<
-    "initialization is not supported for __shared__ variables.">;
+def warn_shared_var_init : Warning<
+    "initialization is not supported for __shared__ variables.">,
+    InGroup<DiagGroup<"cuda-shared-init">>, DefaultError;
 def err_device_static_local_var : Error<
     "within a %select{__device__|__global__|__host__|__host__ __device__}0 "
     "function, only __shared__ variables or const variables without device "
@@ -8808,6 +8811,84 @@ def err_opencl_builtin_to_addr_arg_num : Error<
 def err_opencl_builtin_to_addr_invalid_arg : Error<
   "invalid argument %0 to function: %1, expecting a generic pointer argument">;
 
+// C++AMP support.
+def err_amp_data_member_offset_not_natural_alignment : Error<
+"data member offset not in natural alignment">;
+def err_amp_function_redefinition : Error<
+" '%0' redefinition; overlapping restriction specifiers">;
+def err_amp_using_static_or_global_variables : Error<
+" '%0' using global or static variables is unsupported in amp restricted code">;
+def err_amp_function_conversion : Error<
+"conversion to a function with incompatible restriction specifiers">;
+def err_amp_virtual_member_function : Error<
+"virtual member function is not supported">;
+def err_amp_arithmetic_operation_on_pointer_to_bool : Error<
+" '%0' cannot perform pointer arithmetic on pointer to bool in amp restricted code">;
+def err_amp_bad_reinterpret_cast_from_pointer_to_int : Error<
+"cast from pointer to %2 is unsupported in amp restricted code">;
+def err_amp_bad_reinterpret_cast_from_pointer_to_functionptr : Error<
+"pointer to member functions, function pointers, references to functions with 'amp' restriction specifier are not allowed">;
+def err_amp_memory_operation : Error<
+"operator is not supported in amp-compatible codes'">;
+def err_amp_unsupported_reference_or_pointer : Error<
+"pointer or reference is not allowed as pointed to type, array element type or data member type (except reference to concurrency::array/texture)">;
+def err_amp_incompatible : Error<
+"the field type is not amp-compatible">;
+def err_amp_type_unsupported : Error<
+"'%0': unsupported type in amp restricted code">;
+def err_amp_tile_static_on_function_return_result : Error<
+" tile_static can only be applied to a variable declaration">;
+def err_amp_destructor_overloading : Error <
+"destructors are not allowed to be overloaded">;
+def err_amp_ellipsis_param_on_function_declarator : Error <
+"ellipsis ... is not supported on a amp restricted function declarator">;
+def err_amp_tile_static_unsupported_usage : Error<
+" tile_static cannot be used in a non-amp restricted scope">;
+def err_amp_using_nullptr_in_tile_static : Error<
+" '0': is not allowed on tile_static">;
+def err_amp_tile_static_pointer_or_reference : Error<
+" tile_static variables cannot contain pointer or reference type">;
+def err_amp_constant_out_of_supported_range : Error<
+"constant value is out of supported range in amp restricted code">;
+def err_amp_constant_too_big : Error<
+"constant too big">;
+def err_amp_float_overflow : Error<
+"magnitude of floating-point constant too large for type %0; maximum is %1">;
+def err_amp_unsupported_string_literals : Error<
+"'%0': unsupported usage of string literals in amp restricted code">;
+def err_amp_int_to_pointer_cast : Error<
+"cast to %1 from %0 is unsupported in amp restricted code">;
+def err_amp_virtual_base_class_unsupported : Error<
+"'%0' has has virtual base class which is not supported">;
+def err_amp_call_from_cpu_to_amp : Error<
+"call from CPU-restricted function to AMP-restricted function">;
+def err_amp_call_from_amp_to_cpu : Error<
+"call from AMP-restricted function to CPU-restricted function">;
+def err_amp_call_from_both_amp_and_cpu_to_disctint : Error<
+"call from both amp and cpu restricted function to disctint restricted function">;
+def err_amp_need_4_byte_aligned : Error<
+"variables in AMP-restricted function shall be 4-bytes aligned">;
+def err_amp_captured_variable_type : Error<
+" '%0': variable captured by lambda has unsupported type in amp restricted code">;
+def err_amp_c_linkage_function_has_multiple_restrictions : Error<
+"'%0': multiple restriction specifiers are not supported on C linkage functions">;
+def err_amp_has_second_c_linkage_overloaded_function : Error<
+"'%0': second C linkage of overloaded function not allowed">;
+def err_amp_captured_array_type_by_value : Error<
+" %0 is not allowed to be captured by value if the lambda is amp restricted">;
+def err_amp_captured_by_reference_for_variables : Error<
+" '%0' by-reference capture or 'this' capture is unsupported  if the lambda is amp restricted">;
+def err_amp_has_no_copy_constructor : Error<
+" '%0': no copy constructor available or copy constructor is declared 'explicit'">;
+def err_amp_has_no_copy_assign_or_move_assign : Error<
+" '%0':  is unavailable in '%1'">;
+def err_amp_overloaded_member_function : Error<
+" '%0':  no overloaded function has restriction specifiers that are compatible with the ambient context '%1'">;
+def err_amp_has_no_default_ctor : Error<
+" '%0': no appropriate default constructor available">;
+def err_amp_dtor_rest_cover_all_ctor : Error<
+"Destructor's restriction specifiers must cover the union of restrictions on all constructors">;
+
 // OpenCL v2.0 s6.13.17 Enqueue kernel restrictions.
 def err_opencl_enqueue_kernel_incorrect_args : Error<
   "illegal call to enqueue_kernel, incorrect argument types">;
diff --git a/include/clang/Basic/LangOptions.def b/include/clang/Basic/LangOptions.def
index 31aca2b0d6..0bee503cf1 100644
--- a/include/clang/Basic/LangOptions.def
+++ b/include/clang/Basic/LangOptions.def
@@ -222,6 +222,13 @@ LANGOPT(CUDAHostDeviceConstexpr, 1, 1, "treating unattributed constexpr function
 LANGOPT(CUDADeviceApproxTranscendentals, 1, 0, "using approximate transcendental functions")
 LANGOPT(GPURelocatableDeviceCode, 1, 0, "generate relocatable device code")
 
+LANGOPT(CPlusPlusAMP      , 1, 0, "C++AMP")
+LANGOPT(DevicePath        , 1, 0, "C++AMP Device Path")
+LANGOPT(AMPCPU            , 1, 0, "C++AMP CPU Path")
+LANGOPT(HSAExtension      , 1, 0, "C++AMP Extension for HSA")
+LANGOPT(AutoAuto          , 1, 0, "Enable auto-auto")
+LANGOPT(AutoCompileForAccelerator, 1, 0, "Enable auto-compile-for-accelerator")
+
 LANGOPT(SYCLIsDevice      , 1, 0, "Generate code for SYCL device")
 
 LANGOPT(SizedDeallocation , 1, 0, "sized deallocation")
diff --git a/include/clang/Basic/LangOptions.h b/include/clang/Basic/LangOptions.h
index 8099eed28c..c176d0b7d8 100644
--- a/include/clang/Basic/LangOptions.h
+++ b/include/clang/Basic/LangOptions.h
@@ -296,7 +296,7 @@ public:
   }
 
   bool assumeFunctionsAreConvergent() const {
-    return (CUDA && CUDAIsDevice) || OpenCL;
+    return (CUDA && CUDAIsDevice) || (CPlusPlusAMP && DevicePath) || OpenCL;
   }
 
   /// Return the OpenCL C or C++ version as a VersionTuple.
diff --git a/include/clang/Basic/Specifiers.h b/include/clang/Basic/Specifiers.h
index d1236e798e..2ee3d7accf 100644
--- a/include/clang/Basic/Specifiers.h
+++ b/include/clang/Basic/Specifiers.h
@@ -354,6 +354,14 @@ namespace clang {
   };
 
   llvm::StringRef getParameterABISpelling(ParameterABI kind);
+
+  /// \brief C++AMP restricion specifiers
+  enum CPPAMPSpecifier {
+    CPPAMP_None = 0x0,
+    CPPAMP_CPU  = 0x1,
+    CPPAMP_AMP  = 0x2,
+    CPPAMP_AUTO = 0x4
+  };
 } // end namespace clang
 
 #endif // LLVM_CLANG_BASIC_SPECIFIERS_H
diff --git a/include/clang/Basic/TargetInfo.h b/include/clang/Basic/TargetInfo.h
index c6c966dfbe..3f10faa50f 100644
--- a/include/clang/Basic/TargetInfo.h
+++ b/include/clang/Basic/TargetInfo.h
@@ -362,6 +362,13 @@ public:
     return AddrSpace == 0 ? PointerAlign : getPointerAlignV(AddrSpace);
   }
 
+  /// Return the "preferred" width of pointers on this target, for the
+  /// specified address space.  This can be different from "getPointerWidth" in
+  /// cases where the final address space is not yet known.
+  virtual uint64_t getPreferredPointerWidth(unsigned AddrSpace) const {
+    return getPointerWidth(AddrSpace);
+  }
+
   /// Return the maximum width of pointers on this target.
   virtual uint64_t getMaxPointerWidth() const {
     return PointerWidth;
diff --git a/include/clang/Basic/Version.inc.in b/include/clang/Basic/Version.inc.in
index fd80af4b51..a4579b1897 100644
--- a/include/clang/Basic/Version.inc.in
+++ b/include/clang/Basic/Version.inc.in
@@ -3,3 +3,11 @@
 #define CLANG_VERSION_MAJOR @CLANG_VERSION_MAJOR@
 #define CLANG_VERSION_MINOR @CLANG_VERSION_MINOR@
 #define CLANG_VERSION_PATCHLEVEL @CLANG_VERSION_PATCHLEVEL@
+
+#define HCC_VERSION_STRING "@HCC_VERSION_STRING@"
+#define HCC_VERSION_MAJOR @HCC_VERSION_MAJOR@
+#define HCC_VERSION_MINOR @HCC_VERSION_MINOR@
+#define HCC_VERSION_PATCH "@HCC_VERSION_PATCH@-@KALMAR_SDK_COMMIT@-@KALMAR_FRONTEND_COMMIT@-@KALMAR_BACKEND_COMMIT@"
+#define HCC_VERSION_WORKWEEK @HCC_VERSION_PATCH@
+
+#define KALMAR_BACKEND @KALMAR_BACKEND@
diff --git a/include/clang/CodeGen/BackendUtil.h b/include/clang/CodeGen/BackendUtil.h
index 01b1f5bbd6..5b9f61c104 100644
--- a/include/clang/CodeGen/BackendUtil.h
+++ b/include/clang/CodeGen/BackendUtil.h
@@ -41,11 +41,20 @@ namespace clang {
                          const TargetOptions &TOpts, const LangOptions &LOpts,
                          const llvm::DataLayout &TDesc, llvm::Module *M,
                          BackendAction Action,
-                         std::unique_ptr<raw_pwrite_stream> OS);
+                         std::unique_ptr<raw_pwrite_stream> OS,
+                         bool SetLLVMOpts = true);
 
   void EmbedBitcode(llvm::Module *M, const CodeGenOptions &CGOpts,
                     llvm::MemoryBufferRef Buf);
 
+  void PerformPrelinkPasses(DiagnosticsEngine &Diags,
+                            const HeaderSearchOptions &HeaderSearchOpts,
+                            const CodeGenOptions &CGOpts,
+                            const TargetOptions &TOpts,
+                            const LangOptions &LOpts,
+                            const llvm::DataLayout &TDesc, llvm::Module *M,
+                            BackendAction Action);
+
   llvm::Expected<llvm::BitcodeModule>
   FindThinLTOModule(llvm::MemoryBufferRef MBRef);
   llvm::BitcodeModule *
diff --git a/include/clang/CodeGen/CodeGenABITypes.h b/include/clang/CodeGen/CodeGenABITypes.h
index 31f0cea572..48dae23355 100644
--- a/include/clang/CodeGen/CodeGenABITypes.h
+++ b/include/clang/CodeGen/CodeGenABITypes.h
@@ -55,7 +55,8 @@ const CGFunctionInfo &arrangeObjCMessageSendSignature(CodeGenModule &CGM,
                                                       QualType receiverType);
 
 const CGFunctionInfo &arrangeFreeFunctionType(CodeGenModule &CGM,
-                                              CanQual<FunctionProtoType> Ty);
+                                              CanQual<FunctionProtoType> Ty,
+											  const FunctionDecl *FD);
 
 const CGFunctionInfo &arrangeFreeFunctionType(CodeGenModule &CGM,
                                               CanQual<FunctionNoProtoType> Ty);
diff --git a/include/clang/Driver/Action.h b/include/clang/Driver/Action.h
index c1ff0b1a60..d1ae19b023 100644
--- a/include/clang/Driver/Action.h
+++ b/include/clang/Driver/Action.h
@@ -89,6 +89,7 @@ public:
     OFK_Cuda = 0x02,
     OFK_OpenMP = 0x04,
     OFK_HIP = 0x08,
+    OFK_HCC = 0x16,
   };
 
   static const char *getClassName(ActionClass AC);
@@ -207,6 +208,13 @@ public:
   bool isOffloading(OffloadKind OKind) const {
     return isHostOffloading(OKind) || isDeviceOffloading(OKind);
   }
+
+  /// Check whether the given input tree starts with or contains an action of
+  /// the kind kind and actions of the type typesID.
+  bool ContainsActions(ActionClass kind,
+                       types::ID typesID,
+                       bool singleInputActionsOnly = true,
+                       bool startsWithActionKind = true) const;
 };
 
 class InputAction : public Action {
diff --git a/include/clang/Driver/CC1Options.td b/include/clang/Driver/CC1Options.td
index 1f6c000ecf..62530c39f4 100644
--- a/include/clang/Driver/CC1Options.td
+++ b/include/clang/Driver/CC1Options.td
@@ -872,6 +872,28 @@ def fopenmp_is_device : Flag<["-"], "fopenmp-is-device">,
   HelpText<"Generate code only for an OpenMP target device.">;
 def fopenmp_host_ir_file_path : Separate<["-"], "fopenmp-host-ir-file-path">,
   HelpText<"Path to the IR file produced by the frontend for the host.">;
+  
+//===----------------------------------------------------------------------===//
+// C++AMP Options
+//===----------------------------------------------------------------------===//
+
+def famp_is_device : Flag<["-"], "famp-is-device">,
+  HelpText<"Generate code for AMP kernels">;
+
+def famp_cpu : Flag<["-"], "famp-cpu">,
+  HelpText<"Generate code for AMP CPU kernels">;
+
+def fhsa_extension : Flag<["-"], "fhsa-ext">,
+  HelpText<"Enable HSA-specific rules for C++AMP kernels">;
+
+def fno_auto_auto : Flag<["-"], "fno-auto-auto">,
+  HelpText<"Disable auto-auto feature (Obsolete, use -fauto-auto to explicit enable it instead)">;
+
+def fauto_auto : Flag<["-"], "fauto-auto">,
+  HelpText<"Enable auto-auto feature">;
+
+def fauto_compile_for_accelerator : Flag<["-"], "fauto-compile-for-accelerator">,
+  HelpText<"Enable auto-compile-for-accelerator feature">;
 
 //===----------------------------------------------------------------------===//
 // SYCL Options
diff --git a/include/clang/Driver/Options.td b/include/clang/Driver/Options.td
index 4ea8bfff09..21d1361210 100644
--- a/include/clang/Driver/Options.td
+++ b/include/clang/Driver/Options.td
@@ -186,6 +186,8 @@ def gfortran_Group : OptionGroup<"<gfortran group>">,
 Flags that will be passed onto the ``gfortran`` compiler when Clang is given
 a Fortran input.}]>;
 
+def cxxamp_Group          : OptionGroup<"<C++AMP group>">;
+
 def Link_Group : OptionGroup<"<T/e/s/t/u group>">, DocName<"Linker flags">,
                  DocBrief<[{Flags that are passed on to the linker}]>;
 def T_Group : OptionGroup<"<T group>">, Group<Link_Group>, DocFlatten;
@@ -497,6 +499,20 @@ def bind__at__load : Flag<["-"], "bind_at_load">;
 def bundle__loader : Separate<["-"], "bundle_loader">;
 def bundle : Flag<["-"], "bundle">;
 def b : JoinedOrSeparate<["-"], "b">, Flags<[Unsupported]>;
+def hc_mode : Flag<["-"], "hc">, Flags<[DriverOption]>,
+  HelpText<"Enable Heterogeneous C++ (HC) mode">;
+def hc_function_calls : Flag<["-"], "hc-function-calls">, Flags<[DriverOption]>,
+  HelpText<"Enable support for direct function calls in [[hc]] code">;
+def cxxamp_kernel_mode : Flag<["-"], "gpu">, Flags<[DriverOption]>,Group<cxxamp_Group>,
+  HelpText<"C++AMP only.  This option allows the compiler to emit kernel-specific artifacts"> ;
+def cxxamp_cpu_mode : Flag<["-"], "cpu">, Flags<[DriverOption]>,Group<cxxamp_Group>,
+  HelpText<"C++AMP only.  This option allows the compiler to emit CPU kernel-specific artifacts"> ;
+def amdgpu_target_EQ : Joined<["-", "--"], "amdgpu-target=">, Flags<[DriverOption]>,
+  HelpText<"Specify AMDGPU ISA version (ex: gfx803). Could be specified only once.">;
+def hcc_path_EQ : Joined<["--"], "hcc-path=">, Group<i_Group>,
+  HelpText<"HCC installation path">;
+def hcc_extra_libs_EQ : Joined<["-", "--"], "hcc-extra-libs=">, Flags<[DriverOption]>,
+  HelpText<"Specify hcc extra libraries to be linked in.">;
 def cfguard : Flag<["-"], "cfguard">, Flags<[CC1Option]>,
   HelpText<"Emit tables required for Windows Control Flow Guard.">;
 def cl_opt_disable : Flag<["-"], "cl-opt-disable">, Group<opencl_Group>, Flags<[CC1Option]>,
@@ -1560,6 +1576,8 @@ def fno_objc_nonfragile_abi : Flag<["-"], "fno-objc-nonfragile-abi">, Group<f_Gr
 
 def fobjc_sender_dependent_dispatch : Flag<["-"], "fobjc-sender-dependent-dispatch">, Group<f_Group>;
 def fomit_frame_pointer : Flag<["-"], "fomit-frame-pointer">, Group<f_Group>;
+def famp : Flag<["-"], "famp">, Group<f_Group>, Flags<[CC1Option, NoArgumentUnused]>,
+  HelpText<"Enable C++AMP extensions.">;
 def fopenmp : Flag<["-"], "fopenmp">, Group<f_Group>, Flags<[CC1Option, NoArgumentUnused]>,
   HelpText<"Parse OpenMP pragmas and generate parallel code.">;
 def fno_openmp : Flag<["-"], "fno-openmp">, Group<f_Group>, Flags<[NoArgumentUnused]>;
diff --git a/include/clang/Driver/ToolChain.h b/include/clang/Driver/ToolChain.h
index 7dd3db376c..b7699f7176 100644
--- a/include/clang/Driver/ToolChain.h
+++ b/include/clang/Driver/ToolChain.h
@@ -49,12 +49,12 @@ namespace clang {
 class ObjCRuntime;
 
 namespace driver {
-
-class Driver;
-class InputInfo;
-class SanitizerArgs;
-class Tool;
-class XRayArgs;
+  class HCCInstallationDetector;
+  class Driver;
+  class InputInfo;
+  class SanitizerArgs;
+  class Tool;
+  class XRayArgs;
 
 /// Helper structure used to pass information extracted from clang executable
 /// name such as `i686-linux-android-g++`.
@@ -576,6 +576,9 @@ public:
   virtual void AddCudaIncludeArgs(const llvm::opt::ArgList &DriverArgs,
                                   llvm::opt::ArgStringList &CC1Args) const;
 
+  virtual void AddHCCIncludeArgs(const llvm::opt::ArgList &DriverArgs,
+                                 llvm::opt::ArgStringList &CC1Args) const;
+
   /// Add arguments to use MCU GCC toolchain includes.
   virtual void AddIAMCUIncludeArgs(const llvm::opt::ArgList &DriverArgs,
                                    llvm::opt::ArgStringList &CC1Args) const;
diff --git a/include/clang/Driver/Types.def b/include/clang/Driver/Types.def
index b45789d4b3..c64c7162fc 100644
--- a/include/clang/Driver/Types.def
+++ b/include/clang/Driver/Types.def
@@ -52,6 +52,13 @@ TYPE("objective-c-cpp-output",   PP_ObjC,      INVALID,         "mi",    "u")
 TYPE("objc-cpp-output",          PP_ObjC_Alias, INVALID,        "mi",    "u")
 TYPE("objective-c",              ObjC,         PP_ObjC,         "m",     "u")
 TYPE("c++-cpp-output",           PP_CXX,       INVALID,         "ii",    "u")
+TYPE("c++amp-kernel-cpu-output", PP_CXX_AMP_CPU, INVALID,       "ii",    "u")
+TYPE("c++amp-kernel-cpu",        CXX_AMP_CPU,  PP_CXX_AMP_CPU,  "ii",    "u")
+TYPE("c++amp-kernel-cpp-output", PP_CXX_AMP,   INVALID,         "ii",    "u")
+TYPE("c++amp-kernel",            CXX_AMP,      PP_CXX_AMP,      "cpp",   "u")
+TYPE("hc-host-cpp-output",       PP_HC_HOST,   INVALID,         "ii",    "u")
+TYPE("hc-kernel",                HC_KERNEL,    PP_CXX_AMP,      "cpp",   "u")
+TYPE("hc-host",                  HC_HOST,      PP_HC_HOST,      "cpp",   "u")
 TYPE("c++",                      CXX,          PP_CXX,          "cpp",   "u")
 TYPE("objective-c++-cpp-output", PP_ObjCXX,    INVALID,         "mii",   "u")
 TYPE("objc++-cpp-output",        PP_ObjCXX_Alias, INVALID,      "mii",   "u")
diff --git a/include/clang/Driver/Types.h b/include/clang/Driver/Types.h
index 53afada7ab..4ded429a6e 100644
--- a/include/clang/Driver/Types.h
+++ b/include/clang/Driver/Types.h
@@ -79,6 +79,9 @@ namespace types {
   /// isHIP - Is this a HIP input.
   bool isHIP(ID Id);
 
+  /// isHCC - Is this a HCC input.
+  bool isHCC(ID Id);
+
   /// isObjC - Is this an "ObjC" input (Obj-C and Obj-C++ sources and headers).
   bool isObjC(ID Id);
 
diff --git a/include/clang/Frontend/FrontendOptions.h b/include/clang/Frontend/FrontendOptions.h
index a0acb1f066..8527cb81d5 100644
--- a/include/clang/Frontend/FrontendOptions.h
+++ b/include/clang/Frontend/FrontendOptions.h
@@ -163,6 +163,8 @@ public:
     ///@{ Languages that the frontend can parse and compile.
     C,
     CXX,
+	CXXAMP,
+
     ObjC,
     ObjCXX,
     OpenCL,
diff --git a/include/clang/Frontend/LangStandard.h b/include/clang/Frontend/LangStandard.h
index 244f14c793..0ada0ab97e 100644
--- a/include/clang/Frontend/LangStandard.h
+++ b/include/clang/Frontend/LangStandard.h
@@ -32,7 +32,8 @@ enum LangFeatures {
   GNUMode = (1 << 11),
   HexFloat = (1 << 12),
   ImplicitInt = (1 << 13),
-  OpenCL = (1 << 14)
+  OpenCL = (1 << 14),
+  CPlusPlusAMP = (1 << 15)
 };
 
 }
@@ -108,6 +109,9 @@ public:
   /// isOpenCL - Language is a OpenCL variant.
   bool isOpenCL() const { return Flags & frontend::OpenCL; }
 
+  /// isCPlusPlusAMP - Language is a C++AMP standard.
+  bool isCPlusPlusAMP() const { return Flags & frontend::CPlusPlusAMP; }
+
   static const LangStandard &getLangStandardForKind(Kind K);
   static const LangStandard *getLangStandardForName(StringRef Name);
 };
diff --git a/include/clang/Frontend/LangStandards.def b/include/clang/Frontend/LangStandards.def
index fef7d4dd9a..5fde3e2f0c 100644
--- a/include/clang/Frontend/LangStandards.def
+++ b/include/clang/Frontend/LangStandards.def
@@ -184,6 +184,11 @@ LANGSTANDARD(cuda, "cuda", CUDA, "NVIDIA CUDA(tm)",
 LANGSTANDARD(hip, "hip", HIP, "HIP",
              LineComment | CPlusPlus | Digraphs)
 
+// C++AMP
+LANGSTANDARD(cxxamp, "c++amp", CXXAMP,
+             "ECMA C++AMP Standard",
+             LineComment | CPlusPlus | CPlusPlus11 | CPlusPlusAMP | Digraphs)
+
 #undef LANGSTANDARD
 #undef LANGSTANDARD_ALIAS
 #undef LANGSTANDARD_ALIAS_DEPR
diff --git a/include/clang/Parse/Parser.h b/include/clang/Parse/Parser.h
index 7c67c35f61..da7249ecbe 100644
--- a/include/clang/Parse/Parser.h
+++ b/include/clang/Parse/Parser.h
@@ -749,6 +749,10 @@ private:
     return PP.LookAhead(N-1);
   }
 
+  /// C++ AMP-specific
+  /// check if the given scope is AMP-restricted
+  bool IsInAMPFunction(Scope *);
+
 public:
   /// NextToken - This peeks ahead one token and returns it without
   /// consuming it.
@@ -1444,6 +1448,9 @@ private:
                             bool StopAtSemi = true,
                             bool ConsumeFinalToken = true);
 
+  // C++AMP
+  bool CXXAMPFindRestrictionSeq(CachedTokens &Toks, bool ConsumeFinalToken);
+
   //===--------------------------------------------------------------------===//
   // C99 6.9: External Definitions.
   struct ParsedAttributesWithRange : ParsedAttributes {
@@ -1771,10 +1778,13 @@ private:
   // [...] () -> type {...}
   ExprResult ParseLambdaExpression();
   ExprResult TryParseLambdaExpression();
+  bool TryParseLambdaIntroducer(LambdaIntroducer &Intro, ParsedAttributes &AttrIntro);
+
   bool
   ParseLambdaIntroducer(LambdaIntroducer &Intro,
+	                    ParsedAttributes &AttrIntro,
                         LambdaIntroducerTentativeParse *Tentative = nullptr);
-  ExprResult ParseLambdaExpressionAfterIntroducer(LambdaIntroducer &Intro);
+  ExprResult ParseLambdaExpressionAfterIntroducer(LambdaIntroducer &Intro, ParsedAttributes &AttrIntro);
 
   //===--------------------------------------------------------------------===//
   // C++ 5.2p1: C++ Casts
@@ -2538,6 +2548,7 @@ private:
   SourceLocation SkipExtendedMicrosoftTypeAttributes();
   void ParseMicrosoftInheritanceClassAttributes(ParsedAttributes &attrs);
   void ParseBorlandTypeAttributes(ParsedAttributes &attrs);
+  void ParseHCCQualifiers(ParsedAttributes &Attrs);
   void ParseOpenCLKernelAttributes(ParsedAttributes &attrs);
   void ParseOpenCLQualifiers(ParsedAttributes &Attrs);
   /// Parses opencl_unroll_hint attribute if language is OpenCL v2.0
@@ -2696,6 +2707,11 @@ private:
   void ParseBracketDeclarator(Declarator &D);
   void ParseMisplacedBracketDeclarator(Declarator &D);
 
+  // C++AMP
+  unsigned ParseRestrictionSpecification(Declarator &D,
+                                         ParsedAttributes &Attrs,
+                                         SourceLocation &DeclEndLoc);
+
   //===--------------------------------------------------------------------===//
   // C++ 7: Declarations [dcl.dcl]
 
diff --git a/include/clang/Sema/Overload.h b/include/clang/Sema/Overload.h
index 96aadeac2b..3ec792b1b3 100644
--- a/include/clang/Sema/Overload.h
+++ b/include/clang/Sema/Overload.h
@@ -25,6 +25,7 @@
 #include "clang/Basic/SourceLocation.h"
 #include "clang/Sema/SemaFixItUtils.h"
 #include "clang/Sema/TemplateDeduction.h"
+#include "clang/Sema/Scope.h"
 #include "llvm/ADT/ArrayRef.h"
 #include "llvm/ADT/None.h"
 #include "llvm/ADT/STLExtras.h"
@@ -972,7 +973,9 @@ class Sema;
 
     /// Find the best viable function on this overload set, if it exists.
     OverloadingResult BestViableFunction(Sema &S, SourceLocation Loc,
-                                         OverloadCandidateSet::iterator& Best);
+                                         OverloadCandidateSet::iterator& Best,
+                                         bool UserDefinedConversion = false,
+                                         Scope* SC = 0);
 
     SmallVector<OverloadCandidate *, 32> CompleteCandidates(
         Sema &S, OverloadCandidateDisplayKind OCD, ArrayRef<Expr *> Args,
@@ -1008,7 +1011,9 @@ class Sema;
                                  const OverloadCandidate &Cand1,
                                  const OverloadCandidate &Cand2,
                                  SourceLocation Loc,
-                                 OverloadCandidateSet::CandidateSetKind Kind);
+                                 OverloadCandidateSet::CandidateSetKind Kind,
+                                 bool UserDefinedConversion = false,
+                                 Scope* SC = 0);
 
   struct ConstructorInfo {
     DeclAccessPair FoundDecl;
diff --git a/include/clang/Sema/Scope.h b/include/clang/Sema/Scope.h
index 7848df8f70..fa523c88d6 100644
--- a/include/clang/Sema/Scope.h
+++ b/include/clang/Sema/Scope.h
@@ -204,6 +204,9 @@ private:
   /// Used to determine if errors occurred in this scope.
   DiagnosticErrorTrap ErrorTrap;
 
+  /// C++AMP restriction specifier
+  unsigned short CXXAMPSpecifier;
+
   /// A lattice consisting of undefined, a single NRVO candidate variable in
   /// this scope, or over-defined. The bit is true when over-defined.
   llvm::PointerIntPair<VarDecl *, 1, bool> NRVO;
@@ -385,6 +388,26 @@ public:
     return getFlags() & Scope::FunctionPrototypeScope;
   }
 
+  /// \brief C++AMP restriction specifiers
+  enum CPPAMPSpecifier {
+    CPPAMP_None = 0x0,
+    CPPAMP_CPU  = 0x1,
+    CPPAMP_AMP  = 0x2,
+    CPPAMP_AUTO = 0x4
+  };
+  void setCXXAMPSpecifier(unsigned A) { CXXAMPSpecifier = A; }
+  void setAMPScope() { CXXAMPSpecifier |= CPPAMP_AMP; }
+  void setCPUScope() { CXXAMPSpecifier |= CPPAMP_CPU; }
+  bool isAMPScope() const {
+    return CXXAMPSpecifier & CPPAMP_AMP;
+  }
+  bool isCPUScope() const {
+    return CXXAMPSpecifier & CPPAMP_CPU;
+  }
+  bool isAUTOScope() const {
+    return CXXAMPSpecifier & CPPAMP_AUTO;
+  }
+
   /// isAtCatchScope - Return true if this scope is \@catch.
   bool isAtCatchScope() const {
     return getFlags() & Scope::AtCatchScope;
diff --git a/include/clang/Sema/Sema.h b/include/clang/Sema/Sema.h
index e6c63fd9c0..0ca29ddcf6 100644
--- a/include/clang/Sema/Sema.h
+++ b/include/clang/Sema/Sema.h
@@ -1966,6 +1966,11 @@ public:
                                   bool IsAddressOfOperand,
                                   CorrectionCandidateCallback *CCC = nullptr);
 
+  // C++AMP declarator diagnostic functions
+  bool DiagnoseCXXAMPDecl(Decl* Dcl, bool CheckContainer = false, bool IsInfer = false);
+  bool IsCXXAMPTileStatic(Declarator &D);
+  void DiagnosticCXXAMPTileStatic(Declarator &D, Decl *Dcl);
+
   /// Describes the detailed kind of a template name. Used in diagnostics.
   enum class TemplateNameKindForDiagnostics {
     ClassTemplate,
@@ -2213,6 +2218,9 @@ public:
   /// \c constexpr in C++11 or has an 'auto' return type in C++14).
   bool canSkipFunctionBody(Decl *D);
 
+  // C++AMP restriction specifier inferring routine
+  void TryCXXAMPRestrictionInferring(Decl *D, Stmt *Body);
+
   void computeNRVO(Stmt *Body, sema::FunctionScopeInfo *Scope);
   Decl *ActOnFinishFunctionBody(Decl *Decl, Stmt *Body);
   Decl *ActOnFinishFunctionBody(Decl *Decl, Stmt *Body, bool IsInstantiation);
@@ -2725,6 +2733,11 @@ public:
     /// non-function.
     Ovl_NonFunction
   };
+
+  // C++AMP diagnostic routine on destructor overload resolution
+  void DiagnoseCXXAMPDtorOverload(FunctionDecl *New,
+                           const LookupResult &Old);
+
   OverloadKind CheckOverload(Scope *S,
                              FunctionDecl *New,
                              const LookupResult &OldDecls,
@@ -3021,6 +3034,13 @@ public:
                                             OverloadCandidateSet& CandidateSet,
                                             bool PartialOverloading = false);
 
+  // C++AMP restriction specifier scope checking routines
+  bool IsInAMPRestricted();
+  // Determine if in CPU and/or AMP restricted codes
+  bool IsInAnyExplicitRestricted();
+  void GetCXXAMPParentRestriction(Scope* SC, bool& ParentCPU,
+    bool& ParentAMP, bool&ParentAUTO);
+
   // Emit as a 'note' the specific overload candidate
   void NoteOverloadCandidate(NamedDecl *Found, FunctionDecl *Fn,
                              QualType DestType = QualType(),
@@ -3133,6 +3153,10 @@ public:
                                            OverloadCandidateSet *CandidateSet,
                                            Expr *Range, ExprResult *CallExpr);
 
+  // C++AMP diagnostic routine on overloaded call expressions
+  void DiagnoseCXXAMPOverloadedCallExpr(SourceLocation LParenLoc,
+                                        FunctionDecl* Callee);
+
   ExprResult BuildOverloadedCallExpr(Scope *S, Expr *Fn,
                                      UnresolvedLookupExpr *ULE,
                                      SourceLocation LParenLoc,
@@ -3326,7 +3350,17 @@ public:
   typedef std::function<ExprResult(Sema &, TypoExpr *, TypoCorrection)>
       TypoRecoveryCallback;
 
+  // C++AMP type checking routine for kernel codes
+public:
+  bool IsIncompatibleType(const Type* Ty, bool CheckContainer = false, bool IsInfer = false);
+
 private:
+  // C++AMP type checking routine for kernel codes
+  bool IsCXXAMPUnsupportedPointerType(const Type* Ty,
+    bool CheckContainer = false, bool IsInfer = false);
+  bool IsCXXAMPUnsupportedReferenceType(const Type* Ty,
+    bool CheckContainer = false, bool IsInfer = false);
+
   bool CppLookupName(LookupResult &R, Scope *S);
 
   struct TypoExprState {
@@ -4596,6 +4630,10 @@ public:
                                 ParmVarDecl *Param,
                                 const Expr *ArgExpr);
 
+  // C++AMP diagnotic routine on C++ method call expressions
+  void DiagnoseCXXAMPMethodCallExpr(SourceLocation LParenLoc,
+                                    CXXMethodDecl *Callee);
+
   /// ActOnCallExpr - Handle a call to Fn with the specified array of arguments.
   /// This provides the location of the left/right parens and a list of comma
   /// locations.
@@ -4651,6 +4689,19 @@ public:
                                         bool GNUSyntax,
                                         ExprResult Init);
 
+  // C++AMP restriction specifier calculation routines for special member function
+  void InheritSMFDtorIntersections(CXXRecordDecl* RDecl,
+                                   bool& CPUAttr, bool& AMPAttr,
+                                   bool& ParentCPUAttr, bool& ParentAMPAttr);
+  void InheritSMFCtorIntersections(CXXRecordDecl* RDecl,
+                                   bool& CPUAttr, bool& AMPAttr,
+                                   bool& ParentCPUAttr, bool& ParentAMPAttr,
+                                   int flag, bool ConstParam = true);
+  void InheritSMFMethodIntersections(CXXRecordDecl* RDecl,
+                                     bool& CPUAttr, bool& AMPAttr,
+                                     bool& ParentCPUAttr, bool& ParentAMPAttr,
+                                     int flag, bool ConstParam = true);
+
 private:
   static BinaryOperatorKind ConvertTokenKindToBinaryOpcode(tok::TokenKind Kind);
 
@@ -5190,6 +5241,22 @@ public:
   void DefineImplicitCopyAssignment(SourceLocation CurrentLocation,
                                     CXXMethodDecl *MethodDecl);
 
+  /// Defines an AMP CUP-side serialize function.
+  void DefineAmpCpuSerializeFunction(SourceLocation CurrentLocation,
+                                     CXXMethodDecl *MethodDecl);
+  /// Defines an AMP GPU-side deserialize function.
+  void DefineAmpGpuDeSerializeFunction(SourceLocation CurrentLocation,
+                                       CXXMethodDecl *MethodDecl);
+  /// Declare trampoline name lookup code for AMP CPU-side
+  void DeclareAMPTrampolineName(CXXRecordDecl *ClassDecl,
+                                DeclarationName Name);
+  /// Declare trampoline code for AMP GPU-side entry
+  void DeclareAMPTrampoline(CXXRecordDecl *ClassDecl,
+                            DeclarationName Name);
+  /// Define trampoline code for AMP GPU-side entry
+  void DefineAMPTrampoline(SourceLocation CurrentLocation,
+                           CXXMethodDecl *OperatorCall);
+
   /// Declare the implicit move assignment operator for the given class.
   ///
   /// \param ClassDecl The Class declaration into which the implicit
@@ -6130,6 +6197,19 @@ public:
   /// \returns true if any work was done, false otherwise.
   bool DefineUsedVTables();
 
+  /// \brief Test if a given class requires a
+  /// C++AMP deserializer declaration
+  bool NeedAMPDeserializer(CXXRecordDecl *ClassDecl);
+  /// \brief Test if a given class has a C++AMP deserializer declaration
+  bool HasDeclaredAMPDeserializer(CXXRecordDecl *ClassDecl);
+
+  // Declare C++AMP serializer and deserializer
+  typedef SmallVector<QualType, 16> AMPDeserializerArgs;
+  void DeclareAMPSerializer(CXXRecordDecl *ClassDecl,
+                            DeclarationName Name);
+  void DeclareAMPDeserializer(CXXRecordDecl *ClassDecl,
+                              AMPDeserializerArgs *Args);
+
   void AddImplicitlyDeclaredMembersToClass(CXXRecordDecl *ClassDecl);
 
   void ActOnMemInitializers(Decl *ConstructorDecl,
@@ -6701,6 +6781,12 @@ public:
     CTAK_DeducedFromArrayBound
   };
 
+  // C++AMP diagnotic routine for template arguments
+  void DiagnoseCXXAMPTemplateArgument(NamedDecl *Param,
+                                      const TemplateArgumentLoc &AL,
+                                      NamedDecl *Template,
+                                      SourceLocation TemplateLoc);
+
   bool CheckTemplateArgument(NamedDecl *Param,
                              TemplateArgumentLoc &Arg,
                              NamedDecl *Template,
@@ -9942,6 +10028,9 @@ public:
     Incompatible
   };
 
+  // C++AMP diagnostic routine for expressions
+  void DiagnoseCXXAMPExpr(Expr* Stripped, ExprResult &HS, bool DiagnoseWhenStatic=false);
+
   /// DiagnoseAssignmentResult - Emit a diagnostic, if required, for the
   /// assignment conversion type specified by ConvTy.  This returns true if the
   /// conversion was invalid or false if the conversion was accepted.
diff --git a/include/clang/Sema/SemaInternal.h b/include/clang/Sema/SemaInternal.h
index dfb34daa14..164098d3d8 100644
--- a/include/clang/Sema/SemaInternal.h
+++ b/include/clang/Sema/SemaInternal.h
@@ -50,6 +50,17 @@ inline bool DeclAttrsMatchCUDAMode(const LangOptions &LangOpts, Decl *D) {
   return isDeviceSideDecl == LangOpts.CUDAIsDevice;
 }
 
+// Helper function to check whether D's attributes match current HCC mode.
+// Decls with mismatched attributes and related diagnostics may have to be
+// ignored during this HCC compilation pass.
+inline bool DeclAttrsMatchHCCMode(const LangOptions &LangOpts, Decl *D) {
+  if (!LangOpts.CPlusPlusAMP || !D)
+    return true;
+  bool isDeviceSideDecl = D->hasAttr<CXXAMPRestrictAMPAttr>() ||
+                          D->hasAttr<HC_HCAttr>();
+  return isDeviceSideDecl == LangOpts.DevicePath;
+}
+
 /// Return a DLL attribute from the declaration.
 inline InheritableAttr *getDLLAttr(Decl *D) {
   assert(!(D->hasAttr<DLLImportAttr>() && D->hasAttr<DLLExportAttr>()) &&
diff --git a/lib/AST/ASTContext.cpp b/lib/AST/ASTContext.cpp
index 93bdaafc2a..886f85e857 100644
--- a/lib/AST/ASTContext.cpp
+++ b/lib/AST/ASTContext.cpp
@@ -751,7 +751,10 @@ static const LangASMap *getAddressSpaceMap(const TargetInfo &T,
       4, // opencl_generic
       5, // cuda_device
       6, // cuda_constant
-      7  // cuda_shared
+      7, // cuda_shared
+      8, // hcc_tilestatic
+      9, // hcc_generic
+      10, // hcc_global
     };
     return &FakeAddrSpaceMap;
   } else {
@@ -3098,6 +3101,7 @@ QualType ASTContext::getConstantArrayType(QualType EltTy,
   llvm::APInt ArySize(ArySizeIn);
   ArySize = ArySize.zextOrTrunc(Target->getMaxPointerWidth());
 
+
   llvm::FoldingSetNodeID ID;
   ConstantArrayType::Profile(ID, EltTy, ArySize, ASM, IndexTypeQuals);
 
@@ -9720,6 +9724,8 @@ static GVALinkage adjustGVALinkageForAttributes(const ASTContext &Context,
     // visible externally so they can be launched from host.
     if (L == GVA_DiscardableODR || L == GVA_Internal)
       return GVA_StrongODR;
+  } else if (Context.getLangOpts().CPlusPlusAMP && Context.getLangOpts().DevicePath && D->hasAttr<AnnotateAttr>() && (D->getAttr<AnnotateAttr>()->getAnnotation() == "__cxxamp_trampoline")) {
+    return GVA_StrongODR;
   }
   return L;
 }
@@ -10456,6 +10462,19 @@ unsigned ASTContext::getTargetAddressSpace(LangAS AS) const {
     return (*AddrSpaceMap)[(unsigned)AS];
 }
 
+unsigned ASTContext::getTargetAddressSpace(QualType T) const {
+  if (T.isNull())
+    return 0;
+  if (T->isFunctionType() &&
+      !T.getQualifiers().hasAddressSpace())
+    return 0;
+  return getTargetAddressSpace(T.getQualifiers());
+}
+
+unsigned ASTContext::getTargetAddressSpace(Qualifiers Q) const {
+  return getTargetAddressSpace(Q.getAddressSpace());
+}
+
 QualType ASTContext::getCorrespondingSaturatedType(QualType Ty) const {
   assert(Ty->isFixedPointType());
 
diff --git a/lib/AST/DeclCXX.cpp b/lib/AST/DeclCXX.cpp
index 59710a5549..eb3520783c 100644
--- a/lib/AST/DeclCXX.cpp
+++ b/lib/AST/DeclCXX.cpp
@@ -9,7 +9,6 @@
 // This file implements the C++ related Decl classes.
 //
 //===----------------------------------------------------------------------===//
-
 #include "clang/AST/DeclCXX.h"
 #include "clang/AST/ASTContext.h"
 #include "clang/AST/ASTLambda.h"
@@ -546,6 +545,17 @@ bool CXXRecordDecl::isTriviallyCopyable() const {
   return true;
 }
 
+CXXMethodDecl *CXXRecordDecl::getCXXAMPDeserializationConstructor() const {
+  CXXMethodDecl *Deserializer = NULL;
+  for (ctor_iterator CtorIt = ctor_begin(), CtorE = ctor_end();
+      CtorIt != CtorE; ++CtorIt) {
+    if (CtorIt->hasAttr<AnnotateAttr>())
+        if (CtorIt->getAttr<AnnotateAttr>()->getAnnotation().find("deserialize") != StringRef::npos)
+            Deserializer = *CtorIt;
+  }
+  return Deserializer;
+}
+
 void CXXRecordDecl::markedVirtualFunctionPure() {
   // C++ [class.abstract]p2:
   //   A class is abstract if it has at least one pure virtual function.
diff --git a/lib/AST/RecordLayoutBuilder.cpp b/lib/AST/RecordLayoutBuilder.cpp
index 2a3419a0ce..f520190204 100644
--- a/lib/AST/RecordLayoutBuilder.cpp
+++ b/lib/AST/RecordLayoutBuilder.cpp
@@ -17,6 +17,7 @@
 #include "clang/AST/Expr.h"
 #include "clang/Basic/TargetInfo.h"
 #include "llvm/ADT/SmallSet.h"
+#include "llvm/ADT/Triple.h"
 #include "llvm/Support/Format.h"
 #include "llvm/Support/MathExtras.h"
 
@@ -1759,6 +1760,7 @@ void ItaniumRecordLayoutBuilder::LayoutField(const FieldDecl *D,
   // into the field's tail padding.
   CharUnits EffectiveFieldSize;
 
+  const ReferenceType *RT = D->getType()->getAs<ReferenceType>();
   if (D->getType()->isIncompleteArrayType()) {
     // This is a flexible array member; we can't directly
     // query getTypeInfo about these, so we figure it out here.
diff --git a/lib/AST/Type.cpp b/lib/AST/Type.cpp
index ed75a0b5bc..e7207b2442 100644
--- a/lib/AST/Type.cpp
+++ b/lib/AST/Type.cpp
@@ -4077,6 +4077,20 @@ CXXRecordDecl *MemberPointerType::getMostRecentCXXRecordDecl() const {
   return getClass()->getAsCXXRecordDecl()->getMostRecentNonInjectedDecl();
 }
 
+bool Type::isGPUArrayType() const {
+  bool gpu_array_flag = false;
+  const Type *type = this;
+  if (type->isClassType()) {
+    CXXRecordDecl* ClassDecl = type->getAsCXXRecordDecl();
+    NamespaceDecl* NSDecl = dyn_cast<NamespaceDecl>(ClassDecl->getEnclosingNamespaceContext());
+    if (ClassDecl && (ClassDecl->getName() == "array")
+        && NSDecl && (NSDecl->getName() == "hc" || NSDecl->getName() == "Concurrency")) {
+      gpu_array_flag = true;
+    }
+  }
+  return gpu_array_flag;
+}
+
 void clang::FixedPointValueToString(SmallVectorImpl<char> &Str,
                                     llvm::APSInt Val, unsigned Scale) {
   FixedPointSemantics FXSema(Val.getBitWidth(), Scale, Val.isSigned(),
diff --git a/lib/Basic/IdentifierTable.cpp b/lib/Basic/IdentifierTable.cpp
index ca9c71287a..4db9d383bb 100644
--- a/lib/Basic/IdentifierTable.cpp
+++ b/lib/Basic/IdentifierTable.cpp
@@ -100,6 +100,7 @@ namespace {
     KEYCXX2A      = 0x200000,
     KEYOPENCLCXX  = 0x400000,
     KEYMSCOMPAT   = 0x800000,
+    KEYCXXAMP     = 0x1000000,
     KEYALLCXX = KEYCXX | KEYCXX11 | KEYCXX2A,
     KEYALL = (0xffffff & ~KEYNOMS18 &
               ~KEYNOOPENCL) // KEYNOMS18 and KEYNOOPENCL are used to exclude.
@@ -145,6 +146,7 @@ static KeywordStatus getKeywordStatus(const LangOptions &LangOpts,
   if (LangOpts.ConceptsTS && (Flags & KEYCONCEPTS)) return KS_Enabled;
   if (LangOpts.Coroutines && (Flags & KEYCOROUTINES)) return KS_Enabled;
   if (LangOpts.ModulesTS && (Flags & KEYMODULES)) return KS_Enabled;
+  if (LangOpts.CPlusPlusAMP && (Flags & KEYCXXAMP)) return KS_Enabled;
   if (LangOpts.CPlusPlus && (Flags & KEYALLCXX)) return KS_Future;
   return KS_Disabled;
 }
diff --git a/lib/Basic/Module.cpp b/lib/Basic/Module.cpp
index f394f26e55..1b6300d45d 100644
--- a/lib/Basic/Module.cpp
+++ b/lib/Basic/Module.cpp
@@ -123,6 +123,7 @@ static bool hasFeature(StringRef Feature, const LangOptions &LangOpts,
                         .Case("opencl", LangOpts.OpenCL)
                         .Case("tls", Target.isTLSSupported())
                         .Case("zvector", LangOpts.ZVector)
+                        .Case("cplusplusamp", LangOpts.CPlusPlusAMP)
                         .Default(Target.hasFeature(Feature) ||
                                  isPlatformEnvironment(Target, Feature));
   if (!HasFeature)
diff --git a/lib/Basic/SourceManager.cpp b/lib/Basic/SourceManager.cpp
index 12b0305e70..84f70bb0ef 100644
--- a/lib/Basic/SourceManager.cpp
+++ b/lib/Basic/SourceManager.cpp
@@ -750,7 +750,10 @@ FileID SourceManager::getFileIDLocal(unsigned SLocOffset) const {
   // most newly created FileID.
   const SrcMgr::SLocEntry *I;
 
-  if (LastFileIDLookup.ID < 0 ||
+  // FIXME: Handle file start location
+  // It is CXXAMP Specific. However it shall be ok in general
+  bool patch = (LocalSLocEntryTable[LastFileIDLookup.ID].getOffset() <= SLocOffset);
+  if (LastFileIDLookup.ID < 0 || patch ||
       LocalSLocEntryTable[LastFileIDLookup.ID].getOffset() < SLocOffset) {
     // Neither loc prunes our search.
     I = LocalSLocEntryTable.end();
diff --git a/lib/Basic/Targets/AMDGPU.cpp b/lib/Basic/Targets/AMDGPU.cpp
index b5c82e2885..f2d49fdc7e 100644
--- a/lib/Basic/Targets/AMDGPU.cpp
+++ b/lib/Basic/Targets/AMDGPU.cpp
@@ -11,6 +11,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "AMDGPU.h"
+#include "Targets.h"
 #include "clang/Basic/Builtins.h"
 #include "clang/Basic/CodeGenOptions.h"
 #include "clang/Basic/LangOptions.h"
@@ -46,7 +47,10 @@ const LangASMap AMDGPUTargetInfo::AMDGPUDefIsGenMap = {
     Generic,  // opencl_generic
     Global,   // cuda_device
     Constant, // cuda_constant
-    Local     // cuda_shared
+    Local,     // cuda_shared
+    Local,     // hcc_tilestatic
+    Generic,   // hcc_generic
+    Global     // hcc_global
 };
 
 const LangASMap AMDGPUTargetInfo::AMDGPUDefIsPrivMap = {
@@ -58,7 +62,10 @@ const LangASMap AMDGPUTargetInfo::AMDGPUDefIsPrivMap = {
     Generic,  // opencl_generic
     Global,   // cuda_device
     Constant, // cuda_constant
-    Local     // cuda_shared
+    Local,    // cuda_shared
+    Local,    // hcc_tilestatic
+    Generic,  // hcc_generic
+    Global    // hcc_global
 };
 } // namespace targets
 } // namespace clang
@@ -274,6 +281,7 @@ AMDGPUTargetInfo::AMDGPUTargetInfo(const llvm::Triple &Triple,
   setAddressSpaceMap(Triple.getOS() == llvm::Triple::Mesa3D ||
                      !isAMDGCN(Triple));
   UseAddrSpaceMapMangling = true;
+  HasFloat16=true;
 
   HasLegalHalfType = true;
   HasFloat16 = true;
@@ -288,6 +296,12 @@ AMDGPUTargetInfo::AMDGPUTargetInfo(const llvm::Triple &Triple,
   }
 
   MaxAtomicPromoteWidth = MaxAtomicInlineWidth = 64;
+  // This is a workaround for HIP to get things going until
+  // https://reviews.llvm.org/D57831 is committed.
+#if _WIN32
+  WCharType = UnsignedShort;
+  WIntType = UnsignedShort;
+#endif
 }
 
 void AMDGPUTargetInfo::adjust(LangOptions &Opts) {
diff --git a/lib/Basic/Targets/NVPTX.h b/lib/Basic/Targets/NVPTX.h
index 2cdd37ca1b..52ce17ef04 100644
--- a/lib/Basic/Targets/NVPTX.h
+++ b/lib/Basic/Targets/NVPTX.h
@@ -33,6 +33,9 @@ static const unsigned NVPTXAddrSpaceMap[] = {
     1, // cuda_device
     4, // cuda_constant
     3, // cuda_shared
+    3, // hcc_tilestatic
+    0, // hcc_generic
+    1, // hcc_global
 };
 
 /// The DWARF address class. Taken from
diff --git a/lib/Basic/Targets/SPIR.h b/lib/Basic/Targets/SPIR.h
index 802ccf8b67..5ab0eb1188 100644
--- a/lib/Basic/Targets/SPIR.h
+++ b/lib/Basic/Targets/SPIR.h
@@ -30,7 +30,10 @@ static const unsigned SPIRAddrSpaceMap[] = {
     4, // opencl_generic
     0, // cuda_device
     0, // cuda_constant
-    0  // cuda_shared
+    0, // cuda_shared
+    3, // hcc_tilestatic
+    4, // hcc_generic
+    1, // hcc_global
 };
 
 class LLVM_LIBRARY_VISIBILITY SPIRTargetInfo : public TargetInfo {
diff --git a/lib/Basic/Targets/TCE.h b/lib/Basic/Targets/TCE.h
index 967ef5c59e..b3f81ac6ae 100644
--- a/lib/Basic/Targets/TCE.h
+++ b/lib/Basic/Targets/TCE.h
@@ -39,7 +39,10 @@ static const unsigned TCEOpenCLAddrSpaceMap[] = {
     0, // opencl_generic
     0, // cuda_device
     0, // cuda_constant
-    0  // cuda_shared
+    0, // cuda_shared
+    4, // hcc_tilestatic
+    0, // hcc_generic
+    3, // hcc_global
 };
 
 class LLVM_LIBRARY_VISIBILITY TCETargetInfo : public TargetInfo {
diff --git a/lib/Basic/Targets/X86.h b/lib/Basic/Targets/X86.h
index dd1e7db6c8..1c5b4324f1 100644
--- a/lib/Basic/Targets/X86.h
+++ b/lib/Basic/Targets/X86.h
@@ -131,6 +131,7 @@ public:
   X86TargetInfo(const llvm::Triple &Triple, const TargetOptions &)
       : TargetInfo(Triple) {
     LongDoubleFormat = &llvm::APFloat::x87DoubleExtended();
+    HasFloat16 = true;
   }
 
   const char *getLongDoubleMangling() const override {
diff --git a/lib/CodeGen/BackendUtil.cpp b/lib/CodeGen/BackendUtil.cpp
index 497652e85b..46f892a68a 100644
--- a/lib/CodeGen/BackendUtil.cpp
+++ b/lib/CodeGen/BackendUtil.cpp
@@ -86,8 +86,10 @@ class EmitAssemblyHelper {
   const LangOptions &LangOpts;
   Module *TheModule;
 
+  Timer PreLinkTime;
   Timer CodeGenerationTime;
 
+  mutable legacy::PassManager *PreLinkPasses;
   std::unique_ptr<raw_pwrite_stream> OS;
 
   TargetIRAnalysis getTargetIRAnalysis() const {
@@ -97,6 +99,15 @@ class EmitAssemblyHelper {
     return TargetIRAnalysis();
   }
 
+  legacy::PassManager *getPreLinkPasses() const {
+    if (!PreLinkPasses) {
+      PreLinkPasses = new legacy::PassManager();
+      PreLinkPasses->add(
+          createTargetTransformInfoWrapperPass(getTargetIRAnalysis()));
+    }
+    return PreLinkPasses;
+  }
+
   void CreatePasses(legacy::PassManager &MPM, legacy::FunctionPassManager &FPM);
 
   /// Generates the TargetMachine.
@@ -126,6 +137,9 @@ class EmitAssemblyHelper {
     return F;
   }
 
+  /// Add target specific pre-linking passes.
+  void AddPreLinkPasses();
+
 public:
   EmitAssemblyHelper(DiagnosticsEngine &_Diags,
                      const HeaderSearchOptions &HeaderSearchOpts,
@@ -134,9 +148,12 @@ public:
                      const LangOptions &LOpts, Module *M)
       : Diags(_Diags), HSOpts(HeaderSearchOpts), CodeGenOpts(CGOpts),
         TargetOpts(TOpts), LangOpts(LOpts), TheModule(M),
-        CodeGenerationTime("codegen", "Code Generation Time") {}
+        PreLinkTime("prelink", "Pre-Linking Passes Time"),
+        CodeGenerationTime("codegen", "Code Generation Time"),
+        PreLinkPasses(nullptr) {}
 
   ~EmitAssemblyHelper() {
+    delete PreLinkPasses;
     if (CodeGenOpts.DisableFree)
       BuryPointer(std::move(TM));
   }
@@ -148,6 +165,12 @@ public:
 
   void EmitAssemblyWithNewPassManager(BackendAction Action,
                                       std::unique_ptr<raw_pwrite_stream> OS);
+
+  void DoPreLinkPasses();
+
+  /// Set up target for target specific pre-linking passes and LLVM code
+  /// generation.
+  void setTarget(BackendAction Action);
 };
 
 // We need this wrapper to access LangOpts and CGOpts from extension functions
@@ -737,6 +760,14 @@ static void setCommandLineOpts(const CodeGenOptions &CodeGenOpts) {
                                     BackendArgs.data());
 }
 
+void EmitAssemblyHelper::setTarget(BackendAction Action) {
+  bool UsesCodeGen = (Action != Backend_EmitNothing &&
+                      Action != Backend_EmitBC &&
+                      Action != Backend_EmitLL);
+  if (!TM)
+    CreateTargetMachine(UsesCodeGen);
+}
+
 void EmitAssemblyHelper::CreateTargetMachine(bool MustCreateTM) {
   // Create the TargetMachine for generating code.
   std::string Error;
@@ -789,6 +820,11 @@ bool EmitAssemblyHelper::AddEmitPasses(legacy::PassManager &CodeGenPasses,
   return true;
 }
 
+void EmitAssemblyHelper::AddPreLinkPasses() {
+  legacy::PassManager *PM = getPreLinkPasses();
+  TM->addPreLinkPasses(*PM);
+}
+
 void EmitAssemblyHelper::EmitAssembly(BackendAction Action,
                                       std::unique_ptr<raw_pwrite_stream> OS) {
   TimeRegion Region(FrontendTimesIsEnabled ? &CodeGenerationTime : nullptr);
@@ -798,7 +834,6 @@ void EmitAssemblyHelper::EmitAssembly(BackendAction Action,
   bool UsesCodeGen = (Action != Backend_EmitNothing &&
                       Action != Backend_EmitBC &&
                       Action != Backend_EmitLL);
-  CreateTargetMachine(UsesCodeGen);
 
   if (UsesCodeGen && !TM)
     return;
@@ -904,6 +939,23 @@ void EmitAssemblyHelper::EmitAssembly(BackendAction Action,
     DwoOS->keep();
 }
 
+void EmitAssemblyHelper::DoPreLinkPasses() {
+  TimeRegion Region(llvm::TimePassesIsEnabled ? &PreLinkTime : nullptr);
+
+  if (!TM)
+    return;
+
+  AddPreLinkPasses();
+
+  // Before executing passes, print the final values of the LLVM options.
+  cl::PrintOptionValues();
+
+  if (PreLinkPasses) {
+    PrettyStackTraceString CrashInfo("Pre-linking passes");
+    PreLinkPasses->run(*TheModule);
+  }
+}
+
 static PassBuilder::OptimizationLevel mapToLevel(const CodeGenOptions &Opts) {
   switch (Opts.OptimizationLevel) {
   default:
@@ -1449,7 +1501,8 @@ void clang::EmitBackendOutput(DiagnosticsEngine &Diags,
                               const LangOptions &LOpts,
                               const llvm::DataLayout &TDesc, Module *M,
                               BackendAction Action,
-                              std::unique_ptr<raw_pwrite_stream> OS) {
+                              std::unique_ptr<raw_pwrite_stream> OS,
+                              bool SetLLVMOpts) {
 
   llvm::TimeTraceScope TimeScope("Backend", StringRef(""));
 
@@ -1492,6 +1545,10 @@ void clang::EmitBackendOutput(DiagnosticsEngine &Diags,
 
   EmitAssemblyHelper AsmHelper(Diags, HeaderOpts, CGOpts, TOpts, LOpts, M);
 
+  if (SetLLVMOpts)
+    setCommandLineOpts(CGOpts);
+  AsmHelper.setTarget(Action);
+
   if (CGOpts.ExperimentalNewPassManager)
     AsmHelper.EmitAssemblyWithNewPassManager(Action, std::move(OS));
   else
@@ -1510,6 +1567,20 @@ void clang::EmitBackendOutput(DiagnosticsEngine &Diags,
   }
 }
 
+void clang::PerformPrelinkPasses(DiagnosticsEngine &Diags,
+                                 const HeaderSearchOptions &HeaderSearchOpts,
+                                 const CodeGenOptions &CGOpts,
+                                 const clang::TargetOptions &TOpts,
+                                 const LangOptions &LOpts, const llvm::DataLayout &TDesc,
+                                 Module *M, BackendAction Action) {
+  EmitAssemblyHelper AsmHelper(Diags, HeaderSearchOpts, CGOpts, TOpts, LOpts,
+                               M);
+
+  setCommandLineOpts(CGOpts);
+  AsmHelper.setTarget(Action);
+  AsmHelper.DoPreLinkPasses();
+}
+
 static const char* getSectionNameForBitcode(const Triple &T) {
   switch (T.getObjectFormat()) {
   case Triple::MachO:
diff --git a/lib/CodeGen/CGAMPRuntime.cpp b/lib/CodeGen/CGAMPRuntime.cpp
new file mode 100644
index 0000000000..da905d830c
--- /dev/null
+++ b/lib/CodeGen/CGAMPRuntime.cpp
@@ -0,0 +1,406 @@
+//===----- CGAMPRuntime.cpp - Interface to C++ AMP Runtime ----------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This provides an abstract class for C++ AMP code generation.  Concrete
+// subclasses of this implement code generation for specific C++ AMP
+// runtime libraries.
+//
+//===----------------------------------------------------------------------===//
+
+#include "CodeGenFunction.h"
+#include "CGAMPRuntime.h"
+#include "clang/AST/Decl.h"
+#include "clang/AST/ExprCXX.h"
+#include "CGCall.h"
+#include "TargetInfo.h"
+
+namespace clang {
+namespace CodeGen {
+
+CGAMPRuntime::~CGAMPRuntime() {}
+
+/// Creates an instance of a C++ AMP runtime class.
+CGAMPRuntime *CreateAMPRuntime(CodeGenModule &CGM) {
+  return new CGAMPRuntime(CGM);
+}
+static CXXMethodDecl *findValidIndexType(QualType IndexTy) {
+  CXXRecordDecl *IndexClass = (*IndexTy).getAsCXXRecordDecl();
+  CXXMethodDecl *IndexConstructor = NULL;
+  if (IndexClass) {
+    for (CXXRecordDecl::method_iterator CtorIt = IndexClass->method_begin(),
+        CtorE = IndexClass->method_end();
+        CtorIt != CtorE; ++CtorIt) {
+      if (CtorIt->hasAttr<AnnotateAttr>() &&
+          CtorIt->getAttr<AnnotateAttr>()->getAnnotation() ==
+            "__cxxamp_opencl_index") {
+        IndexConstructor = *CtorIt;
+      }
+    }
+  }
+  return IndexConstructor;
+}
+
+
+void CGAMPRuntime::EmitCXXAMPDeserializer(CodeGenFunction &CGF,
+  const FunctionDecl *Trampoline, FunctionArgList& Args,
+  Address& ai) {
+
+  const CXXRecordDecl *ClassDecl = dyn_cast<CXXMethodDecl>(Trampoline)->getParent();
+
+  CXXConstructorDecl *DeserializeConstructor =
+    dyn_cast<CXXConstructorDecl>(ClassDecl->getCXXAMPDeserializationConstructor());
+  assert(DeserializeConstructor);
+
+  CallArgList DeserializerArgs;
+
+  // this
+  DeserializerArgs.add(RValue::get(ai.getPointer()),
+                       DeserializeConstructor->getThisType());
+
+  // the rest of constructor args. Create temporary objects for references
+  // on stack
+  CXXConstructorDecl::param_iterator CPI = DeserializeConstructor->param_begin(),
+  CPE = DeserializeConstructor->param_end();
+
+  for (FunctionArgList::iterator I = Args.begin(); 
+         I != Args.end() && CPI != CPE; ++CPI) {
+    // Reference types are only allowed to have one level; i.e. no
+    // class base {&int}; class foo { bar &base; };
+    QualType MemberType = (*CPI)->getType().getNonReferenceType();
+    if (MemberType != (*CPI)->getType()) {
+      if (!CGM.getLangOpts().HSAExtension) {
+
+        assert(MemberType.getTypePtr()->isClassType() == true &&
+               "Only supporting taking reference of classes");
+
+        CXXRecordDecl *MemberClass = MemberType.getTypePtr()->getAsCXXRecordDecl();
+
+        CXXConstructorDecl *MemberDeserializer = dyn_cast<CXXConstructorDecl>(
+                              MemberClass->getCXXAMPDeserializationConstructor());
+        assert(MemberDeserializer);
+
+        std::vector<Expr*>MemberArgDeclRefs;
+        for (CXXMethodDecl::param_iterator MCPI = MemberDeserializer->param_begin(),
+              MCPE = MemberDeserializer->param_end(); MCPI!=MCPE; ++MCPI, ++I) {
+
+          Expr *ArgDeclRef = DeclRefExpr::Create(CGM.getContext(),
+                                                 NestedNameSpecifierLoc(),
+                                                 SourceLocation(),
+                                                 const_cast<VarDecl *>(*I),
+                                                 false,
+                                                 SourceLocation(),
+                                                 (*MCPI)->getType(), VK_RValue);
+  	      MemberArgDeclRefs.push_back(ArgDeclRef);
+        }
+
+        // Allocate "this" for member referenced objects
+        Address mai = CGF.CreateMemTemp(MemberType);
+
+        // Emit code to call the deserializing constructor of temp objects
+        CXXConstructExpr *CXXCE = CXXConstructExpr::Create(CGM.getContext(), 
+                                                           MemberType,
+                                                           SourceLocation(),
+                                                           MemberDeserializer,
+                                                           false,
+                                                           MemberArgDeclRefs,
+                                                           false, false, false, false,
+                                                           CXXConstructExpr::CK_Complete,
+                                                           SourceLocation());
+
+        auto currAVS = AggValueSlot::forAddr(mai, MemberType.getQualifiers(),
+        		                               AggValueSlot::IsNotDestructed,
+        		                               AggValueSlot::DoesNotNeedGCBarriers,
+        		                               AggValueSlot::IsNotAliased,
+        		                               AggValueSlot::DoesNotOverlap);
+        CGF.EmitCXXConstructorCall(MemberDeserializer, Ctor_Complete, false, false, currAVS, CXXCE);
+        DeserializerArgs.add(RValue::get(mai.getPointer()), (*CPI)->getType());
+
+      } else { // HSA extension check
+
+        if (MemberType.getTypePtr()->isClassType()) {
+
+          // hc::array should still be serialized as traditional C++AMP objects
+          if (MemberType.getTypePtr()->isGPUArrayType()) {
+
+            CXXRecordDecl *MemberClass = MemberType.getTypePtr()->getAsCXXRecordDecl();
+
+            CXXConstructorDecl *MemberDeserializer =
+              dyn_cast<CXXConstructorDecl>(MemberClass->getCXXAMPDeserializationConstructor());
+            assert(MemberDeserializer);
+
+            std::vector<Expr*>MemberArgDeclRefs;
+            for (CXXMethodDecl::param_iterator MCPI = MemberDeserializer->param_begin(),
+              MCPE = MemberDeserializer->param_end(); MCPI!=MCPE; ++MCPI, ++I) {
+
+              Expr *ArgDeclRef = DeclRefExpr::Create(CGM.getContext(),
+                                                     NestedNameSpecifierLoc(),
+                                                     SourceLocation(),
+                                                     const_cast<VarDecl *>(*I),
+                                                     false,
+                                                     SourceLocation(),
+                                                     (*MCPI)->getType(), VK_RValue);
+
+               MemberArgDeclRefs.push_back(ArgDeclRef);
+            }
+
+            // Allocate "this" for member referenced objects
+            Address mai = CGF.CreateMemTemp(MemberType);
+
+            // Emit code to call the deserializing constructor of temp objects
+            CXXConstructExpr *CXXCE = CXXConstructExpr::Create(CGM.getContext(), 
+                                                               MemberType,
+                                                               SourceLocation(),
+                                                               MemberDeserializer,
+                                                               false,
+                                                               MemberArgDeclRefs,
+                                                               false, false, false, false,
+                                                               CXXConstructExpr::CK_Complete,
+                                                               SourceLocation());
+
+            auto currAVS = AggValueSlot::forAddr(mai, MemberType.getQualifiers(),
+                    		                               AggValueSlot::IsNotDestructed,
+                    		                               AggValueSlot::DoesNotNeedGCBarriers,
+                    		                               AggValueSlot::IsNotAliased,
+                    		                               AggValueSlot::DoesNotOverlap);
+            CGF.EmitCXXConstructorCall(MemberDeserializer, Ctor_Complete, false, false, currAVS, CXXCE);
+            DeserializerArgs.add(RValue::get(mai.getPointer()), (*CPI)->getType());
+
+          } else {
+
+            // capture by refernce for HSA
+            Expr *ArgDeclRef = DeclRefExpr::Create(CGM.getContext(),
+                                                   NestedNameSpecifierLoc(),
+                                                   SourceLocation(),
+                                                   const_cast<VarDecl *>(*I), false,
+                                                   SourceLocation(),
+                                                   (*I)->getType(), VK_RValue);
+
+            RValue ArgRV = CGF.EmitAnyExpr(ArgDeclRef);
+            DeserializerArgs.add(ArgRV, CGM.getContext().getPointerType(MemberType));
+            ++I;
+          }
+          
+        } else {
+
+          // capture by refernce for HSA
+          Expr *ArgDeclRef = DeclRefExpr::Create(CGM.getContext(),
+                                                 NestedNameSpecifierLoc(),
+                                                 SourceLocation(),
+                                                 const_cast<VarDecl *>(*I), false,
+                                                 SourceLocation(),
+                                                 (*I)->getType(), VK_RValue);
+
+          RValue ArgRV = CGF.EmitAnyExpr(ArgDeclRef);
+          DeserializerArgs.add(ArgRV, CGM.getContext().getPointerType(MemberType));
+          ++I;
+        }
+      } // HSA extension check
+
+    } else {
+
+      Expr *ArgDeclRef = DeclRefExpr::Create(CGM.getContext(),
+	                                           NestedNameSpecifierLoc(),
+	                                           SourceLocation(),
+	                                           const_cast<VarDecl *>(*I), false,
+	                                           SourceLocation(),
+	                                           (*I)->getType(), VK_RValue);
+
+      RValue ArgRV = CGF.EmitAnyExpr(ArgDeclRef);
+      DeserializerArgs.add(ArgRV, (*CPI)->getType());
+      ++I;
+    }
+  }
+
+  // Emit code to call the deserializing constructor
+  llvm::Constant *Callee = CGM.getAddrOfCXXStructor(GlobalDecl(DeserializeConstructor,Dtor_Complete));
+
+  const FunctionProtoType *FPT =
+      DeserializeConstructor->getType()->castAs<FunctionProtoType>();
+
+  const CGFunctionInfo &DesFnInfo = 
+      CGM.getTypes().arrangeCXXStructorDeclaration(GlobalDecl(DeserializeConstructor, Dtor_Complete));
+
+  for (unsigned I = 1, E = DeserializerArgs.size(); I != E; ++I) {
+    auto T = FPT->getParamType(I-1);
+    // EmitFromMemory is necessary in case function has bool parameter.
+    if (T->isBooleanType()) {
+      DeserializerArgs[I] =
+          CallArg(RValue::get(CGF.EmitFromMemory(
+                      DeserializerArgs[I].getKnownRValue().getScalarVal(), T)),
+                  T);
+    }
+  }
+  CGF.EmitCall(DesFnInfo, CGCallee::forDirect(Callee), ReturnValueSlot(), DeserializerArgs);
+}
+
+/// Operations:
+/// For each reference-typed members, construct temporary object
+/// Invoke constructor of index
+/// Invoke constructor of the class
+/// Invoke operator(index)
+void CGAMPRuntime::EmitTrampolineBody(CodeGenFunction &CGF,
+  const FunctionDecl *Trampoline, FunctionArgList& Args) {
+  const CXXRecordDecl *ClassDecl = dyn_cast<CXXMethodDecl>(Trampoline)->getParent();
+  assert(ClassDecl);
+  // Allocate "this"
+  Address ai = CGF.CreateMemTemp(QualType(ClassDecl->getTypeForDecl(),0));
+  // Locate the constructor to call
+  if(ClassDecl->getCXXAMPDeserializationConstructor()!=NULL) {
+    EmitCXXAMPDeserializer(CGF,Trampoline,Args,ai); 
+  }
+
+  // Locate the type of Concurrency::index<1>
+  // Locate the operator to call
+  CXXMethodDecl *KernelDecl = NULL;
+  CXXMethodDecl *KernelDeclNoArg = NULL;
+  const FunctionType *MT = NULL;
+  QualType IndexTy;
+  for (CXXRecordDecl::method_iterator Method = ClassDecl->method_begin(),
+                                   MethodEnd = ClassDecl->method_end();
+                                     Method != MethodEnd; ++Method) {
+
+    CXXMethodDecl *MethodDecl = *Method;
+    if (MethodDecl->isOverloadedOperator() &&
+        MethodDecl->getOverloadedOperator() == OO_Call &&
+        MethodDecl->hasAttr<CXXAMPRestrictAMPAttr>()) {
+       
+      //Check types.
+      if(MethodDecl->getNumParams() > 1) {
+	      continue;
+      }
+      else if (MethodDecl->getNumParams() == 0) {
+         MT = dyn_cast<FunctionType>(MethodDecl->getType().getTypePtr());
+         assert(MT);
+         KernelDeclNoArg = MethodDecl;
+         continue;
+      }
+      else {
+        ParmVarDecl *P = MethodDecl->getParamDecl(0);
+        IndexTy = P->getType().getNonReferenceType();
+        if (!findValidIndexType(IndexTy))
+          continue;
+        MT = dyn_cast<FunctionType>(MethodDecl->getType().getTypePtr());
+        assert(MT);
+        KernelDecl = MethodDecl;
+        break;
+      }
+    }
+  }
+
+  // in case we couldn't find any kernel declarator
+  // raise error
+  if (!KernelDecl && !KernelDeclNoArg) {
+    CGF.CGM.getDiags().Report(ClassDecl->getLocation(), diag::err_amp_ill_formed_functor);
+    return;
+  }
+
+  CXXMethodDecl *Kernel = (KernelDecl != NULL) ? KernelDecl : KernelDeclNoArg;
+
+  // Invoke this->operator(index)
+  // Prepate the operator() to call
+  llvm::FunctionType *fnType =
+    CGM.getTypes().GetFunctionType(CGM.getTypes().arrangeCXXMethodDeclaration(Kernel));
+  llvm::Constant *fnAddr = CGM.GetAddrOfFunction(Kernel, fnType);
+
+  // Prepare argument
+  CallArgList KArgs;
+
+  // this
+  KArgs.add(RValue::get(ai.getPointer()), Kernel->getThisType());
+
+  if (KernelDecl) {
+
+    // Allocate Index
+    Address index = CGF.CreateMemTemp(IndexTy);
+
+    // Locate the constructor to call
+    CXXMethodDecl *IndexConstructor = findValidIndexType(IndexTy);
+    assert(IndexConstructor);
+
+    // Emit code to call the Concurrency::index<1>::__cxxamp_opencl_index()
+    if (!CGF.getLangOpts().AMPCPU) {
+      if (CXXConstructorDecl *Constructor =
+            dyn_cast <CXXConstructorDecl>(IndexConstructor)) {
+
+        CXXConstructExpr *CXXCE = CXXConstructExpr::Create(CGM.getContext(), 
+                                                           IndexTy,
+                                                           SourceLocation(),
+                                                           Constructor,
+                                                           false,
+                                                           ArrayRef<Expr*>(),
+                                                           false, false, false, false,
+                                                           CXXConstructExpr::CK_Complete,
+                                                           SourceLocation());
+
+        auto currAVS = AggValueSlot::forAddr(index, IndexTy.getQualifiers(),
+                            		                               AggValueSlot::IsNotDestructed,
+                            		                               AggValueSlot::DoesNotNeedGCBarriers,
+                            		                               AggValueSlot::IsNotAliased,
+                            		                               AggValueSlot::DoesNotOverlap);
+        CGF.EmitCXXConstructorCall(Constructor, Ctor_Complete, false, false, currAVS, CXXCE);
+
+      } else {
+        llvm::FunctionType *indexInitType = CGM.getTypes().GetFunctionType(
+                                               CGM.getTypes().arrangeCXXMethodDeclaration(IndexConstructor));
+
+        llvm::Constant *indexInitAddr = CGM.GetAddrOfFunction(IndexConstructor, indexInitType);
+
+        CGF.EmitCXXMemberOrOperatorCall(IndexConstructor, CGCallee::forDirect(indexInitAddr),
+                                        ReturnValueSlot(), index.getPointer(), /*ImplicitParam=*/0, 
+                                        QualType(), /*CallExpr=*/nullptr, /*RtlArgs=*/nullptr);
+      }
+    }
+
+    // *index
+    // index is of reference type of IndexTy.
+    KArgs.add(RValue::get(index.getPointer()),
+        CGF.getContext().getLValueReferenceType(IndexTy));
+  }
+
+  const CGFunctionInfo &FnInfo = CGM.getTypes().arrangeFreeFunctionCall(KArgs, MT, false);
+  CGF.EmitCall(FnInfo, CGCallee::forDirect(fnAddr), ReturnValueSlot(), KArgs);
+  CGM.getTargetCodeGenInfo().setTargetAttributes(Kernel, CGF.CurFn, CGM);
+}
+
+void CGAMPRuntime::EmitTrampolineNameBody(CodeGenFunction &CGF,
+  const FunctionDecl *Trampoline, FunctionArgList& Args) {
+  const CXXRecordDecl *ClassDecl = dyn_cast<CXXMethodDecl>(Trampoline)->getParent();
+  assert(ClassDecl);
+  // Locate the trampoline
+  // Locate the operator to call
+  CXXMethodDecl *TrampolineDecl = NULL;
+  for (CXXRecordDecl::method_iterator Method = ClassDecl->method_begin(),
+      MethodEnd = ClassDecl->method_end();
+      Method != MethodEnd; ++Method) {
+    CXXMethodDecl *MethodDecl = *Method;
+    if (Method->hasAttr<AnnotateAttr>() &&
+        Method->getAttr<AnnotateAttr>()->getAnnotation() == "__cxxamp_trampoline") {
+      TrampolineDecl = MethodDecl;
+      break;
+    }
+  }
+  assert(TrampolineDecl && "Trampoline not declared!");
+  GlobalDecl GD(TrampolineDecl);
+  llvm::Constant *S = llvm::ConstantDataArray::getString(CGM.getLLVMContext(),
+    CGM.getMangledName(GD));
+  llvm::GlobalVariable *GV = new llvm::GlobalVariable(CGM.getModule(), S->getType(),
+    true, llvm::GlobalValue::PrivateLinkage, S, "__cxxamp_trampoline.kernelname");
+  GV->setUnnamedAddr(llvm::GlobalValue::UnnamedAddr::Global);
+
+  //Create GetElementPtr(0, 0)
+  std::vector<llvm::Constant*> indices;
+  llvm::ConstantInt *zero = llvm::ConstantInt::get(CGM.getLLVMContext(), llvm::APInt(32, StringRef("0"), 10));
+  indices.push_back(zero);
+  indices.push_back(zero);
+  llvm::Constant *const_ptr = llvm::ConstantExpr::getGetElementPtr(GV->getValueType(), GV, indices);
+  CGF.Builder.CreateStore(const_ptr, CGF.ReturnValue);
+
+}
+} // namespace CodeGen
+} // namespace clang
diff --git a/lib/CodeGen/CGAMPRuntime.h b/lib/CodeGen/CGAMPRuntime.h
new file mode 100755
index 0000000000..cc0490820f
--- /dev/null
+++ b/lib/CodeGen/CGAMPRuntime.h
@@ -0,0 +1,54 @@
+//===----- CGAMPRuntime.h - Interface to C++ AMP Runtime --------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This provides an abstract class for C++ AMP code generation.  Concrete
+// subclasses of this implement code generation for specific C++ AMP
+// runtime libraries.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef CLANG_CODEGEN_AMPRUNTIME_H
+#define CLANG_CODEGEN_AMPRUNTIME_H
+
+namespace clang {
+
+namespace CodeGen {
+
+class CodeGenFunction;
+class CodeGenModule;
+class FunctionArgList;
+class ReturnValueSlot;
+class RValue;
+
+class CGAMPRuntime {
+protected:
+  CodeGenModule &CGM;
+
+public:
+  CGAMPRuntime(CodeGenModule &CGM) : CGM(CGM) {}
+  virtual ~CGAMPRuntime();
+  virtual void EmitTrampolineBody(CodeGenFunction &CGF, const FunctionDecl *FD,
+                                  FunctionArgList &Args);
+  void EmitTrampolineNameBody(CodeGenFunction &CGF, const FunctionDecl *FD,
+    FunctionArgList &Args);
+
+private:
+
+  void EmitCXXAMPDeserializer(CodeGenFunction &CGF,
+                              const FunctionDecl *Trampoline, 
+                              FunctionArgList& Args, Address& ai);
+};
+
+/// Creates an instance of a C++ AMP runtime class.
+CGAMPRuntime *CreateAMPRuntime(CodeGenModule &CGM);
+
+}
+}
+
+#endif
diff --git a/lib/CodeGen/CGAtomic.cpp b/lib/CodeGen/CGAtomic.cpp
index a95cd12c2d..584e880a82 100644
--- a/lib/CodeGen/CGAtomic.cpp
+++ b/lib/CodeGen/CGAtomic.cpp
@@ -729,7 +729,8 @@ AddDirectArgument(CodeGenFunction &CGF, CallArgList &Args,
         CGF.getContext().getIntTypeForBitwidth(SizeInBits, /*Signed=*/false);
     llvm::Type *IPtrTy = llvm::IntegerType::get(CGF.getLLVMContext(),
                                                 SizeInBits)->getPointerTo();
-    Address Ptr = Address(CGF.Builder.CreateBitCast(Val, IPtrTy), Align);
+    Address Ptr = Address(
+      CGF.Builder.CreatePointerBitCastOrAddrSpaceCast(Val, IPtrTy), Align);
     Val = CGF.EmitLoadOfScalar(Ptr, false,
                                CGF.getContext().getPointerType(ValTy),
                                Loc);
@@ -1190,14 +1191,16 @@ RValue CodeGenFunction::EmitAtomicExpr(AtomicExpr *E) {
 
       Builder.CreateStore(
           ResVal,
-          Builder.CreateBitCast(Dest, ResVal->getType()->getPointerTo()));
+          Builder.CreatePointerBitCastOrAddrSpaceCast(
+            Dest, ResVal->getType()->getPointerTo()));
     }
 
     if (RValTy->isVoidType())
       return RValue::get(nullptr);
 
     return convertTempToRValue(
-        Builder.CreateBitCast(Dest, ConvertTypeForMem(RValTy)->getPointerTo()),
+        Builder.CreatePointerBitCastOrAddrSpaceCast(
+          Dest, ConvertTypeForMem(RValTy)->getPointerTo()),
         RValTy, E->getExprLoc());
   }
 
@@ -1331,7 +1334,8 @@ Address AtomicInfo::emitCastToAtomicIntPointer(Address addr) const {
     cast<llvm::PointerType>(addr.getPointer()->getType())->getAddressSpace();
   llvm::IntegerType *ty =
     llvm::IntegerType::get(CGF.getLLVMContext(), AtomicSizeInBits);
-  return CGF.Builder.CreateBitCast(addr, ty->getPointerTo(addrspace));
+  return CGF.Builder.CreatePointerBitCastOrAddrSpaceCast(
+    addr, ty->getPointerTo(addrspace));
 }
 
 Address AtomicInfo::convertToAtomicIntPointer(Address Addr) const {
@@ -1400,7 +1404,8 @@ RValue AtomicInfo::ConvertIntToValueOrAtomic(llvm::Value *IntVal,
     } else if (ValTy->isPointerTy())
       return RValue::get(CGF.Builder.CreateIntToPtr(IntVal, ValTy));
     else if (llvm::CastInst::isBitCastable(IntVal->getType(), ValTy))
-      return RValue::get(CGF.Builder.CreateBitCast(IntVal, ValTy));
+      return RValue::get(CGF.Builder.CreatePointerBitCastOrAddrSpaceCast(
+        IntVal, ValTy));
   }
 
   // Create a temporary.  This needs to be big enough to hold the
@@ -1586,7 +1591,8 @@ llvm::Value *AtomicInfo::convertRValueToInt(RValue RVal) const {
       if (isa<llvm::PointerType>(Value->getType()))
         return CGF.Builder.CreatePtrToInt(Value, InputIntTy);
       else if (llvm::BitCastInst::isBitCastable(Value->getType(), InputIntTy))
-        return CGF.Builder.CreateBitCast(Value, InputIntTy);
+        return CGF.Builder.CreatePointerBitCastOrAddrSpaceCast(
+          Value, InputIntTy);
     }
   }
   // Otherwise, we need to go through memory.
diff --git a/lib/CodeGen/CGBuiltin.cpp b/lib/CodeGen/CGBuiltin.cpp
index cadce50741..3813c73a05 100644
--- a/lib/CodeGen/CGBuiltin.cpp
+++ b/lib/CodeGen/CGBuiltin.cpp
@@ -3497,7 +3497,7 @@ RValue CodeGenFunction::EmitBuiltinExpr(const GlobalDecl GD, unsigned BuiltinID,
   case Builtin::BI__GetExceptionInfo: {
     if (llvm::GlobalVariable *GV =
             CGM.getCXXABI().getThrowInfo(FD->getParamDecl(0)->getType()))
-      return RValue::get(llvm::ConstantExpr::getBitCast(GV, CGM.Int8PtrTy));
+      return RValue::get(llvm::ConstantExpr::getPointerCast(GV, CGM.Int8PtrTy));
     break;
   }
 
@@ -3972,7 +3972,9 @@ RValue CodeGenFunction::EmitBuiltinExpr(const GlobalDecl GD, unsigned BuiltinID,
     return RValue::get(Builder.CreateFPExt(HalfVal, Builder.getFloatTy()));
   }
   case Builtin::BIprintf:
-    if (getTarget().getTriple().isNVPTX())
+    if (getTarget().getTriple().isNVPTX() ||
+        (getTarget().getTriple().getArch() == Triple::amdgcn &&
+         getLangOpts().CUDA))
       return EmitNVPTXDevicePrintfCallExpr(E, ReturnValue);
     break;
   case Builtin::BI__builtin_canonicalize:
diff --git a/lib/CodeGen/CGCUDANV.cpp b/lib/CodeGen/CGCUDANV.cpp
index 4d4038dae9..5d4d467cca 100644
--- a/lib/CodeGen/CGCUDANV.cpp
+++ b/lib/CodeGen/CGCUDANV.cpp
@@ -344,13 +344,21 @@ void CGNVCUDARuntime::emitDeviceStubBodyLegacy(CodeGenFunction &CGF,
   llvm::BasicBlock *EndBlock = CGF.createBasicBlock("setup.end");
   CharUnits Offset = CharUnits::Zero();
   for (const VarDecl *A : Args) {
+    auto *Arg = CGF.GetAddrOfLocalVar(A).getPointer();
     CharUnits TyWidth, TyAlign;
-    std::tie(TyWidth, TyAlign) =
-        CGM.getContext().getTypeInfoInChars(A->getType());
+    auto *Aux = CGM.getContext().getAuxTargetInfo();
+    if (Aux && Aux->getTriple().getArch() == llvm::Triple::amdgcn) {
+      auto *ArgTy = Arg->getType()->getPointerElementType();
+      auto &DL = CGM.getDataLayout();
+      TyWidth = CharUnits::fromQuantity(DL.getTypeStoreSize(ArgTy));
+      TyAlign = CharUnits::fromQuantity(DL.getABITypeAlignment(ArgTy));
+    } else {
+      std::tie(TyWidth, TyAlign) =
+               CGM.getContext().getTypeInfoInChars(A->getType());
+    }
     Offset = Offset.alignTo(TyAlign);
     llvm::Value *Args[] = {
-        CGF.Builder.CreatePointerCast(CGF.GetAddrOfLocalVar(A).getPointer(),
-                                      VoidPtrTy),
+        CGF.Builder.CreatePointerCast(Arg, VoidPtrTy),
         llvm::ConstantInt::get(SizeTy, TyWidth.getQuantity()),
         llvm::ConstantInt::get(SizeTy, Offset.getQuantity()),
     };
@@ -795,7 +803,7 @@ llvm::Function *CGNVCUDARuntime::makeModuleDtorFunction() {
 std::string CGNVCUDARuntime::getDeviceStubName(llvm::StringRef Name) const {
   if (!CGM.getLangOpts().HIP)
     return Name;
-  return (Name + ".stub").str();
+  return ("__device_stub_" + Name).str();
 }
 
 CGCUDARuntime *CodeGen::CreateNVCUDARuntime(CodeGenModule &CGM) {
diff --git a/lib/CodeGen/CGCall.cpp b/lib/CodeGen/CGCall.cpp
index cf8024550e..ec408400b9 100644
--- a/lib/CodeGen/CGCall.cpp
+++ b/lib/CodeGen/CGCall.cpp
@@ -174,26 +174,42 @@ static void appendParameterTypes(const CodeGenTypes &CGT,
 static const CGFunctionInfo &
 arrangeLLVMFunctionInfo(CodeGenTypes &CGT, bool instanceMethod,
                         SmallVectorImpl<CanQualType> &prefix,
-                        CanQual<FunctionProtoType> FTP) {
+                        CanQual<FunctionProtoType> FTP, const FunctionDecl *FD) {
   SmallVector<FunctionProtoType::ExtParameterInfo, 16> paramInfos;
   RequiredArgs Required = RequiredArgs::forPrototypePlus(FTP, prefix.size());
   // FIXME: Kill copy.
   appendParameterTypes(CGT, prefix, paramInfos, FTP);
   CanQualType resultType = FTP->getReturnType().getUnqualifiedType();
 
+  // HCC (HIP) specific: for HIP we want __global__ functions to represent
+  // actual kernel entry-points, i.e. we want them to have AMDGPU_KERNEL as
+  // their calling convention. The only way to get that early enough appears to
+  // be the below, wherein the mark them as OpenCL kernels before the layout is
+  // generated. This is temporary and somewhat unpleasant, the correct solution
+  // would be to bubble up AMDGPU_KERNEL as a full blown calling convention
+  // which can be used orthogonally to OpenCL, such as e.g. __stdcall.
+  FunctionType::ExtInfo Tmp = FTP->getExtInfo();
+
+  if (FD &&
+      FD->hasAttr<AnnotateAttr>() &&
+      FD->getAttr<AnnotateAttr>()->getAnnotation() ==
+        "__HIP_global_function__") {
+    Tmp = Tmp.withCallingConv(CallingConv::CC_OpenCLKernel);
+  }
+
   return CGT.arrangeLLVMFunctionInfo(resultType, instanceMethod,
                                      /*chainCall=*/false, prefix,
-                                     FTP->getExtInfo(), paramInfos,
+                                     Tmp, paramInfos,
                                      Required);
 }
 
 /// Arrange the argument and result information for a value of the
 /// given freestanding function type.
 const CGFunctionInfo &
-CodeGenTypes::arrangeFreeFunctionType(CanQual<FunctionProtoType> FTP) {
+CodeGenTypes::arrangeFreeFunctionType(CanQual<FunctionProtoType> FTP, const FunctionDecl *FD) {
   SmallVector<CanQualType, 16> argTypes;
   return ::arrangeLLVMFunctionInfo(*this, /*instanceMethod=*/false, argTypes,
-                                   FTP);
+                                   FTP, FD);
 }
 
 static CallingConv getCallingConventionForDecl(const Decl *D, bool IsWindows) {
@@ -257,7 +273,7 @@ CodeGenTypes::arrangeCXXMethodType(const CXXRecordDecl *RD,
 
   return ::arrangeLLVMFunctionInfo(
       *this, true, argTypes,
-      FTP->getCanonicalTypeUnqualified().getAs<FunctionProtoType>());
+      FTP->getCanonicalTypeUnqualified().getAs<FunctionProtoType>(), MD);
 }
 
 /// Set calling convention for CUDA/HIP kernel.
@@ -289,7 +305,7 @@ CodeGenTypes::arrangeCXXMethodDeclaration(const CXXMethodDecl *MD) {
     return arrangeCXXMethodType(ThisType, prototype.getTypePtr(), MD);
   }
 
-  return arrangeFreeFunctionType(prototype);
+  return arrangeFreeFunctionType(prototype, MD);
 }
 
 bool CodeGenTypes::inheritingCtorHasParams(
@@ -448,7 +464,7 @@ CodeGenTypes::arrangeFunctionDeclaration(const FunctionDecl *FD) {
         /*chainCall=*/false, None, noProto->getExtInfo(), {},RequiredArgs::All);
   }
 
-  return arrangeFreeFunctionType(FTy.castAs<FunctionProtoType>());
+  return arrangeFreeFunctionType(FTy.castAs<FunctionProtoType>(), FD);
 }
 
 /// Arrange the argument and result information for the declaration or
@@ -1059,14 +1075,13 @@ void CodeGenFunction::ExpandTypeToArgs(
   if (auto CAExp = dyn_cast<ConstantArrayExpansion>(Exp.get())) {
     Address Addr = Arg.hasLValue() ? Arg.getKnownLValue().getAddress()
                                    : Arg.getKnownRValue().getAggregateAddress();
-    forConstantArrayExpansion(
-        *this, CAExp, Addr, [&](Address EltAddr) {
-          CallArg EltArg = CallArg(
-              convertTempToRValue(EltAddr, CAExp->EltTy, SourceLocation()),
-              CAExp->EltTy);
-          ExpandTypeToArgs(CAExp->EltTy, EltArg, IRFuncTy, IRCallArgs,
-                           IRCallArgPos);
-        });
+    forConstantArrayExpansion(*this, CAExp, Addr, [&](Address EltAddr) {
+      CallArg EltArg =
+          CallArg(convertTempToRValue(EltAddr, CAExp->EltTy, SourceLocation()),
+                  CAExp->EltTy);
+      ExpandTypeToArgs(CAExp->EltTy, EltArg, IRFuncTy, IRCallArgs,
+                       IRCallArgPos);
+    });
   } else if (auto RExp = dyn_cast<RecordExpansion>(Exp.get())) {
     Address This = Arg.hasLValue() ? Arg.getKnownLValue().getAddress()
                                    : Arg.getKnownRValue().getAggregateAddress();
@@ -1254,8 +1269,10 @@ static llvm::Value *CreateCoercedLoad(Address Src, llvm::Type *Ty,
 
   // Otherwise do coercion through memory. This is stupid, but simple.
   Address Tmp = CreateTempAllocaForCoercion(CGF, Ty, Src.getAlignment());
-  Address Casted = CGF.Builder.CreateElementBitCast(Tmp,CGF.Int8Ty);
-  Address SrcCasted = CGF.Builder.CreateElementBitCast(Src,CGF.Int8Ty);
+  Address Casted =
+    CGF.Builder.CreatePointerBitCastOrAddrSpaceCast(Tmp, CGF.AllocaInt8PtrTy);
+  Address SrcCasted =
+    CGF.Builder.CreatePointerBitCastOrAddrSpaceCast(Src, CGF.AllocaInt8PtrTy);
   CGF.Builder.CreateMemCpy(Casted, SrcCasted,
       llvm::ConstantInt::get(CGF.IntPtrTy, SrcSize),
       false);
@@ -1332,8 +1349,10 @@ static void CreateCoercedStore(llvm::Value *Src,
     // to that information.
     Address Tmp = CreateTempAllocaForCoercion(CGF, SrcTy, Dst.getAlignment());
     CGF.Builder.CreateStore(Src, Tmp);
-    Address Casted = CGF.Builder.CreateElementBitCast(Tmp,CGF.Int8Ty);
-    Address DstCasted = CGF.Builder.CreateElementBitCast(Dst,CGF.Int8Ty);
+    Address Casted =
+      CGF.Builder.CreatePointerBitCastOrAddrSpaceCast(Tmp, CGF.AllocaInt8PtrTy);
+    Address DstCasted =
+      CGF.Builder.CreatePointerBitCastOrAddrSpaceCast(Dst, CGF.AllocaInt8PtrTy);
     CGF.Builder.CreateMemCpy(DstCasted, Casted,
         llvm::ConstantInt::get(CGF.IntPtrTy, DstSize),
         false);
@@ -1621,8 +1640,8 @@ CodeGenTypes::GetFunctionType(const CGFunctionInfo &FI) {
       assert(NumIRArgs == 1);
       // indirect arguments are always on the stack, which is alloca addr space.
       llvm::Type *LTy = ConvertTypeForMem(it->type);
-      ArgTypes[FirstIRArg] = LTy->getPointerTo(
-          CGM.getDataLayout().getAllocaAddrSpace());
+      ArgTypes[FirstIRArg] =
+          LTy->getPointerTo(CGM.getDataLayout().getAllocaAddrSpace());
       break;
     }
 
@@ -1663,7 +1682,8 @@ CodeGenTypes::GetFunctionType(const CGFunctionInfo &FI) {
   bool Erased = FunctionsBeingProcessed.erase(&FI); (void)Erased;
   assert(Erased && "Not in set?");
 
-  return llvm::FunctionType::get(resultType, ArgTypes, FI.isVariadic());
+  auto *FT = llvm::FunctionType::get(resultType, ArgTypes, FI.isVariadic());
+  return FT;
 }
 
 llvm::Type *CodeGenTypes::GetFunctionTypeForVTable(GlobalDecl GD) {
@@ -1915,7 +1935,8 @@ void CodeGenModule::ConstructAttributeList(
   }
 
   if (TargetDecl && TargetDecl->hasAttr<OpenCLKernelAttr>()) {
-    if (getLangOpts().OpenCLVersion <= 120) {
+    if (getLangOpts().OpenCL &&
+        getLangOpts().OpenCLVersion <= 120) {
       // OpenCL v1.2 Work groups are always uniform
       FuncAttrs.addAttribute("uniform-work-group-size", "true");
     } else {
@@ -2324,8 +2345,8 @@ void CodeGenFunction::EmitFunctionProlog(const CGFunctionInfo &FI,
           // copy.
           CharUnits Size = getContext().getTypeSizeInChars(Ty);
           auto SizeVal = llvm::ConstantInt::get(IntPtrTy, Size.getQuantity());
-          Address Dst = Builder.CreateBitCast(AlignedTemp, Int8PtrTy);
-          Address Src = Builder.CreateBitCast(ParamAddr, Int8PtrTy);
+          Address Dst = Builder.CreateBitCast(AlignedTemp, AllocaInt8PtrTy);
+          Address Src = Builder.CreateBitCast(ParamAddr, AllocaInt8PtrTy);
           Builder.CreateMemCpy(Dst, Src, SizeVal, false);
           V = AlignedTemp;
         }
@@ -2443,8 +2464,12 @@ void CodeGenFunction::EmitFunctionProlog(const CGFunctionInfo &FI,
         // type in the function type. Since we are codegening the callee
         // in here, add a cast to the argument type.
         llvm::Type *LTy = ConvertType(Arg->getType());
-        if (V->getType() != LTy)
-          V = Builder.CreateBitCast(V, LTy);
+        if (V->getType() != LTy) {
+          if (V->getType()->isIntegerTy(1))
+            V = Builder.CreateZExt(V, LTy);
+          else
+            V = Builder.CreateBitCast(V, LTy);
+        }
 
         ArgVals.push_back(ParamValue::forDirect(V));
         break;
@@ -3829,19 +3854,19 @@ RValue CodeGenFunction::EmitCall(const CGFunctionInfo &CallInfo,
   if (llvm::StructType *ArgStruct = CallInfo.getArgStruct()) {
     const llvm::DataLayout &DL = CGM.getDataLayout();
     llvm::Instruction *IP = CallArgs.getStackBase();
-    llvm::AllocaInst *AI;
+    llvm::Instruction *CastedAI;
     if (IP) {
       IP = IP->getNextNode();
-      AI = new llvm::AllocaInst(ArgStruct, DL.getAllocaAddrSpace(),
-                                "argmem", IP);
+      CastedAI = CreateAlloca(ArgStruct, "argmem", IP);
     } else {
-      AI = CreateTempAlloca(ArgStruct, "argmem");
+      CastedAI = CreateTempAlloca(ArgStruct, "argmem");
     }
     auto Align = CallInfo.getArgStructAlignment();
+    auto *AI = getAddrSpaceCastedAlloca(CastedAI);
     AI->setAlignment(Align.getQuantity());
     AI->setUsedWithInAlloca(true);
     assert(AI->isUsedWithInAlloca() && !AI->isStaticAlloca());
-    ArgMemory = Address(AI, Align);
+    ArgMemory = Address(CastedAI, Align);
   }
 
   ClangToLLVMArgMapping IRFunctionArgs(CGM.getContext(), CallInfo);
@@ -3926,6 +3951,14 @@ RValue CodeGenFunction::EmitCall(const CGFunctionInfo &CallInfo,
     }
 
     case ABIArgInfo::Indirect: {
+      auto CastToAllocaAddrSpace = [&](llvm::Value *V) {
+        if (!ArgInfo.getIndirectByVal())
+          return V;
+        auto *T = V->getType()->getPointerElementType()->getPointerTo(
+            CGM.getDataLayout().getAllocaAddrSpace());
+        return getTargetHooks().performAddrSpaceCast(
+            *this, V, LangAS::Default, CGM.getASTAllocaAddressSpace(), T, true);
+      };
       assert(NumIRArgs == 1);
       if (!I->isAggregate()) {
         // Make a temporary alloca to pass the argument.
@@ -3950,6 +3983,14 @@ RValue CodeGenFunction::EmitCall(const CGFunctionInfo &CallInfo,
         CharUnits Align = ArgInfo.getIndirectAlign();
         const llvm::DataLayout *TD = &CGM.getDataLayout();
 
+        if (FirstIRArg < IRFuncTy->getNumParams() &&
+                IRFuncTy->getParamType(FirstIRArg)->getPointerAddressSpace() !=
+                    TD->getAllocaAddrSpace()) {
+          llvm::errs() << *IRFuncTy << '\n'
+            << *V <<
+            *(cast<llvm::Instruction>(V)->getParent()->getParent()) << '\n';
+        }
+
         assert((FirstIRArg >= IRFuncTy->getNumParams() ||
                 IRFuncTy->getParamType(FirstIRArg)->getPointerAddressSpace() ==
                     TD->getAllocaAddrSpace()) &&
@@ -4046,11 +4087,12 @@ RValue CodeGenFunction::EmitCall(const CGFunctionInfo &CallInfo,
             V->getType()->isIntegerTy())
           V = Builder.CreateZExt(V, ArgInfo.getCoerceToType());
 
-        // If the argument doesn't match, perform a bitcast to coerce it.  This
-        // can happen due to trivial type mismatches.
-        if (FirstIRArg < IRFuncTy->getNumParams() &&
+        // If the argument doesn't match, perform a bitcast or an addrspacecast
+        // to coerce it.  This can happen due to trivial type mismatches.
+        if (V->getType()->isPointerTy() &&
+            FirstIRArg < IRFuncTy->getNumParams() &&
             V->getType() != IRFuncTy->getParamType(FirstIRArg))
-          V = Builder.CreateBitCast(V, IRFuncTy->getParamType(FirstIRArg));
+          V = Builder.CreatePointerBitCastOrAddrSpaceCast(V, IRFuncTy->getParamType(FirstIRArg));
 
         IRCallArgs[FirstIRArg] = V;
         break;
@@ -4265,8 +4307,14 @@ RValue CodeGenFunction::EmitCall(const CGFunctionInfo &CallInfo,
     if (IRFunctionArgs.hasInallocaArg() &&
         i == IRFunctionArgs.getInallocaArgNo())
       continue;
-    if (i < IRFuncTy->getNumParams())
-      assert(IRCallArgs[i]->getType() == IRFuncTy->getParamType(i));
+    if (i < IRFuncTy->getNumParams()) {
+      if (IRCallArgs[i]->getType() != IRFuncTy->getParamType(i)) {
+        llvm::errs() << *CalleePtr << " arg" << i << ": "
+            << *IRCallArgs[i] << " => " << *IRFuncTy->getParamType(i)
+            << '\n';
+       }
+       assert(IRCallArgs[i]->getType() == IRFuncTy->getParamType(i));
+    }
   }
 #endif
 
@@ -4314,6 +4362,9 @@ RValue CodeGenFunction::EmitCall(const CGFunctionInfo &CallInfo,
     // exception is thrown during a cleanup outside of a try/catch.
     // We don't need to model anything in IR to get this behavior.
     CannotThrow = true;
+  } else if (CGM.getLangOpts().DevicePath) {
+    // If we can in HCC Device Path we do not support exceptions thrown
+    CannotThrow = true;
   } else {
     // Otherwise, nounwind call sites will never throw.
     CannotThrow = Attrs.hasAttribute(llvm::AttributeList::FunctionIndex,
diff --git a/lib/CodeGen/CGClass.cpp b/lib/CodeGen/CGClass.cpp
index c8bb63c5c4..94812768d9 100644
--- a/lib/CodeGen/CGClass.cpp
+++ b/lib/CodeGen/CGClass.cpp
@@ -381,8 +381,8 @@ CodeGenFunction::GetAddressOfDerivedClass(Address BaseAddr,
 
   QualType DerivedTy =
     getContext().getCanonicalType(getContext().getTagDeclType(Derived));
-  llvm::Type *DerivedPtrTy = ConvertType(DerivedTy)->getPointerTo();
 
+  llvm::Type *DerivedPtrTy = ConvertType(DerivedTy)->getPointerTo();
   llvm::Value *NonVirtualOffset =
     CGM.GetNonVirtualBaseClassOffset(Derived, PathBegin, PathEnd);
 
@@ -2488,8 +2488,10 @@ void CodeGenFunction::InitializeVTablePointer(const VPtr &Vptr) {
       llvm::FunctionType::get(CGM.Int32Ty, /*isVarArg=*/true)
           ->getPointerTo()
           ->getPointerTo();
-  VTableField = Builder.CreateBitCast(VTableField, VTablePtrTy->getPointerTo());
-  VTableAddressPoint = Builder.CreateBitCast(VTableAddressPoint, VTablePtrTy);
+  VTableField = Builder.CreatePointerBitCastOrAddrSpaceCast(VTableField,
+      VTablePtrTy->getPointerTo());
+  VTableAddressPoint = Builder.CreatePointerBitCastOrAddrSpaceCast(
+      VTableAddressPoint, VTablePtrTy);
 
   llvm::StoreInst *Store = Builder.CreateStore(VTableAddressPoint, VTableField);
   TBAAAccessInfo TBAAInfo = CGM.getTBAAVTablePtrAccessInfo(VTablePtrTy);
diff --git a/lib/CodeGen/CGDebugInfo.cpp b/lib/CodeGen/CGDebugInfo.cpp
index f6ee7ee26d..e006922404 100644
--- a/lib/CodeGen/CGDebugInfo.cpp
+++ b/lib/CodeGen/CGDebugInfo.cpp
@@ -1742,10 +1742,13 @@ CGDebugInfo::CollectTemplateParams(const TemplateParameterList *TPList,
       QualType T = TA.getParamTypeForDecl().getDesugaredType(CGM.getContext());
       llvm::DIType *TTy = getOrCreateType(T, Unit);
       llvm::Constant *V = nullptr;
-      // Skip retrieve the value if that template parameter has cuda device
+      // Skip retrieve the value if that template parameter has cuda/hcc device
       // attribute, i.e. that value is not available at the host side.
-      if (!CGM.getLangOpts().CUDA || CGM.getLangOpts().CUDAIsDevice ||
-          !D->hasAttr<CUDADeviceAttr>()) {
+      if ((!CGM.getLangOpts().CUDA || CGM.getLangOpts().CUDAIsDevice ||
+           !D->hasAttr<CUDADeviceAttr>()) &&
+          (!CGM.getLangOpts().CPlusPlusAMP || CGM.getLangOpts().DevicePath ||
+           (!D->hasAttr<CXXAMPRestrictAMPAttr>() &&
+            !D->hasAttr<HC_HCAttr>()))) {
         const CXXMethodDecl *MD;
         // Variable pointer template parameters have a value that is the address
         // of the variable.
diff --git a/lib/CodeGen/CGDecl.cpp b/lib/CodeGen/CGDecl.cpp
index 6ad43cefc4..e7a3b515ae 100644
--- a/lib/CodeGen/CGDecl.cpp
+++ b/lib/CodeGen/CGDecl.cpp
@@ -240,9 +240,10 @@ llvm::Constant *CodeGenModule::getOrCreateStaticVarDecl(
 
   // OpenCL variables in local address space and CUDA shared
   // variables cannot have an initializer.
+  // HCC tile_static variables cannot have an initializer.
   llvm::Constant *Init = nullptr;
   if (Ty.getAddressSpace() == LangAS::opencl_local ||
-      D.hasAttr<CUDASharedAttr>())
+      D.hasAttr<CUDASharedAttr>() || D.hasAttr<HCCTileStaticAttr>())
     Init = llvm::UndefValue::get(LTy);
   else
     Init = EmitNullConstant(Ty);
@@ -262,6 +263,12 @@ llvm::Constant *CodeGenModule::getOrCreateStaticVarDecl(
 
   // Make sure the result is of the correct type.
   LangAS ExpectedAS = Ty.getAddressSpace();
+
+  // HCC tile_static pointer would be in generic address space
+  if (D.hasAttr<HCCTileStaticAttr>()) {
+    ExpectedAS = LangAS::hcc_generic;
+  }
+
   llvm::Constant *Addr = GV;
   if (AS != ExpectedAS) {
     Addr = getTargetCodeGenInfo().performAddrSpaceCast(
@@ -412,8 +419,10 @@ void CodeGenFunction::EmitStaticVarDecl(const VarDecl &D,
   // a no-op and should not be emitted.
   bool isCudaSharedVar = getLangOpts().CUDA && getLangOpts().CUDAIsDevice &&
                          D.hasAttr<CUDASharedAttr>();
+  bool isHCCAcceleratorPath =
+      getLangOpts().CPlusPlusAMP && getLangOpts().DevicePath;
   // If this value has an initializer, emit it.
-  if (D.getInit() && !isCudaSharedVar)
+  if (D.getInit() && !isCudaSharedVar && !isHCCAcceleratorPath)
     var = AddInitializerToStaticVarDecl(D, var);
 
   var->setAlignment(alignment.getQuantity());
@@ -1556,6 +1565,18 @@ CodeGenFunction::EmitAutoVarAlloca(const VarDecl &D) {
     EmitAndRegisterVariableArrayDimensions(DI, D, EmitDebugInfo);
   }
 
+  auto T = D.getType();
+  assert(T.getAddressSpace() == LangAS::Default ||
+         T.getAddressSpace() == LangAS::opencl_private);
+  if (getASTAllocaAddressSpace() != LangAS::Default) {
+    auto *Addr = getTargetHooks().performAddrSpaceCast(
+        *this, address.getPointer(), getASTAllocaAddressSpace(),
+        T.getAddressSpace(),
+        address.getElementType()->getPointerTo(
+            getContext().getTargetAddressSpace(T.getAddressSpace())),
+        /*non-null*/ true);
+    address = Address(Addr, address.getAlignment());
+  }
   setAddrOfLocalVar(&D, address);
   emission.Addr = address;
   emission.AllocaAddr = AllocaAddr;
@@ -2382,6 +2403,7 @@ void CodeGenFunction::EmitParmDecl(const VarDecl &D, ParamValue Arg,
     llvm::Type *IRTy = ConvertTypeForMem(Ty)->getPointerTo(AS);
     if (DeclPtr.getType() != IRTy)
       DeclPtr = Builder.CreateBitCast(DeclPtr, IRTy, D.getName());
+
     // Indirect argument is in alloca address space, which may be different
     // from the default address space.
     auto AllocaAS = CGM.getASTAllocaAddressSpace();
diff --git a/lib/CodeGen/CGDeclCXX.cpp b/lib/CodeGen/CGDeclCXX.cpp
index 7a0605b845..5681ec4800 100644
--- a/lib/CodeGen/CGDeclCXX.cpp
+++ b/lib/CodeGen/CGDeclCXX.cpp
@@ -445,8 +445,9 @@ CodeGenModule::EmitCXXGlobalVarDeclInitFunc(const VarDecl *D,
        D->hasAttr<CUDASharedAttr>()))
     return;
 
-  if (getLangOpts().OpenMP &&
-      getOpenMPRuntime().emitDeclareTargetVarDefinition(D, Addr, PerformInit))
+  if ( (getLangOpts().CPlusPlusAMP && getLangOpts().DevicePath &&
+        D->hasAttr<HCCTileStaticAttr>()) ||
+       (getLangOpts().OpenMP && getOpenMPRuntime().emitDeclareTargetVarDefinition(D, Addr, PerformInit)))
     return;
 
   // Check if we've already initialized this decl.
diff --git a/lib/CodeGen/CGException.cpp b/lib/CodeGen/CGException.cpp
index 3b7a88a0b7..16a593dd1c 100644
--- a/lib/CodeGen/CGException.cpp
+++ b/lib/CodeGen/CGException.cpp
@@ -1726,7 +1726,7 @@ Address CodeGenFunction::recoverAddrOfEscapedLocal(CodeGenFunction &ParentCGF,
     llvm::Function *FrameRecoverFn = llvm::Intrinsic::getDeclaration(
         &CGM.getModule(), llvm::Intrinsic::localrecover);
     llvm::Constant *ParentI8Fn =
-        llvm::ConstantExpr::getBitCast(ParentCGF.CurFn, Int8PtrTy);
+        llvm::ConstantExpr::getPointerCast(ParentCGF.CurFn, Int8PtrTy);
     RecoverCall = Builder.CreateCall(
         FrameRecoverFn, {ParentI8Fn, ParentFP,
                          llvm::ConstantInt::get(Int32Ty, FrameEscapeIdx)});
@@ -1791,7 +1791,7 @@ void CodeGenFunction::EmitCapturedLocals(CodeGenFunction &ParentCGF,
     llvm::Function *RecoverFPIntrin =
         CGM.getIntrinsic(llvm::Intrinsic::eh_recoverfp);
     llvm::Constant *ParentI8Fn =
-        llvm::ConstantExpr::getBitCast(ParentCGF.CurFn, Int8PtrTy);
+        llvm::ConstantExpr::getPointerCast(ParentCGF.CurFn, Int8PtrTy);
     ParentFP = Builder.CreateCall(RecoverFPIntrin, {ParentI8Fn, EntryFP});
   }
 
@@ -2025,7 +2025,7 @@ void CodeGenFunction::EnterSEHTryStmt(const SEHTryStmt &S) {
   llvm::Function *FilterFunc =
       HelperCGF.GenerateSEHFilterFunction(*this, *Except);
   llvm::Constant *OpaqueFunc =
-      llvm::ConstantExpr::getBitCast(FilterFunc, Int8PtrTy);
+      llvm::ConstantExpr::getPointerCast(FilterFunc, Int8PtrTy);
   CatchScope->setHandler(0, OpaqueFunc, createBasicBlock("__except.ret"));
 }
 
diff --git a/lib/CodeGen/CGExpr.cpp b/lib/CodeGen/CGExpr.cpp
index 5a4b1188b7..fc5d52887e 100644
--- a/lib/CodeGen/CGExpr.cpp
+++ b/lib/CodeGen/CGExpr.cpp
@@ -112,6 +112,20 @@ llvm::AllocaInst *CodeGenFunction::CreateTempAlloca(llvm::Type *Ty,
                               ArraySize, Name, AllocaInsertPt);
 }
 
+llvm::Instruction *CodeGenFunction::CreateAlloca(llvm::Type *Ty,
+                                                 const Twine &Name,
+                                                 llvm::Instruction *InsertPos) {
+  return new llvm::AllocaInst(Ty,
+      CGM.getDataLayout().getAllocaAddrSpace(), nullptr, Name, InsertPos);
+}
+
+llvm::AllocaInst *
+CodeGenFunction::getAddrSpaceCastedAlloca(llvm::Instruction *V) const {
+  if (auto *Cast = dyn_cast<llvm::AddrSpaceCastInst>(V))
+    return cast<llvm::AllocaInst>(Cast->getOperand(0));
+  return cast<llvm::AllocaInst>(V);
+}
+
 /// CreateDefaultAlignTempAlloca - This creates an alloca with the
 /// default alignment of the corresponding LLVM type, which is *not*
 /// guaranteed to be related in any way to the expected alignment of
@@ -430,9 +444,8 @@ EmitMaterializeTemporaryExpr(const MaterializeTemporaryExpr *M) {
       ownership != Qualifiers::OCL_ExplicitNone) {
     Address Object = createReferenceTemporary(*this, M, E);
     if (auto *Var = dyn_cast<llvm::GlobalVariable>(Object.getPointer())) {
-      Object = Address(llvm::ConstantExpr::getBitCast(Var,
-                           ConvertTypeForMem(E->getType())
-                             ->getPointerTo(Object.getAddressSpace())),
+      Object = Address(llvm::ConstantExpr::getPointerCast(
+          Var, getTypes().getPointerTypeTo(E->getType())),
                        Object.getAlignment());
 
       // createReferenceTemporary will promote the temporary to a global with a
@@ -3336,8 +3349,9 @@ Address CodeGenFunction::EmitArrayToPointerDecay(const Expr *E,
   QualType EltType = E->getType()->castAsArrayTypeUnsafe()->getElementType();
   if (BaseInfo) *BaseInfo = LV.getBaseInfo();
   if (TBAAInfo) *TBAAInfo = CGM.getTBAAAccessInfo(EltType);
-
-  return Builder.CreateElementBitCast(Addr, ConvertTypeForMem(EltType));
+  return Builder.CreatePointerBitCastOrAddrSpaceCast(Addr,
+      ConvertTypeForMem(EltType)->getPointerTo(getContext().
+          getTargetAddressSpace(E->getType())));
 }
 
 /// isSimpleArrayDecayOperand - If the specified expr is a simple decay from an
diff --git a/lib/CodeGen/CGExprComplex.cpp b/lib/CodeGen/CGExprComplex.cpp
index 6a5fb45ba2..ab798a9046 100644
--- a/lib/CodeGen/CGExprComplex.cpp
+++ b/lib/CodeGen/CGExprComplex.cpp
@@ -727,8 +727,20 @@ ComplexPairTy ComplexExprEmitter::EmitBinMul(const BinOpInfo &Op) {
       // Now emit the libcall on this slowest of the slow paths.
       CGF.EmitBlock(LibCallBB);
       Value *LibCallR, *LibCallI;
-      std::tie(LibCallR, LibCallI) = EmitComplexBinOpLibCall(
-          getComplexMultiplyLibCallName(Op.LHS.first->getType()), Op);
+      if (CGF.CGM.getLangOpts().CPlusPlusAMP &&
+          CGF.CGM.getLangOpts().DevicePath) {
+        // TODO: HCC 23/09/2017 - we cannot call the builtin functions which
+        //       handle NaNs robustly and are provided in Compiler-RT on the
+        //       accelerator path. Ideally we will want to add our own
+        //       customised ones, but until such a solution is implemented we
+        //       merely propagate NaNs. Division is in a similar situation.
+        LibCallR = ResR;
+        LibCallI = ResI;
+      }
+      else {
+        std::tie(LibCallR, LibCallI) = EmitComplexBinOpLibCall(
+                getComplexMultiplyLibCallName(Op.LHS.first->getType()), Op);
+      }
       Builder.CreateBr(ContBB);
 
       // Finally continue execution by phi-ing together the different
@@ -769,6 +781,36 @@ ComplexPairTy ComplexExprEmitter::EmitBinMul(const BinOpInfo &Op) {
   return ComplexPairTy(ResR, ResI);
 }
 
+namespace
+{
+  inline
+  void EmitHCCComplexFloatDivision(
+    CGBuilderTy& Builder,
+    llvm::Value *&LHSr,
+    llvm::Value *&LHSi,
+    llvm::Value *&RHSr,
+    llvm::Value *&RHSi)
+  { // TODO: this is simplistic and should be removed as
+    //       soon as possible.
+    // (a+ib) / (c+id) = ((ac+bd)/(cc+dd)) + i((bc-ad)/(cc+dd))
+    llvm::Value *Tmp1 = Builder.CreateFMul(LHSr, RHSr); // a*c
+    llvm::Value *Tmp2 = Builder.CreateFMul(LHSi, RHSi); // b*d
+    llvm::Value *Tmp3 = Builder.CreateFAdd(Tmp1, Tmp2); // ac+bd
+
+    llvm::Value *Tmp4 = Builder.CreateFMul(RHSr, RHSr); // c*c
+    llvm::Value *Tmp5 = Builder.CreateFMul(RHSi, RHSi); // d*d
+    llvm::Value *Tmp6 = Builder.CreateFAdd(Tmp4, Tmp5); // cc+dd
+
+    llvm::Value *Tmp7 = Builder.CreateFMul(LHSi, RHSr); // b*c
+    llvm::Value *Tmp8 = Builder.CreateFMul(LHSr, RHSi); // a*d
+    llvm::Value *Tmp9 = Builder.CreateFSub(Tmp7, Tmp8); // bc-ad
+
+    LHSr = Tmp3;
+    LHSi = Tmp9;
+    RHSr = Tmp6;
+  }
+}
+
 // See C11 Annex G.5.1 for the semantics of multiplicative operators on complex
 // typed values.
 ComplexPairTy ComplexExprEmitter::EmitBinDiv(const BinOpInfo &Op) {
@@ -790,21 +832,36 @@ ComplexPairTy ComplexExprEmitter::EmitBinDiv(const BinOpInfo &Op) {
       if (!LHSi)
         LibCallOp.LHS.second = llvm::Constant::getNullValue(LHSr->getType());
 
-      switch (LHSr->getType()->getTypeID()) {
-      default:
-        llvm_unreachable("Unsupported floating point type!");
-      case llvm::Type::HalfTyID:
-        return EmitComplexBinOpLibCall("__divhc3", LibCallOp);
-      case llvm::Type::FloatTyID:
-        return EmitComplexBinOpLibCall("__divsc3", LibCallOp);
-      case llvm::Type::DoubleTyID:
-        return EmitComplexBinOpLibCall("__divdc3", LibCallOp);
-      case llvm::Type::PPC_FP128TyID:
-        return EmitComplexBinOpLibCall("__divtc3", LibCallOp);
-      case llvm::Type::X86_FP80TyID:
-        return EmitComplexBinOpLibCall("__divxc3", LibCallOp);
-      case llvm::Type::FP128TyID:
-        return EmitComplexBinOpLibCall("__divtc3", LibCallOp);
+      StringRef LibCallName;
+      if (CGF.CGM.getLangOpts().CPlusPlusAMP &&
+          CGF.CGM.getLangOpts().DevicePath) {
+        // TODO: HCC 23/09/2017 - we cannot call the builtin functions, which
+        //       handle NaNs and INFs robustly and are provided in Compiler-RT,
+        //       on the accelerator path. Ideally we will want to add our own
+        //       customised ones, but until such a solution is implemented we
+        //       do not handle special values . Multiplication is in a similar
+        //       situation.
+        EmitHCCComplexFloatDivision(Builder, LHSr, LHSi, RHSr, RHSi);
+        DSTr = Builder.CreateFDiv(LHSr, RHSr);
+        DSTi = Builder.CreateFDiv(LHSi, RHSr);
+      }
+      else {
+        switch (LHSr->getType()->getTypeID()) {
+          default:
+            llvm_unreachable("Unsupported floating point type!");
+          case llvm::Type::HalfTyID:
+            return EmitComplexBinOpLibCall("__divhc3", LibCallOp);
+          case llvm::Type::FloatTyID:
+            return EmitComplexBinOpLibCall("__divsc3", LibCallOp);
+          case llvm::Type::DoubleTyID:
+            return EmitComplexBinOpLibCall("__divdc3", LibCallOp);
+          case llvm::Type::PPC_FP128TyID:
+            return EmitComplexBinOpLibCall("__divtc3", LibCallOp);
+          case llvm::Type::X86_FP80TyID:
+            return EmitComplexBinOpLibCall("__divxc3", LibCallOp);
+          case llvm::Type::FP128TyID:
+            return EmitComplexBinOpLibCall("__divtc3", LibCallOp);
+        }
       }
     } else if (RHSi) {
       if (!LHSi)
diff --git a/lib/CodeGen/CGExprScalar.cpp b/lib/CodeGen/CGExprScalar.cpp
index 3d082de2a1..d482f7d818 100644
--- a/lib/CodeGen/CGExprScalar.cpp
+++ b/lib/CodeGen/CGExprScalar.cpp
@@ -420,8 +420,15 @@ public:
   //===--------------------------------------------------------------------===//
 
   Value *Visit(Expr *E) {
+    if (getenv("DBG_CG_SCALAR_EXPR")) {
+      llvm::errs() << "Expr: "; E->dump();
+    }
     ApplyDebugLocation DL(CGF, E);
-    return StmtVisitor<ScalarExprEmitter, Value*>::Visit(E);
+    auto Res = StmtVisitor<ScalarExprEmitter, Value*>::Visit(E);
+    if (getenv("DBG_CG_SCALAR_EXPR")) {
+      llvm::errs() << " => " << *Res << '\n';
+    }
+    return Res;
   }
 
   Value *VisitStmt(Stmt *S) {
diff --git a/lib/CodeGen/CGGPUBuiltin.cpp b/lib/CodeGen/CGGPUBuiltin.cpp
index d7e2676307..40d46973a5 100644
--- a/lib/CodeGen/CGGPUBuiltin.cpp
+++ b/lib/CodeGen/CGGPUBuiltin.cpp
@@ -20,9 +20,9 @@
 using namespace clang;
 using namespace CodeGen;
 
-static llvm::Function *GetVprintfDeclaration(llvm::Module &M) {
-  llvm::Type *ArgTypes[] = {llvm::Type::getInt8PtrTy(M.getContext()),
-                            llvm::Type::getInt8PtrTy(M.getContext())};
+static llvm::Function *GetVprintfDeclaration(CodeGenModule &CGM) {
+  auto &M = CGM.getModule();
+  llvm::Type *ArgTypes[] = {CGM.Int8PtrTy, CGM.Int8PtrTy};
   llvm::FunctionType *VprintfFuncType = llvm::FunctionType::get(
       llvm::Type::getInt32Ty(M.getContext()), ArgTypes, false);
 
@@ -68,12 +68,13 @@ static llvm::Function *GetVprintfDeclaration(llvm::Module &M) {
 RValue
 CodeGenFunction::EmitNVPTXDevicePrintfCallExpr(const CallExpr *E,
                                                ReturnValueSlot ReturnValue) {
-  assert(getTarget().getTriple().isNVPTX());
+  assert(getTarget().getTriple().isNVPTX() ||
+        (getTarget().getTriple().getArch() == llvm::Triple::amdgcn &&
+         getLangOpts().CUDA));
   assert(E->getBuiltinCallee() == Builtin::BIprintf);
   assert(E->getNumArgs() >= 1); // printf always has at least one arg.
 
   const llvm::DataLayout &DL = CGM.getDataLayout();
-  llvm::LLVMContext &Ctx = CGM.getLLVMContext();
 
   CallArgList Args;
   EmitCallArgs(Args,
@@ -93,7 +94,7 @@ CodeGenFunction::EmitNVPTXDevicePrintfCallExpr(const CallExpr *E,
   llvm::Value *BufferPtr;
   if (Args.size() <= 1) {
     // If there are no args, pass a null pointer to vprintf.
-    BufferPtr = llvm::ConstantPointerNull::get(llvm::Type::getInt8PtrTy(Ctx));
+    BufferPtr = llvm::ConstantPointerNull::get(CGM.Int8PtrTy);
   } else {
     llvm::SmallVector<llvm::Type *, 8> ArgTypes;
     for (unsigned I = 1, NumArgs = Args.size(); I < NumArgs; ++I)
@@ -112,11 +113,11 @@ CodeGenFunction::EmitNVPTXDevicePrintfCallExpr(const CallExpr *E,
       llvm::Value *Arg = Args[I].getRValue(*this).getScalarVal();
       Builder.CreateAlignedStore(Arg, P, DL.getPrefTypeAlignment(Arg->getType()));
     }
-    BufferPtr = Builder.CreatePointerCast(Alloca, llvm::Type::getInt8PtrTy(Ctx));
+    BufferPtr = Builder.CreatePointerCast(Alloca, CGM.Int8PtrTy);
   }
 
   // Invoke vprintf and return.
-  llvm::Function* VprintfFunc = GetVprintfDeclaration(CGM.getModule());
+  llvm::Function* VprintfFunc = GetVprintfDeclaration(CGM);
   return RValue::get(Builder.CreateCall(
       VprintfFunc, {Args[0].getRValue(*this).getScalarVal(), BufferPtr}));
 }
diff --git a/lib/CodeGen/CGOpenMPRuntime.cpp b/lib/CodeGen/CGOpenMPRuntime.cpp
index 27e7175da8..2eaa029fe7 100644
--- a/lib/CodeGen/CGOpenMPRuntime.cpp
+++ b/lib/CodeGen/CGOpenMPRuntime.cpp
@@ -1519,7 +1519,7 @@ Address CGOpenMPRuntime::getOrCreateDefaultLocation(unsigned Flags) {
       DefaultOpenMPPSource =
           CGM.GetAddrOfConstantCString(";unknown;unknown;0;0;;").getPointer();
       DefaultOpenMPPSource =
-          llvm::ConstantExpr::getBitCast(DefaultOpenMPPSource, CGM.Int8PtrTy);
+          llvm::ConstantExpr::getPointerCast(DefaultOpenMPPSource, CGM.Int8PtrTy);
     }
 
     llvm::Constant *Data[] = {
diff --git a/lib/CodeGen/CGRecordLayoutBuilder.cpp b/lib/CodeGen/CGRecordLayoutBuilder.cpp
index 4de64a32f2..73e4396edc 100644
--- a/lib/CodeGen/CGRecordLayoutBuilder.cpp
+++ b/lib/CodeGen/CGRecordLayoutBuilder.cpp
@@ -611,6 +611,19 @@ void CGRecordLowering::clipTailPadding() {
   }
 }
 
+static bool isPassedToHIPGlobalFn(const CXXRecordDecl *MaybeKernarg)
+{
+  if (!MaybeKernarg) return false;
+  if (!MaybeKernarg->hasAttr<AnnotateAttr>()) return false;
+
+  // N.B.: this is set in Sema::GatherArgumentsForCall, via
+  //       MarkByValueRecordsPassedToHIPGlobalFN.
+  static constexpr const char HIPKernargRecord[]{"__HIP_KERNARG_RECORD__"};
+
+  return MaybeKernarg->getAttr<AnnotateAttr>()->getAnnotation()
+    .find(HIPKernargRecord) != StringRef::npos;
+}
+
 void CGRecordLowering::determinePacked(bool NVBaseType) {
   if (Packed)
     return;
diff --git a/lib/CodeGen/CGVTT.cpp b/lib/CodeGen/CGVTT.cpp
index e79f3f3dd8..8c316b9141 100644
--- a/lib/CodeGen/CGVTT.cpp
+++ b/lib/CodeGen/CGVTT.cpp
@@ -83,7 +83,7 @@ CodeGenVTables::EmitVTTDefinition(llvm::GlobalVariable *VTT,
          VTable->getValueType(), VTable, Idxs, /*InBounds=*/true,
          /*InRangeIndex=*/1);
 
-     Init = llvm::ConstantExpr::getBitCast(Init, Int8PtrTy);
+     Init = llvm::ConstantExpr::getPointerCast(Init, Int8PtrTy);
 
      VTTComponents.push_back(Init);
   }
diff --git a/lib/CodeGen/CGVTables.cpp b/lib/CodeGen/CGVTables.cpp
index 3cb3d35448..70042ba661 100644
--- a/lib/CodeGen/CGVTables.cpp
+++ b/lib/CodeGen/CGVTables.cpp
@@ -608,7 +608,7 @@ void CodeGenVTables::addVTableComponent(
     return addOffsetConstant(component.getOffsetToTop());
 
   case VTableComponent::CK_RTTI:
-    return builder.add(llvm::ConstantExpr::getBitCast(rtti, CGM.Int8PtrTy));
+    return builder.add(llvm::ConstantExpr::getPointerCast(rtti, CGM.Int8PtrTy));
 
   case VTableComponent::CK_FunctionPointer:
   case VTableComponent::CK_CompleteDtorPointer:
@@ -653,7 +653,7 @@ void CodeGenVTables::addVTableComponent(
           CGM.CreateRuntimeFunction(fnTy, name).getCallee());
       if (auto f = dyn_cast<llvm::Function>(fn))
         f->setUnnamedAddr(llvm::GlobalValue::UnnamedAddr::Global);
-      return llvm::ConstantExpr::getBitCast(fn, CGM.Int8PtrTy);
+      return llvm::ConstantExpr::getPointerCast(fn, CGM.Int8PtrTy);
     };
 
     llvm::Constant *fnPtr;
@@ -686,7 +686,7 @@ void CodeGenVTables::addVTableComponent(
       fnPtr = CGM.GetAddrOfFunction(GD, fnTy, /*ForVTable=*/true);
     }
 
-    fnPtr = llvm::ConstantExpr::getBitCast(fnPtr, CGM.Int8PtrTy);
+    fnPtr = llvm::ConstantExpr::getPointerCast(fnPtr, CGM.Int8PtrTy);
     builder.add(fnPtr);
     return;
   }
diff --git a/lib/CodeGen/CMakeLists.txt b/lib/CodeGen/CMakeLists.txt
index 6d1f33b892..febd76ab1f 100644
--- a/lib/CodeGen/CMakeLists.txt
+++ b/lib/CodeGen/CMakeLists.txt
@@ -39,6 +39,7 @@ endif()
 
 add_clang_library(clangCodeGen
   BackendUtil.cpp
+  CGAMPRuntime.cpp
   CGAtomic.cpp
   CGBlocks.cpp
   CGBuiltin.cpp
diff --git a/lib/CodeGen/CodeGenABITypes.cpp b/lib/CodeGen/CodeGenABITypes.cpp
index 6b6a116cf2..4854dec16d 100644
--- a/lib/CodeGen/CodeGenABITypes.cpp
+++ b/lib/CodeGen/CodeGenABITypes.cpp
@@ -34,8 +34,9 @@ CodeGen::arrangeObjCMessageSendSignature(CodeGenModule &CGM,
 
 const CGFunctionInfo &
 CodeGen::arrangeFreeFunctionType(CodeGenModule &CGM,
-                                 CanQual<FunctionProtoType> Ty) {
-  return CGM.getTypes().arrangeFreeFunctionType(Ty);
+                                 CanQual<FunctionProtoType> Ty,
+								 const FunctionDecl *FD) {
+  return CGM.getTypes().arrangeFreeFunctionType(Ty, FD);
 }
 
 const CGFunctionInfo &
diff --git a/lib/CodeGen/CodeGenAction.cpp b/lib/CodeGen/CodeGenAction.cpp
index 0ae9ea427d..1ca4414149 100644
--- a/lib/CodeGen/CodeGenAction.cpp
+++ b/lib/CodeGen/CodeGenAction.cpp
@@ -263,6 +263,10 @@ namespace clang {
       Ctx.setDiagnosticHandler(llvm::make_unique<ClangDiagnosticHandler>(
         CodeGenOpts, this));
 
+      PerformPrelinkPasses(Diags, HeaderSearchOpts, CodeGenOpts, TargetOpts,
+                           LangOpts, C.getTargetInfo().getDataLayout(),
+                           getModule(), Action);
+
       Expected<std::unique_ptr<llvm::ToolOutputFile>> OptRecordFileOrErr =
           setupOptimizationRemarks(Ctx, CodeGenOpts.OptRecordFile,
                                    CodeGenOpts.OptRecordPasses,
@@ -302,7 +306,8 @@ namespace clang {
 
       EmitBackendOutput(Diags, HeaderSearchOpts, CodeGenOpts, TargetOpts,
                         LangOpts, C.getTargetInfo().getDataLayout(),
-                        getModule(), Action, std::move(AsmOutStream));
+                        getModule(), Action, std::move(AsmOutStream),
+                        false /* SetLLVMOpts */);
 
       Ctx.setInlineAsmDiagnosticHandler(OldHandler, OldContext);
 
diff --git a/lib/CodeGen/CodeGenFunction.cpp b/lib/CodeGen/CodeGenFunction.cpp
index eafe266744..0f39081daf 100644
--- a/lib/CodeGen/CodeGenFunction.cpp
+++ b/lib/CodeGen/CodeGenFunction.cpp
@@ -13,6 +13,7 @@
 #include "CodeGenFunction.h"
 #include "CGBlocks.h"
 #include "CGCleanup.h"
+#include "CGAMPRuntime.h"
 #include "CGCUDARuntime.h"
 #include "CGCXXABI.h"
 #include "CGDebugInfo.h"
@@ -52,6 +53,10 @@ static bool shouldEmitLifetimeMarkers(const CodeGenOptions &CGOpts,
   if (LangOpts.Sanitize.has(SanitizerKind::Memory))
     return false;
 
+  // Disable lifetime markers in HCC kernel build
+  if (LangOpts.CPlusPlusAMP && CGOpts.AMPIsDevice)
+    return false;
+
   // Asan uses markers for use-after-scope checks.
   if (CGOpts.SanitizeAddressUseAfterScope)
     return true;
@@ -659,7 +664,25 @@ void CodeGenFunction::StartFunction(GlobalDecl GD,
   FnRetTy = RetTy;
   CurFn = Fn;
   CurFnInfo = &FnInfo;
-  assert(CurFn->isDeclaration() && "Function already has body?");
+
+  // Relax duplicated function definition for C++AMP
+  //
+  // The reason is because in the modified GPU build path, both CPU and GPU
+  // codes would be emitted in order to make sure C++ name mangling for
+  // GPU kernels work correctly.  CPU codes would be removed in a later
+  // optimization pass.
+  //
+  // Therefore, in the following case StartFunction() might be called twice
+  // for function foo(), and thus we need to relax the assert check for C++AMP.
+  //
+  // void foo() restrict(amp) { return 1; }
+  // void foo() restrict(cpu) { return 2; }
+
+  if (getContext().getLangOpts().CPlusPlusAMP &&
+      (CGM.getCodeGenOpts().AMPIsDevice || CGM.getCodeGenOpts().AMPCPU)) {
+  } else {
+    assert(CurFn->isDeclaration() && "Function already has body?");
+  }
 
   // If this function has been blacklisted for any of the enabled sanitizers,
   // disable the sanitizer for the function.
@@ -689,7 +712,11 @@ void CodeGenFunction::StartFunction(GlobalDecl GD,
       if (mask & SanitizerKind::KernelHWAddress)
         SanOpts.set(SanitizerKind::HWAddress, false);
     }
+
   }
+  // Device code has all sanitizers disabled for now
+  if (CGM.getCodeGenOpts().AMPIsDevice)
+     SanOpts.clear();
 
   // Apply sanitizer attributes to the function.
   if (SanOpts.hasOneOf(SanitizerKind::Address | SanitizerKind::KernelAddress))
@@ -768,6 +795,28 @@ void CodeGenFunction::StartFunction(GlobalDecl GD,
       EmitOpenCLKernelMetadata(FD, Fn);
   }
 
+  if (getLangOpts().CPlusPlusAMP && getLangOpts().DevicePath) {
+    if (const FunctionDecl *FD = dyn_cast_or_null<FunctionDecl>(D)) {
+      if (FD->hasAttr<AnnotateAttr>() &&
+        FD->getAttr<AnnotateAttr>()->getAnnotation() ==
+          "__HIP_global_function__") {
+            Fn->setCallingConv(llvm::CallingConv::AMDGPU_KERNEL);
+            Fn->setDoesNotRecurse();
+            Fn->setDoesNotThrow();
+            Fn->setLinkage(llvm::Function::LinkageTypes::WeakODRLinkage);
+      }
+    }
+  }
+
+  if (getLangOpts().CPlusPlusAMP) {
+    if (const FunctionDecl *FD = dyn_cast_or_null<FunctionDecl>(D)) {
+      if (FD->hasAttr<AnnotateAttr>() &&
+          FD->getAttr<AnnotateAttr>()->getAnnotation() == "serialize") {
+        Fn->setLinkage(llvm::Function::LinkageTypes::WeakODRLinkage);
+      }
+    }
+  }
+
   // If we are checking function types, emit a function type signature as
   // prologue data.
   if (getLangOpts().CPlusPlus && SanOpts.has(SanitizerKind::Function)) {
@@ -1179,6 +1228,25 @@ void CodeGenFunction::GenerateCode(GlobalDecl GD, llvm::Function *Fn,
     EmitDestructorBody(Args);
   else if (isa<CXXConstructorDecl>(FD))
     EmitConstructorBody(Args);
+  else if (getContext().getLangOpts().CPlusPlusAMP &&
+           CGM.getCodeGenOpts().AMPIsDevice &&
+           FD->hasAttr<AnnotateAttr>() &&
+           FD->getAttr<AnnotateAttr>()->getAnnotation() == "__cxxamp_trampoline")
+    CGM.getAMPRuntime().EmitTrampolineBody(*this, FD, Args);
+  else if (getContext().getLangOpts().CPlusPlusAMP &&
+           (!CGM.getCodeGenOpts().AMPIsDevice || CGM.getCodeGenOpts().AMPCPU)&&
+           FD->hasAttr<AnnotateAttr>() &&
+           FD->getAttr<AnnotateAttr>()->getAnnotation() == "__cxxamp_trampoline_name")
+    CGM.getAMPRuntime().EmitTrampolineNameBody(*this, FD, Args);
+  else if (getContext().getLangOpts().CPlusPlusAMP &&
+           !getContext().getLangOpts().DevicePath &&
+           FD->hasAttr<AnnotateAttr>() &&
+           FD->getAttr<AnnotateAttr>()->getAnnotation() ==
+             "__HIP_global_function__") {
+    // We do not emit __global__ functions on the host path, we only want them
+    // to have a correct address which we can use to obtain the mangled name
+    // from the ELF.
+  }
   else if (getLangOpts().CUDA &&
            !getLangOpts().CUDAIsDevice &&
            FD->hasAttr<CUDAGlobalAttr>())
@@ -1205,7 +1273,8 @@ void CodeGenFunction::GenerateCode(GlobalDecl GD, llvm::Function *Fn,
   // C11 6.9.1p12:
   //   If the '}' that terminates a function is reached, and the value of the
   //   function call is used by the caller, the behavior is undefined.
-  if (getLangOpts().CPlusPlus && !FD->hasImplicitReturnZero() && !SawAsmBlock &&
+  // Relax the rule for C++AMP
+  if (!getLangOpts().CPlusPlusAMP && getLangOpts().CPlusPlus && !FD->hasImplicitReturnZero() && !SawAsmBlock &&
       !FD->getReturnType()->isVoidType() && Builder.GetInsertBlock()) {
     bool ShouldEmitUnreachable =
         CGM.getCodeGenOpts().StrictReturn ||
@@ -1663,7 +1732,7 @@ CodeGenFunction::EmitNullInitialization(Address DestPtr, QualType Ty) {
                                NullConstant, Twine());
     CharUnits NullAlign = DestPtr.getAlignment();
     NullVariable->setAlignment(NullAlign.getQuantity());
-    Address SrcPtr(Builder.CreateBitCast(NullVariable, Builder.getInt8PtrTy()),
+    Address SrcPtr(Builder.CreateBitCast(NullVariable, Builder.getInt8PtrTy(4)),
                    NullAlign);
 
     if (vla) return emitNonZeroVLAInit(*this, Ty, DestPtr, SrcPtr, SizeVal);
diff --git a/lib/CodeGen/CodeGenFunction.h b/lib/CodeGen/CodeGenFunction.h
index c3060d1fb3..676e7943df 100644
--- a/lib/CodeGen/CodeGenFunction.h
+++ b/lib/CodeGen/CodeGenFunction.h
@@ -583,7 +583,7 @@ public:
 
     /// An i8* variable into which the exception pointer to rethrow
     /// has been saved.
-    llvm::AllocaInst *SavedExnVar;
+    llvm::Instruction *SavedExnVar;
 
   public:
     void enter(CodeGenFunction &CGF, const Stmt *Finally,
@@ -2170,6 +2170,13 @@ public:
                             TBAAAccessInfo *TBAAInfo = nullptr);
   LValue EmitLoadOfPointerLValue(Address Ptr, const PointerType *PtrTy);
 
+  /// Create an alloca instruction. If the target address space for auto var
+  /// for the specific language does no match the address space of alloca,
+  /// insert addrspacecast instruction which casts the alloca instruction to
+  /// the expected address space.
+  llvm::Instruction *CreateAlloca(llvm::Type *Ty, const Twine &Name = "tmp",
+                                  llvm::Instruction *InsertPos = nullptr);
+
   /// CreateTempAlloca - This creates an alloca and inserts it into the entry
   /// block if \p ArraySize is nullptr, otherwise inserts it at the current
   /// insertion point of the builder. The caller is responsible for setting an
@@ -2202,6 +2209,11 @@ public:
                            const Twine &Name = "tmp",
                            llvm::Value *ArraySize = nullptr,
                            Address *Alloca = nullptr);
+
+  /// Get alloca instruction operand of an addrspacecast instruction.
+  /// If \p Inst is alloca instruction, returns \p Inst;
+  llvm::AllocaInst *getAddrSpaceCastedAlloca(llvm::Instruction *Inst) const;
+
   Address CreateTempAllocaWithoutCast(llvm::Type *Ty, CharUnits align,
                                       const Twine &Name = "tmp",
                                       llvm::Value *ArraySize = nullptr);
diff --git a/lib/CodeGen/CodeGenModule.cpp b/lib/CodeGen/CodeGenModule.cpp
index 1fd4e4cf8b..08cac60ec4 100644
--- a/lib/CodeGen/CodeGenModule.cpp
+++ b/lib/CodeGen/CodeGenModule.cpp
@@ -11,6 +11,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "CodeGenModule.h"
+#include "CGAMPRuntime.h"
 #include "CGBlocks.h"
 #include "CGCUDARuntime.h"
 #include "CGCXXABI.h"
@@ -48,6 +49,7 @@
 #include "llvm/ADT/Triple.h"
 #include "llvm/Analysis/TargetLibraryInfo.h"
 #include "llvm/IR/CallingConv.h"
+#include "llvm/IR/Constants.h"
 #include "llvm/IR/DataLayout.h"
 #include "llvm/IR/Intrinsics.h"
 #include "llvm/IR/LLVMContext.h"
@@ -60,6 +62,8 @@
 #include "llvm/Support/MD5.h"
 #include "llvm/Support/TimeProfiler.h"
 
+#include <unordered_map>
+
 using namespace clang;
 using namespace CodeGen;
 
@@ -135,6 +139,8 @@ CodeGenModule::CodeGenModule(ASTContext &C, const HeaderSearchOptions &HSO,
     createOpenMPRuntime();
   if (LangOpts.CUDA)
     createCUDARuntime();
+  if (LangOpts.CPlusPlusAMP)
+    createAMPRuntime();
 
   // Enable TBAA unless it's suppressed. ThreadSanitizer needs TBAA even at O0.
   if (LangOpts.Sanitize.has(SanitizerKind::Thread) ||
@@ -222,6 +228,10 @@ void CodeGenModule::createCUDARuntime() {
   CUDARuntime.reset(CreateNVCUDARuntime(*this));
 }
 
+void CodeGenModule::createAMPRuntime() {
+  AMPRuntime.reset(CreateAMPRuntime(*this));
+}
+
 void CodeGenModule::addReplacement(StringRef Name, llvm::Constant *C) {
   Replacements[Name] = C;
 }
@@ -431,7 +441,11 @@ void CodeGenModule::Release() {
   }
   EmitCtorList(GlobalCtors, "llvm.global_ctors");
   EmitCtorList(GlobalDtors, "llvm.global_dtors");
-  EmitGlobalAnnotations();
+  // skip global annotation for HCC kernel path
+  if (Context.getLangOpts().CPlusPlusAMP && getCodeGenOpts().AMPIsDevice) {
+  } else {
+    EmitGlobalAnnotations();
+  }
   EmitStaticExternCAliases();
   EmitDeferredUnusedCoverageMappings();
   if (CoverageMapping)
@@ -445,7 +459,10 @@ void CodeGenModule::Release() {
   if (SanStats)
     SanStats->finish();
 
+  // Disable linker.options for HIP device compilation. This is a workaround
+  // to get things going until https://reviews.llvm.org/D57829 is committed.
   if (CodeGenOpts.Autolink &&
+      !(Context.getLangOpts().CUDAIsDevice && Context.getLangOpts().HIP) &&
       (Context.getLangOpts().Modules || !LinkerOptionsMetadata.empty())) {
     EmitModuleLinkOptions();
   }
@@ -1163,7 +1180,7 @@ void CodeGenModule::EmitCtorList(CtorList &Fns, const char *GlobalName) {
     ctor.addInt(Int32Ty, I.Priority);
     ctor.add(llvm::ConstantExpr::getBitCast(I.Initializer, CtorPFTy));
     if (I.AssociatedData)
-      ctor.add(llvm::ConstantExpr::getBitCast(I.AssociatedData, VoidPtrTy));
+      ctor.add(llvm::ConstantExpr::getPointerCast(I.AssociatedData, VoidPtrTy));
     else
       ctor.addNullPointer(VoidPtrTy);
     ctor.finishAndAddTo(ctors);
@@ -1826,6 +1843,14 @@ void CodeGenModule::SetFunctionAttributes(GlobalDecl GD, llvm::Function *F,
   else if (const auto *SA = FD->getAttr<SectionAttr>())
      F->setSection(SA->getName());
 
+  // Prevent barrier functions be duplicated
+  // Set C++AMP kernels carry AMDGPU_KERNEL calling convention
+  if (getLangOpts().OpenCL ||
+      (getLangOpts().CPlusPlusAMP && CodeGenOpts.AMPIsDevice)) {
+      if (FD->hasAttr<OpenCLKernelAttr>())
+          F->setCallingConv(llvm::CallingConv::AMDGPU_KERNEL);
+  }
+
   if (FD->isReplaceableGlobalAllocationFunction()) {
     // A replaceable global allocation function does not act like a builtin by
     // default, only if it is invoked by a new-expression or delete-expression.
@@ -2199,10 +2224,13 @@ llvm::Constant *CodeGenModule::EmitAnnotateAttr(llvm::GlobalValue *GV,
                  *LineNoCst = EmitAnnotationLineNo(L);
 
   // Create the ConstantStruct for the global annotation.
+  unsigned AS = GV->getType()->getAddressSpace();
+  llvm::PointerType *I8PTy = (AS == Int8PtrTy->getAddressSpace()) ?
+    Int8PtrTy : Int8Ty->getPointerTo(AS);
   llvm::Constant *Fields[4] = {
-    llvm::ConstantExpr::getBitCast(GV, Int8PtrTy),
-    llvm::ConstantExpr::getBitCast(AnnoGV, Int8PtrTy),
-    llvm::ConstantExpr::getBitCast(UnitGV, Int8PtrTy),
+    llvm::ConstantExpr::getPointerCast(GV, I8PTy),
+    llvm::ConstantExpr::getPointerCast(AnnoGV, I8PTy),
+    llvm::ConstantExpr::getPointerCast(UnitGV, I8PTy),
     LineNoCst
   };
   return llvm::ConstantStruct::getAnon(Fields);
@@ -2370,7 +2398,7 @@ ConstantAddress CodeGenModule::GetWeakRefReference(const ValueDecl *VD) {
   llvm::GlobalValue *Entry = GetGlobalValue(AA->getAliasee());
   if (Entry) {
     unsigned AS = getContext().getTargetAddressSpace(VD->getType());
-    auto Ptr = llvm::ConstantExpr::getBitCast(Entry, DeclTy->getPointerTo(AS));
+    auto Ptr = llvm::ConstantExpr::getPointerCast(Entry, DeclTy->getPointerTo(AS));
     return ConstantAddress(Ptr, Alignment);
   }
 
@@ -2452,6 +2480,28 @@ void CodeGenModule::EmitGlobal(GlobalDecl GD) {
     }
   }
 
+  // If this is C++AMP, be selective about which declarations we emit.
+  if (LangOpts.CPlusPlusAMP && !CodeGenOpts.AMPCPU) {
+    if (CodeGenOpts.AMPIsDevice) {
+      // If -famp-is-device switch is on, we are in GPU build path.
+      // Since we will emit both CPU codes and GPU codes to make C++ mangling
+      // algorithm happy, we won't reject anything other than ones with only
+      // restrict(cpu).  Another optimization pass will remove all CPU codes.
+      if (!Global->hasAttr<CXXAMPRestrictAMPAttr>() &&
+         Global->hasAttr<CXXAMPRestrictCPUAttr>())
+        return;
+    } else {
+      // In host path:
+      // let file-scope global variables be emitted
+      // let functions qualifired with restrict(amp) or [[hc]],
+      // but not with restrict(cpu) or [[cpu]] not be emitted
+      if (!isa<VarDecl>(Global) &&
+          Global->hasAttr<CXXAMPRestrictAMPAttr>() &&
+          !Global->hasAttr<CXXAMPRestrictCPUAttr>())
+        return;
+    }
+  }
+
   // Ignore declarations, they will be emitted on their first use.
   if (const auto *FD = dyn_cast<FunctionDecl>(Global)) {
     // Forward declarations are emitted lazily on first use.
@@ -2704,6 +2754,95 @@ bool CodeGenModule::shouldOpportunisticallyEmitVTables() {
   return CodeGenOpts.OptimizationLevel > 0;
 }
 
+namespace
+{
+  class HCCompatible {
+    // TODO: this does not yet include the actual checking of function bodies.
+    std::unordered_map<const Decl*, bool> d_;
+
+    bool allowed_(const VarDecl* x)
+    {
+      if (!x) return true;
+      if (d_.count(x)) return d_[x];
+
+      bool r = true;
+
+      if (!x->hasAttr<HCCTileStaticAttr>() &&
+          (x->isStaticLocal() ||
+          x->hasExternalStorage() ||
+          x->hasGlobalStorage() ||
+          x->isExceptionVariable()))  {
+            r = false;
+      }
+
+      d_[x] = r;
+
+      return r;
+    }
+
+    bool allowed_(const FunctionDecl* x)
+    {
+      if (!x) return true;
+      if (d_.count(x)) return d_[x];
+
+      bool r = true;
+
+      if (x->isVariadic()) r = false;
+      if (x->isPure() || x->isVirtualAsWritten()) r = false;
+
+      d_[x] = r;
+
+      return r;
+    }
+
+    bool allowed_(const CXXRecordDecl* x)
+    {
+      if (!x) return true;
+      if (d_.count(x)) return d_[x];
+
+      bool r = true;
+
+      if (x->isPolymorphic()) r = false;
+
+      d_[x] = r;
+
+      return r;
+    }
+  public:
+    bool operator()(const Decl* x)
+    {
+      if (!x || x->hasAttr<CXXAMPRestrictAMPAttr>()) return true;
+
+      if (d_.count(x)) return d_[x];
+      if (d_.count(x->getNonClosureContext()) &&
+          !d_[x->getNonClosureContext()]) {
+        d_[x] = false;
+        return false;
+      }
+
+      bool r = true;
+
+      if (isa<VarDecl>(x)) r = allowed_(cast<VarDecl>(x));
+      else if (isa<FunctionDecl>(x)) {
+        r = allowed_(cast<FunctionDecl>(x));
+      }
+      else if (isa<CXXRecordDecl>(x)) {
+        r = allowed_(cast<CXXRecordDecl>(x));
+      }
+
+      d_[x] = r;
+
+      return r;
+    }
+  };
+}
+
+static bool isWhiteListForHCC(CodeGenModule &CGM, GlobalDecl GD) {
+  static HCCompatible r;
+
+  return r(GD.getDecl());
+}
+
 void CodeGenModule::EmitMultiVersionFunctionDefinition(GlobalDecl GD,
                                                        llvm::GlobalValue *GV) {
   const auto *FD = cast<FunctionDecl>(GD.getDecl());
@@ -2720,7 +2859,20 @@ void CodeGenModule::EmitMultiVersionFunctionDefinition(GlobalDecl GD,
 void CodeGenModule::EmitGlobalDefinition(GlobalDecl GD, llvm::GlobalValue *GV) {
   const auto *D = cast<ValueDecl>(GD.getDecl());
 
-  PrettyStackTraceDecl CrashInfo(const_cast<ValueDecl *>(D), D->getLocation(),
+  // If this is C++AMP, be selective about which declarations we emit.
+  if (LangOpts.CPlusPlusAMP && !CodeGenOpts.AMPCPU) {
+    if (CodeGenOpts.AMPIsDevice) {
+      // If -famp-is-device switch is on, we are in GPU build path.
+      if (!isWhiteListForHCC(*this, GD)) return;
+    }
+    else if (!isa<VarDecl>(D) &&
+      D->hasAttr<CXXAMPRestrictAMPAttr>() &&
+      !D->hasAttr<CXXAMPRestrictCPUAttr>()) {
+      return;
+    }
+  }
+
+  PrettyStackTraceDecl CrashInfo(const_cast<ValueDecl *>(D), D->getLocation(), 
                                  Context.getSourceManager(),
                                  "Generating code for declaration");
 
@@ -2982,7 +3134,7 @@ llvm::Constant *CodeGenModule::GetOrCreateMultiVersionResolver(
 /// GetOrCreateLLVMFunction - If the specified mangled name is not in the
 /// module, create and return an llvm Function with the specified type. If there
 /// is something in the module with the specified name, return it potentially
-/// bitcasted to the right type.
+/// casted to the right type.
 ///
 /// If D is non-null, it specifies a decl that correspond to this.  This is used
 /// to set the attributes on the function when it is first created.
@@ -3035,9 +3187,12 @@ llvm::Constant *CodeGenModule::GetOrCreateLLVMFunction(
       setDSOLocal(Entry);
     }
 
-    // If there are two attempts to define the same mangled name, issue an
-    // error.
-    if (IsForDefinition && !Entry->isDeclaration()) {
+    // Relax the rule for C++AMP
+    if (!LangOpts.CPlusPlusAMP) {
+
+     // If there are two attempts to define the same mangled name, issue an
+     // error.
+     if (IsForDefinition && !Entry->isDeclaration()) {
       GlobalDecl OtherGD;
       // Check that GD is not yet in DiagnosedConflictingDefinitions is required
       // to make sure that we issue an error only once.
@@ -3050,6 +3205,7 @@ llvm::Constant *CodeGenModule::GetOrCreateLLVMFunction(
         getDiags().Report(OtherGD.getDecl()->getLocation(),
                           diag::note_previous_definition);
       }
+     }
     }
 
     if ((isa<llvm::Function>(Entry) || isa<llvm::GlobalAlias>(Entry)) &&
@@ -3061,7 +3217,7 @@ llvm::Constant *CodeGenModule::GetOrCreateLLVMFunction(
     // (If function is requested for a definition, we always need to create a new
     // function, not just return a bitcast.)
     if (!IsForDefinition)
-      return llvm::ConstantExpr::getBitCast(Entry, Ty->getPointerTo());
+      return llvm::ConstantExpr::getPointerCast(Entry, Ty->getPointerTo());
   }
 
   // This function doesn't have a complete type (for example, the return
@@ -3166,7 +3322,7 @@ llvm::Constant *CodeGenModule::GetOrCreateLLVMFunction(
   }
 
   llvm::Type *PTy = llvm::PointerType::getUnqual(Ty);
-  return llvm::ConstantExpr::getBitCast(F, PTy);
+  return llvm::ConstantExpr::getPointerCast(F, PTy);
 }
 
 /// GetAddrOfFunction - Return the address of the given function.  If Ty is
@@ -3295,7 +3451,7 @@ bool CodeGenModule::isTypeConstant(QualType Ty, bool ExcludeCtor) {
 /// GetOrCreateLLVMGlobal - If the specified mangled name is not in the module,
 /// create and return an llvm GlobalVariable with the specified type.  If there
 /// is something in the module with the specified name, return it potentially
-/// bitcasted to the right type.
+/// casted to the right type.
 ///
 /// If D is non-null, it specifies a decl that correspond to this.  This is used
 /// to set the attributes on the global when it is first created.
@@ -3346,14 +3502,10 @@ CodeGenModule::GetOrCreateLLVMGlobal(StringRef MangledName,
       }
     }
 
-    // Make sure the result is of the correct type.
-    if (Entry->getType()->getAddressSpace() != Ty->getAddressSpace())
-      return llvm::ConstantExpr::getAddrSpaceCast(Entry, Ty);
-
     // (If global is requested for a definition, we always need to create a new
     // global, not just return a bitcast.)
     if (!IsForDefinition)
-      return llvm::ConstantExpr::getBitCast(Entry, Ty);
+      return llvm::ConstantExpr::getPointerCast(Entry, Ty);
   }
 
   auto AddrSpace = GetGlobalVarAddressSpace(D);
@@ -3371,7 +3523,7 @@ CodeGenModule::GetOrCreateLLVMGlobal(StringRef MangledName,
 
     if (!Entry->use_empty()) {
       llvm::Constant *NewPtrForOldDecl =
-          llvm::ConstantExpr::getBitCast(GV, Entry->getType());
+          llvm::ConstantExpr::getPointerCast(GV, Entry->getType());
       Entry->replaceAllUsesWith(NewPtrForOldDecl);
     }
 
@@ -3537,7 +3689,7 @@ llvm::GlobalVariable *CodeGenModule::CreateOrReplaceCXXRuntimeVariable(
 
     if (!OldGV->use_empty()) {
       llvm::Constant *NewPtrForOldDecl =
-      llvm::ConstantExpr::getBitCast(GV, OldGV->getType());
+      llvm::ConstantExpr::getPointerCast(GV, OldGV->getType());
       OldGV->replaceAllUsesWith(NewPtrForOldDecl);
     }
 
@@ -3567,8 +3719,7 @@ llvm::Constant *CodeGenModule::GetAddrOfGlobalVar(const VarDecl *D,
   if (!Ty)
     Ty = getTypes().ConvertTypeForMem(ASTTy);
 
-  llvm::PointerType *PTy =
-    llvm::PointerType::get(Ty, getContext().getTargetAddressSpace(ASTTy));
+  llvm::PointerType *PTy = llvm::PointerType::get(Ty, getContext().getTargetAddressSpace(ASTTy));
 
   StringRef MangledName = getMangledName(D);
   return GetOrCreateLLVMGlobal(MangledName, PTy, D, IsForDefinition);
@@ -3641,6 +3792,10 @@ LangAS CodeGenModule::GetGlobalVarAddressSpace(const VarDecl *D) {
       return LangAS::cuda_device;
   }
 
+  if (LangOpts.CPlusPlusAMP && LangOpts.DevicePath &&
+      D && D->hasAttr<HCCTileStaticAttr>())
+    return LangAS::hcc_tilestatic;
+
   if (LangOpts.OpenMP) {
     LangAS AS;
     if (OpenMPRuntime->hasAllocateAttributeForGlobalVar(D, AS))
@@ -3795,6 +3950,9 @@ void CodeGenModule::EmitGlobalVarDefinition(const VarDecl *D,
   if (getLangOpts().CUDA &&
       (IsCUDASharedVar || IsCUDAShadowVar || IsHIPPinnedShadowVar))
     Init = llvm::UndefValue::get(getTypes().ConvertType(ASTTy));
+  else if (getLangOpts().CPlusPlusAMP && getLangOpts().DevicePath &&
+           D->hasAttr<HCCTileStaticAttr>())
+    Init = llvm::UndefValue::get(getTypes().ConvertType(ASTTy));
   else if (!InitExpr) {
     // This is a tentative definition; tentative definitions are
     // implicitly initialized with { 0 }.
@@ -3871,7 +4029,7 @@ void CodeGenModule::EmitGlobalVarDefinition(const VarDecl *D,
 
     // Replace all uses of the old global with the new global
     llvm::Constant *NewPtrForOldDecl =
-        llvm::ConstantExpr::getBitCast(GV, Entry->getType());
+        llvm::ConstantExpr::getPointerCast(GV, Entry->getType());
     Entry->replaceAllUsesWith(NewPtrForOldDecl);
 
     // Erase the old global, since it is no longer used.
@@ -4298,9 +4456,17 @@ void CodeGenModule::EmitGlobalFunctionDefinition(GlobalDecl GD,
                                                    /*DontDefer=*/true,
                                                    ForDefinition));
 
-  // Already emitted.
-  if (!GV->isDeclaration())
-    return;
+  if (D->getAttr<CXXAMPRestrictAMPAttr>()) {
+    cast<llvm::Function>(GV)->addFnAttr("HC");
+  }
+
+  // Relax the rule for C++AMP
+  if (!LangOpts.CPlusPlusAMP) {
+    // Already emitted.
+    if (!GV->isDeclaration()) {
+      return;
+    }
+  }
 
   // We need to set linkage and visibility on the function before
   // generating code for it because various parts of IR generation
@@ -4389,7 +4555,7 @@ void CodeGenModule::EmitAliasDefinition(GlobalDecl GD) {
     // Remove it and replace uses of it with the alias.
     GA->takeName(Entry);
 
-    Entry->replaceAllUsesWith(llvm::ConstantExpr::getBitCast(GA,
+    Entry->replaceAllUsesWith(llvm::ConstantExpr::getPointerCast(GA,
                                                           Entry->getType()));
     Entry->eraseFromParent();
   } else {
@@ -4658,7 +4824,7 @@ CodeGenModule::GetAddrOfConstantCFString(const StringLiteral *Literal) {
 
   if (isUTF16)
     // Cast the UTF16 string to the correct type.
-    Str = llvm::ConstantExpr::getBitCast(Str, Int8PtrTy);
+    Str = llvm::ConstantExpr::getPointerCast(Str, Int8PtrTy);
   Fields.add(Str);
 
   // String length.
@@ -5114,6 +5280,9 @@ void CodeGenModule::EmitDeclContext(const DeclContext *DC) {
 
 /// EmitTopLevelDecl - Emit code for a single top level declaration.
 void CodeGenModule::EmitTopLevelDecl(Decl *D) {
+  if (getenv("DBG_CG_DECL")) {
+    llvm::errs() << "decl: "; D->dump();
+  }
   // Ignore dependent declarations.
   if (D->isTemplated())
     return;
diff --git a/lib/CodeGen/CodeGenModule.h b/lib/CodeGen/CodeGenModule.h
index 95964afed4..f5adcf9d81 100644
--- a/lib/CodeGen/CodeGenModule.h
+++ b/lib/CodeGen/CodeGenModule.h
@@ -89,6 +89,7 @@ class CGObjCRuntime;
 class CGOpenCLRuntime;
 class CGOpenMPRuntime;
 class CGCUDARuntime;
+class CGAMPRuntime;
 class BlockFieldFlags;
 class FunctionArgList;
 class CoverageMappingModuleGen;
@@ -320,6 +321,7 @@ private:
   std::unique_ptr<CGOpenCLRuntime> OpenCLRuntime;
   std::unique_ptr<CGOpenMPRuntime> OpenMPRuntime;
   std::unique_ptr<CGCUDARuntime> CUDARuntime;
+  std::unique_ptr<CGAMPRuntime> AMPRuntime;
   std::unique_ptr<CGDebugInfo> DebugInfo;
   std::unique_ptr<ObjCEntrypoints> ObjCData;
   llvm::MDNode *NoObjCARCExceptionsMetadata = nullptr;
@@ -495,6 +497,7 @@ private:
   void createOpenCLRuntime();
   void createOpenMPRuntime();
   void createCUDARuntime();
+  void createAMPRuntime();
 
   bool isTriviallyRecursive(const FunctionDecl *F);
   bool shouldEmitFunction(GlobalDecl GD);
@@ -591,6 +594,12 @@ public:
     return *CUDARuntime;
   }
 
+  /// Return a reference to the configured C++AMP runtime.
+  CGAMPRuntime &getAMPRuntime() {
+    assert(AMPRuntime != nullptr);
+    return *AMPRuntime;
+  }
+
   ObjCEntrypoints &getObjCEntrypoints() const {
     assert(ObjCData != nullptr);
     return *ObjCData;
diff --git a/lib/CodeGen/CodeGenTypes.cpp b/lib/CodeGen/CodeGenTypes.cpp
index 79b29b3d91..a2f09f90ec 100644
--- a/lib/CodeGen/CodeGenTypes.cpp
+++ b/lib/CodeGen/CodeGenTypes.cpp
@@ -95,6 +95,11 @@ llvm::Type *CodeGenTypes::ConvertTypeForMem(QualType T) {
                                 (unsigned)Context.getTypeSize(T));
 }
 
+llvm::PointerType *CodeGenTypes::getVariableType(VarDecl D) {
+  auto Ty = D.getType();
+  return ConvertTypeForMem(Ty)->getPointerTo(
+      getContext().getTargetAddressSpace(Ty));
+}
 
 /// isRecordLayoutComplete - Return true if the specified type is already
 /// completely laid out.
@@ -308,7 +313,7 @@ static llvm::Type *getTypeForFormat(llvm::LLVMContext &VMContext,
   llvm_unreachable("Unknown float format!");
 }
 
-llvm::Type *CodeGenTypes::ConvertFunctionTypeInternal(QualType QFT) {
+llvm::Type *CodeGenTypes::ConvertFunctionTypeInternal(QualType QFT, const FunctionDecl *FD) {
   assert(QFT.isCanonical());
   const Type *Ty = QFT.getTypePtr();
   const FunctionType *FT = cast<FunctionType>(QFT.getTypePtr());
@@ -346,7 +351,7 @@ llvm::Type *CodeGenTypes::ConvertFunctionTypeInternal(QualType QFT) {
   const CGFunctionInfo *FI;
   if (const FunctionProtoType *FPT = dyn_cast<FunctionProtoType>(FT)) {
     FI = &arrangeFreeFunctionType(
-        CanQual<FunctionProtoType>::CreateUnsafe(QualType(FPT, 0)));
+        CanQual<FunctionProtoType>::CreateUnsafe(QualType(FPT, 0)), FD);
   } else {
     const FunctionNoProtoType *FNPT = cast<FunctionNoProtoType>(FT);
     FI = &arrangeFreeFunctionType(
@@ -377,6 +382,14 @@ llvm::Type *CodeGenTypes::ConvertFunctionTypeInternal(QualType QFT) {
   return ResultType;
 }
 
+llvm::PointerType *CodeGenTypes::getPointerTypeTo(QualType T) {
+  return ConvertType(T)->getPointerTo(Context.getTargetAddressSpace(T));
+}
+
+llvm::PointerType *CodeGenTypes::getDefaultPointerTo(llvm::Type *T) {
+  return T->getPointerTo();
+}
+
 /// ConvertType - Convert the specified type to its LLVM form.
 llvm::Type *CodeGenTypes::ConvertType(QualType T) {
   T = Context.getCanonicalType(T);
diff --git a/lib/CodeGen/CodeGenTypes.h b/lib/CodeGen/CodeGenTypes.h
index 0310232950..7ef45d4fb6 100644
--- a/lib/CodeGen/CodeGenTypes.h
+++ b/lib/CodeGen/CodeGenTypes.h
@@ -104,7 +104,7 @@ class CodeGenTypes {
   llvm::SmallSet<const Type *, 8> RecordsWithOpaqueMemberPointers;
 
   /// Helper for ConvertType.
-  llvm::Type *ConvertFunctionTypeInternal(QualType FT);
+  llvm::Type *ConvertFunctionTypeInternal(QualType FT, const FunctionDecl *FD = nullptr);
 
 public:
   CodeGenTypes(CodeGenModule &cgm);
@@ -130,12 +130,30 @@ public:
   /// ConvertType - Convert type T into a llvm::Type.
   llvm::Type *ConvertType(QualType T);
 
+  /// Get a pointer type pointing to the given QualType \p T.
+  llvm::PointerType *getPointerTypeTo(QualType T = QualType());
+
+  /// Get a pointer type pointing to the given llvm::Type \p T in the default
+  /// target address space.
+  llvm::PointerType *getDefaultPointerTo(llvm::Type *T);
+
+  /// Converts the GlobalDecl into an llvm::Type. This should be used
+  /// when we know the target of the function we want to convert.  This is
+  /// because some functions (explicitly, those with pass_object_size
+  /// parameters) may not have the same signature as their type portrays, and
+  /// can only be called directly.
+  llvm::Type *ConvertFunctionType(QualType FT,
+                                  const FunctionDecl *FD = nullptr);
+
   /// ConvertTypeForMem - Convert type T into a llvm::Type.  This differs from
   /// ConvertType in that it is used to convert to the memory representation for
   /// a type.  For example, the scalar representation for _Bool is i1, but the
   /// memory representation is usually i8 or i32, depending on the target.
   llvm::Type *ConvertTypeForMem(QualType T);
 
+  /// Get the LLVM pointer type of a variable.
+  llvm::PointerType *getVariableType(VarDecl D);
+
   /// GetFunctionType - Get the LLVM function type for \arg Info.
   llvm::FunctionType *GetFunctionType(const CGFunctionInfo &Info);
 
@@ -202,7 +220,7 @@ public:
   const CGFunctionInfo &arrangeFreeFunctionCall(const CallArgList &Args,
                                                 const FunctionType *Ty,
                                                 bool ChainCall);
-  const CGFunctionInfo &arrangeFreeFunctionType(CanQual<FunctionProtoType> Ty);
+  const CGFunctionInfo &arrangeFreeFunctionType(CanQual<FunctionProtoType> Ty, const FunctionDecl *FD);
   const CGFunctionInfo &arrangeFreeFunctionType(CanQual<FunctionNoProtoType> Ty);
 
   /// A nullary function is a freestanding function of type 'void ()'.
diff --git a/lib/CodeGen/ItaniumCXXABI.cpp b/lib/CodeGen/ItaniumCXXABI.cpp
index 51a2561a45..7a108212b6 100644
--- a/lib/CodeGen/ItaniumCXXABI.cpp
+++ b/lib/CodeGen/ItaniumCXXABI.cpp
@@ -1309,7 +1309,8 @@ llvm::Value *ItaniumCXXABI::EmitTypeid(CodeGenFunction &CGF,
   auto *ClassDecl =
       cast<CXXRecordDecl>(SrcRecordTy->getAs<RecordType>()->getDecl());
   llvm::Value *Value =
-      CGF.GetVTablePtr(ThisPtr, StdTypeInfoPtrTy->getPointerTo(), ClassDecl);
+      CGF.GetVTablePtr(ThisPtr, CGF.getTypes().getDefaultPointerTo(
+          StdTypeInfoPtrTy), ClassDecl);
 
   // Load the type info.
   Value = CGF.Builder.CreateConstInBoundsGEP1_64(Value, -1ULL);
@@ -2799,7 +2800,7 @@ ItaniumRTTIBuilder::GetAddrOfExternalRTTIDescriptor(QualType Ty) {
     CGM.setGVProperties(GV, RD);
   }
 
-  return llvm::ConstantExpr::getBitCast(GV, CGM.Int8PtrTy);
+  return llvm::ConstantExpr::getPointerCast(GV, CGM.Int8PtrTy);
 }
 
 /// TypeInfoIsInStandardLibrary - Given a builtin type, returns whether the type
@@ -3165,7 +3166,7 @@ void ItaniumRTTIBuilder::BuildVTablePointer(const Type *Ty) {
   llvm::Constant *Two = llvm::ConstantInt::get(PtrDiffTy, 2);
   VTable =
       llvm::ConstantExpr::getInBoundsGetElementPtr(CGM.Int8PtrTy, VTable, Two);
-  VTable = llvm::ConstantExpr::getBitCast(VTable, CGM.Int8PtrTy);
+  VTable = llvm::ConstantExpr::getPointerCast(VTable, CGM.Int8PtrTy);
 
   Fields.push_back(VTable);
 }
@@ -3238,7 +3239,7 @@ llvm::Constant *ItaniumRTTIBuilder::BuildTypeInfo(QualType Ty) {
     assert(!OldGV->hasAvailableExternallyLinkage() &&
            "available_externally typeinfos not yet implemented");
 
-    return llvm::ConstantExpr::getBitCast(OldGV, CGM.Int8PtrTy);
+    return llvm::ConstantExpr::getPointerCast(OldGV, CGM.Int8PtrTy);
   }
 
   // Check if there is already an external RTTI descriptor for this type.
@@ -3298,7 +3299,7 @@ llvm::Constant *ItaniumRTTIBuilder::BuildTypeInfo(
     TypeNameField =
         llvm::ConstantExpr::getIntToPtr(TypeNameField, CGM.Int8PtrTy);
   } else {
-    TypeNameField = llvm::ConstantExpr::getBitCast(TypeName, CGM.Int8PtrTy);
+    TypeNameField = llvm::ConstantExpr::getPointerCast(TypeName, CGM.Int8PtrTy);
   }
   Fields.push_back(TypeNameField);
 
@@ -3808,8 +3809,7 @@ static StructorCodegen getCodegenToUse(CodeGenModule &CGM,
 
   if (llvm::GlobalValue::isWeakForLinker(Linkage)) {
     // Only ELF and wasm support COMDATs with arbitrary names (C5/D5).
-    if (CGM.getTarget().getTriple().isOSBinFormatELF() ||
-        CGM.getTarget().getTriple().isOSBinFormatWasm())
+    if (CGM.supportsCOMDAT())
       return StructorCodegen::COMDAT;
     return StructorCodegen::Emit;
   }
diff --git a/lib/CodeGen/TargetInfo.cpp b/lib/CodeGen/TargetInfo.cpp
index 1e1038dbfe..a8ad224c7c 100644
--- a/lib/CodeGen/TargetInfo.cpp
+++ b/lib/CodeGen/TargetInfo.cpp
@@ -7653,6 +7653,8 @@ public:
   ABIArgInfo classifyArgumentType(QualType Ty, unsigned &NumRegsLeft) const;
 
   void computeInfo(CGFunctionInfo &FI) const override;
+  Address EmitVAArg(CodeGenFunction &CGF, Address VAListAddr,
+                    QualType Ty) const override;
 };
 
 bool AMDGPUABIInfo::isHomogeneousAggregateBaseType(QualType Ty) const {
@@ -7716,6 +7718,11 @@ void AMDGPUABIInfo::computeInfo(CGFunctionInfo &FI) const {
   }
 }
 
+Address AMDGPUABIInfo::EmitVAArg(CodeGenFunction &CGF, Address VAListAddr,
+                                 QualType Ty) const {
+  llvm_unreachable("AMDGPU does not support varargs");
+}
+
 ABIArgInfo AMDGPUABIInfo::classifyReturnType(QualType RetTy) const {
   if (isAggregateTypeForABI(RetTy)) {
     // Records with non-trivial destructors/copy-constructors should not be
@@ -7780,7 +7787,8 @@ ABIArgInfo AMDGPUABIInfo::classifyArgumentType(QualType Ty,
 
   Ty = useFirstFieldIfTransparentUnion(Ty);
 
-  if (isAggregateTypeForABI(Ty)) {
+  if (isAggregateTypeForABI(Ty)
+      && !getContext().getLangOpts().CPlusPlusAMP) {
     // Records with non-trivial destructors/copy-constructors should not be
     // passed by value.
     if (auto RAA = getRecordArgABI(Ty, getCXXABI()))
@@ -7888,6 +7896,18 @@ static bool requiresAMDGPUDefaultVisibility(const Decl *D,
   return isa<VarDecl>(D) && D->hasAttr<HIPPinnedShadowAttr>();
 }
 
+namespace {
+inline llvm::APSInt getConstexprInt(const Expr *E, const ASTContext &Ctx) {
+  clang::Expr::EvalResult r;
+  APValue Val(llvm::APSInt(32));
+  r.Val = Val;
+  if (E)
+    E->EvaluateAsInt(r, Ctx);
+
+  return r.Val.getInt();
+}
+} // namespace
+
 void AMDGPUTargetCodeGenInfo::setTargetAttributes(
     const Decl *D, llvm::GlobalValue *GV, CodeGen::CodeGenModule &M) const {
   if (requiresAMDGPUDefaultVisibility(D, GV)) {
@@ -7904,6 +7924,7 @@ void AMDGPUTargetCodeGenInfo::setTargetAttributes(
   if (!FD)
     return;
 
+  auto GPU = M.getTarget().getTargetOpts().CPU;
   llvm::Function *F = cast<llvm::Function>(GV);
 
   const auto *ReqdWGS = M.getLangOpts().OpenCL ?
@@ -7916,6 +7937,7 @@ void AMDGPUTargetCodeGenInfo::setTargetAttributes(
 
   const auto *FlatWGS = FD->getAttr<AMDGPUFlatWorkGroupSizeAttr>();
   if (ReqdWGS || FlatWGS) {
+
     unsigned Min = 0;
     unsigned Max = 0;
     if (FlatWGS) {
@@ -7926,13 +7948,16 @@ void AMDGPUTargetCodeGenInfo::setTargetAttributes(
                 ->EvaluateKnownConstInt(M.getContext())
                 .getExtValue();
     }
+
     if (ReqdWGS && Min == 0 && Max == 0)
       Min = Max = ReqdWGS->getXDim() * ReqdWGS->getYDim() * ReqdWGS->getZDim();
 
     if (Min != 0) {
-      assert(Min <= Max && "Min must be less than or equal Max");
-
-      std::string AttrVal = llvm::utostr(Min) + "," + llvm::utostr(Max);
+      std::string AttrVal = llvm::utostr(Min);
+      if (Max != 0) {
+        assert(Min <= Max && "Min must be less than or equal Max");
+        AttrVal = AttrVal + "," + llvm::utostr(Max);
+      }
       F->addFnAttr("amdgpu-flat-work-group-size", AttrVal);
     } else
       assert(Max == 0 && "Max must be zero");
@@ -7952,24 +7977,50 @@ void AMDGPUTargetCodeGenInfo::setTargetAttributes(
       std::string AttrVal = llvm::utostr(Min);
       if (Max != 0)
         AttrVal = AttrVal + "," + llvm::utostr(Max);
+
       F->addFnAttr("amdgpu-waves-per-eu", AttrVal);
     } else
       assert(Max == 0 && "Max must be zero");
   }
 
   if (const auto *Attr = FD->getAttr<AMDGPUNumSGPRAttr>()) {
-    unsigned NumSGPR = Attr->getNumSGPR();
+    llvm::APSInt sgprs =
+      getConstexprInt(Attr->getNumSGPR(), FD->getASTContext());
+    unsigned NumSGPR = sgprs.getZExtValue();
 
     if (NumSGPR != 0)
       F->addFnAttr("amdgpu-num-sgpr", llvm::utostr(NumSGPR));
   }
 
   if (const auto *Attr = FD->getAttr<AMDGPUNumVGPRAttr>()) {
-    uint32_t NumVGPR = Attr->getNumVGPR();
+    llvm::APSInt vgprs =
+      getConstexprInt(Attr->getNumVGPR(), FD->getASTContext());
+    unsigned NumVGPR = vgprs.getZExtValue();
 
     if (NumVGPR != 0)
       F->addFnAttr("amdgpu-num-vgpr", llvm::utostr(NumVGPR));
   }
+
+  if (const auto *Attr = FD->getAttr<AMDGPUMaxWorkGroupDimAttr>()) {
+    llvm::APSInt x = getConstexprInt(Attr->getX(), FD->getASTContext());
+    llvm::APSInt y = getConstexprInt(Attr->getY(), FD->getASTContext());
+    llvm::APSInt z = getConstexprInt(Attr->getZ(), FD->getASTContext());
+
+    unsigned X = x.getZExtValue();
+    unsigned Y = y.getZExtValue();
+    unsigned Z = z.getZExtValue();
+    std::string AttrVal = llvm::utostr(X) + "," + llvm::utostr(Y) + "," +
+        llvm::utostr(Z);
+    F->addFnAttr("amdgpu-max-work-group-dim", AttrVal);
+    if (FD->getAttr<AMDGPUFlatWorkGroupSizeAttr>() == nullptr) {
+      uint64_t MaxFlat = (uint64_t)X * Y * Z;
+      if (MaxFlat > UINT_MAX)
+        MaxFlat = UINT_MAX;
+      AttrVal = std::string("1,") + llvm::utostr((unsigned)MaxFlat);
+      F->addFnAttr("amdgpu-flat-work-group-size", AttrVal);
+    }
+  }
+
 }
 
 unsigned AMDGPUTargetCodeGenInfo::getOpenCLKernelCallingConv() const {
@@ -7990,7 +8041,7 @@ llvm::Constant *AMDGPUTargetCodeGenInfo::getNullPointer(
   auto &Ctx = CGM.getContext();
   auto NPT = llvm::PointerType::get(PT->getElementType(),
       Ctx.getTargetAddressSpace(LangAS::opencl_generic));
-  return llvm::ConstantExpr::getAddrSpaceCast(
+  return llvm::ConstantExpr::getPointerCast(
       llvm::ConstantPointerNull::get(NPT), PT);
 }
 
@@ -8037,7 +8088,7 @@ AMDGPUTargetCodeGenInfo::getLLVMSyncScopeID(const LangOptions &LangOpts,
     Name = "wavefront";
   }
 
-  if (Ordering != llvm::AtomicOrdering::SequentiallyConsistent) {
+  if(Ordering != llvm::AtomicOrdering::SequentiallyConsistent) {
     if (!Name.empty())
       Name = Twine(Twine(Name) + Twine("-")).str();
 
@@ -9640,7 +9691,7 @@ public:
 //===----------------------------------------------------------------------===//
 
 bool CodeGenModule::supportsCOMDAT() const {
-  return getTriple().supportsCOMDAT();
+  return (!getLangOpts().CPlusPlusAMP && getTriple().supportsCOMDAT());
 }
 
 const TargetCodeGenInfo &CodeGenModule::getTargetCodeGenInfo() {
diff --git a/lib/Driver/Action.cpp b/lib/Driver/Action.cpp
index 47b03f6643..5f1bb45837 100644
--- a/lib/Driver/Action.cpp
+++ b/lib/Driver/Action.cpp
@@ -96,6 +96,8 @@ std::string Action::getOffloadingKindPrefix() const {
     return "device-cuda";
   case OFK_OpenMP:
     return "device-openmp";
+  case OFK_HCC:
+    return "device-hcc";
   case OFK_HIP:
     return "device-hip";
 
@@ -116,6 +118,9 @@ std::string Action::getOffloadingKindPrefix() const {
   if (ActiveOffloadKindMask & OFK_OpenMP)
     Res += "-openmp";
 
+  if (ActiveOffloadKindMask & OFK_HCC)
+    Res += "-hcc";
+
   // TODO: Add other programming models here.
 
   return Res;
@@ -149,6 +154,8 @@ StringRef Action::GetOffloadKindName(OffloadKind Kind) {
     return "cuda";
   case OFK_OpenMP:
     return "openmp";
+  case OFK_HCC:
+    return "hcc";
   case OFK_HIP:
     return "hip";
 
@@ -158,6 +165,23 @@ StringRef Action::GetOffloadKindName(OffloadKind Kind) {
   llvm_unreachable("invalid offload kind");
 }
 
+bool Action::ContainsActions(ActionClass kind,
+                             types::ID typesID,
+                             bool singleInputActionsOnly,
+                             bool startsWithActionKind) const {
+  if (startsWithActionKind && getKind() != kind)
+    return false;
+  if (singleInputActionsOnly && size() != 1)
+    return false;
+  if (getType() == typesID)
+    return true;
+  for (const Action *A : inputs()) {
+    if (A->ContainsActions(kind, typesID, singleInputActionsOnly, false))
+      return true;
+  }
+  return false;
+}
+
 void InputAction::anchor() {}
 
 InputAction::InputAction(const Arg &_Input, types::ID _Type)
diff --git a/lib/Driver/CMakeLists.txt b/lib/Driver/CMakeLists.txt
index d90c0ff436..183a76c4a0 100644
--- a/lib/Driver/CMakeLists.txt
+++ b/lib/Driver/CMakeLists.txt
@@ -46,6 +46,7 @@ add_clang_library(clangDriver
   ToolChains/Fuchsia.cpp
   ToolChains/Gnu.cpp
   ToolChains/Haiku.cpp
+  ToolChains/Hcc.cpp
   ToolChains/HIP.cpp
   ToolChains/Hexagon.cpp
   ToolChains/Hurd.cpp
diff --git a/lib/Driver/Driver.cpp b/lib/Driver/Driver.cpp
index a9a273529b..3a7d7130e5 100644
--- a/lib/Driver/Driver.cpp
+++ b/lib/Driver/Driver.cpp
@@ -24,6 +24,7 @@
 #include "ToolChains/Gnu.h"
 #include "ToolChains/HIP.h"
 #include "ToolChains/Haiku.h"
+#include "ToolChains/Hcc.h"
 #include "ToolChains/Hexagon.h"
 #include "ToolChains/Hurd.h"
 #include "ToolChains/Lanai.h"
@@ -408,6 +409,24 @@ DerivedArgList *Driver::TranslateInputArgs(const InputArgList &Args) const {
   }
 #endif
 
+  // Add extra flags -hc should imply.
+  if (Args.hasArg(options::OPT_hc_mode)) {
+    DAL->AddFlagArg(0, Opts->getOption(options::OPT_famp));
+    DAL->AddPositionalArg(0, Opts->getOption(options::OPT_Xclang), "-famp");
+    DAL->AddPositionalArg(0, Opts->getOption(options::OPT_Xclang), "-fhsa-ext");
+
+    // We need at least C++11 or C++AMP. If we're not given an explicit C++
+    // standard, add one because the default is too old.
+    if (!Args.hasArg(options::OPT_std_EQ)) {
+      DAL->AddPositionalArg(0, Opts->getOption(options::OPT_std_EQ), "c++amp");
+    }
+    if (Args.hasArg(options::OPT_hc_function_calls)) {
+      DAL->AddFlagArg(nullptr, Opts->getOption(options::OPT_hc_function_calls));
+    }
+  } else if (Args.hasArg(options::OPT_famp)) {
+    DAL->AddPositionalArg(0, Opts->getOption(options::OPT_Xclang), "-famp");
+  }
+
   return DAL;
 }
 
@@ -714,6 +733,25 @@ void Driver::CreateOffloadingDeviceToolChains(Compilation &C,
           << OpenMPTargets->getAsString(C.getInputArgs());
   }
 
+  //
+  // HCC
+  //
+  // Initialize HCC device TC if we have HCC inputs.
+  if (llvm::any_of(Inputs, [](const std::pair<types::ID, const Arg *> &I) {
+        return types::isHCC(I.first);
+      })) {
+
+    const ToolChain *HostTC = C.getSingleOffloadToolChain<Action::OFK_Host>();
+    llvm::Triple HccTriple("amdgcn--amdhsa-hcc");
+    auto &HccTC = ToolChains[HccTriple.str()];
+    if (!HccTC)
+      HccTC = llvm::make_unique<toolchains::HCCToolChain>(*this, HccTriple, *HostTC, C.getInputArgs());
+      
+    const ToolChain *TC = HccTC.get();
+
+    C.addOffloadDeviceToolChain(TC, Action::OFK_HCC);
+  }
+
   //
   // TODO: Add support for other offloading programming models here.
   //
@@ -2157,9 +2195,50 @@ void Driver::BuildInputs(const ToolChain &TC, DerivedArgList &Args,
         }
       }
 
-      if (DiagnoseInputExistence(Args, Value, Ty, /*TypoCorrect=*/true))
-        Inputs.push_back(std::make_pair(Ty, A));
-
+      if (DiagnoseInputExistence(Args, Value, Ty, /*TypoCorrect=*/true)) {
+        // C++ AMP-specific
+        // For C++ source files, duplicate the input so we launch the compiler twice
+        // 1 for GPU compilation (TY_CXX_AMP), 1 for CPU compilation (TY_CXX)
+        if (Ty == types::TY_CXX && (Args.hasArg(options::OPT_famp) ||
+          Args.getLastArgValue(options::OPT_std_EQ).equals("c++amp"))) {
+          Arg *FinalPhaseArg;
+          phases::ID FinalPhase = getFinalPhase(Args, &FinalPhaseArg);
+          switch (FinalPhase) {
+            // -E
+            case phases::Preprocess:
+            // -c
+            case phases::Assemble:
+            // build executable
+            case phases::Link:
+              if (Args.hasArg(options::OPT_cxxamp_cpu_mode))
+                  Inputs.push_back(std::make_pair(types::TY_CXX_AMP_CPU, A));
+              if(Args.hasArg(options::OPT_hc_mode)) {
+                Inputs.push_back(std::make_pair(types::TY_HC_HOST, A));
+                Inputs.push_back(std::make_pair(types::TY_HC_KERNEL, A));
+              } else {
+                Inputs.push_back(std::make_pair(Ty, A));
+                Inputs.push_back(std::make_pair(types::TY_CXX_AMP, A));
+              }
+            break;
+            // -S
+            case phases::Backend:
+              if (Args.hasArg(options::OPT_cxxamp_kernel_mode)) {
+                Inputs.push_back(std::make_pair(types::TY_CXX_AMP, A));
+              } else if (Args.hasArg(options::OPT_cxxamp_cpu_mode)) {
+                  Inputs.push_back(std::make_pair(types::TY_CXX_AMP_CPU, A));
+              } else {
+                Inputs.push_back(std::make_pair(Ty, A));
+              }
+            break;
+            default:
+              Inputs.push_back(std::make_pair(Ty, A));
+            break;
+          }
+        } else {
+          // Standard compilation flow
+          Inputs.push_back(std::make_pair(Ty, A));
+        }
+      }
     } else if (A->getOption().matches(options::OPT__SLASH_Tc)) {
       StringRef Value = A->getValue();
       if (DiagnoseInputExistence(Args, Value, types::TY_C,
@@ -3526,8 +3605,12 @@ void Driver::BuildJobs(Compilation &C) const {
         ++NumOutputs;
 
     if (NumOutputs > 1) {
-      Diag(clang::diag::err_drv_output_argument_with_multiple_files);
-      FinalOutput = nullptr;
+      // relax rule for C++AMP because we may have multiple outputs
+      if (!C.getArgs().hasArg(options::OPT_famp) &&
+        !C.getArgs().getLastArgValue(options::OPT_std_EQ).equals("c++amp")) {
+        Diag(clang::diag::err_drv_output_argument_with_multiple_files);
+        FinalOutput = nullptr;
+      }
     }
   }
 
@@ -3556,11 +3639,11 @@ void Driver::BuildJobs(Compilation &C) const {
     }
 
     BuildJobsForAction(C, A, &C.getDefaultToolChain(),
-                       /*BoundArch*/ StringRef(),
-                       /*AtTopLevel*/ true,
-                       /*MultipleArchs*/ ArchNames.size() > 1,
-                       /*LinkingOutput*/ LinkingOutput, CachedResults,
-                       /*TargetDeviceOffloadKind*/ Action::OFK_None);
+                      /*BoundArch*/ StringRef(),
+                      /*AtTopLevel*/ true,
+                      /*MultipleArchs*/ ArchNames.size() > 1,
+                      /*LinkingOutput*/ LinkingOutput, CachedResults,
+                      /*TargetDeviceOffloadKind*/ Action::OFK_None);
   }
 
   // If the user passed -Qunused-arguments or there were errors, don't warn
@@ -3601,6 +3684,14 @@ void Driver::BuildJobs(Compilation &C) const {
           continue;
       }
 
+      // Suppress the warning if this is -Xclang -fhsa-ext
+      if (Opt.getKind() == Option::SeparateClass) {
+        if (Opt.getName() == "Xclang" &&
+            A->containsValue("-fhsa-ext")) {
+          continue;
+        }
+      }
+
       // In clang-cl, don't mention unknown arguments here since they have
       // already been warned about.
       if (!IsCLMode() || !A->getOption().matches(options::OPT_UNKNOWN))
@@ -3870,6 +3961,17 @@ public:
   /// dropping them. If no suitable tool is found, null will be returned.
   const Tool *getTool(ActionList &Inputs,
                       ActionList &CollapsedOffloadAction) {
+
+    if (BaseAction->ContainsActions(Action::AssembleJobClass, types::TY_HC_HOST) ||
+        BaseAction->ContainsActions(Action::AssembleJobClass, types::TY_HC_KERNEL) ||
+        BaseAction->ContainsActions(Action::AssembleJobClass, types::TY_PP_CXX_AMP) ||
+        BaseAction->ContainsActions(Action::AssembleJobClass, types::TY_PP_CXX_AMP_CPU)) {
+      const ToolChain *DeviceTC = C.getSingleOffloadToolChain<Action::OFK_HCC>();
+      assert(DeviceTC && "HCC Device ToolChain is not set.");
+      Inputs = BaseAction->getInputs();
+      return DeviceTC->SelectTool(*BaseAction);
+    }
+
     //
     // Get the largest chain of actions that we could combine.
     //
@@ -4081,9 +4183,18 @@ InputInfo Driver::BuildJobsForActionNoCache(
     // FIXME: Clean this up.
     bool SubJobAtTopLevel =
         AtTopLevel && (isa<DsymutilJobAction>(A) || isa<VerifyJobAction>(A));
+    // UPGRADE_TBD: Find a better way to check HCC-specific Action objects
+    // Find correct Tool for HCC-specific Actions in HCC ToolChain
+    bool IsHccTC =
+      JA->ContainsActions(Action::BackendJobClass, types::TY_PP_CXX_AMP) ||
+      JA->ContainsActions(Action::BackendJobClass, types::TY_PP_CXX_AMP_CPU) ||
+      JA->ContainsActions(Action::AssembleJobClass, types::TY_HC_KERNEL) ||
+      JA->ContainsActions(Action::AssembleJobClass, types::TY_PP_CXX_AMP) ||
+      JA->ContainsActions(Action::AssembleJobClass, types::TY_PP_CXX_AMP_CPU);
     InputInfos.push_back(BuildJobsForAction(
-        C, Input, TC, BoundArch, SubJobAtTopLevel, MultipleArchs, LinkingOutput,
-        CachedResults, A->getOffloadingDeviceKind()));
+      C, Input, IsHccTC ? C.getSingleOffloadToolChain<Action::OFK_HCC>() : TC,
+      BoundArch, SubJobAtTopLevel, MultipleArchs, LinkingOutput, CachedResults,
+      A->getOffloadingDeviceKind()));
   }
 
   // Always use the first input as the base input.
@@ -4321,6 +4432,9 @@ const char *Driver::GetNamedOutputPath(Compilation &C, const JobAction &JA,
     } else {
       TmpName = GetTemporaryPath(Split.first, Suffix);
     }
+    if (JA.ContainsActions(Action::BackendJobClass, types::TY_PP_CXX_AMP_CPU) ||
+        JA.ContainsActions(Action::AssembleJobClass, types::TY_PP_CXX_AMP_CPU))
+      TmpName += ".cpu";
     return C.addTempFile(C.getArgs().MakeArgString(TmpName));
   }
 
@@ -4541,7 +4655,7 @@ std::string Driver::GetProgramPath(StringRef Name, const ToolChain &TC) const {
 
 std::string Driver::GetTemporaryPath(StringRef Prefix, StringRef Suffix) const {
   SmallString<128> Path;
-  std::error_code EC = llvm::sys::fs::createTemporaryFile(Prefix, Suffix, Path);
+  std::error_code EC = llvm::sys::fs::getPotentiallyUniqueTempFileName(Prefix, Suffix, Path); 
   if (EC) {
     Diag(clang::diag::err_unable_to_make_temp) << EC.message();
     return "";
diff --git a/lib/Driver/ToolChain.cpp b/lib/Driver/ToolChain.cpp
index b1fddb0af5..9c5643c19d 100644
--- a/lib/Driver/ToolChain.cpp
+++ b/lib/Driver/ToolChain.cpp
@@ -146,6 +146,7 @@ static const DriverSuffix *FindDriverSuffix(StringRef ProgName, size_t &Pos) {
       {"clang-g++", "--driver-mode=g++"},
       {"clang-gcc", nullptr},
       {"clang-cl", "--driver-mode=cl"},
+      {"hcc", "--driver-mode=g++"},
       {"cc", nullptr},
       {"cpp", "--driver-mode=cpp"},
       {"cl", "--driver-mode=cl"},
@@ -474,8 +475,9 @@ bool ToolChain::needsGCovInstrumentation(const llvm::opt::ArgList &Args) {
 }
 
 Tool *ToolChain::SelectTool(const JobAction &JA) const {
-  if (getDriver().ShouldUseClangCompiler(JA)) return getClang();
   Action::ActionClass AC = JA.getKind();
+
+  if (getDriver().ShouldUseClangCompiler(JA)) return getClang();
   if (AC == Action::AssembleJobClass && useIntegratedAs())
     return getClangAs();
   return getTool(AC);
@@ -919,6 +921,9 @@ SanitizerMask ToolChain::getSupportedSanitizers() const {
 void ToolChain::AddCudaIncludeArgs(const ArgList &DriverArgs,
                                    ArgStringList &CC1Args) const {}
 
+void ToolChain::AddHCCIncludeArgs(const ArgList &DriverArgs,
+                                  ArgStringList &CC1Args) const {}
+
 void ToolChain::AddIAMCUIncludeArgs(const ArgList &DriverArgs,
                                     ArgStringList &CC1Args) const {}
 
diff --git a/lib/Driver/ToolChains/Clang.cpp b/lib/Driver/ToolChains/Clang.cpp
index dd461a1976..be85ce070f 100644
--- a/lib/Driver/ToolChains/Clang.cpp
+++ b/lib/Driver/ToolChains/Clang.cpp
@@ -144,6 +144,11 @@ forAllAssociatedToolChains(Compilation &C, const JobAction &JA,
   } else if (JA.isDeviceOffloading(Action::OFK_OpenMP))
     Work(*C.getSingleOffloadToolChain<Action::OFK_Host>());
 
+  if (JA.isHostOffloading(Action::OFK_HCC))
+    Work(*C.getSingleOffloadToolChain<Action::OFK_HCC>());
+  else if (JA.isDeviceOffloading(Action::OFK_HCC))
+    Work(*C.getSingleOffloadToolChain<Action::OFK_Host>());
+
   //
   // TODO: Add support for other offloading programming models here.
   //
@@ -1154,6 +1159,10 @@ void Clang::AddPreprocessingOptions(Compilation &C, const JobAction &JA,
   if (JA.isOffloading(Action::OFK_Cuda))
     getToolChain().AddCudaIncludeArgs(Args, CmdArgs);
 
+  if (Args.hasArg(options::OPT_famp) ||
+    Args.getLastArgValue(options::OPT_std_EQ).equals("c++amp"))
+    getToolChain().AddHCCIncludeArgs(Args, CmdArgs);
+
   // If we are offloading to a target via OpenMP we need to include the
   // openmp_wrappers folder which contains alternative system headers.
   if (JA.isDeviceOffloading(Action::OFK_OpenMP) &&
@@ -3166,7 +3175,7 @@ static DwarfFissionKind getDebugFissionKind(const Driver &D,
 static void RenderDebugOptions(const ToolChain &TC, const Driver &D,
                                const llvm::Triple &T, const ArgList &Args,
                                bool EmitCodeView, bool IsWindowsMSVC,
-                               ArgStringList &CmdArgs,
+                               bool IsHCCKernelPath, ArgStringList &CmdArgs,
                                codegenoptions::DebugInfoKind &DebugInfoKind,
                                DwarfFissionKind &DwarfFission) {
   if (Args.hasFlag(options::OPT_fdebug_info_for_profiling,
@@ -3223,6 +3232,8 @@ static void RenderDebugOptions(const ToolChain &TC, const Driver &D,
     }
   }
 
+  if (!IsHCCKernelPath ||
+       DebugInfoKind == codegenoptions::DebugLineTablesOnly) {
   // If a debugger tuning argument appeared, remember it.
   if (const Arg *A =
           Args.getLastArg(options::OPT_gTune_Group, options::OPT_ggdbN_Group)) {
@@ -3404,6 +3415,8 @@ static void RenderDebugOptions(const ToolChain &TC, const Driver &D,
   if (DebuggerTuning == llvm::DebuggerKind::SCE)
     CmdArgs.push_back("-dwarf-explicit-import");
 
+  } // if (!IsHCCKernelPath)
+
   RenderDebugInfoCompressionArgs(Args, CmdArgs, D, TC);
 }
 
@@ -3486,6 +3499,33 @@ void Clang::ConstructJob(Compilation &C, const JobAction &JA,
   // FIXME: Implement custom jobs for internal actions.
   CmdArgs.push_back("-cc1");
 
+  // add HCC macros, based on compiler modes
+  if (Args.hasArg(options::OPT_hc_mode)) {
+    CmdArgs.push_back("-D__KALMAR_HC__=1");
+    CmdArgs.push_back("-D__HCC_HC__=1");
+  } else if (Args.hasArg(options::OPT_famp) ||
+    Args.getLastArgValue(options::OPT_std_EQ).equals("c++amp")) {
+    CmdArgs.push_back("-D__KALMAR_AMP__=1");
+    CmdArgs.push_back("-D__HCC_AMP__=1");
+  }
+
+  // C++ AMP-specific
+  if (JA.ContainsActions(Action::BackendJobClass, types::TY_PP_CXX_AMP) ||
+      JA.ContainsActions(Action::PreprocessJobClass, types::TY_HC_KERNEL) ||
+      JA.ContainsActions(Action::PreprocessJobClass, types::TY_CXX_AMP)) {
+    // path to compile kernel codes on GPU
+    CmdArgs.push_back("-famp-is-device");
+    CmdArgs.push_back("-fno-builtin");
+    CmdArgs.push_back("-fno-common");
+    //CmdArgs.push_back("-m32"); // added below using -triple
+    CmdArgs.push_back("-O2");
+  } else if (JA.ContainsActions(Action::BackendJobClass, types::TY_PP_CXX_AMP_CPU) ||
+             JA.ContainsActions(Action::PreprocessJobClass, types::TY_CXX_AMP_CPU)) {
+    // path to compile kernel codes on CPU
+    CmdArgs.push_back("-famp-is-device");
+    CmdArgs.push_back("-famp-cpu");
+  }
+
   // Add the "effective" target triple.
   CmdArgs.push_back("-triple");
   CmdArgs.push_back(Args.MakeArgString(TripleStr));
@@ -3527,6 +3567,20 @@ void Clang::ConstructJob(Compilation &C, const JobAction &JA,
     CmdArgs.push_back(Args.MakeArgString(NormalizedTriple));
   }
 
+  // Make sure host triple is specified for HCC kernel compilation path
+  bool IsHCCKernelPath = JA.ContainsActions(Action::BackendJobClass, types::TY_PP_CXX_AMP) ||
+                         JA.ContainsActions(Action::BackendJobClass, types::TY_PP_CXX_AMP_CPU);
+  if (IsHCCKernelPath) {
+    // We have to pass the triple of the host if compiling for a HCC device
+    std::string NormalizedTriple;
+    NormalizedTriple = C.getSingleOffloadToolChain<Action::OFK_Host>()
+                         ->getTriple()
+                         .normalize();
+
+    CmdArgs.push_back("-aux-triple");
+    CmdArgs.push_back(Args.MakeArgString(NormalizedTriple));
+  }
+
   if (IsOpenMPDevice) {
     // We have to pass the triple of the host if compiling for an OpenMP device.
     std::string NormalizedTriple =
@@ -3799,7 +3853,9 @@ void Clang::ConstructJob(Compilation &C, const JobAction &JA,
   // Discard value names in assert builds unless otherwise specified.
   if (Args.hasFlag(options::OPT_fdiscard_value_names,
                    options::OPT_fno_discard_value_names, !IsAssertBuild))
-    CmdArgs.push_back("-discard-value-names");
+      
+    if (!Args.hasArg(options::OPT_hc_mode))
+       CmdArgs.push_back("-discard-value-names");
 
   // Set the main file name, so that debug info works even with
   // -save-temps.
@@ -4118,7 +4174,7 @@ void Clang::ConstructJob(Compilation &C, const JobAction &JA,
     AddClangCLArgs(Args, InputType, CmdArgs, &DebugInfoKind, &EmitCodeView);
 
   DwarfFissionKind DwarfFission;
-  RenderDebugOptions(TC, D, RawTriple, Args, EmitCodeView, IsWindowsMSVC,
+  RenderDebugOptions(TC, D, RawTriple, Args, EmitCodeView, IsWindowsMSVC, IsHCCKernelPath,
                      CmdArgs, DebugInfoKind, DwarfFission);
 
   // Add the split debug info name to the command lines here so we
@@ -4256,7 +4312,22 @@ void Clang::ConstructJob(Compilation &C, const JobAction &JA,
       CmdArgs.push_back("-O3");
       D.Diag(diag::warn_O4_is_O3);
     } else {
-      A->render(Args, CmdArgs);
+      // C++ AMP-specific
+      if (JA.ContainsActions(Action::BackendJobClass, types::TY_PP_CXX_AMP)) {
+        // ignore -O0 and -O1 for GPU compilation paths
+        // because inliner would not be enabled and will cause compilation fail
+        if (A->getOption().matches(options::OPT_O0)) {
+          D.Diag(diag::warn_drv_O0_ignored_for_GPU);
+        } else if (A->containsValue("1")) {
+          D.Diag(diag::warn_drv_O1_ignored_for_GPU);
+        } else {
+          // let all other optimization levels pass
+          A->render(Args, CmdArgs);
+        }
+      } else {
+        // normal cases
+        A->render(Args, CmdArgs);
+      }
     }
   }
 
@@ -5034,6 +5105,9 @@ void Clang::ConstructJob(Compilation &C, const JobAction &JA,
                     options::OPT_fno_gnu_inline_asm, true))
     CmdArgs.push_back("-fno-gnu-inline-asm");
 
+  // Turn off vectorization support for GPU kernels for now
+  if (!IsHCCKernelPath) {
+
   // Enable vectorization per default according to the optimization level
   // selected. For optimization levels that want vectorization we use the alias
   // option to simplify the hasFlag logic.
@@ -5044,8 +5118,10 @@ void Clang::ConstructJob(Compilation &C, const JobAction &JA,
                    options::OPT_fno_vectorize, EnableVec))
     CmdArgs.push_back("-vectorize-loops");
 
+  } // if (!IsHCCKernelPath)
+
   // -fslp-vectorize is enabled based on the optimization level selected.
-  bool EnableSLPVec = shouldEnableVectorizerAtOLevel(Args, true);
+  bool EnableSLPVec = shouldEnableVectorizerAtOLevel(Args, true) && !IsHCCKernelPath;
   OptSpecifier SLPVectAliasOption =
       EnableSLPVec ? options::OPT_O_Group : options::OPT_fslp_vectorize;
   if (Args.hasFlag(options::OPT_fslp_vectorize, SLPVectAliasOption,
@@ -5300,6 +5376,23 @@ void Clang::ConstructJob(Compilation &C, const JobAction &JA,
       CmdArgs.push_back("-fcuda-short-ptr");
   }
 
+  if (Args.hasArg(options::OPT_hc_mode) ||
+    Args.hasArg(options::OPT_famp) ||
+    Args.getLastArgValue(options::OPT_std_EQ).equals("c++amp")) {
+
+    // Generate *relocatable* code by default for HCC
+    // In reality, HCC doesn't support relocatible code at the moment.
+    // What it really cares about is -fno-gpu-rdc, which instructs
+    // HCC to generate non-relocatable code.  This is a hint for HCC
+    // to enable early finalization because kernels don't contain calls 
+    // to functions defined in another module.
+    if (Args.hasFlag(options::OPT_fgpu_rdc, options::OPT_fno_gpu_rdc, true))
+      CmdArgs.push_back("-fgpu-rdc");
+  }
+
+  if (IsHIP)
+    CmdArgs.push_back("-fcuda-allow-variadic-functions");
+
   // OpenMP offloading device jobs take the argument -fopenmp-host-ir-file-path
   // to specify the result of the compile phase on the host, so the meaningful
   // device declarations can be identified. Also, -fopenmp-is-device is passed
@@ -5343,6 +5436,13 @@ void Clang::ConstructJob(Compilation &C, const JobAction &JA,
     CmdArgs.push_back("-fwhole-program-vtables");
   }
 
+  // C++ AMP-specific
+  if (JA.ContainsActions(Action::BackendJobClass, types::TY_PP_CXX_AMP) ||
+      JA.ContainsActions(Action::BackendJobClass, types::TY_PP_CXX_AMP_CPU) ||
+      JA.ContainsActions(Action::BackendJobClass, types::TY_PP_HC_HOST)) {
+    CmdArgs.push_back("-emit-llvm-bc");
+  }
+
   bool RequiresSplitLTOUnit = WholeProgramVTables || Sanitize.needsLTO();
   bool SplitLTOUnit =
       Args.hasFlag(options::OPT_fsplit_lto_unit,
@@ -5451,6 +5551,17 @@ void Clang::ConstructJob(Compilation &C, const JobAction &JA,
   // the -cc1 command easier to edit when reproducing compiler crashes.
   if (Output.getType() == types::TY_Dependencies) {
     // Handled with other dependency code.
+  } else if (Output.isFilename() &&
+             (JA.ContainsActions(Action::PreprocessJobClass, types::TY_HC_KERNEL) ||
+              JA.ContainsActions(Action::PreprocessJobClass, types::TY_CXX_AMP) ||
+              JA.ContainsActions(Action::PreprocessJobClass, types::TY_CXX_AMP_CPU))) {
+    CmdArgs.push_back("-o");
+    SmallString<128> KernelPreprocessFile(Output.getFilename());
+    if (JA.ContainsActions(Action::PreprocessJobClass, types::TY_CXX_AMP_CPU)) {
+      llvm::sys::path::replace_extension(KernelPreprocessFile, ".amp_cpu.i");
+    } else {
+      llvm::sys::path::replace_extension(KernelPreprocessFile, ".gpu.i");
+    }
   } else if (Output.isFilename()) {
     CmdArgs.push_back("-o");
     CmdArgs.push_back(Output.getFilename());
diff --git a/lib/Driver/ToolChains/Gnu.cpp b/lib/Driver/ToolChains/Gnu.cpp
index 33cdd3585c..1e79f97cb2 100644
--- a/lib/Driver/ToolChains/Gnu.cpp
+++ b/lib/Driver/ToolChains/Gnu.cpp
@@ -7,6 +7,8 @@
 //===----------------------------------------------------------------------===//
 
 #include "Gnu.h"
+#include "Linux.h"
+#include "Hcc.h"
 #include "Arch/ARM.h"
 #include "Arch/Mips.h"
 #include "Arch/PPC.h"
@@ -341,11 +343,14 @@ static bool getStatic(const ArgList &Args) {
       !Args.hasArg(options::OPT_static_pie);
 }
 
-void tools::gnutools::Linker::ConstructJob(Compilation &C, const JobAction &JA,
-                                           const InputInfo &Output,
-                                           const InputInfoList &Inputs,
-                                           const ArgList &Args,
-                                           const char *LinkingOutput) const {
+
+void tools::gnutools::Linker::ConstructLinkerJob(Compilation &C,
+                                    const JobAction &JA,
+                                    const InputInfo &Output,
+                                    const InputInfoList &Inputs,
+                                    const ArgList &Args,
+                                    const char *LinkingOutput,
+                                    ArgStringList &CmdArgs) const {
   const toolchains::Linux &ToolChain =
       static_cast<const toolchains::Linux &>(getToolChain());
   const Driver &D = ToolChain.getDriver();
@@ -362,8 +367,6 @@ void tools::gnutools::Linker::ConstructJob(Compilation &C, const JobAction &JA,
       ToolChain.getTriple().hasEnvironment() ||
       (ToolChain.getTriple().getVendor() != llvm::Triple::MipsTechnologies);
 
-  ArgStringList CmdArgs;
-
   // Silence warning for "clang -g foo.o -o foo"
   Args.ClaimAllArgs(options::OPT_g_Group);
   // and "clang -emit-llvm foo.o -o foo"
@@ -619,15 +622,53 @@ void tools::gnutools::Linker::ConstructJob(Compilation &C, const JobAction &JA,
     }
   }
 
+  // HCC: Add compiler-rt library to get the half fp builtins
+  if (C.getArgs().hasArg(options::OPT_famp) ||
+    C.getArgs().getLastArgValue(options::OPT_std_EQ).equals("c++amp")) {
+    CmdArgs.push_back(Args.MakeArgString(
+        "-lclang_rt.builtins-" +
+        getToolChain().getTriple().getArchName()));
+  }
+
   // Add OpenMP offloading linker script args if required.
   AddOpenMPLinkerScript(getToolChain(), C, Output, Inputs, Args, CmdArgs, JA);
-
   // Add HIP offloading linker script args if required.
   AddHIPLinkerScript(getToolChain(), C, Output, Inputs, Args, CmdArgs, JA,
                      *this);
+}
 
-  const char *Exec = Args.MakeArgString(ToolChain.GetLinkerPath());
-  C.addCommand(llvm::make_unique<Command>(JA, *this, Exec, CmdArgs, Inputs));
+void tools::gnutools::Linker::ConstructJob(Compilation &C,
+                                    const JobAction &JA,
+                                    const InputInfo &Output,
+                                    const InputInfoList &Inputs,
+                                    const ArgList &Args,
+                                    const char *LinkingOutput) const {
+  // ToDo: Find a better way to persist CXXAMPLink and construct the link
+  // job using it.
+  if (C.getArgs().hasArg(options::OPT_famp) ||
+    C.getArgs().getLastArgValue(options::OPT_std_EQ).equals("c++amp")) {
+    ArgStringList CmdArgs;
+
+    if (!HCLinker)
+      HCLinker = std::unique_ptr<HCC::CXXAMPLink>(new HCC::CXXAMPLink(getToolChain()));
+
+    if (C.getArgs().hasFlag(options::OPT_hc_function_calls, {}, false)) {
+      CmdArgs.emplace_back("--amdgpu-func-calls");
+    }
+  
+    HCLinker->ConstructLinkerJob(C, JA, Output, Inputs, Args, LinkingOutput, CmdArgs);
+    this->ConstructLinkerJob(C, JA, Output, Inputs, Args, LinkingOutput, CmdArgs);
+
+    const char *Exec = Args.MakeArgString(getToolChain().GetProgramPath("clamp-link"));
+    C.addCommand(llvm::make_unique<Command>(JA, *this, Exec, CmdArgs, Inputs));
+  } else {
+    ArgStringList CmdArgs;
+
+    ConstructLinkerJob(C, JA, Output, Inputs, Args, LinkingOutput, CmdArgs);
+
+    const char *Exec = Args.MakeArgString(getToolChain().GetLinkerPath());
+    C.addCommand(llvm::make_unique<Command>(JA, *this, Exec, CmdArgs, Inputs));
+  }
 }
 
 void tools::gnutools::Assembler::ConstructJob(Compilation &C,
@@ -2499,7 +2540,8 @@ bool Generic_GCC::GCCInstallationDetector::ScanGentooGccConfig(
 Generic_GCC::Generic_GCC(const Driver &D, const llvm::Triple &Triple,
                          const ArgList &Args)
     : ToolChain(D, Triple, Args), GCCInstallation(D),
-      CudaInstallation(D, Triple, Args) {
+      CudaInstallation(D, Triple, Args),
+      HCCInstallation(D, Args) {
   getProgramPaths().push_back(getDriver().getInstalledDir());
   if (getDriver().getInstalledDir() != getDriver().Dir)
     getProgramPaths().push_back(getDriver().Dir);
@@ -2532,6 +2574,7 @@ void Generic_GCC::printVerboseInfo(raw_ostream &OS) const {
   // Print the information about how we detected the GCC installation.
   GCCInstallation.print(OS);
   CudaInstallation.print(OS);
+  HCCInstallation.print(OS);
 }
 
 bool Generic_GCC::IsUnwindTablesDefault(const ArgList &Args) const {
diff --git a/lib/Driver/ToolChains/Gnu.h b/lib/Driver/ToolChains/Gnu.h
index 3bb38c498b..6a2c45ff5f 100644
--- a/lib/Driver/ToolChains/Gnu.h
+++ b/lib/Driver/ToolChains/Gnu.h
@@ -9,9 +9,11 @@
 #ifndef LLVM_CLANG_LIB_DRIVER_TOOLCHAINS_GNU_H
 #define LLVM_CLANG_LIB_DRIVER_TOOLCHAINS_GNU_H
 
+#include "Hcc.h"
 #include "Cuda.h"
 #include "clang/Driver/Tool.h"
 #include "clang/Driver/ToolChain.h"
+#include <memory>
 #include <set>
 
 namespace clang {
@@ -60,8 +62,11 @@ public:
 };
 
 class LLVM_LIBRARY_VISIBILITY Linker : public GnuTool {
+  mutable std::unique_ptr<HCC::CXXAMPLink> HCLinker;
+
 public:
   Linker(const ToolChain &TC) : GnuTool("GNU::Linker", "linker", TC) {}
+  Linker(const ToolChain &TC, const char* Name) : GnuTool(Name, "linker", TC) {}
 
   bool hasIntegratedCPP() const override { return false; }
   bool isLinkJob() const override { return true; }
@@ -70,6 +75,13 @@ public:
                     const InputInfo &Output, const InputInfoList &Inputs,
                     const llvm::opt::ArgList &TCArgs,
                     const char *LinkingOutput) const override;
+protected:
+  virtual void ConstructLinkerJob(Compilation &C, const JobAction &JA,
+                                  const InputInfo &Output,
+                                  const InputInfoList &Inputs,
+                                  const llvm::opt::ArgList &Args,
+                                  const char *LinkingOutput,
+                                  llvm::opt::ArgStringList &CmdArgs) const; 
 };
 } // end namespace gnutools
 
@@ -278,6 +290,9 @@ public:
 protected:
   GCCInstallationDetector GCCInstallation;
   CudaInstallationDetector CudaInstallation;
+  HCCInstallationDetector HCCInstallation;
+
+  friend class tools::HCC::CXXAMPLink;
 
 public:
   Generic_GCC(const Driver &D, const llvm::Triple &Triple,
diff --git a/lib/Driver/ToolChains/HIP.cpp b/lib/Driver/ToolChains/HIP.cpp
index 2ec97e798f..e800f0a23b 100644
--- a/lib/Driver/ToolChains/HIP.cpp
+++ b/lib/Driver/ToolChains/HIP.cpp
@@ -126,8 +126,12 @@ const char *AMDGCN::Linker::constructLlcCommand(
     const llvm::opt::ArgList &Args, llvm::StringRef SubArchName,
     llvm::StringRef OutputFilePrefix, const char *InputFileName) const {
   // Construct llc command.
+  // FIXME: -disable-promote-alloca-to-lds is a workaround for issues in
+  // AMDGPUPromoteAlloca pass which cause invalid memory access in PyTorch.
+  // Remove this once the issue is fixed.
   ArgStringList LlcArgs{InputFileName, "-mtriple=amdgcn-amd-amdhsa",
                         "-filetype=obj",
+                        "-disable-promote-alloca-to-lds",
                         Args.MakeArgString("-mcpu=" + SubArchName)};
 
   // Extract all the -m options
@@ -279,6 +283,8 @@ void HIPToolChain::addClangTargetOptions(
                          false))
     CC1Args.push_back("-fgpu-rdc");
 
+  CC1Args.push_back("-fcuda-allow-variadic-functions");
+
   // Default to "hidden" visibility, as object level linking will not be
   // supported for the foreseeable future.
   if (!DriverArgs.hasArg(options::OPT_fvisibility_EQ,
diff --git a/lib/Driver/ToolChains/Hcc.cpp b/lib/Driver/ToolChains/Hcc.cpp
new file mode 100755
index 0000000000..1cb745d4e8
--- /dev/null
+++ b/lib/Driver/ToolChains/Hcc.cpp
@@ -0,0 +1,431 @@
+//===--- Hcc.cpp - HCC Tool and ToolChain Implementations -----*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "Hcc.h"
+#include "Gnu.h"
+#include "InputInfo.h"
+#include "clang/Driver/Compilation.h"
+#include "clang/Driver/Driver.h"
+#include "clang/Driver/DriverDiagnostic.h"
+#include "clang/Driver/Options.h"
+#include "llvm/Option/ArgList.h"
+#include "llvm/Support/Path.h"
+#include "llvm/Support/VirtualFileSystem.h"
+
+#include <sstream>
+#include <string>
+
+using namespace clang;
+using namespace clang::driver;
+using namespace clang::driver::toolchains;
+using namespace clang::driver::tools;
+using namespace llvm::opt;
+
+bool FunctionCallDefault = false;
+
+HCCInstallationDetector::HCCInstallationDetector(const Driver &D, const llvm::opt::ArgList &Args) : D(D) {
+  std::string BinPath = D.Dir;
+  std::string InstallPath = D.InstalledDir;
+  auto &FS = D.getVFS();
+  SmallVector<std::string, 4> HCCPathCandidates;
+
+  if (Args.hasArg(options::OPT_hcc_path_EQ))
+    HCCPathCandidates.push_back(
+      Args.getLastArgValue(options::OPT_hcc_path_EQ));
+    
+  HCCPathCandidates.push_back(InstallPath + "/..");
+  HCCPathCandidates.push_back(BinPath + "/..");
+  HCCPathCandidates.push_back(BinPath + "/../..");
+
+  for (const auto &HCCPath: HCCPathCandidates) {
+    if (HCCPath.empty() ||
+        !(FS.exists(HCCPath + "/include/hc.hpp") || FS.exists(HCCPath + "/include/hcc/hc.hpp")) || 
+        !FS.exists(HCCPath + "/lib/libmcwamp.so"))
+      continue;
+
+    IncPath = HCCPath;
+    LibPath = HCCPath + "/lib";
+
+    IsValid = true;
+    break;
+  }
+}
+
+void HCCInstallationDetector::AddHCCIncludeArgs(const llvm::opt::ArgList &DriverArgs, llvm::opt::ArgStringList &CC1Args) const {
+  if (IsValid) {
+    CC1Args.push_back(DriverArgs.MakeArgString("-I" + IncPath + "/include"));
+    CC1Args.push_back(DriverArgs.MakeArgString("-I" + IncPath + "/hcc/include"));
+  }
+}
+
+void HCCInstallationDetector::AddHCCLibArgs(const llvm::opt::ArgList &Args, llvm::opt::ArgStringList &CmdArgs) const {
+  if (IsValid) {
+    // add verbose flag to linker script if clang++ is invoked with --verbose flag
+    if (Args.hasArg(options::OPT_v)) CmdArgs.push_back("--verbose");
+        
+    // Reverse translate the -lstdc++ option
+    // Or add -lstdc++ when running on RHEL 7 or CentOS 7
+    if (Args.hasArg(options::OPT_Z_reserved_lib_stdcxx) ||
+      HCC_TOOLCHAIN_RHEL) {
+      CmdArgs.push_back("-lstdc++");
+    }
+
+    CmdArgs.push_back(Args.MakeArgString("-L" + LibPath));
+    CmdArgs.push_back(Args.MakeArgString("--rpath=" + LibPath));
+
+    for (auto &lib: SystemLibs)
+      CmdArgs.push_back(lib);
+    
+    for (auto &lib: RuntimeLibs)
+      CmdArgs.push_back(lib);
+
+    if (Args.hasArg(options::OPT_hcc_extra_libs_EQ)) {
+      auto HccExtraLibs = Args.getAllArgValues(options::OPT_hcc_extra_libs_EQ);
+      std::string prefix{"--hcc-extra-libs="};
+
+      for(auto&& Lib:HccExtraLibs)
+        CmdArgs.push_back(Args.MakeArgString(prefix + Lib));
+    }
+  }
+}
+
+void HCCInstallationDetector::print(raw_ostream &OS) const {
+  if (IsValid)
+    OS << "Found HCC installation: " << IncPath << "\n";
+}
+    
+namespace
+{
+    struct Process_deleter {
+        int status = EXIT_FAILURE;
+        void operator()(std::FILE* p)
+        {
+            if (p) {
+                status = pclose(p);
+                status = WIFEXITED(status) ? WEXITSTATUS(status) : status;
+            }
+        }
+    };
+
+    std::vector<std::string> detect_gfxip(
+        const Compilation& c, const ToolChain& tc)
+    {   // Invariant: iff it executes correctly, rocm_agent_enumerator returns
+        //            at least gfx000; returning only gfx000 signals the absence
+        //            of valid GPU agents.
+        // Invariant: iff it executes correctly, and iff there are valid GPU
+        //            agents present rocm_agent_enumerator returns the set
+        //            formed from their union, including gfx000.
+        std::vector<std::string> r;
+
+        const char* tmp = std::getenv("ROCM_ROOT");
+        const Twine rocm = tmp ? tmp : "/opt/rocm";
+        const Twine e = rocm + "/bin/rocm_agent_enumerator";
+
+        if (!tc.getVFS().exists(e)) return r;
+
+        Process_deleter d;
+        std::unique_ptr<std::FILE, Process_deleter> pipe{
+            popen((e.str() + " --type GPU").c_str(), "r"), d};
+
+        if (!pipe) return r;
+
+        static constexpr std::size_t buf_sz = 16u;
+        std::array<char, buf_sz> buf = {{}};
+        while (std::fgets(buf.data(), buf.size(), pipe.get())) {
+            r.emplace_back(buf.data());
+        }
+
+        for (auto&& x : r) { // fgets copies the newline.
+            x.erase(std::remove(x.begin(), x.end(), '\n'), x.end());
+        }
+
+        if (r.size() > 1) {
+            std::sort(r.rbegin(), r.rend());
+            r.pop_back(); // Remove null-agent.
+        }
+
+        return r;
+    }
+
+    std::vector<std::string> detect_and_add_targets(
+        const Compilation& c, const ToolChain& tc)
+    {
+        constexpr const char null_agent[] = "gfx000";
+
+        const auto detected_targets = detect_gfxip(c, tc);
+        if (detected_targets.empty()) {
+            c.getDriver().Diag(diag::warn_amdgpu_agent_detector_failed);
+        }
+        else if (detected_targets[0] == null_agent) {
+            c.getDriver().Diag(diag::err_amdgpu_no_agent_available);
+        }
+
+        return detected_targets;
+    }
+
+    bool is_valid(const std::string& gfxip)
+    {
+        static constexpr std::array<const char*, 4u> valid = {
+            { "gfx701", "gfx803", "gfx900", "gfx906" }};
+
+        return std::find(valid.cbegin(), valid.cend(), gfxip) != valid.cend();
+    }
+
+    bool is_deprecated(const std::string& gfxip)
+    {
+        static constexpr std::array<const char*, 1u> deprecated = {{"gfx700"}};
+
+        return std::find(
+            deprecated.cbegin(), deprecated.cend(), gfxip) != deprecated.cend();
+    }
+
+    void validate_and_add_to_command(
+        const std::string& gfxip,
+        const Compilation& c,
+        const ArgList& args,
+        ArgStringList& cmd_args)
+    {
+        static constexpr const char prefix[] = "--amdgpu-target=";
+
+        if (!is_valid(gfxip)) {
+            c.getDriver().Diag(diag::warn_amdgpu_target_invalid) << gfxip;
+            return;
+        }
+
+        if (is_deprecated(gfxip)) {
+            c.getDriver().Diag(diag::warn_amdgpu_target_deprecated) << gfxip;
+        }
+        cmd_args.push_back(args.MakeArgString(prefix + gfxip));
+    }
+
+    template<typename T>
+    void split(const std::string& s, char delim, T result)
+    {
+        std::stringstream ss;
+        ss.str(s);
+        std::string item;
+        while (std::getline(ss, item, delim)) {
+            *(result++) = item;
+        }
+    }
+
+    std::vector<std::string> split_gfx_list(
+        const std::string& gfx_list,
+        char delim)
+    {
+        std::vector<std::string> elems;
+        split(gfx_list, delim, std::back_inserter(elems));
+        return elems;
+    }
+
+    template <typename T>
+    void remove_duplicate_targets(std::vector<T>& TargetVec)
+    {
+        std::sort(TargetVec.begin(), TargetVec.end());
+        TargetVec.erase(unique(TargetVec.begin(), TargetVec.end()), TargetVec.end());
+    }
+
+    void construct_amdgpu_target_cmdargs(
+        Compilation &C,
+        const ToolChain& tc,
+        const ArgList &Args,
+        ArgStringList &CmdArgs)
+    {
+        // specify AMDGPU target
+        constexpr const char auto_tgt[] = "auto";
+
+        #if !defined(HCC_AMDGPU_TARGET)
+            #define HCC_AMDGPU_TARGET auto_tgt
+        #endif
+
+        auto AMDGPUTargetVector =
+            Args.getAllArgValues(options::OPT_amdgpu_target_EQ);
+
+        if (AMDGPUTargetVector.empty()) {
+            // split HCC_AMDGPU_TARGET list up
+            AMDGPUTargetVector = split_gfx_list(HCC_AMDGPU_TARGET, ' ');
+        }
+
+        const auto cnt = std::count(
+            AMDGPUTargetVector.cbegin(), AMDGPUTargetVector.cend(), auto_tgt);
+
+        if (cnt > 1) C.getDriver().Diag(diag::warn_amdgpu_target_auto_nonsingular);
+        if (cnt == AMDGPUTargetVector.size()) {
+            AMDGPUTargetVector = detect_and_add_targets(C, tc);
+        }
+        AMDGPUTargetVector.erase(
+            std::remove(
+                AMDGPUTargetVector.begin(), AMDGPUTargetVector.end(), auto_tgt),
+            AMDGPUTargetVector.end());
+
+        remove_duplicate_targets(AMDGPUTargetVector);
+
+        for (auto&& AMDGPUTarget : AMDGPUTargetVector) {
+            validate_and_add_to_command(AMDGPUTarget, C, Args, CmdArgs);
+        }
+    }
+}
+
+void HCC::Assembler::ConstructJob(Compilation &C, const JobAction &JA,
+                                    const InputInfo &Output,
+                                    const InputInfoList &Inputs,
+                                    const ArgList &Args,
+                                    const char *LinkingOutput) const {
+  assert(Inputs.size() == 1 && "Unable to handle multiple inputs.");
+
+  ArgStringList CmdArgs;
+  for (InputInfoList::const_iterator
+         it = Inputs.begin(), ie = Inputs.end(); it != ie; ++it) {
+    const InputInfo &II = *it;
+    if (II.isFilename())
+      CmdArgs.push_back(II.getFilename());
+    else
+      II.getInputArg().renderAsInput(Args, CmdArgs);
+  }
+
+  if (Output.isFilename())
+    CmdArgs.push_back(Output.getFilename());
+  else
+    Output.getInputArg().renderAsInput(Args, CmdArgs);
+
+  if (JA.getKind() == Action::AssembleJobClass) {
+    if (!Args.hasFlag(options::OPT_fgpu_rdc, options::OPT_fno_gpu_rdc, true)) {
+      if (Args.hasFlag(options::OPT_hc_function_calls, {}, false)) {
+        CmdArgs.push_back("--amdgpu-func-calls");
+      }
+      CmdArgs.push_back("--early_finalize");
+      // add the amdgpu target args
+      construct_amdgpu_target_cmdargs(C, getToolChain(), Args, CmdArgs);
+    }
+    const char *Exec = Args.MakeArgString(
+      getToolChain().GetProgramPath("hc-kernel-assemble"));
+    C.addCommand(llvm::make_unique<Command>(JA, *this, Exec, CmdArgs, Inputs));
+  }
+}
+
+
+#ifndef HCC_TOOLCHAIN_RHEL
+  #define HCC_TOOLCHAIN_RHEL false
+#endif
+
+void HCC::CXXAMPLink::ConstructLinkerJob(
+    Compilation &C,
+    const JobAction &JA,
+    const InputInfo &Output,
+    const InputInfoList &Inputs,
+    const ArgList &Args,
+    const char *LinkingOutput,
+    ArgStringList &CmdArgs) const
+{
+    const auto &TC = static_cast<const toolchains::Generic_ELF &>(getToolChain());
+    TC.HCCInstallation.AddHCCLibArgs(Args, CmdArgs);
+
+    construct_amdgpu_target_cmdargs(C, getToolChain(), Args, CmdArgs);
+}
+
+void HCC::CXXAMPLink::ConstructJob(Compilation &C,
+                                   const JobAction &JA,
+                                   const InputInfo &Output,
+                                   const InputInfoList &Inputs,
+                                   const ArgList &Args,
+                                   const char *LinkingOutput) const {
+}
+
+/// HCC toolchain.
+/// It may operate in 2 modes, depending on the Environment in Triple
+/// - C++AMP mode:
+///   - use clamp-assemble as assembler
+///   - use clamp-link as linker
+/// - HC mode:
+///   - use hc-kernel-assemble as assembler for kernel path
+///   - use hc-host-assemble as assembler for host path
+///   - use clamp-link as linker
+
+HCCToolChain::HCCToolChain(const Driver &D, const llvm::Triple &Triple,
+                           const ToolChain &HostTC, const ArgList &Args)
+    : ToolChain(D, Triple, Args), HostTC(HostTC) {
+  getProgramPaths().push_back(getDriver().getInstalledDir());
+  if (getDriver().getInstalledDir() != getDriver().Dir)
+    getProgramPaths().push_back(getDriver().Dir);
+}
+
+void HCCToolChain::addClangTargetOptions(
+    const llvm::opt::ArgList &DriverArgs, llvm::opt::ArgStringList &CC1Args,
+    Action::OffloadKind DeviceOffloadKind) const {
+  HostTC.addClangTargetOptions(DriverArgs, CC1Args, DeviceOffloadKind);
+
+  // TBD, depends on mode set correct arguments
+}
+
+void HCCToolChain::AddClangSystemIncludeArgs(const ArgList &DriverArgs, ArgStringList &CC1Args) const {
+  HostTC.AddClangSystemIncludeArgs(DriverArgs, CC1Args);
+}
+
+void HCCToolChain::AddClangCXXStdlibIncludeArgs(const ArgList &Args, ArgStringList &CC1Args) const {
+  HostTC.AddClangCXXStdlibIncludeArgs(Args, CC1Args);
+}
+
+void HCCToolChain::AddHCCIncludeArgs(const ArgList &DriverArgs, ArgStringList &CC1Args) const {
+  HostTC.AddHCCIncludeArgs(DriverArgs, CC1Args);
+}
+
+llvm::opt::DerivedArgList *
+HCCToolChain::TranslateArgs(const llvm::opt::DerivedArgList &Args,
+                            StringRef BoundArch,
+                            Action::OffloadKind DeviceOffloadKind) const {
+  // TBD look into what should be properly implemented
+  DerivedArgList *DAL = new DerivedArgList(Args.getBaseArgs());
+  const OptTable &Opts = getDriver().getOpts();
+
+  for (Arg *A : Args) {
+    if (A->getOption().matches(options::OPT_Xarch__)) {
+      // Skip this argument unless the architecture matches BoundArch
+      if (BoundArch.empty() || A->getValue(0) != BoundArch)
+        continue;
+
+      unsigned Index = Args.getBaseArgs().MakeIndex(A->getValue(1));
+      unsigned Prev = Index;
+      std::unique_ptr<Arg> XarchArg(Opts.ParseOneArg(Args, Index));
+
+      // If the argument parsing failed or more than one argument was
+      // consumed, the -Xarch_ argument's parameter tried to consume
+      // extra arguments. Emit an error and ignore.
+      //
+      // We also want to disallow any options which would alter the
+      // driver behavior; that isn't going to work in our model. We
+      // use isDriverOption() as an approximation, although things
+      // like -O4 are going to slip through.
+      if (!XarchArg || Index > Prev + 1) {
+        getDriver().Diag(diag::err_drv_invalid_Xarch_argument_with_args)
+            << A->getAsString(Args);
+        continue;
+      } else if (XarchArg->getOption().hasFlag(options::DriverOption)) {
+        getDriver().Diag(diag::err_drv_invalid_Xarch_argument_isdriver)
+            << A->getAsString(Args);
+        continue;
+      }
+      XarchArg->setBaseArg(A);
+      A = XarchArg.release();
+      DAL->AddSynthesizedArg(A);
+    }
+    DAL->append(A);
+  }
+
+  if (!BoundArch.empty())
+    DAL->AddJoinedArg(nullptr, Opts.getOption(options::OPT_march_EQ), BoundArch);
+  return DAL;
+}
+
+Tool *HCCToolChain::buildAssembler() const {
+  return new tools::HCC::Assembler(*this);
+}
+
+Tool *HCCToolChain::buildLinker() const {
+  return new tools::HCC::CXXAMPLink(*this);
+}
diff --git a/lib/Driver/ToolChains/Hcc.h b/lib/Driver/ToolChains/Hcc.h
new file mode 100755
index 0000000000..c851d83044
--- /dev/null
+++ b/lib/Driver/ToolChains/Hcc.h
@@ -0,0 +1,139 @@
+//===--- Hcc.h - HCC ToolChain Implementations ----------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_CLANG_LIB_DRIVER_TOOLCHAINS_HCC_H
+#define LLVM_CLANG_LIB_DRIVER_TOOLCHAINS_HCC_H
+
+#include "clang/Driver/Action.h"
+#include "clang/Driver/ToolChain.h"
+#include "clang/Driver/Tool.h"
+#include "llvm/Support/Compiler.h"
+
+extern bool FunctionCallDefault;
+
+namespace clang {
+namespace driver {
+
+class HCCInstallationDetector {
+private:
+  const Driver &D;
+  bool IsValid = false;
+
+  std::string IncPath;
+  std::string LibPath;
+
+  std::vector<const char *> SystemLibs = {"-ldl", "-lm", "-lpthread"};
+  std::vector<const char *> RuntimeLibs = {"-lhc_am", "-lmcwamp"};
+
+public:
+  HCCInstallationDetector(const Driver &D, const llvm::opt::ArgList &Args);
+      
+  void AddHCCIncludeArgs(const llvm::opt::ArgList &DriverArgs, llvm::opt::ArgStringList &CC1Args) const;
+
+  void AddHCCLibArgs(const llvm::opt::ArgList &Args, llvm::opt::ArgStringList &CmdArgs) const;
+      
+  bool isValid() const { return IsValid; }
+      
+  void print(raw_ostream &OS) const;
+};
+
+namespace tools {
+namespace HCC {
+
+/// \brief HC assembler tool.
+class LLVM_LIBRARY_VISIBILITY Assembler : public Tool {
+public:
+  Assembler(const ToolChain &TC)
+      : Tool("hc-assemble", "HC assembler", TC) {}
+
+  bool hasGoodDiagnostics() const override { return true; }
+  bool hasIntegratedAssembler() const override { return false; }
+  bool hasIntegratedCPP() const override { return false; }
+
+  void ConstructJob(Compilation &C, const JobAction &JA,
+                    const InputInfo &Output,
+                    const InputInfoList &Inputs,
+                    const llvm::opt::ArgList &TCArgs,
+                    const char *LinkingOuput) const override;
+};
+
+// \brief C++AMP linker.
+class LLVM_LIBRARY_VISIBILITY CXXAMPLink : public Tool {
+public:
+  CXXAMPLink(const ToolChain &TC) : Tool("clamp-link", "HC linker", TC) {}
+
+  bool hasGoodDiagnostics() const override { return true; }
+  bool hasIntegratedAssembler() const override { return false; }
+  bool hasIntegratedCPP() const override { return false; }
+
+  void ConstructJob(Compilation &C,
+                    const JobAction &JA,
+                    const InputInfo &Output,
+                    const InputInfoList &Inputs,
+                    const llvm::opt::ArgList &Args,
+                    const char *LinkingOuput) const override;
+
+  void ConstructLinkerJob(Compilation &C,
+                          const JobAction &JA,
+                          const InputInfo &Output,
+                          const InputInfoList &Inputs,
+                          const llvm::opt::ArgList &Args,
+                          const char *LinkingOutput,
+                          llvm::opt::ArgStringList &CmdArgs) const;
+};
+
+} // end namespace HCC
+} // end namespace tools
+
+namespace toolchains {
+
+class LLVM_LIBRARY_VISIBILITY HCCToolChain : public ToolChain {
+public:
+  HCCToolChain(const Driver &D, const llvm::Triple &Triple,
+               const ToolChain &HostTC, const llvm::opt::ArgList &Args);
+
+  llvm::opt::DerivedArgList *
+  TranslateArgs(const llvm::opt::DerivedArgList &Args,
+                StringRef BoundArch,
+                Action::OffloadKind DeviceOffloadKind) const override;
+  void
+  addClangTargetOptions(const llvm::opt::ArgList &DriverArgs,
+                        llvm::opt::ArgStringList &CC1Args,
+                        Action::OffloadKind DeviceOffloadKind) const override;
+
+  void AddClangSystemIncludeArgs(const llvm::opt::ArgList &DriverArgs, llvm::opt::ArgStringList &CC1Args) const override;
+
+  void AddClangCXXStdlibIncludeArgs(const llvm::opt::ArgList &Args, llvm::opt::ArgStringList &CC1Args) const override;
+
+  void AddHCCIncludeArgs(const llvm::opt::ArgList &DriverArgs, llvm::opt::ArgStringList &CC1Args) const override;
+  
+  bool useIntegratedAs() const override { return false; }
+
+  // HCC ToolChain use DWARF version 2 by default
+  unsigned GetDefaultDwarfVersion() const override { return 2; }
+
+  // HCC ToolChain doesn't support "-pg"-style profiling yet
+  bool SupportsProfiling() const override { return false; }
+
+  bool isPICDefault() const override { return false; }
+  bool isPIEDefault() const override { return false; }
+  bool isPICDefaultForced() const override { return false; }
+
+  const ToolChain &HostTC;
+
+protected:
+  Tool *buildAssembler() const override;
+  Tool *buildLinker() const override;
+};
+
+} // end namespace toolchains
+} // end namespace driver
+} // end namespace clang
+
+#endif // LLVM_CLANG_LIB_DRIVER_TOOLCHAINS_HCC_H
diff --git a/lib/Driver/ToolChains/Linux.cpp b/lib/Driver/ToolChains/Linux.cpp
index d900508ad9..648fc9c448 100644
--- a/lib/Driver/ToolChains/Linux.cpp
+++ b/lib/Driver/ToolChains/Linux.cpp
@@ -6,6 +6,7 @@
 //
 //===----------------------------------------------------------------------===//
 
+#include "Hcc.h"
 #include "Linux.h"
 #include "Arch/ARM.h"
 #include "Arch/Mips.h"
@@ -961,6 +962,11 @@ void Linux::AddCudaIncludeArgs(const ArgList &DriverArgs,
   CudaInstallation.AddCudaIncludeArgs(DriverArgs, CC1Args);
 }
 
+void Linux::AddHCCIncludeArgs(const ArgList &DriverArgs,
+                              ArgStringList &CC1Args) const {
+  HCCInstallation.AddHCCIncludeArgs(DriverArgs, CC1Args);
+}
+
 void Linux::AddIAMCUIncludeArgs(const ArgList &DriverArgs,
                                 ArgStringList &CC1Args) const {
   if (GCCInstallation.isValid()) {
diff --git a/lib/Driver/ToolChains/Linux.h b/lib/Driver/ToolChains/Linux.h
index 4c61994691..282bc41b97 100644
--- a/lib/Driver/ToolChains/Linux.h
+++ b/lib/Driver/ToolChains/Linux.h
@@ -34,6 +34,8 @@ public:
       llvm::opt::ArgStringList &CC1Args) const override;
   void AddCudaIncludeArgs(const llvm::opt::ArgList &DriverArgs,
                           llvm::opt::ArgStringList &CC1Args) const override;
+  void AddHCCIncludeArgs(const llvm::opt::ArgList &DriverArgs,
+                         llvm::opt::ArgStringList &CC1Args) const override;
   void AddIAMCUIncludeArgs(const llvm::opt::ArgList &DriverArgs,
                            llvm::opt::ArgStringList &CC1Args) const override;
   CXXStdlibType GetDefaultCXXStdlibType() const override;
diff --git a/lib/Driver/Types.cpp b/lib/Driver/Types.cpp
index 96937678ac..37eb5fad98 100644
--- a/lib/Driver/Types.cpp
+++ b/lib/Driver/Types.cpp
@@ -106,6 +106,9 @@ bool types::isAcceptedByClang(ID Id) {
   case TY_HIP_DEVICE:
   case TY_ObjC: case TY_PP_ObjC: case TY_PP_ObjC_Alias:
   case TY_CXX: case TY_PP_CXX:
+  case TY_CXX_AMP: case TY_PP_CXX_AMP: case TY_CXX_AMP_CPU:
+  case TY_PP_CXX_AMP_CPU:
+  case TY_HC_KERNEL: case TY_HC_HOST: case TY_PP_HC_HOST:
   case TY_ObjCXX: case TY_PP_ObjCXX: case TY_PP_ObjCXX_Alias:
   case TY_CHeader: case TY_PP_CHeader:
   case TY_CLHeader:
@@ -138,6 +141,9 @@ bool types::isCXX(ID Id) {
     return false;
 
   case TY_CXX: case TY_PP_CXX:
+  case TY_CXX_AMP: case TY_PP_CXX_AMP: case TY_CXX_AMP_CPU:
+  case TY_PP_CXX_AMP_CPU:
+  case TY_HC_KERNEL: case TY_HC_HOST: case TY_PP_HC_HOST:
   case TY_ObjCXX: case TY_PP_ObjCXX: case TY_PP_ObjCXX_Alias:
   case TY_CXXHeader: case TY_PP_CXXHeader:
   case TY_ObjCXXHeader: case TY_PP_ObjCXXHeader:
@@ -187,6 +193,19 @@ bool types::isHIP(ID Id) {
   }
 }
 
+bool types::isHCC(ID Id) {
+  switch (Id) {
+  default:
+    return false;
+
+  case TY_CXX_AMP:
+  case TY_CXX_AMP_CPU:
+  case TY_HC_HOST:
+  case TY_HC_KERNEL:
+    return true;
+  }
+}
+
 bool types::isSrcFile(ID Id) {
   return Id != TY_Object && getPreprocessedType(Id) != TY_INVALID;
 }
diff --git a/lib/Frontend/CompilerInstance.cpp b/lib/Frontend/CompilerInstance.cpp
index cf0267549e..ad1013fe43 100644
--- a/lib/Frontend/CompilerInstance.cpp
+++ b/lib/Frontend/CompilerInstance.cpp
@@ -897,9 +897,8 @@ bool CompilerInstance::ExecuteAction(FrontendAction &Act) {
   if (!hasTarget())
     return false;
 
-  // Create TargetInfo for the other side of CUDA and OpenMP compilation.
-  if ((getLangOpts().CUDA || getLangOpts().OpenMPIsDevice) &&
-      !getFrontendOpts().AuxTriple.empty()) {
+  // Create TargetInfo for the other side of CUDA and HCC compilation.
+  if ((getLangOpts().CUDA || getLangOpts().CPlusPlusAMP) && !getFrontendOpts().AuxTriple.empty()) {
     auto TO = std::make_shared<TargetOptions>();
     TO->Triple = llvm::Triple::normalize(getFrontendOpts().AuxTriple);
     TO->HostTriple = getTarget().getTriple().str();
diff --git a/lib/Frontend/CompilerInvocation.cpp b/lib/Frontend/CompilerInvocation.cpp
index bc54e38a1a..7bb1b83959 100644
--- a/lib/Frontend/CompilerInvocation.cpp
+++ b/lib/Frontend/CompilerInvocation.cpp
@@ -817,6 +817,8 @@ static bool ParseCodeGenArgs(CodeGenOptions &Opts, ArgList &Args, InputKind IK,
   Opts.PreserveAsmComments = !Args.hasArg(OPT_fno_preserve_as_comments);
   Opts.AssumeSaneOperatorNew = !Args.hasArg(OPT_fno_assume_sane_operator_new);
   Opts.ObjCAutoRefCountExceptions = Args.hasArg(OPT_fobjc_arc_exceptions);
+  Opts.AMPIsDevice = Args.hasArg(OPT_famp_is_device);
+  Opts.AMPCPU = Args.hasArg(OPT_famp_cpu);
   Opts.CXAAtExit = !Args.hasArg(OPT_fno_use_cxa_atexit);
   Opts.RegisterGlobalDtorsWithAtExit =
       Args.hasArg(OPT_fregister_global_dtors_with_atexit);
@@ -853,6 +855,7 @@ static bool ParseCodeGenArgs(CodeGenOptions &Opts, ArgList &Args, InputKind IK,
   Opts.CorrectlyRoundedDivSqrt =
       Args.hasArg(OPT_cl_fp32_correctly_rounded_divide_sqrt);
   Opts.UniformWGSize =
+      !Args.hasArg(OPT_famp_is_device) &&
       Args.hasArg(OPT_cl_uniform_work_group_size);
   Opts.Reciprocals = Args.getAllArgValues(OPT_mrecip_EQ);
   Opts.ReciprocalMath = Args.hasArg(OPT_freciprocal_math);
@@ -1895,6 +1898,10 @@ static InputKind ParseFrontendArgs(FrontendOptions &Opts, ArgList &Args,
                 .Case("cuda", InputKind::CUDA)
                 .Case("hip", InputKind::HIP)
                 .Case("c++", InputKind::CXX)
+      .Case("c++amp-kernel", InputKind::CXXAMP) // C++ AMP support
+      .Case("hc-kernel", InputKind::CXXAMP) // HC support
+      .Case("hc-host", InputKind::CXXAMP) // HC support
+      .Case("c++amp-kernel-cpu", InputKind::CXXAMP) // C++ AMP support
                 .Case("objective-c", InputKind::ObjC)
                 .Case("objective-c++", InputKind::ObjCXX)
                 .Case("renderscript", InputKind::RenderScript)
@@ -2151,6 +2158,7 @@ void CompilerInvocation::setLangDefaults(LangOptions &Opts, InputKind IK,
       LangStd = LangStandard::lang_gnu11;
 #endif
       break;
+    case InputKind::CXXAMP:
     case InputKind::CXX:
     case InputKind::ObjCXX:
 #if defined(CLANG_DEFAULT_STD_CXX)
@@ -2184,6 +2192,7 @@ void CompilerInvocation::setLangDefaults(LangOptions &Opts, InputKind IK,
   Opts.GNUInline = !Opts.C99 && !Opts.CPlusPlus;
   Opts.HexFloats = Std.hasHexFloats();
   Opts.ImplicitInt = Std.hasImplicitInt();
+  Opts.CPlusPlusAMP |= Std.isCPlusPlusAMP();
 
   // Set OpenCL Version.
   Opts.OpenCL = Std.isOpenCL();
@@ -2287,12 +2296,18 @@ static bool IsInputCompatibleWithStandard(InputKind IK,
 
   case InputKind::CXX:
   case InputKind::ObjCXX:
-    return S.getLanguage() == InputKind::CXX;
+    return S.getLanguage() == InputKind::CXX ||
+           S.getLanguage() == InputKind::CXXAMP;
+
+  case InputKind::CXXAMP:
+    return S.getLanguage() == InputKind::CXXAMP ||
+           S.getLanguage() == InputKind::CXX;
 
   case InputKind::CUDA:
     // FIXME: What -std= values should be permitted for CUDA compilations?
     return S.getLanguage() == InputKind::CUDA ||
-           S.getLanguage() == InputKind::CXX;
+           S.getLanguage() == InputKind::CXX ||
+           S.getLanguage() == InputKind::CXXAMP;
 
   case InputKind::HIP:
     return S.getLanguage() == InputKind::CXX ||
@@ -2321,6 +2336,8 @@ static const StringRef GetInputKindName(InputKind IK) {
     return "Objective-C++";
   case InputKind::OpenCL:
     return "OpenCL";
+  case InputKind::CXXAMP:
+    return "C++AMP";
   case InputKind::CUDA:
     return "CUDA";
   case InputKind::RenderScript:
@@ -2425,6 +2442,15 @@ static void ParseLangArgs(LangOptions &Opts, ArgList &Args, InputKind IK,
   llvm::Triple T(TargetOpts.Triple);
   CompilerInvocation::setLangDefaults(Opts, IK, T, PPOpts, LangStd);
 
+  // Add AMP features if we got -famp.
+  Opts.CPlusPlusAMP |= Args.hasArg(options::OPT_famp);
+
+  // Add AMP features if using AMP.
+  if (Opts.CPlusPlusAMP) {
+    Opts.NativeHalfType = 1;
+    Opts.NativeHalfArgsAndReturns = 1;
+  }
+
   // -cl-strict-aliasing needs to emit diagnostic in the case where CL > 1.0.
   // This option should be deprecated for CL > 1.0 because
   // this option was added for compatibility with OpenCL 1.0.
@@ -2965,6 +2991,19 @@ static void ParseLangArgs(LangOptions &Opts, ArgList &Args, InputKind IK,
           << Opts.OMPHostIRFile;
   }
 
+  // C++ AMP: Decide host path or device path
+  Opts.DevicePath = Args.hasArg(OPT_famp_is_device);
+  Opts.AMPCPU = Args.hasArg(OPT_famp_cpu);
+  Opts.HSAExtension = Args.hasArg(OPT_fhsa_extension);
+
+  // rules for auto-auto:
+  // disabled by default, or explicitly disabled by -fno-auto-auto
+  // enabled by -fauto-auto
+  Opts.AutoAuto = Args.hasArg(OPT_fauto_auto) && !Args.hasArg(OPT_fno_auto_auto);
+
+  // rules for auto-compile-for-accelerator:
+  Opts.AutoCompileForAccelerator = Args.hasArg(OPT_fauto_compile_for_accelerator);
+
   Opts.SYCLIsDevice = Args.hasArg(options::OPT_fsycl_is_device);
 
   // Set CUDA mode for OpenMP target NVPTX if specified in options
@@ -3403,9 +3442,13 @@ bool CompilerInvocation::CreateFromArgs(CompilerInvocation &Res,
       Res.getTargetOpts().HostTriple = Res.getFrontendOpts().AuxTriple;
   }
 
-  // Set the triple of the host for OpenMP device compile.
-  if (LangOpts.OpenMPIsDevice)
-    Res.getTargetOpts().HostTriple = Res.getFrontendOpts().AuxTriple;
+  if (LangOpts.CPlusPlusAMP) {
+    // During HCC device-side compilation, the aux triple is the
+    // triple used for host compilation
+    if (LangOpts.DevicePath) {
+      Res.getTargetOpts().HostTriple = Res.getFrontendOpts().AuxTriple;
+    }
+  }
 
   // FIXME: Override value name discarding when asan or msan is used because the
   // backend passes depend on the name of the alloca in order to print out
diff --git a/lib/Frontend/FrontendActions.cpp b/lib/Frontend/FrontendActions.cpp
index e37afae533..8d9af3d8e1 100644
--- a/lib/Frontend/FrontendActions.cpp
+++ b/lib/Frontend/FrontendActions.cpp
@@ -834,6 +834,7 @@ void PrintPreambleAction::ExecuteAction() {
   switch (getCurrentFileKind().getLanguage()) {
   case InputKind::C:
   case InputKind::CXX:
+  case InputKind::CXXAMP:
   case InputKind::ObjC:
   case InputKind::ObjCXX:
   case InputKind::OpenCL:
diff --git a/lib/Frontend/InitPreprocessor.cpp b/lib/Frontend/InitPreprocessor.cpp
index 6feb7bcbd4..8a718d2f55 100644
--- a/lib/Frontend/InitPreprocessor.cpp
+++ b/lib/Frontend/InitPreprocessor.cpp
@@ -460,6 +460,20 @@ static void InitializeStandardPredefinedMacros(const TargetInfo &TI,
     if (LangOpts.CUDAIsDevice)
       Builder.defineMacro("__HIP_DEVICE_COMPILE__");
   }
+  if(LangOpts.DevicePath) {
+    if(LangOpts.AMPCPU) {
+      Builder.defineMacro("__AMP_CPU__", "1");
+      Builder.defineMacro("__KALMAR_ACCELERATOR__", "2");
+      Builder.defineMacro("__HCC_ACCELERATOR__", "2");
+    } else {
+      Builder.defineMacro("__GPU__", "1");
+      Builder.defineMacro("__KALMAR_ACCELERATOR__", "1");
+      Builder.defineMacro("__HCC_ACCELERATOR__", "1");
+    }
+  } else {
+    Builder.defineMacro("__KALMAR_CPU__", "1");
+    Builder.defineMacro("__HCC_CPU__", "1");
+  }
 }
 
 /// Initialize the predefined C++ language feature test macros defined in
@@ -560,16 +574,35 @@ static void InitializePredefinedMacros(const TargetInfo &TI,
   // Compiler version introspection macros.
   Builder.defineMacro("__llvm__");  // LLVM Backend
   Builder.defineMacro("__clang__"); // Clang Frontend
+
+  // hcc macros
+  Builder.defineMacro("__KALMAR_CC__", "1");
+  Builder.defineMacro("__HCC__", "1");
+
 #define TOSTR2(X) #X
 #define TOSTR(X) TOSTR2(X)
   Builder.defineMacro("__clang_major__", TOSTR(CLANG_VERSION_MAJOR));
   Builder.defineMacro("__clang_minor__", TOSTR(CLANG_VERSION_MINOR));
   Builder.defineMacro("__clang_patchlevel__", TOSTR(CLANG_VERSION_PATCHLEVEL));
-#undef TOSTR
-#undef TOSTR2
   Builder.defineMacro("__clang_version__",
                       "\"" CLANG_VERSION_STRING " "
                       + getClangFullRepositoryVersion() + "\"");
+
+  // hcc version macros
+  Builder.defineMacro("__hcc_major__", TOSTR(HCC_VERSION_MAJOR));
+  Builder.defineMacro("__hcc_minor__", TOSTR(HCC_VERSION_MINOR));
+  Builder.defineMacro("__hcc_patchlevel__", TOSTR(HCC_VERSION_PATCH));
+  Builder.defineMacro("__hcc_version__", TOSTR(HCC_VERSION_STRING));
+  Builder.defineMacro("__hcc_workweek__", TOSTR(HCC_VERSION_WORKWEEK));
+
+  // hcc backend macro. possible values are:
+  // - CL : for non-HSA systems
+  // - HLC : for HLC backend
+  // - AMDGPU : for Lightning backend
+  Builder.defineMacro("__hcc_backend__", TOSTR(KALMAR_BACKEND));
+
+#undef TOSTR
+#undef TOSTR2
   if (!LangOpts.MSVCCompat) {
     // Currently claim to be compatible with GCC 4.2.1-5621, but only if we're
     // not compiling for MSVC compatibility
@@ -1110,7 +1143,7 @@ void clang::InitializePreprocessor(
   if (InitOpts.UsePredefines) {
     // FIXME: This will create multiple definitions for most of the predefined
     // macros. This is not the right way to handle this.
-    if ((LangOpts.CUDA || LangOpts.OpenMPIsDevice) && PP.getAuxTargetInfo())
+    if ((LangOpts.CUDA || LangOpts.OpenMPIsDevice || LangOpts.CPlusPlusAMP) && PP.getAuxTargetInfo())
       InitializePredefinedMacros(*PP.getAuxTargetInfo(), LangOpts, FEOpts,
                                  Builder);
 
diff --git a/lib/Parse/ParseDecl.cpp b/lib/Parse/ParseDecl.cpp
index 73b4f50fda..fbd79c6ff4 100644
--- a/lib/Parse/ParseDecl.cpp
+++ b/lib/Parse/ParseDecl.cpp
@@ -801,6 +801,13 @@ void Parser::ParseOpenCLQualifiers(ParsedAttributes &Attrs) {
                ParsedAttr::AS_Keyword);
 }
 
+void Parser::ParseHCCQualifiers(ParsedAttributes &Attrs) {
+  IdentifierInfo *AttrName = Tok.getIdentifierInfo();
+  SourceLocation AttrNameLoc = Tok.getLocation();
+  Attrs.addNew(AttrName, AttrNameLoc, nullptr, AttrNameLoc, nullptr, 0,
+               ParsedAttr::AS_Keyword);
+}
+
 void Parser::ParseNullabilityTypeSpecifiers(ParsedAttributes &attrs) {
   // Treat these like attributes, even though they're type specifiers.
   while (true) {
@@ -2214,6 +2221,14 @@ Decl *Parser::ParseDeclarationAfterDeclarator(
   return ParseDeclarationAfterDeclaratorAndAttributes(D, TemplateInfo);
 }
 
+// Check if a given Declarator is a tile_static variable
+static bool IsTileStatic(Declarator &D) {
+  if (D.getDeclSpec().hasAttributes()) {
+    return D.getDeclSpec().getAttributes().hasAttribute(ParsedAttr::AT_HCCTileStatic);
+  }
+  return false;
+}
+
 Decl *Parser::ParseDeclarationAfterDeclaratorAndAttributes(
     Declarator &D, const ParsedTemplateInfo &TemplateInfo, ForRangeInit *FRI) {
   // RAII type used to track whether we're inside an initializer.
@@ -2311,6 +2326,14 @@ Decl *Parser::ParseDeclarationAfterDeclaratorAndAttributes(
   if (isTokenEqualOrEqualTypo()) {
     SourceLocation EqualLoc = ConsumeToken();
 
+    // C++ AMP-specific
+    // tile_static variables can't be initialized.
+    if (getLangOpts().CPlusPlusAMP) {
+      if (IsTileStatic(D)) {
+        Diag(ConsumeToken(), diag::err_tile_static_no_init);
+      }
+    }
+
     if (Tok.is(tok::kw_delete)) {
       if (D.isFunctionDeclarator())
         Diag(ConsumeToken(), diag::err_default_delete_in_multiple_declaration)
@@ -2363,6 +2386,15 @@ Decl *Parser::ParseDeclarationAfterDeclaratorAndAttributes(
                                      /*DirectInit=*/false);
     }
   } else if (Tok.is(tok::l_paren)) {
+
+    // C++ AMP-specific
+    // tile_static variables can't be initialized.
+    if (getLangOpts().CPlusPlusAMP) {
+      if (IsTileStatic(D)) {
+        Diag(Tok, diag::err_tile_static_no_init);
+      }
+    }
+
     // Parse C++ direct initializer: '(' expression-list ')'
     BalancedDelimiterTracker T(*this, tok::l_paren);
     T.consumeOpen();
@@ -2418,6 +2450,15 @@ Decl *Parser::ParseDeclarationAfterDeclaratorAndAttributes(
     }
   } else if (getLangOpts().CPlusPlus11 && Tok.is(tok::l_brace) &&
              (!CurParsedObjCImpl || !D.isFunctionDeclarator())) {
+
+    // C++ AMP-specific
+    // tile_static variables can't be initialized.
+    if (getLangOpts().CPlusPlusAMP) {
+      if (IsTileStatic(D)) {
+        Diag(Tok, diag::err_tile_static_no_init);
+      }
+    }
+
     // Parse C++0x braced-init-list.
     Diag(Tok, diag::warn_cxx98_compat_generalized_initializer_lists);
 
@@ -3841,6 +3882,13 @@ void Parser::ParseDeclarationSpecifiers(DeclSpec &DS,
                                  getLangOpts());
       break;
     case tok::kw_restrict:
+      // Must distinguish between '__restrict' and 'restrict(cpu,amp)':
+      // '__restrict' is a type qualifier
+      // 'restrict(cpu,amp)' is C++AMP restriction specifier
+      if (getLangOpts().CPlusPlusAMP) {
+        if (NextToken().is(tok::l_paren))
+          return;
+      }
       isInvalid = DS.SetTypeQual(DeclSpec::TQ_restrict, Loc, PrevSpec, DiagID,
                                  getLangOpts());
       break;
@@ -5816,7 +5864,8 @@ void Parser::ParseDirectDeclarator(Declarator &D) {
     D.SetRangeEnd(Tok.getLocation());
     ConsumeToken();
     goto PastIdentifier;
-  } else if (Tok.is(tok::identifier) && !D.mayHaveIdentifier()) {
+  } else if (Tok.is(tok::identifier) && !D.mayHaveIdentifier() &&
+             !getLangOpts().CPlusPlusAMP) { // Relax the rule for C++AMP
     // We're not allowed an identifier here, but we got one. Try to figure out
     // if the user was trying to attach a name to the type, or whether the name
     // is some unrelated trailing syntax.
@@ -6260,6 +6309,65 @@ void Parser::ParseFunctionDeclarator(Declarator &D,
       if (ParseRefQualifier(RefQualifierIsLValueRef, RefQualifierLoc))
         EndLoc = RefQualifierLoc;
 
+      // Parse C++AMP restriction specifier
+      unsigned cppampSpec = CPPAMP_None;
+      if (getLangOpts().CPlusPlusAMP) {
+        cppampSpec = ParseRestrictionSpecification(D, FnAttrs, EndLoc);
+        // Reset Scope's CXXAMP specifier
+        getCurScope()->setCXXAMPSpecifier(0);
+        // AMP-restricted function-only
+        if (cppampSpec & CPPAMP_AMP) {
+
+          // check if there's incompatible parameters in the function declarator
+          for (SmallVector<DeclaratorChunk::ParamInfo, 16>::iterator param = ParamInfo.begin(); param != ParamInfo.end(); ++param) {
+            ParmVarDecl *pvDecl = dyn_cast_or_null<ParmVarDecl>(param->Param);
+            if (pvDecl) {
+              QualType t = pvDecl->getOriginalType();
+              const Type* Ty = t.getTypePtrOrNull();
+              // reject incompatible scalar types
+              if (getLangOpts().HSAExtension) {
+                ; // hsa-ext
+              } else {
+                if (Ty->isCharType() || Ty->isWideCharType() || Ty->isSpecificBuiltinType(BuiltinType::Short) || Ty->isSpecificBuiltinType(BuiltinType::LongLong) || Ty->isSpecificBuiltinType(BuiltinType::LongDouble)) {
+                    Diag(param->IdentLoc, diag::err_amp_illegal_function_parameter);
+                }
+              }
+
+              // reject incompatible volatile type qualifier
+              if (t.isVolatileQualified()) {
+                  Diag(param->IdentLoc, diag::err_amp_illegal_function_parameter_volatile);
+              }
+
+              // reject incompatible enum types
+              if (Ty && Ty->isEnumeralType()) {
+                const EnumType* ETy = dyn_cast<EnumType>(Ty);
+                if (ETy && ETy->getDecl()) {
+                  const Type* UTy = ETy->getDecl()->getIntegerType().getTypePtrOrNull();
+                  if (UTy->isCharType() || UTy->isWideCharType() || UTy->isSpecificBuiltinType(BuiltinType::Short) || UTy->isSpecificBuiltinType(BuiltinType::LongLong) || UTy->isSpecificBuiltinType(BuiltinType::LongDouble)) {
+                    Diag(param->IdentLoc, diag::err_amp_illegal_function_parameter);
+                    Diag(ETy->getDecl()->getBeginLoc(), diag::err_amp_illegal_function_parameter);
+                  }
+                }
+              }
+            }
+          }
+
+          // check if the return type is of incompatible type
+          if (!getLangOpts().HSAExtension && D.getDeclSpec().getTypeSpecType() == DeclSpec::TST_char) {
+            Diag(D.getBeginLoc(), diag::err_amp_illegal_function_return_char);
+          } else if (!getLangOpts().HSAExtension && D.getDeclSpec().getTypeSpecWidth() == DeclSpec::TSW_short) {
+            Diag(D.getBeginLoc(), diag::err_amp_illegal_function_return_short);
+          } else if (D.getDeclSpec().getTypeQualifiers() & DeclSpec::TQ_volatile) {
+            Diag(D.getBeginLoc(), diag::err_amp_illegal_function_return_volatile);
+          }
+
+          // check if the function is volatile-qualified
+          if (DS.getTypeQualifiers() & DeclSpec::TQ_volatile) {
+            Diag(D.getBeginLoc(), diag::err_amp_illegal_function_return_volatile);
+          }
+        }
+      }
+
       // C++11 [expr.prim.general]p3:
       //   If a declaration declares a member function or member function
       //   template of a class X, the expression this is a prvalue of type
@@ -6324,8 +6432,14 @@ void Parser::ParseFunctionDeclarator(Declarator &D,
                                                  DynamicExceptionRanges,
                                                  NoexceptExpr,
                                                  ExceptionSpecTokens);
-      if (ESpecType != EST_None)
-        EndLoc = ESpecRange.getEnd();
+      if (ESpecType != EST_None) {
+          EndLoc = ESpecRange.getEnd();
+  
+        // C++AMP specific, reject exception specifiers for amp-restricted functions
+        if (getLangOpts().CPlusPlusAMP && (cppampSpec & CPPAMP_AMP)) {
+          Diag(ESpecRange.getBegin(), diag::err_amp_no_throw);
+        }
+      }
 
       // Parse attribute-specifier-seq[opt]. Per DR 979 and DR 1297, this goes
       // after the exception-specification.
@@ -6339,6 +6453,19 @@ void Parser::ParseFunctionDeclarator(Declarator &D,
           StartLoc = D.getDeclSpec().getTypeSpecTypeLoc();
         LocalEndLoc = Tok.getLocation();
         SourceRange Range;
+
+        // C++AMP specific
+        // Update Scope to mark if it is in AMP, CPU or dual context
+        // FIXME: only used in Trailing return case for now
+        if (getLangOpts().CPlusPlusAMP) {
+          // Reset Scope's CXXAMP specifier
+          getCurScope()->setCXXAMPSpecifier(0);
+          if (cppampSpec & CPPAMP_AMP)
+            getCurScope()->setAMPScope();
+
+          if (cppampSpec & CPPAMP_CPU)
+            getCurScope()->setCPUScope();
+        }
         TrailingReturnType =
             ParseTrailingReturnType(Range, D.mayBeFollowedByCXXDirectInit());
         EndLoc = Range.getEnd();
diff --git a/lib/Parse/ParseDeclCXX.cpp b/lib/Parse/ParseDeclCXX.cpp
index 9c61c4da44..81a7b8a9ff 100644
--- a/lib/Parse/ParseDeclCXX.cpp
+++ b/lib/Parse/ParseDeclCXX.cpp
@@ -23,6 +23,8 @@
 #include "clang/Sema/DeclSpec.h"
 #include "clang/Sema/ParsedTemplate.h"
 #include "clang/Sema/Scope.h"
+#include "clang/Sema/SemaDiagnostic.h"
+#include "clang/Sema/Lookup.h"
 #include "llvm/ADT/SmallString.h"
 #include "llvm/Support/TimeProfiler.h"
 
@@ -4369,3 +4371,235 @@ void Parser::ParseMicrosoftIfExistsClassDeclaration(
 
   Braces.consumeClose();
 }
+
+/// CXXAMPFindRestrictionSeq - Consume and store the token at the passed
+/// token container until the token '(' is reached (which gets
+/// consumed/stored too, if ConsumeFinalToken) or End Chars are reached.
+/// The EndChars are '{', '}' and ';' which present Funcion Definition Start, End and
+/// Declaration End respectively.
+/// Returns true if token '(' is found.
+/// NOTE: This is a specialized version of Parser::ConsumeAndStoreUntil.
+bool Parser::CXXAMPFindRestrictionSeq(CachedTokens &Toks,
+                                  bool ConsumeFinalToken) {
+  // We always want this function to consume at least one token if the first
+  // token isn't T and if not at EOF.
+  while (1) {
+    // If we found one of the tokens, stop and return true.
+    if (Tok.is(tok::l_paren)) {
+      if (ConsumeFinalToken) {
+        Toks.push_back(Tok);
+        ConsumeAnyToken();
+      }
+      return true;
+    }
+
+    switch (Tok.getKind()) {
+    case tok::eof:
+      // Ran out of tokens.
+      return false;
+    // End Chars
+    case tok::l_brace:
+     return false;
+      break;
+    case tok::r_brace:
+      return false;
+      break;
+
+    case tok::code_completion:
+      Toks.push_back(Tok);
+      ConsumeCodeCompletionToken();
+      break;
+
+    case tok::string_literal:
+    case tok::wide_string_literal:
+    case tok::utf8_string_literal:
+    case tok::utf16_string_literal:
+    case tok::utf32_string_literal:
+      Toks.push_back(Tok);
+      ConsumeStringToken();
+      break;
+    case tok::semi:
+        return false;
+      // FALL THROUGH.
+    default:
+      // consume this token.
+      Toks.push_back(Tok);
+      ConsumeToken();
+      break;
+    }
+  }
+}
+
+/// Parse a C++ restriction-specification if present (C++AMP restrict(amp,cpu)).
+///
+///       restriction-specifier-seq:
+///         restriction-specifier
+///         restriction-specifier-seq restriction-specifier
+///
+///       restriction-specifier:
+///         'restrict' ( restriction-seq )
+///
+///       restriction-seq:
+///         restriction
+///         restriction-seq, restriction
+///
+///       restriction:
+///         amp-restriction
+///         'cpu'
+///
+///       amp-restriction:
+///         'amp'
+unsigned Parser::ParseRestrictionSpecification(Declarator& D,
+  ParsedAttributes &Attrs,
+  SourceLocation &DeclEndLoc) {
+  unsigned retSpec = CPPAMP_None;
+  while (1)
+  {
+    if (Tok.isNot(tok::identifier)) // 'restrict' is an identifier
+      break;
+
+	CachedTokens Ids;
+    if (CXXAMPFindRestrictionSeq(Ids, /*ConsumeFinalToken=*/false)) {
+		if (Ids.size() == 1) {
+                IdentifierInfo* II = Ids[0].getIdentifierInfo();
+                if (II->getName() != "restrict") {
+				Diag(Ids[0], diag::err_expected_restrict);
+				DeclEndLoc = Tok.getLocation();
+				break;
+			}
+        } else if (Ids.size() == 0) {
+			Diag(Tok, diag::err_expected_restrict);
+			DeclEndLoc = Tok.getLocation();
+			break;
+		} else {
+                IdentifierInfo* II = Ids[0].getIdentifierInfo();
+                if (II->getName() != "restrict") {
+				Diag(Ids[0], diag::err_expected_restrict);
+				DeclEndLoc = Tok.getLocation();
+				break;
+			}
+			Diag(Ids[1], diag::err_expected_lparen_after_restriction);
+			DeclEndLoc = Tok.getLocation();
+			break;
+		}
+     } else {
+       break;
+	}
+
+    if (Tok.isNot(tok::l_paren)) {
+      Diag(Tok, diag::err_expected_lparen_after_restriction);
+      DeclEndLoc = Tok.getLocation();
+      break;
+    }
+
+    BalancedDelimiterTracker T(*this, tok::l_paren);
+    T.consumeOpen();
+    CachedTokens Toks;
+    if (ConsumeAndStoreUntil(tok::r_paren, Toks, /*StopAtSemi=*/true, /*ConsumeFinalToken=*/false)) {
+      // We find the ')' we expected
+      // Parse restriction-seq: 'cpu', 'amp'
+      if (Toks.size() == 0)
+        Diag(Tok, diag::err_amp_empty_restriction);
+
+      // The following usages are supported,
+      //   restrict(,), restrict(,,,,), where number of ',' is not cared
+      //   restrict(THE_RES,), restrict(THE_RES,,,,), where THE_RES is cpu, amp, auto
+      //      or any supported restriction specifier in C++AMP
+      bool isStart = true;
+      for (unsigned i = 0; i < Toks.size(); ++i) {
+        if (Toks[i].is(tok::comma)) {
+          isStart = true;
+        } else {
+          IdentifierInfo* II = Toks[i].getIdentifierInfo();
+          if(II) {
+            if (II->getName() == "cpu" ||    // 'cpu' is an identifier
+              II->getName() == "auto" ||    // 'auto' is a keyword
+              II->getName() == "amp" ) {   // 'amp' is an identifier
+              // Clean front and next is tok::comma
+              if(isStart && ((i == Toks.size()-1) ||Toks[i+1].is(tok::comma))) {
+                SourceLocation AttrNameLoc = Toks[i].getLocation();
+                Attrs.addNew(II, AttrNameLoc, 0, AttrNameLoc, /*0,
+                  AttrNameLoc, */0, 0, ParsedAttr::AS_GNU);
+
+                if (II->getName() == "cpu")
+                  retSpec |= CPPAMP_CPU;
+
+                if (II->getName() == "amp")
+                  retSpec |= CPPAMP_AMP;
+
+                if (II->getName() == "auto")
+                  retSpec |= CPPAMP_AUTO;
+              }
+            } else {
+              // Not valid specifier
+              Diag(Toks[i], diag::err_amp_unrecognized_restriction) << II->getName();
+            }
+          } else {
+            // Punctuators
+            Diag(Toks[i], diag::err_amp_unrecognized_restriction)
+                  <<tok::/*getTokenSimpleSpelling*/getPunctuatorSpelling(Toks[i].getKind());
+          }
+          isStart = false;
+        }
+      }
+    }
+    T.consumeClose();
+    DeclEndLoc = T.getCloseLocation();
+  }
+
+  if (retSpec & CPPAMP_AUTO) {
+    // Since it is not completely parsed, manually determine if it has a funciton body
+    // FIXME: the following can not be determined as a function body
+    //    int f() restrict(auto) const {}
+    //    int restrict(auto) f() {}
+    ParsingDeclSpec DS(*this);
+    ParsingDeclarator PD(*this, DS, D.getContext());
+      if (!isStartOfFunctionDefinition(PD)) {
+        #if 0
+        if (DS.getStorageClassSpec() == DeclSpec::SCS_typedef) {
+          Diag(Tok, diag::err_function_declared_typedef);
+          DS.ClearStorageClassSpecs();
+        }
+        #endif
+        // 'auto' restriction is only allowed on function defintion
+        Diag(DeclEndLoc, diag::err_amp_expected_auto_restriction_on_definition);
+    }
+    DeclarationNameInfo NameInfo = Actions.GetNameFromUnqualifiedId(D.getName());
+    LookupResult R(Actions, NameInfo, Sema::LookupUsingDeclName,
+                    Sema::ForVisibleRedeclaration);
+    if (Actions.LookupName(R, getCurScope())) {
+      Diag(DeclEndLoc, diag::err_amp_auto_restricted_function_has_other_declaration)
+        << NameInfo.getName().getAsString();
+      for (LookupResult::iterator I = R.begin(), IEnd = R.end();
+             I != IEnd; ++I)
+        Diag((*I)->getLocation(), diag::note_auto_restricted_prev_declaration);
+    }
+  }
+  // FIXME: this is an inefficient, yet effective method
+  // try walk up enclising scopes and find restriction specifiers
+  if (retSpec == CPPAMP_None) {
+    Scope *scope = getCurScope();
+    while (scope) {
+      if (scope->getFlags() & Scope::FnScope) {
+        FunctionDecl *FD = dyn_cast_or_null<FunctionDecl>(static_cast<DeclContext*>(scope->getEntity()));
+        if (FD) {
+          if (FD->hasAttr<CXXAMPRestrictAMPAttr>()) {
+            IdentifierInfo *II = &PP.getIdentifierTable().get("amp");
+            assert(II);
+            Attrs.addNew(II, DeclEndLoc, 0, DeclEndLoc, /*0, DeclEndLoc, */0, 0, ParsedAttr::AS_GNU);
+            retSpec |= CPPAMP_AMP;
+          }
+          if (FD->hasAttr<CXXAMPRestrictCPUAttr>()) {
+            IdentifierInfo *II = &PP.getIdentifierTable().get("cpu");
+            assert(II);
+            Attrs.addNew(II, DeclEndLoc, 0, DeclEndLoc, /*0, DeclEndLoc, */0, 0, ParsedAttr::AS_GNU);
+            retSpec |= CPPAMP_CPU;
+          }
+        }
+      }
+      scope = scope->getParent();
+    }
+  }
+
+  return retSpec;
+}
diff --git a/lib/Parse/ParseExpr.cpp b/lib/Parse/ParseExpr.cpp
index 7a0c07bd3b..2c75930a8a 100644
--- a/lib/Parse/ParseExpr.cpp
+++ b/lib/Parse/ParseExpr.cpp
@@ -31,6 +31,19 @@
 #include "llvm/ADT/SmallVector.h"
 using namespace clang;
 
+bool Parser::IsInAMPFunction(Scope *scope) {
+  while (scope) {
+    if (scope->getFlags() & Scope::FnScope) {
+      FunctionDecl *FD = dyn_cast_or_null<FunctionDecl>(static_cast<DeclContext*>(scope->getEntity()));
+      if (FD && FD->hasAttr<CXXAMPRestrictAMPAttr>()) {
+        return true;
+      }
+    }
+    scope = scope->getParent();
+  }
+  return false;
+}
+
 /// Simple precedence-based parser for binary/ternary operators.
 ///
 /// Note: we diverge from the C99 grammar when parsing the assignment-expression
@@ -164,8 +177,15 @@ ExprResult Parser::ParseAssignmentExpression(TypeCastState isTypeCast) {
     return ExprError();
   }
 
-  if (Tok.is(tok::kw_throw))
+  if (Tok.is(tok::kw_throw)) {
+    // C++ AMP-specific, reject if we are in an AMP-restricted function
+    if (getLangOpts().CPlusPlusAMP && getLangOpts().DevicePath && !getLangOpts().AMPCPU) {
+      if (IsInAMPFunction(getCurScope())) {
+        Diag(Tok, diag::err_amp_illegal_keyword_throw);
+      }
+    }
     return ParseThrowExpression();
+  }
   if (Tok.is(tok::kw_co_yield))
     return ParseCoyieldExpression();
 
@@ -1217,8 +1237,14 @@ ExprResult Parser::ParseCastExpression(bool isUnaryExpression,
     ConsumeToken();
     return Res;
   }
-  case tok::kw_const_cast:
   case tok::kw_dynamic_cast:
+    // C++ AMP-specific, reject if we are in an AMP-restricted function
+    if (getLangOpts().CPlusPlusAMP && getLangOpts().DevicePath) {
+      if (IsInAMPFunction(getCurScope())) {
+        Diag(Tok, diag::err_amp_illegal_keyword_dynamiccast);
+      }
+    }
+  case tok::kw_const_cast:
   case tok::kw_reinterpret_cast:
   case tok::kw_static_cast:
     Res = ParseCXXCasts();
@@ -1227,6 +1253,12 @@ ExprResult Parser::ParseCastExpression(bool isUnaryExpression,
     Res = ParseBuiltinBitCast();
     break;
   case tok::kw_typeid:
+    // C++ AMP-specific, reject if we are in an AMP-restricted function
+    if (getLangOpts().CPlusPlusAMP && getLangOpts().DevicePath) {
+      if (IsInAMPFunction(getCurScope())) {
+        Diag(Tok, diag::err_amp_illegal_keyword_typeid);
+      }
+    }
     Res = ParseCXXTypeid();
     break;
   case tok::kw___uuidof:
@@ -1448,6 +1480,7 @@ ExprResult Parser::ParseCastExpression(bool isUnaryExpression,
     cutOffParsing();
     return ExprError();
   }
+  case tok::kw___attribute:  // HCC hack to allow __attribute__ in front of a lambda introducer
   case tok::l_square:
     if (getLangOpts().CPlusPlus11) {
       if (getLangOpts().ObjC) {
diff --git a/lib/Parse/ParseExprCXX.cpp b/lib/Parse/ParseExprCXX.cpp
index 85c7e6c6bc..8f948ada7f 100644
--- a/lib/Parse/ParseExprCXX.cpp
+++ b/lib/Parse/ParseExprCXX.cpp
@@ -689,14 +689,16 @@ ExprResult Parser::ParseCXXIdExpression(bool isAddressOfOperand) {
 ExprResult Parser::ParseLambdaExpression() {
   // Parse lambda-introducer.
   LambdaIntroducer Intro;
-  if (ParseLambdaIntroducer(Intro)) {
+  ParsedAttributes AttrIntro(AttrFactory);
+
+  if (ParseLambdaIntroducer(Intro, AttrIntro)) {
     SkipUntil(tok::r_square, StopAtSemi);
     SkipUntil(tok::l_brace, StopAtSemi);
     SkipUntil(tok::r_brace, StopAtSemi);
     return ExprError();
   }
 
-  return ParseLambdaExpressionAfterIntroducer(Intro);
+  return ParseLambdaExpressionAfterIntroducer(Intro, AttrIntro);
 }
 
 /// Use lookahead and potentially tentative parsing to determine if we are
@@ -704,9 +706,11 @@ ExprResult Parser::ParseLambdaExpression() {
 ///
 /// If we are not looking at a lambda expression, returns ExprError().
 ExprResult Parser::TryParseLambdaExpression() {
+#if 0
   assert(getLangOpts().CPlusPlus11
          && Tok.is(tok::l_square)
          && "Not at the start of a possible lambda expression.");
+#endif
 
   const Token Next = NextToken();
   if (Next.is(tok::eof)) // Nothing else to lookup here...
@@ -714,7 +718,8 @@ ExprResult Parser::TryParseLambdaExpression() {
 
   const Token After = GetLookAheadToken(2);
   // If lookahead indicates this is a lambda...
-  if (Next.is(tok::r_square) ||     // []
+  if (Next.is(tok::kw___attribute) || // __attribute
+      Next.is(tok::r_square) ||     // []
       Next.is(tok::equal) ||        // [=
       (Next.is(tok::amp) &&         // [&] or [&,
        After.isOneOf(tok::r_square, tok::comma)) ||
@@ -735,10 +740,16 @@ ExprResult Parser::TryParseLambdaExpression() {
   // writing two routines to parse a lambda introducer, just try to parse
   // a lambda introducer first, and fall back if that fails.
   LambdaIntroducer Intro;
+  ParsedAttributes AttrIntro(AttrFactory);
+  if (getLangOpts().CPlusPlusAMP) {
+    if (TryParseLambdaIntroducer(Intro, AttrIntro))
+      return ExprEmpty();
+  }
+
   {
     TentativeParsingAction TPA(*this);
     LambdaIntroducerTentativeParse Tentative;
-    if (ParseLambdaIntroducer(Intro, &Tentative)) {
+    if (ParseLambdaIntroducer(Intro, AttrIntro, &Tentative)) {
       TPA.Commit();
       return ExprError();
     }
@@ -753,7 +764,7 @@ ExprResult Parser::TryParseLambdaExpression() {
       // non-tentative parse.
       TPA.Revert();
       Intro = LambdaIntroducer();
-      if (ParseLambdaIntroducer(Intro))
+      if (ParseLambdaIntroducer(Intro, AttrIntro))
         return ExprError();
       break;
 
@@ -765,7 +776,7 @@ ExprResult Parser::TryParseLambdaExpression() {
     }
   }
 
-  return ParseLambdaExpressionAfterIntroducer(Intro);
+  return ParseLambdaExpressionAfterIntroducer(Intro, AttrIntro);
 }
 
 /// Parse a lambda introducer.
@@ -778,11 +789,22 @@ ExprResult Parser::TryParseLambdaExpression() {
 /// \return \c true if parsing (or disambiguation) failed with a diagnostic and
 ///         the caller should bail out / recover.
 bool Parser::ParseLambdaIntroducer(LambdaIntroducer &Intro,
+		                           ParsedAttributes &AttrIntro,
                                    LambdaIntroducerTentativeParse *Tentative) {
   if (Tentative)
     *Tentative = LambdaIntroducerTentativeParse::Success;
 
+
+  // try parse attributes before parameter list
+  if (getLangOpts().CPlusPlusAMP) {
+	SourceLocation DeclEndLoc = Intro.Range.getEnd();
+    MaybeParseGNUAttributes(AttrIntro, &DeclEndLoc);
+  }
+
+
+#if 0
   assert(Tok.is(tok::l_square) && "Lambda expressions begin with '['.");
+#endif
   BalancedDelimiterTracker T(*this, tok::l_square);
   T.consumeOpen();
 
@@ -1101,11 +1123,37 @@ bool Parser::ParseLambdaIntroducer(LambdaIntroducer &Intro,
   return false;
 }
 
+/// TryParseLambdaIntroducer - Tentatively parse a lambda introducer.
+///
+/// Returns true if it hit something unexpected.
+bool Parser::TryParseLambdaIntroducer(LambdaIntroducer &Intro, ParsedAttributes &AttrIntro) {
+    TentativeParsingAction PA1(*this);
+
+    if (ParseLambdaIntroducer(Intro, AttrIntro)){
+      PA1.Revert();
+      return true;
+    }
+
+  // Try to parse it again, but this time parse the init-captures too.
+  Intro = LambdaIntroducer();
+  AttrIntro.clear();
+  TentativeParsingAction PA2(*this);
+
+  if (!ParseLambdaIntroducer(Intro, AttrIntro)) {
+    PA2.Commit();
+    return false;
+  }
+
+  PA2.Revert();
+  return true;
+}
+
 static void tryConsumeLambdaSpecifierToken(Parser &P,
                                            SourceLocation &MutableLoc,
                                            SourceLocation &ConstexprLoc,
                                            SourceLocation &ConstevalLoc,
                                            SourceLocation &DeclEndLoc) {
+
   assert(MutableLoc.isInvalid());
   assert(ConstexprLoc.isInvalid());
   // Consume constexpr-opt mutable-opt in any sequence, and set the DeclEndLoc
@@ -1179,7 +1227,8 @@ static void addConstevalToLambdaDeclSpecifier(Parser &P,
 /// ParseLambdaExpressionAfterIntroducer - Parse the rest of a lambda
 /// expression.
 ExprResult Parser::ParseLambdaExpressionAfterIntroducer(
-                     LambdaIntroducer &Intro) {
+                     LambdaIntroducer &Intro,
+                     ParsedAttributes &AttrIntro ) {
   SourceLocation LambdaBeginLoc = Intro.Range.getBegin();
   Diag(LambdaBeginLoc, diag::warn_cxx98_compat_lambda);
 
@@ -1217,6 +1266,13 @@ ExprResult Parser::ParseLambdaExpressionAfterIntroducer(
               << A.getName()->getName();
   };
 
+  // try parse attributes before parameter list
+  SourceLocation DeclEndLoc = Intro.Range.getBegin();
+  ParsedAttributes AttrPre(AttrFactory);
+  if (getLangOpts().CPlusPlusAMP) {
+    MaybeParseGNUAttributes(AttrPre, &DeclEndLoc);
+  }
+
   // FIXME: Consider allowing this as an extension for GCC compatibiblity.
   const bool HasExplicitTemplateParams = Tok.is(tok::less);
   ParseScope TemplateParamScope(this, Scope::TemplateParamScope,
@@ -1295,7 +1351,33 @@ ExprResult Parser::ParseLambdaExpressionAfterIntroducer(
                                    ConstevalLoc, DeclEndLoc);
 
     addConstexprToLambdaDeclSpecifier(*this, ConstexprLoc, DS);
+
+    // Parse C++AMP restriction specifier
+    unsigned cppampSpec = CPPAMP_None;
+    if (getLangOpts().CPlusPlusAMP) {
+      cppampSpec = ParseRestrictionSpecification(D, Attr, DeclEndLoc);
+
+      if (getLangOpts().HSAExtension && getLangOpts().AutoAuto) {
+        // auto-auto: automatically append restrict(auto) in case no restriction specifier is found
+        if (cppampSpec == CPPAMP_None) {
+          cppampSpec = CPPAMP_AUTO;
+          IdentifierInfo *II = &PP.getIdentifierTable().get("auto");
+          assert(II);
+          Attr.addNew(II, DeclEndLoc, 0, DeclEndLoc, /*0, DeclEndLoc,*/ 0, 0, ParsedAttr::AS_GNU);
+        }
+      }
+    }
+
+    // C++AMP
+    if (getLangOpts().CPlusPlusAMP) {
+      // take all attributed parsed before introducer
+      Attr.takeAllFrom(AttrIntro);
+      // take all attributes parsed before parameter list
+      Attr.takeAllFrom(AttrPre);
+    }
+
     addConstevalToLambdaDeclSpecifier(*this, ConstevalLoc, DS);
+
     // Parse exception-specification[opt].
     ExceptionSpecificationType ESpecType = EST_None;
     SourceRange ESpecRange;
@@ -1310,9 +1392,15 @@ ExprResult Parser::ParseLambdaExpressionAfterIntroducer(
                                                NoexceptExpr,
                                                ExceptionSpecTokens);
 
-    if (ESpecType != EST_None)
+    if (ESpecType != EST_None) {
       DeclEndLoc = ESpecRange.getEnd();
 
+      // C++AMP specific, reject exception specifiers for amp-restricted functions
+      if (getLangOpts().CPlusPlusAMP && (cppampSpec & CPPAMP_AMP)) {
+        Diag(ESpecRange.getBegin(), diag::err_amp_no_throw);
+      }
+    }
+
     // Parse attribute-specifier[opt].
     MaybeParseCXX11Attributes(Attr, &DeclEndLoc);
 
@@ -1412,6 +1500,23 @@ ExprResult Parser::ParseLambdaExpressionAfterIntroducer(
                       /*DeclsInPrototype=*/None, DeclLoc, DeclEndLoc, D,
                       TrailingReturnType),
                   std::move(Attr), DeclEndLoc);
+  }  else if (Tok.is(tok::l_brace)) {
+    // Next is compound-statement.
+    // Parse C++AMP restrict specifier though the lambda expression has no params, so that
+    // context inside lambda compound-statement is distinguished from cpu codes or amp codes.
+    // And the lambda's calloperator will be attached with the same restrictions as its parent
+    // function's if any. Such lambda expression is as follows,
+    //   [] {
+    //     // The compound-statement
+    //   };
+    if (getLangOpts().CPlusPlusAMP) {
+      // Place restriction after r_square
+      SourceLocation LambdaEndLoc = Intro.Range.getEnd();
+      ParsedAttributes Attr(AttrFactory);
+      ParseRestrictionSpecification(D, Attr, LambdaEndLoc);
+      D.getAttributes().addAll(Attr.begin(), Attr.end());
+      D.getAttributePool().takeAllFrom(Attr.getPool());
+    }
   }
 
   // FIXME: Rename BlockScope -> ClosureScope if we decide to continue using
diff --git a/lib/Parse/ParseInit.cpp b/lib/Parse/ParseInit.cpp
index 7a455484b9..55f13609a2 100644
--- a/lib/Parse/ParseInit.cpp
+++ b/lib/Parse/ParseInit.cpp
@@ -68,8 +68,9 @@ bool Parser::MayBeDesignationStart() {
   RevertingTentativeParsingAction Tentative(*this);
 
   LambdaIntroducer Intro;
+  ParsedAttributes AttrIntro(AttrFactory);
   LambdaIntroducerTentativeParse ParseResult;
-  if (ParseLambdaIntroducer(Intro, &ParseResult)) {
+  if (ParseLambdaIntroducer(Intro, AttrIntro, &ParseResult)) {
     // Hit and diagnosed an error in a lambda.
     // FIXME: Tell the caller this happened so they can recover.
     return true;
diff --git a/lib/Parse/ParseStmt.cpp b/lib/Parse/ParseStmt.cpp
index bf04253ab7..718b7b033e 100644
--- a/lib/Parse/ParseStmt.cpp
+++ b/lib/Parse/ParseStmt.cpp
@@ -250,6 +250,12 @@ Retry:
     return ParseForStatement(TrailingElseLoc);
 
   case tok::kw_goto:                // C99 6.8.6.1: goto-statement
+    // C++ AMP-specific, reject if we are in an AMP-restricted function
+    if (getLangOpts().CPlusPlusAMP && getLangOpts().DevicePath) {
+      if (!getLangOpts().HSAExtension && IsInAMPFunction(getCurScope())) {
+        Diag(Tok, diag::err_amp_illegal_keyword_goto);
+      }
+    }
     Res = ParseGotoStatement();
     SemiError = "goto";
     break;
@@ -271,6 +277,15 @@ Retry:
     break;
 
   case tok::kw_asm: {
+
+    // C++ AMP-specific, reject if we are in an AMP-restricted function 
+    if (getLangOpts().CPlusPlusAMP &&
+        !getLangOpts().HSAExtension &&
+        getLangOpts().DevicePath) {
+      if (IsInAMPFunction(getCurScope())) {
+        Diag(Tok, diag::err_amp_illegal_keyword_asm);
+      }
+    }
     ProhibitAttributes(Attrs);
     bool msAsm = false;
     Res = ParseAsmStatement(msAsm);
@@ -289,6 +304,12 @@ Retry:
     return StmtEmpty();
 
   case tok::kw_try:                 // C++ 15: try-block
+    // C++ AMP-specific, reject if we are in an AMP-restricted function
+    if (getLangOpts().CPlusPlusAMP && getLangOpts().DevicePath) {
+      if (IsInAMPFunction(getCurScope())) {
+        Diag(Tok, diag::err_amp_illegal_keyword_trycatch);
+      }
+    }
     return ParseCXXTryBlock();
 
   case tok::kw___try:
diff --git a/lib/Parse/ParseTentative.cpp b/lib/Parse/ParseTentative.cpp
index a413f9a941..222b893b1b 100644
--- a/lib/Parse/ParseTentative.cpp
+++ b/lib/Parse/ParseTentative.cpp
@@ -683,11 +683,22 @@ Parser::isCXX11AttributeSpecifier(bool Disambiguate,
   // Check to see if this is a lambda-expression.
   // FIXME: If this disambiguation is too slow, fold the tentative lambda parse
   // into the tentative attribute parse below.
+  LambdaIntroducer Intro;
+  ParsedAttributes AttrIntro(AttrFactory);
+  if (!TryParseLambdaIntroducer(Intro, AttrIntro)) {
+    // A lambda cannot end with ']]', and an attribute must.
+    bool IsAttribute = Tok.is(tok::r_square);
+
+    if (IsAttribute)
+      // Case 1: C++11 attribute.
+      return CAK_AttributeSpecifier;
+  }
+
   {
     RevertingTentativeParsingAction LambdaTPA(*this);
     LambdaIntroducer Intro;
     LambdaIntroducerTentativeParse Tentative;
-    if (ParseLambdaIntroducer(Intro, &Tentative)) {
+    if (ParseLambdaIntroducer(Intro, AttrIntro, &Tentative)) {
       // We hit a hard error after deciding this was not an attribute.
       // FIXME: Don't parse and annotate expressions when disambiguating
       // against an attribute.
@@ -2035,6 +2046,17 @@ Parser::TPResult Parser::TryParseFunctionDeclarator() {
   // ref-qualifier[opt]
   if (Tok.isOneOf(tok::amp, tok::ampamp))
     ConsumeToken();
+  
+  // C++AMP
+  // 'restrict' is an identifier, not a keyword
+  if (getLangOpts().CPlusPlusAMP && Tok.is(tok::identifier) && (Tok.getIdentifierInfo()->getName() == "restrict")) {
+    ConsumeToken();
+    if (Tok.isNot(tok::l_paren))
+      return TPResult::Error;
+    ConsumeParen();
+    if (!SkipUntil(tok::r_paren))
+      return TPResult::Error;
+  }
 
   // exception-specification
   if (Tok.is(tok::kw_throw)) {
diff --git a/lib/Parse/Parser.cpp b/lib/Parse/Parser.cpp
index 9124f15586..5a539f5c3d 100644
--- a/lib/Parse/Parser.cpp
+++ b/lib/Parse/Parser.cpp
@@ -958,7 +958,12 @@ bool Parser::isDeclarationAfterDeclarator() {
 /// Determine whether the current token, if it occurs after a
 /// declarator, indicates the start of a function definition.
 bool Parser::isStartOfFunctionDefinition(const ParsingDeclarator &Declarator) {
-  assert(Declarator.isFunctionDeclarator() && "Isn't a function declarator");
+  // Relax the rule for C++AMP because there may not necessarily be a
+  // declarator when used in C++AMP to parse 'auto'
+  if(Actions.getLangOpts().CPlusPlusAMP) {
+  } else {
+    assert(Declarator.isFunctionDeclarator() && "Isn't a function declarator");
+  }
   if (Tok.is(tok::l_brace))   // int X() {}
     return true;
 
@@ -1976,7 +1981,8 @@ bool Parser::TryAnnotateTypeOrScopeTokenAfterScopeSpec(CXXScopeSpec &SS,
 /// Note that this routine emits an error if you call it with ::new or ::delete
 /// as the current tokens, so only call it in contexts where these are invalid.
 bool Parser::TryAnnotateCXXScopeToken(bool EnteringContext) {
-  assert(getLangOpts().CPlusPlus &&
+  // C++AMP
+  assert((getLangOpts().CPlusPlus || getLangOpts().CPlusPlusAMP) &&
          "Call sites of this function should be guarded by checking for C++");
   assert((Tok.is(tok::identifier) || Tok.is(tok::coloncolon) ||
           (Tok.is(tok::annot_template_id) && NextToken().is(tok::coloncolon)) ||
diff --git a/lib/Sema/CMakeLists.txt b/lib/Sema/CMakeLists.txt
index 742343583d..33371af0ee 100644
--- a/lib/Sema/CMakeLists.txt
+++ b/lib/Sema/CMakeLists.txt
@@ -62,6 +62,7 @@ add_clang_library(clangSema
   SemaTemplateInstantiateDecl.cpp
   SemaTemplateVariadic.cpp
   SemaType.cpp
+  StmtResInfer.cpp
   TypeLocBuilder.cpp
 
   DEPENDS
diff --git a/lib/Sema/SemaCUDA.cpp b/lib/Sema/SemaCUDA.cpp
index 203c09c571..67b697922c 100644
--- a/lib/Sema/SemaCUDA.cpp
+++ b/lib/Sema/SemaCUDA.cpp
@@ -499,7 +499,7 @@ void Sema::checkAllowedCUDAInitializer(VarDecl *VD) {
 
     if (!AllowedInit) {
       Diag(VD->getLocation(), VD->hasAttr<CUDASharedAttr>()
-                                  ? diag::err_shared_var_init
+                                  ? diag::warn_shared_var_init
                                   : diag::err_dynamic_var_init)
           << Init->getSourceRange();
       VD->setInvalidDecl();
diff --git a/lib/Sema/SemaCast.cpp b/lib/Sema/SemaCast.cpp
index f184eda2f2..ffd9906012 100644
--- a/lib/Sema/SemaCast.cpp
+++ b/lib/Sema/SemaCast.cpp
@@ -1964,6 +1964,20 @@ static void checkIntToPointerCast(bool CStyle, SourceLocation Loc,
                                   Sema &Self) {
   QualType SrcType = SrcExpr->getType();
 
+  // C++AMP-specific rule checks
+  if (Self.getLangOpts().CPlusPlusAMP
+      && Self.IsInAMPRestricted()
+      && !Self.getLangOpts().HSAExtension
+      && !CStyle && SrcType->isIntegralType(Self.Context)
+      && !SrcType->isBooleanType()
+      && !SrcType->isEnumeralType()
+      && !SrcExpr->isIntegerConstantExpr(Self.Context)
+      && Self.Context.getTypeSize(DestType) > Self.Context.getTypeSize(SrcType)) {
+    // C++AMP
+    Self.Diag(Loc, diag::err_amp_int_to_pointer_cast)<< SrcType << DestType;
+    return;
+  }
+
   // Not warning on reinterpret_cast, boolean, constant expressions, etc
   // are not explicit design choices, but consistent with GCC's behavior.
   // Feel free to modify them if you've reason/evidence for an alternative.
@@ -1973,6 +1987,15 @@ static void checkIntToPointerCast(bool CStyle, SourceLocation Loc,
       && !SrcExpr->isIntegerConstantExpr(Self.Context)
       && Self.Context.getTypeSize(DestType) >
          Self.Context.getTypeSize(SrcType)) {
+
+    // C++AMP
+    if(Self.getLangOpts().CPlusPlusAMP
+       && Self.IsInAMPRestricted()
+       && !Self.getLangOpts().HSAExtension) {
+        Self.Diag(Loc, diag::err_amp_int_to_pointer_cast)<< SrcType << DestType;
+        return;
+    }
+
     // Separate between casts to void* and non-void* pointers.
     // Some APIs use (abuse) void* for something like a user context,
     // and often that value is an integer even if it isn't a pointer itself.
@@ -2072,6 +2095,23 @@ static TryCastResult TryReinterpretCast(Sema &Self, ExprResult &SrcExpr,
       return TC_NotApplicable;
     }
 
+    // C++AMP
+    if(Self.getLangOpts().CPlusPlusAMP && SrcType->isIntegralOrEnumerationType() &&
+      !Self.Context.getPointerType(SrcType).isNull()) {
+      // The expression,
+      //     int foo;
+      //     int *& r = (int*&)foo;  // Error
+      // where at this point,
+      //    SrcType is 'int' which will be coverted to 'int*'
+      //    DestType is 'int*&' which will be converted to be 'int**'
+      // Note that, the trick is 'int*&' is taken as 'int**'
+      if (Self.getLangOpts().CPlusPlusAMP &&
+          Self.IsInAMPRestricted() &&
+          !Self.getLangOpts().HSAExtension)
+        Self.Diag(OpRange.getBegin(), diag::err_amp_int_to_pointer_cast)
+          << SrcType << DestType;
+    }
+
     // This code does this transformation for the checked types.
     DestType = Self.Context.getPointerType(DestTypeTmp->getPointeeType());
     SrcType = Self.Context.getPointerType(SrcType);
@@ -2119,6 +2159,15 @@ static TryCastResult TryReinterpretCast(Sema &Self, ExprResult &SrcExpr,
     // A valid member pointer cast.
     assert(!IsLValueCast);
     Kind = CK_ReinterpretMemberPointer;
+
+    // C++AMP
+    if(Self.getLangOpts().CPlusPlusAMP && Self.IsInAnyExplicitRestricted()) {
+      // FIXME: It is not clear if it is necessary to reject since usage of this DestType is unknown
+      // in current context, e.g. there is only defintion without any usage.
+      msg = diag::err_amp_bad_reinterpret_cast_from_pointer_to_functionptr;
+      return TC_Failed;
+    }
+
     return TC_Success;
   }
 
@@ -2199,6 +2248,15 @@ static TryCastResult TryReinterpretCast(Sema &Self, ExprResult &SrcExpr,
 
   if (DestType->isIntegralType(Self.Context)) {
     assert(srcIsPtr && "One type must be a pointer");
+
+    // C++AMP
+    if(Self.getLangOpts().CPlusPlusAMP && !Self.getLangOpts().HSAExtension) {
+      if(Self.IsInAnyExplicitRestricted()) {
+          msg = diag::err_amp_bad_reinterpret_cast_from_pointer_to_int;
+          return TC_Failed;
+        }
+    }
+
     // C++ 5.2.10p4: A pointer can be explicitly converted to any integral
     //   type large enough to hold it; except in Microsoft mode, where the
     //   integral type size doesn't matter (except we don't allow bool).
@@ -2245,7 +2303,7 @@ static TryCastResult TryReinterpretCast(Sema &Self, ExprResult &SrcExpr,
                              /*CheckObjCLifetime=*/CStyle))
     SuccessResult = getCastAwayConstnessCastKind(CACK, msg);
 
-  if (IsAddressSpaceConversion(SrcType, DestType)) {
+  if (!Self.getLangOpts().CPlusPlusAMP && IsAddressSpaceConversion(SrcType, DestType)) {
     Kind = CK_AddressSpaceConversion;
     assert(SrcType->isPointerType() && DestType->isPointerType());
     if (!CStyle &&
@@ -2890,6 +2948,28 @@ ExprResult Sema::BuildCStyleCastExpr(SourceLocation LPLoc,
   if (Op.SrcExpr.isInvalid())
     return ExprError();
 
+  // C++AMP [2.4.1.2.1]
+  if(getLangOpts().CPlusPlusAMP) {
+    if(IsInAMPRestricted()) {
+      if (IntegerLiteral *I = dyn_cast<IntegerLiteral>(CastExpr->IgnoreParenCasts())){
+        // Case by case
+        //    int xxxn = (int) 0x2ffffffffLL;  // Error
+        //    int xxxn = (int) 0xffffffffLL;    // Correct
+        QualType TargetType = CastTypeInfo->getType();
+        llvm::APInt ResultVal = I->getValue();
+        if(TargetType->isSpecificBuiltinType(BuiltinType::Int) ||
+          TargetType->isSpecificBuiltinType(BuiltinType::UInt)) {
+          #if 0
+          unsigned IntSize = Context.getTargetInfo().getIntWidth();
+          #endif
+          // Does it fit in a long or longlong? if yes, reject using higher precision integer
+          if (ResultVal.getActiveBits() > (sizeof(int)*8)) {
+            Diag(CastExpr->getExprLoc(), diag::err_amp_constant_too_big);
+          }
+        }
+      }
+    }
+  }
   // -Wcast-qual
   DiagnoseCastQual(Op.Self, Op.SrcExpr, Op.DestType);
 
diff --git a/lib/Sema/SemaChecking.cpp b/lib/Sema/SemaChecking.cpp
index f9f82cdeef..df5585203c 100644
--- a/lib/Sema/SemaChecking.cpp
+++ b/lib/Sema/SemaChecking.cpp
@@ -11430,11 +11430,20 @@ CheckImplicitConversion(Sema &S, Expr *E, QualType T, SourceLocation CC,
       std::string PrettySourceValue = Value.toString(10);
       std::string PrettyTargetValue = PrettyPrintInRange(Value, TargetRange);
 
+      // C++AMP [2.4.1.2.1]
+      //   int xxxn = 0x2ffffffffLL;  // Error by using higher precision integer
+      //   int xxxn = 0xffffffffLL;    // Correct. the TargetRange == SourceRange == 32
+      if(S.getLangOpts().CPlusPlusAMP) {
+        // Suppress the warning
+        if(S.IsInAMPRestricted())
+          S.DiagRuntimeBehavior(E->getExprLoc(), E, S.PDiag(diag::err_amp_constant_too_big));
+      } else {
       S.DiagRuntimeBehavior(
           E->getExprLoc(), E,
           S.PDiag(diag::warn_impcast_integer_precision_constant)
               << PrettySourceValue << PrettyTargetValue << E->getType() << T
               << E->getSourceRange() << clang::SourceRange(CC));
+      }
       return;
     }
 
@@ -12841,6 +12850,14 @@ void Sema::CheckArrayAccess(const Expr *BaseExpr, const Expr *IndexExpr,
       }
     }
 
+    // C++AMP
+    // Error if try to access char string since 'char' is not amp compatible type, e.g.
+    //            The StringLiteral "   "Hello"[0]     "
+    if (getLangOpts().CPlusPlusAMP && dyn_cast<StringLiteral>(BaseExpr) &&
+           IsInAMPRestricted()) {
+      Diag(BaseExpr->getExprLoc(), diag::err_amp_unsupported_string_literals);
+    }
+
     if (size.getBitWidth() > index.getBitWidth())
       index = index.zext(size.getBitWidth());
     else if (size.getBitWidth() < index.getBitWidth())
diff --git a/lib/Sema/SemaDecl.cpp b/lib/Sema/SemaDecl.cpp
index 8f19edbc4f..a5a78ae543 100644
--- a/lib/Sema/SemaDecl.cpp
+++ b/lib/Sema/SemaDecl.cpp
@@ -47,6 +47,7 @@
 #include <algorithm>
 #include <cstring>
 #include <functional>
+#include <set>
 
 using namespace clang;
 using namespace sema;
@@ -4518,6 +4519,20 @@ Sema::ParsedFreeStandingDeclSpec(Scope *S, AccessSpecifier AS, DeclSpec &DS,
     }
   }
 
+  // C++AMP
+  if(getLangOpts().CPlusPlusAMP && TagD) {
+    // If TagD is not null, Dcl itself presents a CXXRecordDecl
+    if(CXXRecordDecl* RD = dyn_cast<CXXRecordDecl>(TagD)) {
+      for ( CXXRecordDecl::method_iterator MethodIt = RD->method_begin(),
+                  MethodItE = RD->method_end(); MethodIt != MethodItE; ++MethodIt) {
+            if(MethodIt->isUserProvided() && MethodIt->hasAttr<CXXAMPRestrictAMPAttr>() &&
+               MethodIt->isVirtual()) {
+               Diag(MethodIt->getSourceRange().getBegin(), diag::err_amp_virtual_member_function);
+            }
+      }
+    }
+  }
+
   return TagD;
 }
 
@@ -5250,10 +5265,631 @@ static bool RebuildDeclaratorInCurrentInstantiation(Sema &S, Declarator &D,
   return false;
 }
 
+static void Track4ByteAligned(const CXXRecordDecl* RDecl, Sema& S, Declarator &D,
+                                             std::vector<FieldDecl*>&FoundVec, bool& Aligned)
+{
+  // It is myself, Walk the field
+  for (CXXRecordDecl::field_iterator It = RDecl->field_begin(),
+        ItE = RDecl->field_end(); It != ItE; ++It) {
+   const FieldDecl *FD = *It;
+    if(!FD)
+      continue;
+    const RecordType *RT = S.Context.getBaseElementType(FD->getType())->getAs<RecordType>();
+    // The field contains RecordType
+    if (RT) {
+      Aligned = true;
+      break;
+   }
+
+    QualType FieldType = FD->getType();
+    // Array
+    if(const ArrayType* ArrayTy = dyn_cast<ArrayType>(FieldType)) {
+          FieldType = ArrayTy->getElementType();
+      if( FieldType->getAs<RecordType>()) {
+        Aligned = true;
+        break;
+      }
+    }
+
+    if(const Type* Ty = FieldType.getTypePtrOrNull()) {
+      if(Ty->isBooleanType()) {
+        Aligned = false;
+        // Temporarily append, will remove if determine they are aligned
+        FoundVec.push_back(const_cast<FieldDecl*>(FD));
+      } else
+       Aligned = true;
+    } else
+      Aligned= true;
+  }
+
+  if(RDecl->getDefinition()) {
+    RDecl = RDecl->getDefinition();
+    for(CXXRecordDecl::base_class_const_iterator BaseIt = RDecl->bases_begin();
+             BaseIt!=RDecl->bases_end(); BaseIt++) {
+      const CXXRecordDecl *BaseRDecl =
+            cast<CXXRecordDecl>(BaseIt->getType()->getAs<RecordType>()->getDecl());
+        Track4ByteAligned(BaseRDecl, S, D, FoundVec, Aligned);
+    }
+  }
+  return;
+}
+
+static bool IsIncompatibleScalarType(const Type* Ty, bool HSAExtension = false) {
+  assert(Ty);
+  if (HSAExtension) {
+    return false;
+  } else {
+    return Ty->isCharType() ||
+           Ty->isWideCharType() ||
+           Ty->isSpecificBuiltinType(BuiltinType::Short) ||
+           Ty->isSpecificBuiltinType(BuiltinType::LongLong) ||
+           Ty->isSpecificBuiltinType(BuiltinType::LongDouble);
+  }
+}
+
+static inline bool IsCompatibleScalarType(const Type* Ty, bool HSAExtension = false) {
+  assert(Ty);
+  if (HSAExtension) {
+    return Ty->isVoidType() ||
+           Ty->isCharType() ||
+           Ty->isBooleanType() ||
+           Ty->isWideCharType() ||
+           Ty->isSpecificBuiltinType(BuiltinType::Int) ||
+           Ty->isSpecificBuiltinType(BuiltinType::Long) ||
+           Ty->isSpecificBuiltinType(BuiltinType::Short) ||
+           Ty->isSpecificBuiltinType(BuiltinType::Float) ||
+           Ty->isSpecificBuiltinType(BuiltinType::Double) ||
+           Ty->isSpecificBuiltinType(BuiltinType::LongLong) ||
+           Ty->isSpecificBuiltinType(BuiltinType::LongDouble);
+  } else {
+    return Ty->isVoidType() ||
+           Ty->isBooleanType() ||
+           Ty->isSpecificBuiltinType(BuiltinType::Int) ||
+           Ty->isSpecificBuiltinType(BuiltinType::Long) ||
+           Ty->isSpecificBuiltinType(BuiltinType::Float) ||
+           Ty->isSpecificBuiltinType(BuiltinType::Double);
+  }
+}
+
+static inline bool hasUnsupportedTypeQualifier(QualType Ty, bool HSAExtension = false) {
+  return (HSAExtension) ? false : Ty.isVolatileQualified();
+}
+
+bool Sema::IsCXXAMPUnsupportedReferenceType(const Type* Ty,
+  bool CheckContainer, bool IsInfer) {
+  assert(Ty);
+
+  // relax all reference types as ok in HSA extension mode
+  if (getLangOpts().HSAExtension) {
+    return false;
+  }
+
+  const ReferenceType* RTy = dyn_cast<ReferenceType>(Ty);
+  if(!RTy)
+    return false;
+
+  // Recursively test its pointee type
+  QualType PointeeType = RTy->getPointeeType();
+  const Type* TargetTy= RTy->getPointeeType().getTypePtrOrNull();
+  assert(TargetTy);
+
+  // reject reference to volatile type
+  if(hasUnsupportedTypeQualifier(PointeeType, getLangOpts().HSAExtension))
+    return true;
+
+  //References (lvalue and rvalue) shall refer only to amp-compatible types
+  if(IsIncompatibleScalarType(TargetTy, getLangOpts().HSAExtension))
+    return true;
+  if(IsCompatibleScalarType(TargetTy, getLangOpts().HSAExtension))
+    return false;
+
+  // reject reference to function type
+  if (TargetTy->isFunctionType())
+    return true;
+
+  //No reference type is considered amp-compatible
+  if (TargetTy->isReferenceType())
+    return true;
+
+  //Reference to std::nullptr_t is not allowed
+  // Need to be ahead of PointerType check
+  if(TargetTy->isNullPtrType() && PointeeType.getAsString().find("std::nullptr_t"))
+    return true;
+
+  // Additionally, references to pointers are supported as long as the pointer type is itself
+  // supported
+  if(TargetTy->isPointerType() || TargetTy->isMemberPointerType())
+    return IsCXXAMPUnsupportedPointerType(TargetTy, CheckContainer, IsInfer);
+
+  if (TargetTy->isRecordType()) {
+    // Support reference to concurrency::array and/or concurrency::graphics::texture
+    if(CXXRecordDecl* RD = Ty->getAsCXXRecordDecl()) {
+      if((RD->getName() == "array" && PointeeType.getAsString().find("Concurrency::array")) ||
+        (RD->getName() == "texture"&& PointeeType.getAsString().find("graphics::texture")) ||
+        RD->getQualifiedNameAsString().find("std::")!=std::string::npos)
+        return false;
+      else
+       return IsIncompatibleType(TargetTy, CheckContainer, IsInfer);
+    }
+  }
+
+  // TODO: References are onlysupported as local variables
+  // and/or function parameters
+  // and/or function return types
+  return false;
+}
+
+// Check PointerType & MemberPointerType
+bool Sema::IsCXXAMPUnsupportedPointerType(const Type* Ty,
+  bool CheckContainer, bool IsInfer) {
+  assert(Ty);
+
+  // relax all pointer types as ok in HSA extension mode
+  if (getLangOpts().HSAExtension) {
+    return false;
+  }
+
+   // reject incompatible function pointer types
+  if (Ty->isFunctionPointerType() || Ty->isMemberFunctionPointerType())
+    return true;
+    // Pointers to members (C++11 8.3.3) shall only refer to non-static data members.
+  if(Ty->isMemberPointerType()) {
+    // FIXME: no way to check if it is non-static or not
+    if(Ty->isMemberDataPointerType())
+      return true;
+    else
+      return true;
+  }
+
+  const PointerType* PTy = dyn_cast<PointerType>(Ty);
+  if(!PTy)
+    return false;
+
+  // Recursively test its pointee type
+  QualType PointeeType = PTy->getPointeeType();
+  const Type* TargetTy= PTy->getPointeeType().getTypePtrOrNull();
+  assert(TargetTy);
+
+   //std::nullptr_t type is supported and treated as a pointer type
+  if(Ty->isNullPtrType() && PointeeType.getAsString().find("std::nullptr_t"))
+    return true;
+
+  // reject reference to volatile type
+  if(hasUnsupportedTypeQualifier(PointeeType, getLangOpts().HSAExtension))
+    return true;
+
+  //Pointers shall only point to amp-compatible types
+  if(IsIncompatibleScalarType(TargetTy, getLangOpts().HSAExtension))
+    return true;
+  if(IsCompatibleScalarType(TargetTy, getLangOpts().HSAExtension)) {
+    // FIXME: reject this kind of pointer type
+    if(Ty->isMemberDataPointerType())
+      return true;
+    else
+      return false;
+  }
+  // reject pointer to pointer type
+  // No pointer type is considered amp-compatible
+  if (TargetTy->isPointerType() || TargetTy->isMemberPointerType())
+    return true;
+
+  // test pointer to class type
+  if (TargetTy->isRecordType()) {
+    // Pointers can point to amp-compatible types or
+    // concurrency::array or concurrency::graphics::texture
+    if(CXXRecordDecl* RD = TargetTy->getAsCXXRecordDecl()) {
+       if((RD->getName() == "array" && PointeeType.getAsString().find("Concurrency::array")) ||
+        (RD->getName() == "texture"&& PointeeType.getAsString().find("graphics::texture")) ||
+        RD->getQualifiedNameAsString().find("std::")!=std::string::npos)
+         return false;
+       else
+          return IsIncompatibleType(TargetTy, CheckContainer, IsInfer);
+     }
+   }
+
+  // TODO: Pointers are only supported as local variables
+  // and/or function parameters
+  // and/or function return types
+  return false;
+}
+
+bool Sema::DiagnoseCXXAMPDecl(Decl* Dcl, bool CheckContainer, bool IsInfer) {
+  if(!Dcl)
+    return false;
+
+  if(const CXXRecordDecl* RDecl = dyn_cast<CXXRecordDecl>(Dcl)) {
+    // Check nested containers, e.g.
+    //   array<array<T, 2>, 2>
+    //   array<array_view<T,3>, 3>
+    if(CheckContainer) {
+      if(RDecl->getName() == "array" || RDecl->getName() == "array_view")
+        return true;
+    }
+    // bypass array and array_view class
+    if (RDecl->getName() == "array" || RDecl->getName() == "array_view" ||
+        RDecl->getName() == "extent" || RDecl->getName() == "index" ||
+        RDecl->getName() == "accelerator_view" || RDecl->getName() == "accelerator" ||
+        // FIXM: Restrictly skip checking of public APIs and other underlying codes
+        RDecl->getQualifiedNameAsString().find("std::")!=std::string::npos ||
+        // Allow customized impl.
+        // TODO: Need a user code scope
+        RDecl->getName() == "Serialize" || RDecl->getName().find("__gmac") != std::string::npos ||
+        RDecl->getName().find("Gmac") != std::string::npos)
+      return false;
+
+    if(RDecl->getName() == "") {
+      // FIXME:need to consider 'typedef' operator
+    }
+
+    // Check if the record decl* is 4-byte aligned
+    if (!getLangOpts().HSAExtension) {
+      if(RDecl->hasDefinition() && RDecl->isStruct() && !RDecl->isLambda()) {
+        CXXRecordDecl* DefRDecl = RDecl->getDefinition();
+        if(const TypeDecl* TD = dyn_cast<TypeDecl>(Dcl)) {
+          const Type* Ty = TD->getTypeForDecl();
+          if(!Ty->isIncompleteType()) {
+            Type::TypeClass TC = Ty->getTypeClass();
+            if(TC == Type::Elaborated ||TC == Type::InjectedClassName ||
+              (TC == Type::TemplateSpecialization && Context.getCanonicalType(Ty) == Ty)){
+            // FIXME: The following TypeClass might cause endless loop. Just skip them for now
+            // TypeClass: Elaborated, e.g, struct obj_N identifier
+            //   template <int N>
+            //   struct obj_N {
+            //     char m;
+            //     int i;
+            //   };
+            //
+            // TypeClass: InjectedClassName, e.g, obj_N_T<N, T> identifier
+            //   template <int N, typename T>
+            //   struct obj_N_T {
+            //     T m;
+            //     int i;
+            //   };
+            } else {
+              unsigned Alignment = Context.getTypeAlignInChars(Ty).getQuantity();
+              bool isPowerOf2 = Context.getTypeSizeInChars(Ty).isPowerOfTwo();
+              if(!isPowerOf2 && (Alignment & 0x3)) {
+                if(!IsInfer)
+                  Diag(DefRDecl->getBeginLoc(), diag::err_amp_data_member_offset_not_natural_alignment);
+                return false;
+              }
+            }
+          }
+        }
+      }
+    }
+
+    if(RDecl->getDefinition()) {
+      RDecl = RDecl->getDefinition();
+      // Walk through base classes
+      for(CXXRecordDecl::base_class_const_iterator BaseIt = RDecl->bases_begin();
+           BaseIt!=RDecl->bases_end(); BaseIt++) {
+        // it shall not have virtual base classes, and virtual member functions
+        if(!getLangOpts().HSAExtension && BaseIt->isVirtual()) {
+          if(!IsInfer)
+            Diag(BaseIt->getBeginLoc(), diag::err_amp_incompatible);
+          return true;
+        }
+        if(const RecordType* RT = BaseIt->getType()->getAs<RecordType>()) {
+          const CXXRecordDecl *BaseRDecl = cast<CXXRecordDecl>(RT->getDecl());
+          if(!BaseRDecl)
+            continue;
+          return DiagnoseCXXAMPDecl(const_cast<CXXRecordDecl*>(BaseRDecl), CheckContainer, IsInfer);
+        }
+      }
+    }
+
+    // traverse each field, reject incompatible field
+    for (CXXRecordDecl::field_iterator It = RDecl->field_begin(), ItE = RDecl->field_end(); It != ItE; ++It) {
+      const FieldDecl *FD = *It;
+      QualType FieldType = FD->getType();
+      const Type* FTy = FieldType.getTypePtrOrNull();
+      // At this point, float* is not diagnosed
+      if (hasUnsupportedTypeQualifier(FieldType, getLangOpts().HSAExtension) || IsIncompatibleType(FTy, CheckContainer, IsInfer)) {
+        if(!IsInfer)
+          Diag(FD->getBeginLoc(), diag::err_amp_incompatible);
+        return true;
+      }
+
+      // no bitfield is amp-compatible
+      if (!getLangOpts().HSAExtension && FD->isBitField()) {
+        if(!IsInfer)
+          Diag(FD->getBeginLoc(), diag::err_amp_incompatible);
+        return true;
+      }
+      // no pointer type is amp-compatible
+      // no reference type is amp-compatible
+      // At this point, float* is diagnosed when we know it is a member data pointer
+      if (!getLangOpts().HSAExtension && (FTy->isPointerType() || FTy->isReferenceType())) {
+        //pointer or reference is not allowed as pointed to
+        //    type, array element type or data member type
+        //    (except reference to concurrency::array/texture)
+        QualType PointeeType = FTy->getPointeeType();
+        const Type* TargetTy = PointeeType.getTypePtrOrNull();
+        bool is = true;
+        // Test pointer type
+        if(FTy->isPointerType())
+          is = IsIncompatibleType(TargetTy, CheckContainer, IsInfer);
+
+        // Handle special case in struct
+        // struct A {
+        //   float* m;  // not allowed
+        // }
+        if(RDecl->isStruct())
+          is = true;
+
+        // Handle special case in lambda
+        // parallel_for_each[]() {
+        //   float* m;     // is allowed
+        //   float** m1; // not allowed
+        // }
+        #if 0
+        if(RDecl->isClass() && RDecl->isLambda()) {
+         // do nothing
+        }
+        #endif
+
+        // Handle special case
+        if (FTy->isReferenceType() && TargetTy && TargetTy->isRecordType()) {
+          if(CXXRecordDecl* RD = TargetTy->getAsCXXRecordDecl()) {
+            if((RD->getName() == "array" && PointeeType.getAsString().find("Concurrency::array")) ||
+             (RD->getName() == "texture"&& PointeeType.getAsString().find("graphics::texture")))
+             is = false;
+          }
+        }
+        if(is) {
+          if(!IsInfer)
+            Diag(FD->getBeginLoc(), diag::err_amp_incompatible);
+          return true;
+        }
+      }
+
+    //It is a typename
+
+  }
+
+    // traverse each member function, reject incompatible member function
+    for (CXXRecordDecl::method_iterator It = RDecl->method_begin(),
+      ItE = RDecl->method_end(); It != ItE; ++It) {
+      const CXXMethodDecl *MD = *It;
+      if (MD->isVirtual()) {
+        if(!IsInfer)
+          Diag(MD->getBeginLoc(), diag::err_amp_incompatible);
+        return true;
+      }
+    }
+  }
+
+  return false;
+}
+
+namespace {
+  std::set<const Type*> checkedType;
+}
+
+bool Sema::IsIncompatibleType(const Type* Ty, bool CheckContainer, bool IsInfer) {
+  assert(Ty);
+
+  // check if the type has already been checked
+  if (checkedType.find(Ty) == checkedType.end()) {
+    // if not, place it into the set
+    checkedType.insert(Ty);
+  } else {
+    // if the type has already been checked, simply return false here
+    // if the type is really incompatible, it would be found in the previous invocation
+    return false;
+  }
+
+  // reject incompatible scalar types
+  if(IsIncompatibleScalarType(Ty, getLangOpts().HSAExtension))
+    return true;
+
+  if(IsCompatibleScalarType(Ty, getLangOpts().HSAExtension))
+    return false;
+
+  // Check EnumeralType
+  if(const EnumType* ET = dyn_cast<EnumType>(Ty)) {
+    // Enumeration types shall have underlying types consisting of
+    //       int, unsigned int, long, or unsigned long.
+    EnumDecl * EDcl = ET->getDecl();
+    assert(EDcl);
+    const Type* ETy = EDcl->getIntegerType().getTypePtrOrNull();
+    return !(ETy->isSpecificBuiltinType(BuiltinType::Int) ||
+      ETy->isSpecificBuiltinType(BuiltinType::UInt) ||
+      ETy->isSpecificBuiltinType(BuiltinType::Long) ||
+      ETy->isSpecificBuiltinType(BuiltinType::ULong));
+  }
+
+  // Check reference type
+  if (Ty->isReferenceType())
+    return IsCXXAMPUnsupportedReferenceType(Ty, CheckContainer, IsInfer);
+
+  // Check pointer type
+  if (Ty->isPointerType() || Ty->isMemberPointerType())
+    return IsCXXAMPUnsupportedPointerType(Ty, CheckContainer, IsInfer);
+
+  // reject incompatible array types
+  if (Ty->isArrayType()) {
+    if (const ArrayType* ATy = dyn_cast<ArrayType>(Ty)) {
+      if (const Type* ETy = ATy->getElementType().getTypePtrOrNull()) {
+        // reject array of pointer
+        if (!getLangOpts().HSAExtension && ETy->isPointerType()) {
+          return true;
+        }
+
+        // reject array of incompatible scalar type
+        if (!getLangOpts().HSAExtension && (IsIncompatibleScalarType(ETy) || ETy->isBooleanType())) { // array of bool is not 4-bytes aligned
+          return true;
+        }
+
+        // test array of class type
+        if (ETy->isClassType()) {
+          return IsIncompatibleType(ETy, CheckContainer, IsInfer);
+        }
+      }
+    }
+  }
+
+  // Check if it is a TemplateSpecializationType
+  if(const TemplateSpecializationType* TST = Ty->getAs<TemplateSpecializationType>()) {
+      // FIXME: should consider alias Template
+      // Get its underlying template decl*
+      if(ClassTemplateDecl* CTDecl = dyn_cast_or_null<ClassTemplateDecl>(
+        TST->getTemplateName().getAsTemplateDecl())) {
+        if(CTDecl->getTemplatedDecl())
+          return DiagnoseCXXAMPDecl(CTDecl->getTemplatedDecl(), CheckContainer, IsInfer);
+      }
+  }
+
+  // reject incompatible class types
+  if (Ty->isRecordType()) {
+    return DiagnoseCXXAMPDecl(Ty->getAsCXXRecordDecl(), CheckContainer, IsInfer);
+  }
+
+  return false;
+}
+
+bool Sema::IsCXXAMPTileStatic(Declarator &D) {
+ if (D.getDeclSpec().hasAttributes()) {
+    return D.getDeclSpec().getAttributes().hasAttribute(ParsedAttr::AT_HCCTileStatic);
+  }
+  return false;
+}
+
+ void Sema::DiagnosticCXXAMPTileStatic(Declarator &D, Decl *Dcl) {
+  if(!IsInAMPRestricted())
+    Diag(D.getIdentifierLoc(), diag::err_amp_tile_static_unsupported_usage);
+
+  if(!Dcl)
+    return;
+
+  if (getLangOpts().HSAExtension)
+    return;
+
+  clang::Decl::Kind DK= Dcl->getKind();
+  if(DK == clang::Decl::Var) {
+    if (const ValueDecl *VD = dyn_cast<ValueDecl>(Dcl)) {
+      const Type* Ty = VD->getType().getTypePtrOrNull();
+      if(Ty && (Ty->isPointerType() ||Ty->isReferenceType()))
+        Diag(D.getIdentifierLoc(), diag::err_amp_tile_static_pointer_or_reference);
+      // std::nullptr_t is not a base type
+      if (!VD->getType().getBaseTypeIdentifier()) {
+        QualType Child = VD->getType().IgnoreParens();
+        if(Child.getAsString().find("std::nullptr_t")!=std::string::npos)
+          Diag(D.getIdentifierLoc(), diag::err_amp_using_nullptr_in_tile_static)
+          << Child.getAsString();
+      }
+    }
+  }
+}
+
+
 Decl *Sema::ActOnDeclarator(Scope *S, Declarator &D) {
   D.setFunctionDefinitionKind(FDK_Declaration);
   Decl *Dcl = HandleDeclarator(S, D, MultiTemplateParamsArg());
 
+  // C++AMP
+  if(getLangOpts().CPlusPlusAMP) {
+    // handle incompatible types for variables
+    if(IsInAMPRestricted()) {
+      clang::Decl::Kind DK= Dcl->getKind();
+
+      if(DK == clang::Decl::Var) {
+        if (const ValueDecl *VD = dyn_cast<ValueDecl>(Dcl)) {
+          QualType QTy = VD->getType();
+          const Type* Ty = QTy.getTypePtrOrNull();
+
+          // reject incompatible types
+          // If the type is without name, e.g.
+          //   typedef struct {
+          //   }
+          //  the IdentiferInfo is null. So use QualType itself to skip
+          if(QTy.getAsString().find("ecl_accelerator_info")!=std::string::npos ||
+            QTy.getAsString().find("std::")!=std::string::npos) {
+            // Skip own implementation
+          } else {
+            if (hasUnsupportedTypeQualifier(QTy, getLangOpts().HSAExtension) || IsIncompatibleType(Ty))
+              Diag(D.getDeclSpec().getBeginLoc(), diag::err_amp_type_unsupported)
+                << QTy.getAsString();
+          }
+        }
+      }
+    }
+
+    // Diagnose tile_static
+    if(IsCXXAMPTileStatic(D)) {
+      DiagnosticCXXAMPTileStatic(D, Dcl);
+    } else {
+      // Not tile_static
+     if(IsInAMPRestricted()) {
+      clang::Decl::Kind DK= Dcl->getKind();
+
+      //base class, data member or array element must be at least 4 byte aligned
+      //
+      //     parallel_for_each(arr.get_extent(), [&](index<1> idx) restrict(amp) {
+      //             struct A_base
+      //            {
+      //               bool m1;             // Only a bool.local, and not 4 byte aligned
+      //            };
+      //            class A : A_base
+      //            {
+      //            };
+      //            A local_array[10];  // is not allowed
+      //      });
+      //
+
+
+      //FIXME: the following cases should also be handled
+      //(1)          class A3    // supported for amp since m2 now has 32-bit alignment
+      //              {
+      //                 bool m1;
+      //                 __declspec(align(4)) bool m2;
+      //              };
+      //
+      //(2) __declspec(align(4)) bool a1[10]; // not supported since this __declspec(align(4))
+      //(3)     typedef __declspec(align(4)) struct S{ bool m;} ALIGNED_BOOL;
+      //          ALIGNED_BOOL a2[10]; // supported since each array element is now 32-bit aligned
+      //
+      if(DK == clang::Decl::Var) {
+        if (const ValueDecl *VD = dyn_cast<ValueDecl>(Dcl)) {
+          QualType DestType = VD->getType();
+          if(const ArrayType* CA = dyn_cast<ArrayType>(DestType))
+            DestType = CA->getElementType();
+
+          const RecordType *DestRecordType = DestType->getAs<RecordType>();
+          // Only need struct/union/class which has ctors
+          if (DestRecordType) {
+            CXXRecordDecl *DestRecordDecl = cast<CXXRecordDecl>(DestRecordType->getDecl());
+            assert(DestRecordDecl && "Should have constructor initialization!");
+
+            // FIXME: There is a clang bug in ClassTemplateSpecializationDecl which we can't
+            // interate its base classes
+            if(!getLangOpts().HSAExtension &&
+               !dyn_cast<ClassTemplateSpecializationDecl>(DestRecordDecl)) {
+              // Empty class type of array element
+              if(DestRecordDecl && DestRecordDecl->isEmpty() && dyn_cast<ArrayType>(VD->getType()))
+                Diag(Dcl->getLocation(), diag::err_amp_need_4_byte_aligned);
+
+              // Recursively walk up base class for amp_need_4_byte_aligned
+              std::vector<FieldDecl*> FoundVec;
+              bool Aligned = true;
+              Track4ByteAligned(DestRecordDecl, *this, D, FoundVec, Aligned);
+              if(!Aligned) {
+                  for (unsigned i=0; i<FoundVec.size(); i++)
+                    if(FoundVec[i])
+                     Diag(FoundVec[i]->getInnerLocStart(), diag::err_amp_need_4_byte_aligned);
+
+                  // The problematic VarDecl
+                 Diag(Dcl->getLocation(), diag::err_amp_incompatible);
+               }
+             }
+           }
+         }
+       }
+     }
+   }
+  }
+
   if (OriginalLexicalContext && OriginalLexicalContext->isObjCContainer() &&
       Dcl && Dcl->getDeclContext()->isFileContext())
     Dcl->setTopLevelDeclInObjCContainer();
@@ -6815,6 +7451,13 @@ NamedDecl *Sema::ActOnVariableDeclarator(
     }
   }
 
+  if (getLangOpts().CPlusPlusAMP) {
+    if (SC == SC_None && S->getFnParent() != nullptr &&
+        (NewVD->hasAttr<HCCTileStaticAttr>())) {
+      NewVD->setStorageClass(SC_Static);
+    }
+  }
+
   // Ensure that dllimport globals without explicit storage class are treated as
   // extern. The storage class is set above using parsed attributes. Now we can
   // check the VarDecl itself.
@@ -7831,6 +8474,13 @@ static NamedDecl *DiagnoseInvalidRedeclaration(
                                   : Sema::LookupOrdinaryName,
                     Sema::ForVisibleRedeclaration);
 
+  // C++ AMP-specific
+  //
+  // relax the rule to allow out-of-line definitions of CPU-restrited member functions or operators
+  if (SemaRef.getLangOpts().CPlusPlusAMP && NewFD->hasAttr<CXXAMPRestrictCPUAttr>()) {
+    return nullptr;
+  }
+
   NewFD->setInvalidDecl();
   if (IsLocalFriend)
     SemaRef.LookupName(Prev, S);
@@ -8056,6 +8706,16 @@ static FunctionDecl* CreateNewFunctionDecl(Sema &SemaRef, Declarator &D,
                                     NameInfo, R, TInfo, isInline,
                                     /*isImplicitlyDeclared=*/false);
 
+      // C++AMP-specific
+      if (SemaRef.getLangOpts().CPlusPlusAMP) {
+        bool isAMP = Record->hasAttr<CXXAMPRestrictAMPAttr>();
+        bool isCPU = Record->hasAttr<CXXAMPRestrictCPUAttr>();
+        if (isAMP ^ isCPU) {
+          SemaRef.Diag(D.getIdentifierLoc(), diag::err_amp_destructor_overloading);
+          return 0;
+        }
+      }
+
       // If the destructor needs an implicit exception specification, set it
       // now. FIXME: It'd be nice to be able to create the right type to start
       // with, but the type needs to reference the destructor declaration.
@@ -8796,7 +9456,43 @@ Sema::ActOnFunctionDeclarator(Scope *S, Declarator &D, DeclContext *DC,
 
         if (Param->isInvalidDecl())
           NewFD->setInvalidDecl();
-      }
+
+        // C++AMP
+        if(getLangOpts().CPlusPlusAMP) {
+          QualType PType = Param->getOriginalType();
+          if(dyn_cast<ReferenceType>(PType)) {
+            if(PType->getPointeeType()->isFunctionProtoType()) {
+              //       int amp_function(int x, int y) __GPU_ONLY
+              //       {
+              //            return x + y;
+              //       }
+              //       int test(int (&p)(int, int) __GPU_ONLY)
+              //       {
+              //            // can't call an amp function through a pointer
+              //            return 1;
+              //       }
+              if(Param->hasAttr<CXXAMPRestrictAMPAttr>())
+                Diag(Param->getLocation(), diag::err_amp_bad_reinterpret_cast_from_pointer_to_functionptr);
+              }
+            }
+          if(PType->isFunctionPointerType()) {
+             if(PType->getPointeeType()->isFunctionProtoType()) {
+              // int amp_function(int x, int y) __GPU_ONLY {
+              //   return x + y;
+              // }
+              // int amp_function(int x, int y) __CPU_ONLY {
+              //    return 2 * x + y;
+              // }
+              // int test(int (*p)(int, int) __GPU) { // Error overloaded param
+              //    return p(3, 4); // Error in call
+              // }
+              if(Param->hasAttr<CXXAMPRestrictAMPAttr>())
+                // FIXME: is it overloaded?
+                Diag(Param->getLocation(), diag::err_amp_bad_reinterpret_cast_from_pointer_to_functionptr);
+              }
+            }
+          }
+       }
     }
 
     if (!getLangOpts().CPlusPlus) {
@@ -8916,6 +9612,32 @@ Sema::ActOnFunctionDeclarator(Scope *S, Declarator &D, DeclContext *DC,
     }
   }
 
+  // C++AMP
+  if(getLangOpts().CPlusPlusAMP && (NewFD->getType().getAddressSpace() == LangAS::hcc_tilestatic)) {
+    Diag(D.getIdentifierLoc(), diag::err_amp_tile_static_on_function_return_result);
+  }
+  if (getLangOpts().CPlusPlusAMP && NewFD->hasAttr<CXXAMPRestrictAMPAttr>()) {
+    DeclaratorChunk::FunctionTypeInfo &FTI = D.getFunctionTypeInfo();
+    if(!getLangOpts().HSAExtension && FTI.getEllipsisLoc().isValid()) {
+      Diag(FTI.getEllipsisLoc(), diag::err_amp_ellipsis_param_on_function_declarator);
+     }
+     // C++AMP
+     // Check function params if it is amp restricted
+     {
+        for(unsigned i = 0; i< Params.size(); i++) {
+        QualType PType = Params[i]->getOriginalType();
+        if(dyn_cast<ReferenceType>(PType)) {
+          if(const RecordType *DestRecordType = PType->getPointeeType()->getAs<RecordType>()) {
+            // Only need struct/union/class which has ctors
+            if(cast<CXXRecordDecl>(DestRecordType->getDecl()) ) {
+              IsIncompatibleType(PType->getPointeeType().getTypePtrOrNull());
+            }
+          }
+        }
+      }
+    }
+  }
+
   if (!getLangOpts().CPlusPlus) {
     // Perform semantic checking on the function declaration.
     if (!NewFD->isInvalidDecl() && NewFD->isMain())
@@ -10131,10 +10853,53 @@ bool Sema::CheckFunctionDeclaration(Scope *S, FunctionDecl *NewFD,
     }
   }
 
+  // C++AMP
+   // Apply this routine only when we have function definition.
+   // FIXME: This should be applied after all C++/C++11 semantic checks
+   //             And the following assumptiions should be considered if not correct in further impl.
+   // (1) No AMP specific restrictions in signature
+   // (2) Do not merge or overload if AMP restriction is not intersected correctly
+   // (3) No intersections of all overloaded functions, only Ovl_Match
+  if (1 && getLangOpts().CPlusPlusAMP && !Previous.empty() &&
+    AllowOverloadingOfFunction(Previous, Context, NewFD) &&
+    NewFD->hasBody() &&
+    CheckOverload(S, NewFD, Previous, OldDecl,
+                            /*NewIsUsingDecl*/ false) ==Ovl_Match) {
+    //Use different modifiers on a declaration and definiton
+    //               struct S
+    //               {
+    //                  int test() __GPU;
+    //               };
+    //
+    //               int S::test() __GPU_ONLY  // Error, the 'matched' not the 'overload' has
+    //                                                        // incorrect AMP Attr intersection
+    //              {
+    //                  return 1;
+    //              }
+    // Ugly codes
+    bool CurCPU = NewFD->hasAttr<CXXAMPRestrictCPUAttr>();
+    bool CurAMP = NewFD->hasAttr<CXXAMPRestrictAMPAttr>();
+    bool PreviousCPU = false;
+    bool PreviousAMP = false;
+    for(LookupResult::iterator PreDecl = Previous.begin(); PreDecl!=Previous.end(); PreDecl++) {
+      PreviousCPU |= (*PreDecl)->hasAttr<CXXAMPRestrictCPUAttr>();
+      PreviousAMP |= (*PreDecl)->hasAttr<CXXAMPRestrictAMPAttr>();
+    }
+    // Case by Case
+    if(PreviousCPU && PreviousAMP && CurAMP && !CurCPU) {
+      // Previous __GPU, current __GPU_ONLY
+      Diag(NewFD->getLocation(), diag::err_amp_function_redefinition)
+        << NewFD->getNameInfo().getAsString();
+      for(LookupResult::iterator PreDecl = Previous.begin(); PreDecl!=Previous.end(); PreDecl++)
+        Diag((*PreDecl)->getLocation(), diag::note_member_def_close_match);
+    }
+  }
+
   if (CheckMultiVersionFunction(*this, NewFD, Redeclaration, OldDecl,
                                 MergeTypeWithPrevious, Previous))
     return Redeclaration;
 
+
   // C++11 [dcl.constexpr]p8:
   //   A constexpr specifier for a non-static member function that is not
   //   a constructor declares that member function to be const.
@@ -10399,6 +11164,22 @@ bool Sema::CheckFunctionDeclaration(Scope *S, FunctionDecl *NewFD,
     if (!Redeclaration && LangOpts.CUDA)
       checkCUDATargetOverload(NewFD, Previous);
   }
+
+  // C++AMP
+  // C linkage functions can't have multiple restriction specifiers
+  //   extern "C" void foo() restrict(amp, cpu) {}  // Error
+  // However, for HIP __global__ functions we allow it orthogonally to the
+  // linkage specifier, since our mechanism for implementing restrictions is not
+  // in any way impacting mangling, unlike what the original C++AMP had.
+  // TODO: this is too verbose, should be split up into separate functions.
+  if (getLangOpts().CPlusPlusAMP && NewFD->isExternC() &&
+    NewFD->hasAttr<CXXAMPRestrictCPUAttr>() &&
+    NewFD->hasAttr<CXXAMPRestrictAMPAttr>() &&
+    (!NewFD->hasAttr<AnnotateAttr>() ||
+     NewFD->getAttr<AnnotateAttr>()->getAnnotation() != "__HIP_global_function__"))
+    Diag(NewFD->getLocation(), diag::err_amp_c_linkage_function_has_multiple_restrictions)
+          << NewFD->getDeclName();
+
   return Redeclaration;
 }
 
@@ -12152,6 +12933,47 @@ Sema::ActOnCXXForRangeIdentifier(Scope *S, SourceLocation IdentLoc,
                        AttrEnd.isValid() ? AttrEnd : IdentLoc);
 }
 
+/// True if the expression is valid for the initialization expression of a
+/// C++AMP global array.
+static bool checkCXXAMPGlobalArrayInitExpr(Stmt *E) {
+  if (auto *CE = dyn_cast<CXXConstructExpr>(E)) {
+    auto *CD = CE->getConstructor();
+    if (CD->hasAttr<CXXAMPRestrictAMPAttr>() &&
+        !CD->hasAttr<CXXAMPRestrictCPUAttr>())
+      return false;
+    for (auto I : CE->arguments()) {
+      if (!checkCXXAMPGlobalArrayInitExpr(I))
+        return false;
+    }
+    for (auto I : CD->inits()) {
+      if (!checkCXXAMPGlobalArrayInitExpr(I->getInit()))
+        return false;
+    }
+    return true;
+  } else if (auto *EWC = dyn_cast<ExprWithCleanups>(E)) {
+    for (auto I : EWC->children()) {
+      if (!checkCXXAMPGlobalArrayInitExpr(I))
+        return false;
+    }
+    return true;
+  } else if (auto *IL = dyn_cast<InitListExpr>(E)) {
+    for (auto I : IL->children()) {
+      if (!checkCXXAMPGlobalArrayInitExpr(I))
+        return false;
+    }
+    return true;
+  } else if (auto *MTE = dyn_cast<MaterializeTemporaryExpr>(E)) {
+    return checkCXXAMPGlobalArrayInitExpr(MTE->GetTemporaryExpr());
+  } else if (auto *CBE = dyn_cast<CXXBindTemporaryExpr>(E)) {
+    return checkCXXAMPGlobalArrayInitExpr(CBE->getSubExpr());
+  } else if (auto *IC = dyn_cast<ImplicitCastExpr>(E)) {
+    return checkCXXAMPGlobalArrayInitExpr(IC->getSubExpr());
+  } else if (auto *FC = dyn_cast<CXXFunctionalCastExpr>(E)) {
+    return checkCXXAMPGlobalArrayInitExpr(FC->getSubExpr());
+  }
+  return true;
+}
+
 void Sema::CheckCompleteVariableDeclaration(VarDecl *var) {
   if (var->isInvalidDecl()) return;
 
@@ -12298,7 +13120,31 @@ void Sema::CheckCompleteVariableDeclaration(VarDecl *var) {
   bool IsGlobal = GlobalStorage && !var->isStaticLocal();
   QualType baseType = Context.getBaseElementType(type);
 
-  if (Init && !Init->isValueDependent()) {
+  // C++AMP
+  if(IsGlobal && dyn_cast<ArrayType>(type)) {
+    const ArrayType* AT = dyn_cast<ArrayType>(type);
+    const RecordType *RT = AT->getElementType()->getAs<RecordType>();
+    if(RT) {
+      CXXRecordDecl *RDecl = cast<CXXRecordDecl>(RT->getDecl());
+      // Case by case
+      //     struct A
+      //     {
+      //        int var;
+      //        A() restrict(amp) { }
+      //     };
+      //
+      //     A arr[5];   // Error: Array initialization in global scope, CPU restricted by default
+      //
+      if(RDecl && RDecl->hasUserDeclaredConstructor()) {
+        if (!checkCXXAMPGlobalArrayInitExpr(var->getInit()))
+          Diag(var->getLocation(), diag::err_amp_call_from_cpu_to_amp);
+      }
+    }
+  }
+
+  if (!var->getDeclContext()->isDependentContext() &&
+      Init && !Init->isValueDependent()) {
+
     if (var->isConstexpr()) {
       SmallVector<PartialDiagnosticAt, 8> Notes;
       if (!var->evaluateValue(Notes) || !var->isInitICE()) {
@@ -13084,6 +13930,24 @@ Sema::ActOnStartOfFunctionDef(Scope *FnBodyScope, Declarator &D,
 
   D.setFunctionDefinitionKind(FDK_Definition);
   Decl *DP = HandleDeclarator(ParentScope, D, TemplateParameterLists);
+
+  if (LangOpts.CPlusPlusAMP && SkipBody) {
+    const bool IsHC = DP->hasAttr<CXXAMPRestrictAMPAttr>();
+    const bool IsCPU = DP->hasAttr<CXXAMPRestrictCPUAttr>();
+
+    SkipBody->ShouldSkip = LangOpts.DevicePath ? (!IsHC && IsCPU) :
+                                                 (IsHC && !IsCPU);
+
+    if (SkipBody->ShouldSkip) {
+      auto Empty = new (getASTContext()) NullStmt{DP->getLocation()};
+      cast<FunctionDecl>(DP)->setBody(Empty);
+      cast<FunctionDecl>(DP)->addAttr(
+        CXX11NoReturnAttr::CreateImplicit(getASTContext()));
+
+      return DP;
+    }
+  }
+
   return ActOnStartOfFunctionDef(FnBodyScope, DP, SkipBody);
 }
 
@@ -13235,12 +14099,23 @@ Sema::CheckForFunctionRedefinition(FunctionDecl *FD,
     return;
   }
 
-  if (getLangOpts().GNUMode && Definition->isInlineSpecified() &&
-      Definition->getStorageClass() == SC_Extern)
+  // C++AMP
+  // FIXME: Remove && Definition ?
+  if (getLangOpts().CPlusPlusAMP && FD->isExternC() && Definition) {
+    // Mangling is removed and linker will have 2 definitions of the same function
+    //   extern "C" void foo() restrict(amp) { }
+    //   extern "C" void foo() restrict(cpu) { } // Error
+    if (FD->hasAttr<CXXAMPRestrictAMPAttr>()!=Definition->hasAttr<CXXAMPRestrictAMPAttr>() ||
+        FD->hasAttr<CXXAMPRestrictCPUAttr>()!=Definition->hasAttr<CXXAMPRestrictCPUAttr>())
+      Diag(Definition->getLocation(), diag::err_amp_has_second_c_linkage_overloaded_function)
+        << FD->getDeclName();
+  } else if (getLangOpts().GNUMode && Definition->isInlineSpecified() &&
+             Definition->getStorageClass() == SC_Extern) {
     Diag(FD->getLocation(), diag::err_redefinition_extern_inline)
-        << FD->getDeclName() << getLangOpts().CPlusPlus;
-  else
+         << FD->getDeclName() << getLangOpts().CPlusPlus;
+  } else {
     Diag(FD->getLocation(), diag::err_redefinition) << FD->getDeclName();
+  }
 
   Diag(Definition->getLocation(), diag::note_previous_definition);
   FD->setInvalidDecl();
@@ -13524,6 +14399,62 @@ Decl *Sema::ActOnFinishFunctionBody(Decl *D, Stmt *BodyArg) {
   return ActOnFinishFunctionBody(D, BodyArg, false);
 }
 
+// FIXME: Not all conditions are covered
+void TrackMemoryOperator(const Stmt *S, std::vector <Expr*>& FoundVec) {
+  // VisitDeclStmt
+  if (const DeclStmt *DS = dyn_cast<DeclStmt>(S)) {
+    for (DeclStmt::const_decl_iterator I = DS->decl_begin(),E = DS->decl_end();
+        I != E; ++I) {
+      VarDecl* VD = dyn_cast<VarDecl>(*I);
+      if (VD) {
+        if(Expr* EP = const_cast<Expr*>(VD->getAnyInitializer())) {
+          if(CXXNewExpr* NE = dyn_cast<CXXNewExpr>(EP))
+            FoundVec.push_back(NE);
+          if(CXXDeleteExpr* DE = dyn_cast<CXXDeleteExpr>(EP))
+            FoundVec.push_back(DE);
+          // CStyle
+          if(CStyleCastExpr* CStyleCE = dyn_cast<CStyleCastExpr>(EP)) {
+            if(CallExpr* CE = dyn_cast<CallExpr>(CStyleCE->getSubExpr())) {
+              if(CE->getDirectCallee() &&
+                   CE->getDirectCallee()->getNameInfo().getAsString()== std::string("malloc"))
+                   FoundVec.push_back(CE);
+              }
+          }
+          if(CallExpr* CE = dyn_cast<CallExpr>(EP)) {
+             if(CE->getDirectCallee() &&
+                   CE->getDirectCallee()->getNameInfo().getAsString() == std::string("malloc"))
+                FoundVec.push_back(CE);
+          }
+        }
+      }
+    }
+    return;
+  }
+
+  if (const Expr *CEP = dyn_cast<Expr>(S)) {
+    Expr* EP = const_cast<Expr*>(CEP);
+    if(CXXNewExpr* NE = dyn_cast<CXXNewExpr>(EP)) {
+      FoundVec.push_back(NE);
+    }
+    if(CXXDeleteExpr* DE = dyn_cast<CXXDeleteExpr>(EP)) {
+      FoundVec.push_back(DE);
+    }
+    if(CallExpr* CE = dyn_cast<CallExpr>(EP)) {
+      if(CE->getDirectCallee() &&
+          CE->getDirectCallee()->getNameInfo().getAsString() == std::string("free"))
+        FoundVec.push_back(CE);
+    }
+    return;
+  }
+
+  for (Stmt::const_child_iterator CI = S->child_begin(); CI != S->child_end(); ++CI) {
+    if (*CI) {
+      TrackMemoryOperator(*CI, FoundVec);
+    }
+  }
+
+}
+
 /// RAII object that pops an ExpressionEvaluationContext when exiting a function
 /// body.
 class ExitFunctionBodyRAII {
@@ -13734,6 +14665,22 @@ Decl *Sema::ActOnFinishFunctionBody(Decl *dcl, Stmt *Body,
       }
     }
 
+    // C++AMP
+    if(getLangOpts().CPlusPlusAMP  && !getLangOpts().HSAExtension &&
+      ((getCurLambda() && (FD == getCurLambda()->CallOperator)) ||
+      FD->isGlobal())) {
+      if (FD->hasAttr<CXXAMPRestrictAMPAttr>()) {
+        std::vector<Expr* > FoundVec;
+        TrackMemoryOperator(Body, FoundVec);
+        if(FoundVec.size()) {
+          for (unsigned i = 0; i <FoundVec.size(); i++) {
+            if(FoundVec[i])
+              Diag(FoundVec[i]->getExprLoc(), diag::err_amp_memory_operation);
+            }
+         }
+       }
+    }
+
     assert((FD == getCurFunctionDecl() || getCurLambda()->CallOperator == FD) &&
            "Function parsing confused");
   } else if (ObjCMethodDecl *MD = dyn_cast_or_null<ObjCMethodDecl>(dcl)) {
@@ -13886,6 +14833,9 @@ Decl *Sema::ActOnFinishFunctionBody(Decl *dcl, Stmt *Body,
     DiscardCleanupsInEvaluationContext();
   }
 
+  // C++AMP try use restriction specifier inferring logic
+  TryCXXAMPRestrictionInferring(dcl, Body);
+
   return dcl;
 }
 
@@ -16503,6 +17453,24 @@ void Sema::ActOnFields(Scope *S, SourceLocation RecLoc, Decl *EnclosingDecl,
           }
         }
       }
+
+      // C++AMP
+      if (getLangOpts().CPlusPlusAMP && CXXRecord->hasUserDeclaredDestructor()) {
+        bool hasAMP = false;
+        bool hasCPU = false;
+        for (CXXRecordDecl::ctor_iterator CtorIt = CXXRecord->ctor_begin(),
+                                          CtorE = CXXRecord->ctor_end();
+             CtorIt != CtorE; ++CtorIt) {
+          hasAMP |= CtorIt->hasAttr<CXXAMPRestrictAMPAttr>();
+          hasCPU |= CtorIt->hasAttr<CXXAMPRestrictCPUAttr>();
+        }
+        CXXDestructorDecl *Dtor = CXXRecord->getDestructor();
+        if ((hasAMP && !Dtor->hasAttr<CXXAMPRestrictAMPAttr>()) ||
+            (hasCPU && !Dtor->hasAttr<CXXAMPRestrictCPUAttr>())) {
+          Diag(Dtor->getLocation(), diag::err_amp_dtor_rest_cover_all_ctor);
+          Record->setInvalidDecl();
+        }
+      }
     }
 
     if (!Completed)
diff --git a/lib/Sema/SemaDeclAttr.cpp b/lib/Sema/SemaDeclAttr.cpp
index ee06f8ae51..7434ce17ca 100644
--- a/lib/Sema/SemaDeclAttr.cpp
+++ b/lib/Sema/SemaDeclAttr.cpp
@@ -4369,6 +4369,12 @@ static void handleSharedAttr(Sema &S, Decl *D, const ParsedAttr &AL) {
       AL.getRange(), S.Context, AL.getAttributeSpellingListIndex()));
 }
 
+static void handleHCCTileStaticAttr(Sema &S, Decl *D, const ParsedAttr &AL) {
+  // FIXME more checkings to follow
+  D->addAttr(::new (S.Context) HCCTileStaticAttr(
+      AL.getRange(), S.Context, AL.getAttributeSpellingListIndex()));
+}
+
 static void handleGlobalAttr(Sema &S, Decl *D, const ParsedAttr &AL) {
   if (checkAttrMutualExclusion<CUDADeviceAttr>(S, D, AL) ||
       checkAttrMutualExclusion<CUDAHostAttr>(S, D, AL)) {
@@ -5906,6 +5912,117 @@ static void handleInterruptAttr(Sema &S, Decl *D, const ParsedAttr &AL) {
   }
 }
 
+class AMDGPUISAVersionChecker {
+public:
+  AMDGPUISAVersionChecker(Sema &S):S(S){
+    GPU = S.getASTContext().getTargetInfo().getTargetOpts().CPU;
+  }
+
+  /// Check GPU ISA version parameter of AMDGPU attributes.
+  /// The ISA version parameter is either a three-digit ISA version string
+  /// prefixed by "gfx", e.g. "gfx810", or a case insensitive device name,
+  /// e.g. "Fiji".
+  /// \param Attr is AMDGPU attribute containing ISA version parameter.
+  /// \param Index is the index for the ISA version parameter.
+  /// \param ISA will contain the ISA version string if returning true.
+  /// \return true if the attribute does not contain the ISA version parameter,
+  ///         or the ISA version parameter is empty, or the ISA version
+  ///         parameter and the target GPU have the same ISA version.
+  bool checkAMDGPUISAVersion(const ParsedAttr &Attr, unsigned Index,
+      StringRef &ISA) {
+    if (Attr.getNumArgs() <= Index) {
+      ISA = "";
+      return true;
+    }
+
+    if (!S.checkStringLiteralArgumentAttr(Attr, Index, ISA))
+      return false;
+
+    if (ISA.empty())
+      return true;
+    auto ISAVer = parseAMDGPUISAVersion(ISA);
+    if (ISAVer == AMDGPU_ISA_NONE || !ISA.startswith("gfx")) {
+      S.Diag(Attr.getLoc(), diag::err_attribute_amdgpu_invalid_isa_version)
+        << ISA;
+      return false;
+    }
+    return ISAVer == parseAMDGPUISAVersion(GPU);
+  }
+
+private:
+  Sema &S;
+  StringRef GPU; // target GPU specified by -target-cpu option
+
+  /// \brief The ISA version of AMD GPU.
+  enum AMDGPUISAVersion {
+    AMDGPU_ISA_NONE,
+    AMDGPU_ISA_600,
+    AMDGPU_ISA_700,
+    AMDGPU_ISA_701,
+    AMDGPU_ISA_800,
+    AMDGPU_ISA_801,
+    AMDGPU_ISA_802,
+    AMDGPU_ISA_803,
+  };
+
+  // Parse AMDGPU ISA version string.
+  // \param ISA is either a three-digit ISA version string with prefix "gfx",
+  //        e.g. "gfx810", or a case insensitive device name, e.g. "Fiji".
+  // \return AMDGPU ISA version.
+  // ToDo: The cases need to cover both ISA version parameter of register
+  // control attributes and -target-cpu option. We should phase out using GPU
+  // code names in -target-cpu option and remove them from below.
+  AMDGPUISAVersion parseAMDGPUISAVersion(StringRef ISA) {
+    return llvm::StringSwitch<AMDGPUISAVersion>(ISA.lower())
+      .Case("",          AMDGPU_ISA_600)
+      .Case("gfx600",    AMDGPU_ISA_600)
+      .Case("gfx700",    AMDGPU_ISA_700)
+      .Case("gfx701",    AMDGPU_ISA_701)
+      .Case("gfx800",    AMDGPU_ISA_800)
+      .Case("gfx801",    AMDGPU_ISA_801)
+      .Case("gfx802",    AMDGPU_ISA_802)
+      .Case("gfx803",    AMDGPU_ISA_803)
+      .Case("tahiti",    AMDGPU_ISA_600)
+      .Case("pitcairn",  AMDGPU_ISA_600)
+      .Case("verde",     AMDGPU_ISA_600)
+      .Case("oland",     AMDGPU_ISA_600)
+      .Case("hainan",    AMDGPU_ISA_600)
+      .Case("bonaire",   AMDGPU_ISA_700)
+      .Case("kabini",    AMDGPU_ISA_700)
+      .Case("kaveri",    AMDGPU_ISA_700)
+      .Case("hawaii",    AMDGPU_ISA_701)
+      .Case("mullins",   AMDGPU_ISA_700)
+      .Case("tonga",     AMDGPU_ISA_802)
+      .Case("iceland",   AMDGPU_ISA_800)
+      .Case("carrizo",   AMDGPU_ISA_801)
+      .Case("fiji",      AMDGPU_ISA_803)
+      .Case("stoney",    AMDGPU_ISA_801)
+      .Case("polaris10", AMDGPU_ISA_803)
+      .Case("polaris11", AMDGPU_ISA_803)
+      .Default(AMDGPU_ISA_NONE);
+  }
+};
+
+namespace
+{
+  inline
+  bool checkAllAreIntegral(const ParsedAttr &Attr, Sema &S) {
+    for (auto i = 0u; i != Attr.getNumArgs(); ++i) {
+      auto e = Attr.getArgAsExpr(i);
+      if (e && !e->getType()->isIntegralOrEnumerationType()) {
+        S.Diag(getAttrLoc(Attr), diag::err_attribute_argument_n_type)
+          << Attr << i << AANT_ArgumentIntegerConstant
+          << e->getSourceRange();
+
+        return false;
+      }
+    }
+
+    return true;
+  }
+}
+
+
 static bool
 checkAMDGPUFlatWorkGroupSizeArguments(Sema &S, Expr *MinExpr, Expr *MaxExpr,
                                       const AMDGPUFlatWorkGroupSizeAttr &Attr) {
@@ -5915,11 +6032,11 @@ checkAMDGPUFlatWorkGroupSizeArguments(Sema &S, Expr *MinExpr, Expr *MaxExpr,
     return false;
 
   uint32_t Min = 0;
-  if (!checkUInt32Argument(S, Attr, MinExpr, Min, 0))
+  if (MinExpr->isEvaluatable(S.Context) && !checkUInt32Argument(S, Attr, MinExpr, Min, 0))
     return true;
 
   uint32_t Max = 0;
-  if (!checkUInt32Argument(S, Attr, MaxExpr, Max, 1))
+  if (MaxExpr->isEvaluatable(S.Context) && !checkUInt32Argument(S, Attr, MaxExpr, Max, 1))
     return true;
 
   if (Min == 0 && Max != 0) {
@@ -5952,7 +6069,9 @@ void Sema::addAMDGPUFlatWorkGroupSizeAttr(SourceRange AttrRange, Decl *D,
 static void handleAMDGPUFlatWorkGroupSizeAttr(Sema &S, Decl *D,
                                               const ParsedAttr &AL) {
   Expr *MinExpr = AL.getArgAsExpr(0);
-  Expr *MaxExpr = AL.getArgAsExpr(1);
+  Expr *MaxExpr = MinExpr;
+  if (AL.getNumArgs() > 1)
+    MaxExpr = AL.getArgAsExpr(1);
 
   S.addAMDGPUFlatWorkGroupSizeAttr(AL.getRange(), D, MinExpr, MaxExpr,
                                    AL.getAttributeSpellingListIndex());
@@ -6018,27 +6137,71 @@ static void handleAMDGPUWavesPerEUAttr(Sema &S, Decl *D, const ParsedAttr &AL) {
 }
 
 static void handleAMDGPUNumSGPRAttr(Sema &S, Decl *D, const ParsedAttr &AL) {
+
+  if (!checkAllAreIntegral(AL, S))
+    return;
+
   uint32_t NumSGPR = 0;
   Expr *NumSGPRExpr = AL.getArgAsExpr(0);
-  if (!checkUInt32Argument(S, AL, NumSGPRExpr, NumSGPR))
+  if (NumSGPRExpr->isEvaluatable(S.Context) &&
+      !checkUInt32Argument(S, AL, NumSGPRExpr, NumSGPR))
     return;
 
   D->addAttr(::new (S.Context)
-             AMDGPUNumSGPRAttr(AL.getLoc(), S.Context, NumSGPR,
+             AMDGPUNumSGPRAttr(AL.getLoc(), S.Context, NumSGPRExpr,
                                AL.getAttributeSpellingListIndex()));
 }
 
 static void handleAMDGPUNumVGPRAttr(Sema &S, Decl *D, const ParsedAttr &AL) {
+
+  if (!checkAllAreIntegral(AL, S))
+    return;
+
   uint32_t NumVGPR = 0;
   Expr *NumVGPRExpr = AL.getArgAsExpr(0);
-  if (!checkUInt32Argument(S, AL, NumVGPRExpr, NumVGPR))
+  if (NumVGPRExpr->isEvaluatable(S.Context) &&
+      !checkUInt32Argument(S, AL, NumVGPRExpr, NumVGPR))
     return;
 
   D->addAttr(::new (S.Context)
-             AMDGPUNumVGPRAttr(AL.getLoc(), S.Context, NumVGPR,
+             AMDGPUNumVGPRAttr(AL.getLoc(), S.Context, NumVGPRExpr,
                                AL.getAttributeSpellingListIndex()));
 }
 
+static void handleAMDGPUMaxWorkGroupDimAttr(Sema &S, Decl *D,
+                                            const ParsedAttr &Attr) {
+  if (!checkAllAreIntegral(Attr, S))
+    return;
+  if (!checkAttributeAtLeastNumArgs(S, Attr, 3))
+    return;
+
+  uint32_t X = 0;
+  Expr *XExpr = Attr.getArgAsExpr(0);
+  if (XExpr->isEvaluatable(S.Context) &&
+      !checkUInt32Argument(S, Attr, XExpr, X))
+    return;
+
+  uint32_t Y = 0;
+  Expr *YExpr = Attr.getArgAsExpr(1);
+  if (YExpr->isEvaluatable(S.Context) &&
+      !checkUInt32Argument(S, Attr, YExpr, Y))
+    return;
+
+  uint32_t Z = 0;
+  Expr *ZExpr = Attr.getArgAsExpr(2);
+  if (ZExpr->isEvaluatable(S.Context) &&
+      !checkUInt32Argument(S, Attr, ZExpr, Z))
+    return;
+
+  AMDGPUISAVersionChecker VC(S);
+  StringRef ISA;
+  if (VC.checkAMDGPUISAVersion(Attr, 3, ISA))
+    D->addAttr(::new (S.Context)
+         AMDGPUMaxWorkGroupDimAttr(Attr.getLoc(), S.Context, XExpr, YExpr,
+                                   ZExpr, ISA,
+                                   Attr.getAttributeSpellingListIndex()));
+}
+
 static void handleX86ForceAlignArgPointerAttr(Sema &S, Decl *D,
                                               const ParsedAttr &AL) {
   // If we try to apply it to a function pointer, don't warn, but don't
@@ -6593,6 +6756,72 @@ static void handleMSAllocatorAttr(Sema &S, Decl *D, const ParsedAttr &AL) {
   handleSimpleAttribute<MSAllocatorAttr>(S, D, AL);
 }
 
+//===----------------------------------------------------------------------===//
+// C++ AMP specific attribute handlers.
+// FIXME: Merge these handlers with handleSimpleAttribute
+//===----------------------------------------------------------------------===//
+
+static void handleAutoAttr(Sema &S, Decl *D, const ParsedAttr &Attr) {
+  if (S.LangOpts.CUDA) {
+    // No support for now
+  } else if (S.LangOpts.CPlusPlusAMP) {
+    D->addAttr(::new (S.Context) AlwaysInlineAttr(Attr.getRange(),
+          S.Context, Attr.getAttributeSpellingListIndex()));
+    D->addAttr(::new (S.Context) CXXAMPRestrictAUTOAttr(Attr.getRange(),
+          S.Context, Attr.getAttributeSpellingListIndex()));
+  } else {
+    S.Diag(Attr.getLoc(), diag::warn_attribute_ignored) << "auto";
+  }
+}
+
+static void handleDeviceAttr(Sema &S, Decl *D, const ParsedAttr &Attr) {
+  if (S.LangOpts.CUDA) {
+    // check the attribute arguments.
+    if (Attr.getNumArgs() != 0) {
+      S.Diag(Attr.getLoc(), diag::err_attribute_wrong_number_arguments) << 0;
+      return;
+    }
+
+    if (!isa<FunctionDecl>(D) && !isa<VarDecl>(D)) {
+      S.Diag(Attr.getLoc(), diag::warn_attribute_wrong_decl_type)
+        << Attr.getName() << ExpectedVariableOrFunction;
+      return;
+    }
+
+    D->addAttr(::new (S.Context)
+               CUDADeviceAttr(Attr.getRange(), S.Context,
+                              Attr.getAttributeSpellingListIndex()));
+  } else if (S.LangOpts.CPlusPlusAMP) {
+    D->addAttr(::new (S.Context) CXXAMPRestrictAMPAttr(Attr.getRange(),
+          S.Context, Attr.getAttributeSpellingListIndex()));
+  } else {
+    S.Diag(Attr.getLoc(), diag::warn_attribute_ignored) << "device";
+  }
+}
+
+static void handleHostAttr(Sema &S, Decl *D, const ParsedAttr &Attr) {
+  if (S.LangOpts.CUDA) {
+    // check the attribute arguments.
+    if (!checkAttributeNumArgs(S, Attr, 0))
+      return;
+
+    if (!isa<FunctionDecl>(D)) {
+      S.Diag(Attr.getLoc(), diag::warn_attribute_wrong_decl_type)
+        << Attr.getName() << ExpectedFunction;
+      return;
+    }
+
+    D->addAttr(::new (S.Context)
+               CUDAHostAttr(Attr.getRange(), S.Context,
+                            Attr.getAttributeSpellingListIndex()));
+  } else if (S.LangOpts.CPlusPlusAMP) {
+    D->addAttr(::new (S.Context) CXXAMPRestrictCPUAttr(Attr.getRange(),
+          S.Context, Attr.getAttributeSpellingListIndex()));
+  } else {
+    S.Diag(Attr.getLoc(), diag::warn_attribute_ignored) << "host";
+  }
+}
+
 //===----------------------------------------------------------------------===//
 // Top Level Sema Entry Points
 //===----------------------------------------------------------------------===//
@@ -6608,7 +6837,12 @@ static void ProcessDeclAttribute(Sema &S, Scope *scope, Decl *D,
 
   // Ignore C++11 attributes on declarator chunks: they appertain to the type
   // instead.
-  if (AL.isCXX11Attribute() && !IncludeCXX11Attributes)
+  if (AL.isCXX11Attribute() && !IncludeCXX11Attributes &&
+      AL.getKind() != ParsedAttr::AT_HC_HC &&
+      AL.getKind() != ParsedAttr::AT_HC_CPU &&
+      AL.getKind() != ParsedAttr::AT_AMDGPUWavesPerEU &&
+      AL.getKind() != ParsedAttr::AT_AMDGPUFlatWorkGroupSize &&
+      AL.getKind() != ParsedAttr::AT_AMDGPUMaxWorkGroupDim)
     return;
 
   // Unknown attributes are automatically warned on. Target-specific attributes
@@ -6680,6 +6914,9 @@ static void ProcessDeclAttribute(Sema &S, Scope *scope, Decl *D,
   case ParsedAttr::AT_AMDGPUNumVGPR:
     handleAMDGPUNumVGPRAttr(S, D, AL);
     break;
+  case ParsedAttr::AT_AMDGPUMaxWorkGroupDim:
+    handleAMDGPUMaxWorkGroupDimAttr(S, D, AL);
+    break;
   case ParsedAttr::AT_AVRSignal:
     handleAVRSignalAttr(S, D, AL);
     break;
@@ -6798,9 +7035,19 @@ static void ProcessDeclAttribute(Sema &S, Scope *scope, Decl *D,
   case ParsedAttr::AT_CUDAGlobal:
     handleGlobalAttr(S, D, AL);
     break;
+  case ParsedAttr::AT_HC_HC:
+  case ParsedAttr::AT_CXXAMPRestrictAMP:
+    handleDeviceAttr(S, D, AL);
+    break;
+  case ParsedAttr::AT_HC_CPU:
+  case ParsedAttr::AT_CXXAMPRestrictCPU:
+    handleHostAttr(S, D, AL);
+    break;
+  case ParsedAttr::AT_CXXAMPRestrictAUTO:
+    handleAutoAttr(S, D, AL);
+    break;
   case ParsedAttr::AT_CUDADevice:
-    handleSimpleAttributeWithExclusions<CUDADeviceAttr, CUDAGlobalAttr>(S, D,
-                                                                        AL);
+    handleSimpleAttributeWithExclusions<CUDADeviceAttr, CUDAGlobalAttr>(S, D, AL);
     break;
   case ParsedAttr::AT_CUDAHost:
     handleSimpleAttributeWithExclusions<CUDAHostAttr, CUDAGlobalAttr>(S, D, AL);
@@ -6885,6 +7132,9 @@ static void ProcessDeclAttribute(Sema &S, Scope *scope, Decl *D,
   case ParsedAttr::AT_CUDAShared:
     handleSharedAttr(S, D, AL);
     break;
+  case ParsedAttr::AT_HCCTileStatic:
+    handleHCCTileStaticAttr(S, D, AL);
+    break;
   case ParsedAttr::AT_VecReturn:
     handleVecReturnAttr(S, D, AL);
     break;
@@ -7386,7 +7636,7 @@ void Sema::ProcessDeclAttributeList(Scope *S, Decl *D,
     } else if (const auto *A = D->getAttr<OpenCLIntelReqdSubGroupSizeAttr>()) {
       Diag(D->getLocation(), diag::err_opencl_kernel_attr) << A;
       D->setInvalidDecl();
-    } else if (!D->hasAttr<CUDAGlobalAttr>()) {
+    } else if (!D->hasAttr<CUDAGlobalAttr>() && !D->hasAttr<CXXAMPRestrictAMPAttr>()) {
       if (const auto *A = D->getAttr<AMDGPUFlatWorkGroupSizeAttr>()) {
         Diag(D->getLocation(), diag::err_attribute_wrong_decl_type)
             << A << ExpectedKernelFunction;
diff --git a/lib/Sema/SemaDeclCXX.cpp b/lib/Sema/SemaDeclCXX.cpp
index 2f9e4f961f..bb45d0a28f 100644
--- a/lib/Sema/SemaDeclCXX.cpp
+++ b/lib/Sema/SemaDeclCXX.cpp
@@ -3896,6 +3896,50 @@ Sema::BuildMemInitializer(Decl *ConstructorD,
   // Look for a member, first.
   if (ValueDecl *Member = tryLookupCtorInitMemberDecl(
           ClassDecl, SS, TemplateTypeTy, MemberOrBase)) {
+
+        // C++AMP
+        // FIMXE: Need to consider non member initializer cases
+        if(getLangOpts().CPlusPlusAMP && ClassDecl->isStruct()
+          && (Constructor->hasAttr<CXXAMPRestrictAMPAttr>() ||
+          Constructor->hasAttr<CXXAMPRestrictCPUAttr>())) {
+          // Can't use IsIncompatibleType
+          const Type* Ty  = Member->getType().getTypePtrOrNull();
+          QualType TheType = Member->getType();
+          if(Ty) {
+            // Case by case
+            if(Ty->isPointerType())
+              TheType = Ty->getPointeeType();
+            if(Ty->isArrayType())
+              TheType = dyn_cast<ArrayType>(Ty)->getElementType();
+            if(!TheType.isNull() && TheType->isRecordType()) {
+              CXXRecordDecl* RDecl = TheType->getAsCXXRecordDecl();
+                if (RDecl->getName() == "array")
+                  Diag(Member->getBeginLoc(), diag::err_amp_incompatible);
+            }
+          }
+          // Checke if it is array_view's reference or pointer
+          if(Ty && (Ty->isPointerType() ||Ty->isReferenceType())) {
+            const Type* TargetTy = Ty->getPointeeType().getTypePtrOrNull();
+            if(const TemplateSpecializationType* TST = TargetTy->getAs<TemplateSpecializationType>()) {
+              // Check if it is a TemplateSpecializationType
+              // FIXME: should consider alias Template
+              // Get its underlying template decl*
+              if(ClassTemplateDecl* CTDecl = dyn_cast_or_null<ClassTemplateDecl>(
+                TST->getTemplateName().getAsTemplateDecl())) {
+                if(CXXRecordDecl* RDecl = CTDecl->getTemplatedDecl())
+                  if(RDecl->getName() == "array_view") {
+                    #if 0
+                    Diag(ClassDecl->getBeginLoc(), diag::err_amp_type_unsupported)
+                      << ClassDecl->getName();
+                    #endif
+                    Diag(Member->getLocation(), diag::err_amp_unsupported_reference_or_pointer);
+                  }
+              }
+            }
+          }
+        }
+
+
     if (EllipsisLoc.isValid())
       Diag(EllipsisLoc, diag::err_pack_expansion_member_init)
           << MemberOrBase
@@ -7976,6 +8020,266 @@ void Sema::ActOnFinishCXXMemberSpecification(
   CheckCompletedCXXClass(cast<CXXRecordDecl>(TagDecl));
 }
 
+/// FIXME: O(n)
+bool Sema::NeedAMPDeserializer(CXXRecordDecl *ClassDecl) {
+#if 0
+  //FIXME(Ray) have problem supporting templates
+  if (ClassTemplateDecl *Template = ClassDecl->getDescribedClassTemplate())
+    return false;
+#endif
+  bool HasRestrict=false, HasDeserializerDecl=false;
+  for (CXXRecordDecl::method_iterator CI = ClassDecl->method_begin(),
+      CE = ClassDecl->method_end(); CI!=CE; CI++) {
+    if (CXXMethodDecl *MD = dyn_cast<CXXMethodDecl>(*CI)) {
+      if (MD->hasAttr<CXXAMPRestrictAMPAttr>()) {
+        HasRestrict = true;
+        if (MD->hasAttr<AnnotateAttr>() &&
+            (MD->getAttr<AnnotateAttr>()->getAnnotation().find("deserialize") != StringRef::npos))
+          HasDeserializerDecl = true;
+      }
+    }
+  }
+  return HasRestrict && !HasDeserializerDecl;
+}
+
+//FIXME: Refactor this
+bool Sema::HasDeclaredAMPDeserializer(CXXRecordDecl *ClassDecl) {
+  return ClassDecl->getCXXAMPDeserializationConstructor() != NULL;
+}
+
+void Sema::DeclareAMPSerializer(CXXRecordDecl *ClassDecl, DeclarationName Name) {
+  SourceLocation CurrentLocation = ClassDecl->getLocation();
+  for (CXXRecordDecl::method_iterator Method = ClassDecl->method_begin(),
+      MethodEnd = ClassDecl->method_end();
+      Method != MethodEnd; ++Method) {
+    if (CXXMethodDecl *MD = dyn_cast<CXXMethodDecl>(*Method)) {
+      if (MD->hasAttr<AnnotateAttr>() &&
+        MD->getAttr<AnnotateAttr>()->getAnnotation() == "serialize") {
+        return;
+      }
+    }
+  }
+
+  DeclarationNameInfo DNI(Name, CurrentLocation);
+  const FunctionProtoType *SerializeType;
+  FunctionProtoType::ExtProtoInfo ExtInfo;
+  const CXXRecordDecl *ParmClassDecl;
+
+  AMPDeserializerArgs LocalArgs;
+#if 0
+  for (ASTContext::type_iterator it = ClassDecl->getASTContext().types_begin(),
+       e = ClassDecl->getASTContext().types_end(); it != e; ++it) {
+#endif
+  auto &Types = ClassDecl->getASTContext().getTypes();
+  for (unsigned I = 0; I != Types.size(); ++I) {
+    const auto *it = Types[I];
+    if(/*(**/it/*)*/->isRecordType()) {
+      ParmClassDecl = /*(**/it/*)*/->getAsCXXRecordDecl();
+      std::string RecordName = ParmClassDecl->getDeclName().getAsString();
+      if(RecordName == "Serialize") {
+        QualType ArgType = Context.getTypeDeclType(ParmClassDecl);
+        ArgType = Context.getLValueReferenceType(ArgType);
+        LocalArgs.push_back(ArgType);
+        break;
+      }
+    }
+    ;
+  }
+
+  ExtInfo.TypeQuals.addConst();
+  SerializeType = dyn_cast<FunctionProtoType>(Context.getFunctionType(Context.VoidTy,
+                                              LocalArgs,
+                                              ExtInfo).getTypePtr());
+  assert(SerializeType != NULL);
+ // SerializeType->TypeQuals |= Qualifiers::Const;
+  TypeSourceInfo *TI = Context.getTrivialTypeSourceInfo(QualType(SerializeType, 0),
+                                                        CurrentLocation);
+  // Set correct parameter information for templates
+  TypeLoc TL = TI->getTypeLoc();
+  FunctionProtoTypeLoc ProtoTL = TL.getAs<FunctionProtoTypeLoc>();
+
+  CXXMethodDecl * SerializeFunc = CXXMethodDecl::Create(Context,
+                                                        ClassDecl,
+                                                        CurrentLocation,
+                                                        DNI,
+                                                    QualType(SerializeType, 0),
+                                                        /*TInfo=*/TI,
+                                                        SC_None,
+                                                        /*Inline=*/false,
+                                                        CSK_unspecified, // /*isConstExpr*/false,
+                                                        CurrentLocation);
+  SerializeFunc->setAccess(AS_public);
+
+  int i = 0;
+  SmallVector<ParmVarDecl *, 4> FieldAsArgs;
+  for (AMPDeserializerArgs::iterator it = LocalArgs.begin(),
+    e = LocalArgs.end(); it != e; it ++, i++) {
+    ParmVarDecl *PVD = ParmVarDecl::Create(Context, SerializeFunc,
+                                           CurrentLocation,
+                                           CurrentLocation,
+                                           /*Id=*/0,
+                                           *it,
+    /*TypeSourceInfo*/ Context.getTrivialTypeSourceInfo(*it ,CurrentLocation),
+                                           SC_None, 0);
+    PVD->setScopeInfo(0, i);
+    FieldAsArgs.push_back(PVD);
+    ProtoTL.setParam(i, PVD);
+  }
+  SerializeFunc->setParams(FieldAsArgs);
+  // Set appropriate attributes for AMP
+  if (getLangOpts().AMPCPU)
+      SerializeFunc->addAttr(new (Context)
+                             CXXAMPRestrictAMPAttr(CurrentLocation, Context, 0));
+  SerializeFunc->addAttr(::new (Context)
+                         CXXAMPRestrictCPUAttr(CurrentLocation, Context, 0));
+  SerializeFunc->addAttr(::new (Context)
+                         AnnotateAttr(CurrentLocation, Context, "serialize", 0));
+  ClassDecl->addDecl(SerializeFunc);
+  // Now we've obtained a valid Name. Use that to recursively declare
+  // __cxxamp_serialize() for member classes. TBD: base classes?
+  for (CXXRecordDecl::field_iterator Field = ClassDecl->field_begin(),
+       FieldEnd = ClassDecl->field_end(); Field != FieldEnd; ++Field) {
+    QualType FieldType = Field->getType().getNonReferenceType();
+    if (const RecordType *RecordTy = FieldType->getAs<RecordType>()) {
+      const CXXRecordDecl *MemberClassDecl = dyn_cast<const CXXRecordDecl>(
+                                                RecordTy->getDecl());
+      if (!MemberClassDecl)
+        continue;
+      DeclareAMPSerializer(const_cast<CXXRecordDecl*>(MemberClassDecl), Name);
+    }
+  }
+  for (CXXRecordDecl::base_class_iterator Base = ClassDecl->bases_begin(),
+       BaseEnd = ClassDecl->bases_end(); Base != BaseEnd; ++Base) {
+      QualType BaseType = Base->getType().getNonReferenceType();
+      if (const RecordType *RecordTy = BaseType->getAs<RecordType>()) {
+          const CXXRecordDecl *MemberClassDecl = dyn_cast<const CXXRecordDecl>(RecordTy->getDecl());
+          if (!MemberClassDecl)
+              continue;
+          DeclareAMPSerializer(const_cast<CXXRecordDecl*>(MemberClassDecl), Name);
+      }
+  }
+  MarkFunctionReferenced(CurrentLocation, SerializeFunc);
+}
+
+void Sema::DeclareAMPDeserializer(CXXRecordDecl *ClassDecl, AMPDeserializerArgs *Args) {
+  // Create the deserializer declaration.
+  CanQualType ClassType
+    = Context.getCanonicalType(Context.getTypeDeclType(ClassDecl));
+  SourceLocation ClassLoc = ClassDecl->getLocation();
+  // Build up a function type for this particular constructor.
+  const Type *NewCtorType;
+  AMPDeserializerArgs LocalArgs;
+  //Recursively declare base-class deserializers
+  for (CXXRecordDecl::base_class_iterator B = ClassDecl->bases_begin(),
+                 BEnd = ClassDecl->bases_end(); B!=BEnd; ++B ) {
+    if (const RecordType *BaseType = B->getType()->getAs<RecordType>()) {
+       CXXRecordDecl *BaseClassDecl = cast<CXXRecordDecl>(BaseType->getDecl());
+      if (!HasDeclaredAMPDeserializer(BaseClassDecl)) {
+        DeclareAMPDeserializer(BaseClassDecl, &LocalArgs);
+      } else if(CXXMethodDecl *MD =
+	  BaseClassDecl->getCXXAMPDeserializationConstructor()) {
+	for (CXXMethodDecl::param_iterator CPI = MD->param_begin(),
+	     CPE = MD->param_end(); CPI!=CPE; CPI++) {
+	  LocalArgs.push_back((*CPI)->getType());
+	}
+      }
+    }
+  }
+  for (CXXRecordDecl::field_iterator Field = ClassDecl->field_begin(),
+                                  FieldEnd = ClassDecl->field_end();
+        Field != FieldEnd; Field++) {
+    // Skip fields that are not supposed to be marshalled to GPU space
+    if((Field->getType())->isArrayType()&&!(Field->getType()).isVolatileQualified()) {
+      if(ClassDecl->getQualifiedNameAsString().find("std::")==std::string::npos) {
+        if (getLangOpts().HSAExtension) {
+          // relax this rule in HSA to allow capturing raw pointers
+        } else {
+          const ArrayType* AT = dyn_cast<ArrayType>(Field->getType());
+          if(IsIncompatibleType(AT->getElementType().getTypePtrOrNull())) {
+            Diag(Field->getLocation(), diag::err_amp_incompatible);
+            return;
+          }
+        }
+      }
+    }
+    if (Field->hasAttr<CXXAMPRestrictCPUAttr>() &&
+        !Field->hasAttr<CXXAMPRestrictAMPAttr>()) {
+      continue;
+    }
+    if (Field->isUnnamedBitfield())
+      continue;
+    if (Field->getType()->isRecordType()) {
+      CXXRecordDecl *FieldClassDecl = Field->getType()->getAsCXXRecordDecl();
+      if (!HasDeclaredAMPDeserializer(FieldClassDecl)) {
+	DeclareAMPDeserializer(FieldClassDecl, &LocalArgs);
+      } else if(CXXMethodDecl *MD =
+	  FieldClassDecl->getCXXAMPDeserializationConstructor()) {
+	for (CXXMethodDecl::param_iterator CPI = MD->param_begin(),
+	     CPE = MD->param_end(); CPI!=CPE; CPI++) {
+	  LocalArgs.push_back((*CPI)->getType());
+	}
+      }
+      continue;
+    }
+    LocalArgs.push_back(Field->getType());
+  }
+  if (!LocalArgs.size()) {
+    return;
+  }
+  if(Args)
+    Args->insert(Args->end(), LocalArgs.begin(), LocalArgs.end());
+
+  FunctionProtoType::ExtProtoInfo ExtInfo;
+  NewCtorType = Context.getFunctionType(Context.VoidTy,
+      LocalArgs, ExtInfo)
+    .getTypePtr();
+
+  DeclarationName Name
+    = Context.DeclarationNames.getCXXConstructorName(ClassType);
+  DeclarationNameInfo NameInfo(Name, ClassLoc);
+  TypeSourceInfo *TI = Context.getTrivialTypeSourceInfo(QualType(NewCtorType, 0),ClassLoc);
+  // Set correct parameter information for templates
+  TypeLoc TL = TI->getTypeLoc();
+  FunctionProtoTypeLoc ProtoTL = TL.getAs<FunctionProtoTypeLoc>();
+  assert(ProtoTL && "Missing prototype?");
+  CXXConstructorDecl *Constructor
+      = CXXConstructorDecl::Create(Context, ClassDecl, ClassLoc, NameInfo,
+                            QualType(NewCtorType, 0),
+			    /*TypeSourceInfo*/ TI,
+                                  ExplicitSpecifier(), // /*isExplicit=*/ false, 
+                                  /*isInline=*/true,
+                                  /*isImplicitlyDeclared=*/true,
+                                  CSK_unspecified // /*isConstexp*/false
+                                  );
+  Constructor->setAccess(AS_public);
+  SmallVector<ParmVarDecl *, 4> FieldAsArgs;
+  // Compute arguments needed
+  int i = 0;
+  for (AMPDeserializerArgs::iterator it = LocalArgs.begin(),
+    e = LocalArgs.end(); it != e; it ++, i++) {
+    ParmVarDecl *PVD = ParmVarDecl::Create(Context, Constructor,
+                                               ClassLoc, ClassLoc, /*Id=*/0,
+                                               *it,
+  /*TypeSourceInfo*/ Context.getTrivialTypeSourceInfo(*it ,ClassLoc),
+                                               SC_None, 0);
+    PVD->setScopeInfo(0, i);
+    FieldAsArgs.push_back(PVD);
+    ProtoTL.setParam(i, PVD);
+  }
+  // Popluate arguments
+  Constructor->setParams(FieldAsArgs);
+  // Set appropriate attributes for AMP
+  if (getLangOpts().AMPCPU)
+      Constructor->addAttr(new (Context) CXXAMPRestrictCPUAttr(ClassLoc, Context, 0));
+  Constructor->addAttr(::new (Context) CXXAMPRestrictAMPAttr(ClassLoc, Context, 0));
+  Constructor->addAttr(::new (Context)
+    AnnotateAttr(ClassLoc, Context, "auto_deserialize", 0));
+  // Introduce this constructor into its scope.
+  if (Scope *S = getScopeForContext(ClassDecl))
+    PushOnScopeChains(Constructor, S, false);
+  ClassDecl->addDecl(Constructor);
+}
+
 /// AddImplicitlyDeclaredMembersToClass - Adds any implicitly-declared
 /// special functions, such as the default constructor, copy
 /// constructor, or destructor, to the given C++ class (C++
@@ -12228,6 +12532,957 @@ void Sema::DefineImplicitCopyAssignment(SourceLocation CurrentLocation,
   }
 }
 
+void Sema::DeclareAMPTrampolineName(CXXRecordDecl *ClassDecl, DeclarationName Name) {
+  SourceLocation CurrentLocation = ClassDecl->getLocation();
+  CXXMethodDecl *Trampoline = NULL;
+  for (CXXRecordDecl::method_iterator Method = ClassDecl->method_begin(),
+      MethodEnd = ClassDecl->method_end();
+      Method != MethodEnd; ++Method) {
+    if (Method->hasAttr<AnnotateAttr>() &&
+      Method->getAttr<AnnotateAttr>()->getAnnotation() == "__cxxamp_trampoline_name") {
+      return;
+    }
+  }
+  DeclarationNameInfo DNI(Name, CurrentLocation);
+
+  const Type *TrampolineType;
+  FunctionProtoType::ExtProtoInfo ExtInfo;
+  TypeSourceInfo *TI;
+  TypeLoc TL;
+  FunctionProtoTypeLoc ProtoTL;
+  // Generate name lookup routine: char *__cxxamp_trampoline_name(void)
+  TrampolineType = Context.getFunctionType(
+      Context.getPointerType(Context.CharTy),
+      None,
+      ExtInfo).getTypePtr();
+  TI = Context.getTrivialTypeSourceInfo(QualType(TrampolineType, 0),
+    CurrentLocation);
+  // Set correct parameter information for templates
+  TL = TI->getTypeLoc();
+  ProtoTL = TL.getAs<FunctionProtoTypeLoc>();
+  assert(ProtoTL && "Missing prototype?");
+  Trampoline = CXXMethodDecl::Create(
+      Context, ClassDecl, CurrentLocation, DNI, QualType(TrampolineType, 0),
+      /*TInfo=*/TI,
+      SC_Static,
+      /*Inline=*/false,
+      CSK_unspecified, // /*isConstExpr*/false,
+      CurrentLocation
+      );
+  Trampoline->setAccess(AS_public);
+  // Set appropriate attributes for AMP
+ if (getLangOpts().AMPCPU)
+     Trampoline->addAttr(new (Context) CXXAMPRestrictAMPAttr(CurrentLocation, Context, 0));
+  Trampoline->addAttr(new (Context) CXXAMPRestrictCPUAttr(CurrentLocation, Context, 0));
+  Trampoline->addAttr(new (Context) AnnotateAttr(CurrentLocation, Context, "__cxxamp_trampoline_name", 0));
+  ClassDecl->addDecl(Trampoline);
+  // Generate definition
+  MarkFunctionReferenced(CurrentLocation, Trampoline);
+}
+
+void CreateDummyAMPTrampoline(Sema& S, DeclarationName Name, CXXRecordDecl *&ClassDecl, CXXMethodDecl *&Trampoline) {
+  if(Trampoline)
+    return;
+  SourceLocation CurrentLocation = ClassDecl->getLocation();
+  DeclarationNameInfo DNI(Name, CurrentLocation);
+  ASTContext& Context = S.Context;
+  Sema::AMPDeserializerArgs LocalArgs;
+  FunctionProtoType::ExtProtoInfo ExtInfo;
+
+  // Generate name lookup routine: char *__cxxamp_trampoline(void)
+  const Type *TrampolineType =  Context.getFunctionType(Context.VoidTy,
+      LocalArgs,
+      ExtInfo).getTypePtr();
+    TypeSourceInfo *TI = Context.getTrivialTypeSourceInfo(QualType(TrampolineType, 0),
+    CurrentLocation);
+  Trampoline = CXXMethodDecl::Create(
+        Context, ClassDecl, CurrentLocation, DNI, QualType(TrampolineType, 0),
+        /*TInfo=*/TI,
+        SC_Static,
+        /*Inline=*/false,
+        CSK_unspecified, // /*isConstExpr*/false,
+        CurrentLocation
+        );
+  Trampoline->setAccess(AS_public);
+
+  // Popluate no arguments
+  // FIXME: Side effects on CPU side
+  #if 0
+  Trampoline->setParams(TrampolineParams);
+  #endif
+
+  // Set appropriate attributes for AMP
+   Trampoline->addAttr(::new (Context) OpenCLKernelAttr(CurrentLocation, Context, 0));
+   Trampoline->addAttr(::new (Context) CXXAMPRestrictAMPAttr(CurrentLocation, Context, 0));
+
+   // In CPU compilation mode, we need an empty implementation of trampoline
+   // so that parallel_for_each can find it.
+   Trampoline->addAttr(::new (Context) CXXAMPRestrictCPUAttr(CurrentLocation, Context, 0));
+   Trampoline->addAttr(::new (Context) AnnotateAttr(CurrentLocation, Context, "__cxxamp_trampoline", 0));
+   // Manually add this Attribute on this stage to avoid
+   //     ClassDecl->getCXXAMPDeserializationConstructor() == NULL
+   Trampoline->addAttr(::new (Context) AnnotateAttr(CurrentLocation, Context, "dummy_deserialize", 0));
+   ClassDecl->addDecl(Trampoline);
+}
+
+void Sema::DeclareAMPTrampoline(CXXRecordDecl *ClassDecl,
+  DeclarationName Name) {
+  SourceLocation CurrentLocation = ClassDecl->getLocation();
+  CXXMethodDecl *Trampoline = NULL;
+  // Deserializer is declared lazily until first lookup, so
+  // if it's not done yet, do so.
+  if (!HasDeclaredAMPDeserializer(ClassDecl)) {
+    DeclareAMPDeserializer(ClassDecl, NULL);
+  }
+  for (CXXRecordDecl::method_iterator Method = ClassDecl->method_begin(),
+      MethodEnd = ClassDecl->method_end();
+      Method != MethodEnd; ++Method) {
+    if (Method->hasAttr<AnnotateAttr>() &&
+      Method->getAttr<AnnotateAttr>()->getAnnotation() == "__cxxamp_trampoline") {
+      return;
+    }
+  }
+  // The AMP deserialized constructor might be NULL if the actual declaration is invalid.
+   // Return at this point to force the compiler to throw compilation errors as expected
+   if(ClassDecl->getCXXAMPDeserializationConstructor() == NULL) {
+     CreateDummyAMPTrampoline(*this, Name, ClassDecl, Trampoline);
+     return;
+   }
+   CXXConstructorDecl *DeserializeConstructor =
+     dyn_cast<CXXConstructorDecl>(
+       ClassDecl->getCXXAMPDeserializationConstructor());
+  #if 0
+  assert(DeserializeConstructor);
+  #endif
+  DeclarationNameInfo DNI(Name, CurrentLocation);
+
+  const Type *TrampolineType;
+  FunctionProtoType::ExtProtoInfo ExtInfo;
+  // Now collect the constructors that we already have in the current class.
+  AMPDeserializerArgs LocalArgs;
+  for (CXXMethodDecl::param_iterator CPI = DeserializeConstructor->param_begin(),
+      CPE = DeserializeConstructor->param_end(); CPI!=CPE; CPI++) {
+    // Reference types are only allowed to have one level; i.e. no
+    // class base {&int}; class foo { bar &base; };
+    QualType MemberType = (*CPI)->getType().getNonReferenceType();
+    if (MemberType != (*CPI)->getType()) {
+      if (!getLangOpts().HSAExtension) {
+        if (!MemberType.getTypePtr()->isClassType()) {
+          Diag((*CPI)->getLocation(), diag::err_amp_incompatible);
+        } else {
+          assert(MemberType.getTypePtr()->isClassType() == true &&
+                 "Only supporting taking reference of classes");
+          CXXRecordDecl *MemberClass = MemberType.getTypePtr()->getAsCXXRecordDecl();
+          if (!HasDeclaredAMPDeserializer(MemberClass)) {
+    	    DeclareAMPDeserializer(MemberClass, NULL);
+          }
+          CXXMethodDecl *MemberDeserializer =
+            MemberClass->getCXXAMPDeserializationConstructor();
+          if (!MemberDeserializer) {
+            Diag((*CPI)->getLocation(), diag::err_amp_incompatible);
+          } else {
+            assert(MemberDeserializer);
+            for (CXXMethodDecl::param_iterator CPI = MemberDeserializer->param_begin(),
+                 CPE = MemberDeserializer->param_end(); CPI!=CPE; CPI++) {
+              LocalArgs.push_back((*CPI)->getType());
+            }
+          }
+        }
+      } else { // HSA extension check
+        if (MemberType.getTypePtr()->isClassType()) {
+          // hc::array should still be serialized as traditional C++AMP objects
+          if (MemberType.getTypePtr()->isGPUArrayType()) {
+            CXXRecordDecl *MemberClass = MemberType.getTypePtr()->getAsCXXRecordDecl();
+            if (!HasDeclaredAMPDeserializer(MemberClass)) {
+              DeclareAMPDeserializer(MemberClass, NULL);
+            }
+            CXXMethodDecl *MemberDeserializer =
+              MemberClass->getCXXAMPDeserializationConstructor();
+            if (!MemberDeserializer) {
+              Diag((*CPI)->getLocation(), diag::err_amp_incompatible);
+            } else {
+              assert(MemberDeserializer);
+              for (CXXMethodDecl::param_iterator CPI = MemberDeserializer->param_begin(),
+                   CPE = MemberDeserializer->param_end(); CPI!=CPE; CPI++) {
+                LocalArgs.push_back((*CPI)->getType());
+              }
+            }
+          } else {
+            // In HSA extension mode, capture by reference is simply a pointer
+            LocalArgs.push_back(Context.getPointerType(MemberType));
+          }
+        } else {
+          // In HSA extension mode, capture by reference is simply a pointer
+          LocalArgs.push_back(Context.getPointerType(MemberType));
+        }
+      } // HSA extension check
+    } else {
+    // Since OpenCL kernel argument does not allow system dependent built-in types,
+    // e.g. bool. The following is a tricky method to replace _Bool type with Char8.
+    // The alternative way is to replace it in function ActOnVariableDeclarator
+    // line 4876, before a NewVD is created.
+    if((*CPI)->getType()->isScalarType() && Type::STK_Bool == (*CPI)->getType()->getScalarTypeKind())
+        (*CPI)->setType(Context.CharTy);
+
+      LocalArgs.push_back((*CPI)->getType());
+    }
+  }
+  TrampolineType = Context.getFunctionType(Context.VoidTy,
+      LocalArgs,
+      ExtInfo).getTypePtr();
+  TypeSourceInfo *TI = Context.getTrivialTypeSourceInfo(QualType(TrampolineType, 0),
+    CurrentLocation);
+  // Set correct parameter information for templates
+  Trampoline = CXXMethodDecl::Create(
+      Context, ClassDecl, CurrentLocation, DNI, QualType(TrampolineType, 0),
+      /*TInfo=*/TI,
+      SC_Static,
+      /*Inline=*/false,
+      CSK_unspecified, // /*isConstExpr*/false,
+      CurrentLocation
+      );
+  Trampoline->setAccess(AS_public);
+  SmallVector<ParmVarDecl *, 4> TrampolineParams;
+  for (CXXConstructorDecl::param_iterator it = DeserializeConstructor->param_begin();
+    it != DeserializeConstructor->param_end(); it++) {
+    QualType MemberType = (*it)->getType().getNonReferenceType();
+    if (MemberType != (*it)->getType()) {
+      if (!getLangOpts().HSAExtension) {
+        if (!MemberType.getTypePtr()->isClassType()) {
+          Diag((*it)->getLocation(), diag::err_amp_incompatible);
+        } else {
+          assert(MemberType.getTypePtr()->isClassType() == true &&
+                 "Only supporting taking reference of classes");
+          CXXRecordDecl *MemberClass =
+    	    MemberType.getTypePtr()->getAsCXXRecordDecl();
+          CXXMethodDecl *MemberDeserializer =
+            MemberClass->getCXXAMPDeserializationConstructor();
+          if (!MemberDeserializer) {
+            Diag((*it)->getLocation(), diag::err_amp_incompatible);
+          } else {
+            assert(MemberDeserializer);
+            for (CXXMethodDecl::param_iterator CPI =
+      	         MemberDeserializer->param_begin(),
+                 CPE = MemberDeserializer->param_end(); CPI!=CPE; CPI++) {
+              ParmVarDecl *FromParam = ParmVarDecl::Create(Context, Trampoline,
+                                                     CurrentLocation, CurrentLocation,
+                                                     (*CPI)->getIdentifier(),
+                                                     (*CPI)->getType(),
+                                                     /*TInfo=*/0,
+                                                     SC_None, 0);
+      	      TrampolineParams.push_back(FromParam);
+            }
+          }
+        }
+      } else { // HSA extension check
+        if (MemberType.getTypePtr()->isClassType()) {
+          // hc::array should still be serialized as traditional C++AMP objects
+          if (MemberType.getTypePtr()->isGPUArrayType()) {
+            CXXRecordDecl *MemberClass =
+              MemberType.getTypePtr()->getAsCXXRecordDecl();
+            CXXMethodDecl *MemberDeserializer =
+              MemberClass->getCXXAMPDeserializationConstructor();
+            if (!MemberDeserializer) {
+              Diag((*it)->getLocation(), diag::err_amp_incompatible);
+            } else {
+              assert(MemberDeserializer);
+              for (CXXMethodDecl::param_iterator CPI =
+                   MemberDeserializer->param_begin(),
+                   CPE = MemberDeserializer->param_end(); CPI!=CPE; CPI++) {
+                ParmVarDecl *FromParam = ParmVarDecl::Create(Context, Trampoline,
+                                                       CurrentLocation, CurrentLocation,
+                                                       (*CPI)->getIdentifier(),
+                                                       (*CPI)->getType(),
+                                                       /*TInfo=*/0,
+                                                       SC_None, 0);
+                TrampolineParams.push_back(FromParam);
+              }
+            }
+          } else {
+            ParmVarDecl *FromParam = ParmVarDecl::Create(Context, Trampoline,
+                                                   CurrentLocation, CurrentLocation,
+                                                   (*it)->getIdentifier(),
+                                                   Context.getPointerType(MemberType),
+                                                   /*TInfo=*/0,
+                                                   SC_None, 0);
+            TrampolineParams.push_back(FromParam);
+          }
+        } else {
+          ParmVarDecl *FromParam = ParmVarDecl::Create(Context, Trampoline,
+                                                 CurrentLocation, CurrentLocation,
+                                                 (*it)->getIdentifier(),
+                                                 Context.getPointerType(MemberType),
+                                                 /*TInfo=*/0,
+                                                 SC_None, 0);
+          TrampolineParams.push_back(FromParam);
+        }
+      } // HSA extension check
+    } else {
+      TrampolineParams.push_back(*it);
+    }
+  }
+  // Popluate arguments
+  Trampoline->setParams(TrampolineParams);
+  // Set appropriate attributes for AMP
+  Trampoline->addAttr(::new (Context) OpenCLKernelAttr(CurrentLocation, Context, 0));
+  Trampoline->addAttr(::new (Context) CXXAMPRestrictAMPAttr(CurrentLocation, Context, 0));
+  // In CPU compilation mode, we need an empty implementation of trampoline
+  // so that parallel_for_each can find it.
+  Trampoline->addAttr(::new (Context) CXXAMPRestrictCPUAttr(CurrentLocation, Context, 0));
+  Trampoline->addAttr(::new (Context) AnnotateAttr(CurrentLocation, Context, "__cxxamp_trampoline", 0));
+  ClassDecl->addDecl(Trampoline);
+  // Generate definition
+  MarkFunctionReferenced(CurrentLocation, Trampoline);
+  MarkFunctionReferenced(CurrentLocation, DeserializeConstructor);
+}
+
+
+/// Generate an empty body of trampoline code so that the
+/// CodeGenFunction would be invoked on the generated trampoline function;
+/// the actual definition will be done at CodeGen phase.
+void Sema::DefineAMPTrampoline(SourceLocation CurrentLocation,
+                               CXXMethodDecl *Trampoline) {
+  StmtResult Body; // Populate an empty Compound statement body
+  {
+    CompoundScopeRAII CompoundScope(*this);
+    Body = ActOnCompoundStmt(CurrentLocation, CurrentLocation,
+        MultiStmtArg(),
+        /*isStmtExpr=*/false);
+    assert(!Body.isInvalid() && "Compound statement creation cannot fail");
+  }
+  Trampoline->setBody(Body.getAs<Stmt>());
+  if (ASTMutationListener *L = getASTMutationListener()) {
+    L->CompletedImplicitDefinition(Trampoline);
+  }
+  // The constructors for reference types are only referenced
+  // in codegen stage. mark them as referenced so that the code
+  // can be generated.
+  CXXMethodDecl *DeserializeConstructor =
+    Trampoline->getParent()->getCXXAMPDeserializationConstructor();
+  // The AMP deserialized constructor might be NULL if the actual definition is empty.
+  // Return at this point to force the compiler to throw compilation errors as expected
+  if(!DeserializeConstructor) {
+    return;
+  }
+  #if 0
+  assert(DeserializeConstructor&&
+    "Trampoline assumes deserialization constructor");
+  #endif
+  for (CXXConstructorDecl::param_iterator it =
+      DeserializeConstructor->param_begin();
+    it != DeserializeConstructor->param_end(); it++) {
+    QualType MemberType = (*it)->getType().getNonReferenceType();
+    if (MemberType != (*it)->getType()) {
+      if (!getLangOpts().HSAExtension) {
+        if (!MemberType.getTypePtr()->isClassType()) {
+          Diag((*it)->getLocation(), diag::err_amp_incompatible);
+        } else {
+          assert(MemberType.getTypePtr()->isClassType() == true &&
+                 "Only supporting taking reference of classes");
+          CXXRecordDecl *MemberClass =
+       	    MemberType.getTypePtr()->getAsCXXRecordDecl();
+          CXXMethodDecl *MemberDeserializer =
+            MemberClass->getCXXAMPDeserializationConstructor();
+          if (!MemberDeserializer) {
+            Diag((*it)->getLocation(), diag::err_amp_incompatible);
+          } else {
+            assert(MemberDeserializer);
+            MarkFunctionReferenced(CurrentLocation, MemberDeserializer);
+          }
+        }
+      } else {
+        if (MemberType.getTypePtr()->isClassType()) {
+          // hc::array should still be serialized as traditional C++AMP objects
+          if (MemberType.getTypePtr()->isGPUArrayType()) {
+            CXXRecordDecl *MemberClass =
+       	      MemberType.getTypePtr()->getAsCXXRecordDecl();
+            CXXMethodDecl *MemberDeserializer =
+              MemberClass->getCXXAMPDeserializationConstructor();
+            if (!MemberDeserializer) {
+              Diag((*it)->getLocation(), diag::err_amp_incompatible);
+            } else {
+              assert(MemberDeserializer);
+              MarkFunctionReferenced(CurrentLocation, MemberDeserializer);
+            }
+          }
+        }
+      } // HSA extension check
+    }
+  }
+
+}
+
+/// GPU-side Deserialization constructor. Pair arguments to each members
+/// recursively if member is of a compound type
+void Sema::DefineAmpGpuDeSerializeFunction(SourceLocation CurrentLocation,
+                                         CXXMethodDecl *Deserialization) {
+  if (Deserialization->hasBody()
+      || Deserialization->hasInlineBody()
+      || Deserialization->isOutOfLine()
+      || (Deserialization->hasAttr<AnnotateAttr>() &&
+          Deserialization->getAttr<AnnotateAttr>()->getAnnotation()
+          == "user_deserialize"))
+    return;
+  SourceLocation Loc = Deserialization->getLocation();
+  if (CXXConstructorDecl *Constructor =
+    dyn_cast<CXXConstructorDecl>(Deserialization)) {
+    Deserialization->setIsUsed();
+
+    CXXRecordDecl *ClassDecl = Deserialization->getParent();
+
+    if (ClassDecl->isInvalidDecl() || Deserialization->isInvalidDecl()) {
+      Deserialization->setInvalidDecl();
+      return;
+    }
+
+    SmallVector<CXXCtorInitializer*, 4> NewInits;
+    CXXCtorInitializer *CCI;
+    // Assign non-static members.
+    int i = 0;
+
+    // Call direct base-class deserialize.
+    for (CXXRecordDecl::base_class_iterator B = ClassDecl->bases_begin(),
+                                        BEnd = ClassDecl->bases_end();
+         B != BEnd; ++B) {
+      if (const RecordType *BaseType = B->getType()->getAs<RecordType>()) {
+        CXXRecordDecl *BaseClassDecl = cast<CXXRecordDecl>(BaseType->getDecl());
+        SmallVector<Expr*, 4> BaseCtorArgs;
+        int NumArgs = 0;
+        bool FoundBaseClassCtor = false;
+        for (CXXRecordDecl::ctor_iterator CI = BaseClassDecl->ctor_begin(),
+             CE = BaseClassDecl->ctor_end(); CI!=CE; CI++) {
+          CXXMethodDecl *MD = *CI;
+          if (MD->hasAttr<CXXAMPRestrictAMPAttr>() &&
+              MD->hasAttr<AnnotateAttr>() &&
+              MD->getAttr<AnnotateAttr>()->getAnnotation()
+                .find("deserialize") != StringRef::npos) {
+              NumArgs += MD->getNumParams();
+              FoundBaseClassCtor = true;
+              break;
+          }
+        }
+        // Skip base classes that do not have a deserializer defined.
+        if (!FoundBaseClassCtor)
+          continue;
+        for (int j=0; j < NumArgs; j++) {
+          ParmVarDecl *Param = Constructor->getParamDecl(i+j);
+          QualType ParamType = Param->getType().getNonReferenceType();
+          Expr *MemberExprBase =
+            DeclRefExpr::Create(Context, NestedNameSpecifierLoc(),
+                SourceLocation(), Param, false,
+                Loc, ParamType, VK_LValue, 0);
+          Param->setIsUsed();
+          BaseCtorArgs.push_back(MemberExprBase);
+        }
+        i += NumArgs;
+        InitializedEntity InitEntity
+          = InitializedEntity::InitializeBase(Context, B, B->isVirtual());
+        InitializationKind InitKind =
+          InitializationKind::CreateDirect(Loc, Loc, Loc);
+        InitializationSequence InitSeq(*this, InitEntity, InitKind,
+                                       MultiExprArg(BaseCtorArgs.data(), NumArgs));
+        ExprResult BaseInit =
+          InitSeq.Perform(*this, InitEntity, InitKind,
+              MultiExprArg(BaseCtorArgs.data(), NumArgs));
+        BaseInit = MaybeCreateExprWithCleanups(BaseInit);
+        assert (!BaseInit.isInvalid() && "Base initialization failure");
+        CCI = new (Context) CXXCtorInitializer(Context,
+            B->getTypeSourceInfo(), B->isVirtual(), CurrentLocation,
+            BaseInit.get(), CurrentLocation, CurrentLocation);
+        NewInits.push_back(CCI);
+      }
+    }
+    for (CXXRecordDecl::field_iterator Field = ClassDecl->field_begin(),
+	FieldEnd = ClassDecl->field_end();
+	Field != FieldEnd; ++Field) {
+      if (Field->hasAttr<CXXAMPRestrictCPUAttr>() &&
+          !Field->hasAttr<CXXAMPRestrictAMPAttr>()) {
+        // if any data members of the class is restrict(cpu),
+        // skip that field
+        continue;
+      }
+      if (Field->isUnnamedBitfield())
+	continue;
+      // Suppress assigning zero-width bitfields.
+      if (Field->isBitField() && Field->getBitWidthValue(Context) == 0)
+	continue;
+
+      QualType FieldType = Field->getType();
+      if (FieldType->isIncompleteArrayType()) {
+	assert(ClassDecl->hasFlexibleArrayMember() &&
+	    "Incomplete array type is not valid");
+	continue;
+      }
+
+      if (FieldType->isRecordType()) {
+        // Locate the deserialization constructor of the member class,
+        // find # of arguments required.
+        int NumArgs = 0;
+        SmallVector<Expr*, 4> FieldArgs;
+
+        if (CXXRecordDecl *FieldClassDecl = FieldType->getAsCXXRecordDecl()) {
+	  CXXMethodDecl *FieldDes =
+	    FieldClassDecl->getCXXAMPDeserializationConstructor();
+          // Skip member classes that do not have a deserializer defined.
+          if (!FieldDes)
+            continue;
+	  NumArgs = FieldDes->getNumParams();
+          for (int j=0; j < NumArgs; j++) {
+            ParmVarDecl *Param = Constructor->getParamDecl(i+j);
+            QualType ParamType = Param->getType().getNonReferenceType();
+            Expr *MemberExprBase =
+              DeclRefExpr::Create(Context, NestedNameSpecifierLoc(),
+                  SourceLocation(), Param, false,
+                  Loc, ParamType, VK_LValue, 0);
+            Param->setIsUsed();
+            FieldArgs.push_back(MemberExprBase);
+          }
+          i+=NumArgs;
+
+          InitializedEntity InitEntity
+            = InitializedEntity::InitializeMember(*Field);
+          InitializationKind InitKind =
+            InitializationKind::CreateDirect(Loc, Loc, Loc);
+          InitializationSequence InitSeq(*this, InitEntity, InitKind,
+                                         MultiExprArg(FieldArgs.data(), NumArgs));
+          ExprResult MemberInit =
+            InitSeq.Perform(*this, InitEntity, InitKind,
+                MultiExprArg(FieldArgs.data(), NumArgs));
+
+          MemberInit = MaybeCreateExprWithCleanups(MemberInit);
+          assert (!MemberInit.isInvalid() && "Member initialization failure");
+
+          CCI = new (Context) CXXCtorInitializer(Context,
+              *Field, CurrentLocation, CurrentLocation, MemberInit.get(),
+              CurrentLocation);
+        } else {
+          assert(0);
+        }
+      } else { // POD member
+        ParmVarDecl *Param = Constructor->getParamDecl(i++);
+        QualType ParamType = Param->getType().getNonReferenceType();
+
+        Expr *MemberExprBase =
+          DeclRefExpr::Create(Context, NestedNameSpecifierLoc(),
+              SourceLocation(), Param, false,
+              Loc, ParamType, VK_LValue, 0);
+        Param->setIsUsed();
+        CCI= new (Context) CXXCtorInitializer(Context, *Field,
+	  CurrentLocation, CurrentLocation, MemberExprBase, CurrentLocation);
+      }
+      NewInits.push_back(CCI);
+    }
+    SetCtorInitializers(Constructor, false, NewInits);
+  }
+
+  StmtResult Body; // Populate an empty Compound statement body
+  {
+    CompoundScopeRAII CompoundScope(*this);
+    Body = ActOnCompoundStmt(Loc, Loc, MultiStmtArg(),
+                             /*isStmtExpr=*/false);
+    assert(!Body.isInvalid() && "Compound statement creation cannot fail");
+  }
+  Deserialization->setBody(Body.getAs<Stmt>());
+  Deserialization->setImplicitlyInline();
+
+  if (ASTMutationListener *L = getASTMutationListener()) {
+    L->CompletedImplicitDefinition(Deserialization);
+  }
+}
+
+void Sema::DefineAmpCpuSerializeFunction(SourceLocation CurrentLocation,
+                                          CXXMethodDecl *Serialization) {
+  Serialization->setIsUsed();
+  // Avoid overriding user-defined serialization
+  if (Serialization->hasBody()||Serialization->hasInlineBody()||
+      Serialization->isOutOfLine()) {
+    return;
+  }
+  SourceLocation Loc = Serialization->getLocation();
+  SynthesizedFunctionScope Scope(*this, Serialization);
+  CXXRecordDecl *ClassDecl = Serialization->getParent();
+
+  // The statements that form the synthesized function body.
+  SmallVector<Stmt*, 32> Statements;
+
+  // Construct the "this" pointer.
+  Expr *This = ActOnCXXThis(Loc).getAs<Expr>();
+
+  // Construct the Expr of parameter.
+  ParmVarDecl *S = Serialization->getParamDecl(0);
+  QualType SRefType = S->getType();
+  if (const LValueReferenceType *SRef = SRefType->getAs<LValueReferenceType>()) {
+    SRefType = SRef->getPointeeType();
+  }
+  Expr *SRef = BuildDeclRefExpr(S, SRefType, VK_LValue, Loc);
+  assert(SRef && "Reference to parameter cannot fail!");
+
+  // Call direct base-class serialize.
+  for (CXXRecordDecl::base_class_iterator B = ClassDecl->bases_begin(),
+                                        BEnd = ClassDecl->bases_end();
+       B != BEnd; ++B) {
+    if (B->isVirtual()) // Cannot Handled VirtualBaseSpec.
+      continue;
+    if (const RecordType *BaseType = B->getType()->getAs<RecordType>()) {
+      CXXRecordDecl *BaseClassDecl = cast<CXXRecordDecl>(BaseType->getDecl());
+      for (CXXRecordDecl::method_iterator Method = BaseClassDecl->method_begin(),
+           MethodEnd = BaseClassDecl->method_end();
+           Method != MethodEnd; ++Method) {
+        if((*Method)->getNameInfo().getAsString() == "__cxxamp_serialize") {
+          LookupResult Methodlookup(*this,
+                                    (*Method)->getNameInfo(),LookupOrdinaryName);
+          LookupQualifiedName(Methodlookup, BaseClassDecl, false);
+          CXXCastPath BasePath;
+          BasePath.push_back(B);
+
+          // Dereference "this".
+          ExprResult Base = CreateBuiltinUnaryOp(Loc, UO_Deref, This);
+
+          // Implicitly cast "this" to the appropriately-qualified base type.
+          Base = ImpCastExprToType(Base.get(),
+                                   Context.getQualifiedType(B->getType().getUnqualifiedType(),
+                                             Serialization->getMethodQualifiers()),
+                                   CK_UncheckedDerivedToBase,
+                                   VK_LValue, &BasePath);
+          CXXScopeSpec SS;
+          ExprResult BaseSerializeFunctionRef
+            = BuildMemberReferenceExpr(Base.getAs<Expr>(),
+                                       B->getType().getNonReferenceType(), Loc,
+                                       /*isArrow=*/false, SS,
+                                       /*TemplateKWLoc=*/SourceLocation(),
+                                       /*FirstQualifierInScope=*/0,
+                                       Methodlookup,
+                                       /*TemplateArgs=*/nullptr,/*S*/nullptr);
+          assert(!BaseSerializeFunctionRef.isInvalid() &&
+                 "BaseSerializeFunctionRef cannot fail");
+          MultiExprArg MEArg(&SRef, 1);
+          ExprResult Call = BuildCallToMemberFunction(/*Scope=*/0,
+                                                      BaseSerializeFunctionRef.getAs<Expr>(),
+                                                      Loc, /*&SRef*/MEArg, /*1,*/ Loc);
+          StmtResult SerializeResult = Call.getAs<Stmt>();
+          Statements.push_back(SerializeResult.getAs<Expr>());
+        }
+      }
+    }
+  }
+
+  //Do the fields serialize.
+  for (CXXRecordDecl::field_iterator Field = ClassDecl->field_begin(),
+       FieldEnd = ClassDecl->field_end(); Field != FieldEnd; ++Field) {
+    QualType FieldType = Field->getType().getNonReferenceType();
+
+    // Skip fields that are not supposed to be marshalled to GPU space
+    if (Field->hasAttr<CXXAMPRestrictCPUAttr>() &&
+        !Field->hasAttr<CXXAMPRestrictAMPAttr>()) {
+      continue;
+    }
+    const RecordType *RecordTy = FieldType->getAs<RecordType>();
+
+    // hc::array shall be serialized as normal C++AMP objects even in HC mode
+    if (!getLangOpts().HSAExtension || FieldType.getTypePtr()->isGPUArrayType()) {
+
+      if (RecordTy) {
+        CXXRecordDecl *FieldClassDecl = cast<CXXRecordDecl>(RecordTy->getDecl());
+        for (CXXRecordDecl::method_iterator Method = FieldClassDecl->method_begin(),
+             MethodEnd = FieldClassDecl->method_end();
+             Method != MethodEnd; ++Method) {
+          if((*Method)->getNameInfo().getAsString() == "__cxxamp_serialize") {
+            // Intentionally empty
+            CXXScopeSpec CSS;
+            LookupResult MemberLookup(*this, Field->getDeclName(), Loc,
+                                      LookupMemberName);
+            MemberLookup.addDecl(*Field);
+            MemberLookup.resolveKind();
+            ExprResult FRef = BuildMemberReferenceExpr(This, This->getType(),
+                                                       Loc, /*IsArrow=*/true,
+                                                       CSS, SourceLocation(), 0,
+                                                       MemberLookup,
+                                                       /*TemplateArgs=*/nullptr,/*S*/nullptr);
+            LookupResult Methodlookup(*this,
+                                      (*Method)->getNameInfo(),LookupOrdinaryName);
+            LookupQualifiedName(Methodlookup, FieldClassDecl, false);
+            CXXScopeSpec SS;
+            const Type *CanonicalT = Context.getCanonicalType(FieldType.getTypePtr());
+            SS.MakeTrivial(Context,
+                           NestedNameSpecifier::Create(Context, 0,
+                                                       false, CanonicalT),
+                          Loc);
+            ExprResult SerializeFunctionRef
+              = BuildMemberReferenceExpr(FRef.get(), FieldType, Loc,
+                                         /*isArrow=*/false, SS,
+                                         /*TemplateKWLoc=*/SourceLocation(),
+                                         /*FirstQualifierInScope=*/0,
+                                         Methodlookup,
+                                         /*TemplateArgs=*/nullptr,/*S*/nullptr);
+            assert(!SerializeFunctionRef.isInvalid() &&
+                   "SerializeFunctionRef cannot fail");
+            MultiExprArg MEArg(&SRef, 1);
+            ExprResult Call = BuildCallToMemberFunction(/*Scope=*/0,
+                                                        SerializeFunctionRef.getAs<Expr>(),
+                                                        Loc, /*&SRef*/MEArg, /*1,*/ Loc);
+            StmtResult SerializeResult = Call.getAs<Stmt>();
+            Statements.push_back(SerializeResult.getAs<Expr>());
+          }
+        }
+      }
+      else if(FieldType->isScalarType()){
+        if (Field->getType()->isReferenceType()) {
+          Diag(ClassDecl->getLocation(), diag::err_uninitialized_member_for_assign)
+            << Context.getTagDeclType(ClassDecl) << 0 << Field->getDeclName();
+          Diag(Field->getLocation(), diag::note_declared_at);
+          Diag(CurrentLocation, diag::note_member_synthesized_at)
+            << Serialization << Context.getTagDeclType(ClassDecl);
+          continue;
+        }
+        const RecordType *SRecordTy
+          = (S->getType().getNonReferenceType())->getAs<RecordType>();
+        CXXRecordDecl *SClassDecl = cast<CXXRecordDecl>(SRecordTy->getDecl());
+        assert(SClassDecl && "SClassDecl cannot fail!");
+        for (CXXRecordDecl::method_iterator Method = SClassDecl->method_begin(),
+             MethodEnd = SClassDecl->method_end();
+             Method != MethodEnd; ++Method) {
+          if((*Method)->getNameInfo().getAsString() == "Append") {
+            CXXScopeSpec CSS;
+            LookupResult MemberLookup(*this, Field->getDeclName(), Loc,
+                                      LookupMemberName);
+            MemberLookup.addDecl(*Field);
+            MemberLookup.resolveKind();
+            ExprResult FRef = BuildMemberReferenceExpr(This, This->getType(),
+                                                       Loc, /*IsArrow=*/true,
+                                                       CSS, SourceLocation(), 0,
+                                                       MemberLookup,
+                                                       /*TemplateArgs=*/nullptr,/*S*/nullptr);
+  
+            // Construct the parameter "const void *" for Append.
+            ExprResult T = CreateBuiltinUnaryOp(Loc, UO_AddrOf, FRef.getAs<Expr>());
+  
+            // Construct the parameter "size_t" for Append.
+            QualType SizeType = Context.getSizeType();
+            llvm::APInt Size(Context.getTypeSize(SizeType),
+                          Context.getTypeSizeInChars(Field->getType()).getQuantity());
+  
+            LookupResult Methodlookup(*this,
+                                      (*Method)->getNameInfo(),LookupOrdinaryName);
+            LookupQualifiedName(Methodlookup, SClassDecl, false);
+            CXXScopeSpec SS;
+            const Type *CanonicalT = Context.getCanonicalType(SRefType.getTypePtr());
+            SS.MakeTrivial(Context,
+                           NestedNameSpecifier::Create(Context, 0,
+                                                       false, CanonicalT),
+                           Loc);
+            ExprResult AppendFunctionRef
+              = BuildMemberReferenceExpr(SRef, SRefType, Loc,
+                                         /*isArrow=*/false, SS,
+                                         /*TemplateKWLoc=*/SourceLocation(),
+                                         /*FirstQualifierInScope=*/0,
+                                         Methodlookup,
+                                         /*TemplateArgs=*/nullptr,/*S*/nullptr);
+            Expr* CallArgs[2]
+              = {IntegerLiteral::Create(Context, Size, SizeType, Loc),T.getAs<Expr>()};
+            MultiExprArg MEArg(CallArgs, 2);
+            ExprResult Call = BuildCallToMemberFunction(/*Scope=*/0,
+                                                        AppendFunctionRef.getAs<Expr>(),
+                                                        Loc, /*CallArgs*/MEArg, /*2,*/ Loc);
+            Statements.push_back(Call/*.getAs<Stmt>()*/.getAs<Expr>());
+          }
+        }
+      }
+
+    } else { // HSA extension check
+
+      const RecordType *HSARecordTy = Field->getType()->getAs<RecordType>();
+      CXXRecordDecl *FieldClassDecl = NULL;
+      NamespaceDecl* FieldNamespaceDecl = NULL;
+      if (RecordTy) {
+        FieldClassDecl = cast<CXXRecordDecl>(RecordTy->getDecl());
+        FieldNamespaceDecl = dyn_cast<NamespaceDecl>(FieldClassDecl->getEnclosingNamespaceContext());
+      }
+
+      // call __cxxamp_serialize for the following cases:
+      // a) under Concurrency namespace
+      // b) is a lambda/class/union (RecordType) by itself
+      // for all other classes, directly push pointer as kernel argument
+      if (HSARecordTy &&
+          ( (FieldNamespaceDecl && FieldNamespaceDecl->getName() == "Concurrency") ||
+            (FieldClassDecl) ) ) {
+        CXXRecordDecl *FieldClassDecl = cast<CXXRecordDecl>(HSARecordTy->getDecl());
+        for (CXXRecordDecl::method_iterator Method = FieldClassDecl->method_begin(),
+             MethodEnd = FieldClassDecl->method_end();
+             Method != MethodEnd; ++Method) {
+          if((*Method)->getNameInfo().getAsString() == "__cxxamp_serialize") {
+            // Intentionally empty
+            CXXScopeSpec CSS;
+            LookupResult MemberLookup(*this, Field->getDeclName(), Loc,
+                                      LookupMemberName);
+            MemberLookup.addDecl(*Field);
+            MemberLookup.resolveKind();
+            ExprResult FRef = BuildMemberReferenceExpr(This, This->getType(),
+                                                       Loc, /*IsArrow=*/true,
+                                                       CSS, SourceLocation(), 0,
+                                                       MemberLookup,
+                                                       /*TemplateArgs=*/nullptr,/*S*/nullptr);
+            LookupResult Methodlookup(*this,
+                                      (*Method)->getNameInfo(),LookupOrdinaryName);
+            LookupQualifiedName(Methodlookup, FieldClassDecl, false);
+            CXXScopeSpec SS;
+            const Type *CanonicalT = Context.getCanonicalType(FieldType.getTypePtr());
+            SS.MakeTrivial(Context,
+                           NestedNameSpecifier::Create(Context, 0,
+                                                       false, CanonicalT),
+                          Loc);
+            ExprResult SerializeFunctionRef
+              = BuildMemberReferenceExpr(FRef.get(), FieldType, Loc,
+                                         /*isArrow=*/false, SS,
+                                         /*TemplateKWLoc=*/SourceLocation(),
+                                         /*FirstQualifierInScope=*/0,
+                                         Methodlookup,
+                                         /*TemplateArgs=*/nullptr,/*S*/nullptr);
+            assert(!SerializeFunctionRef.isInvalid() &&
+                   "SerializeFunctionRef cannot fail");
+            MultiExprArg MEArg(&SRef, 1);
+            ExprResult Call = BuildCallToMemberFunction(/*Scope=*/0,
+                                                        SerializeFunctionRef.getAs<Expr>(),
+                                                        Loc, /*&SRef*/MEArg, /*1,*/ Loc);
+            StmtResult SerializeResult = Call.getAs<Stmt>();
+            Statements.push_back(SerializeResult.getAs<Expr>());
+          }
+        }
+      } else {
+        // HSA capture by reference
+        // use AppendPtr to push pointer to the object
+        const RecordType *SRecordTy
+          = (S->getType().getNonReferenceType())->getAs<RecordType>();
+
+        CXXRecordDecl *SClassDecl = cast<CXXRecordDecl>(SRecordTy->getDecl());
+        assert(SClassDecl && "SClassDecl cannot fail!");
+
+        std::string AppendFuncName;
+        if (Field->getType()->isReferenceType()) {
+          AppendFuncName = "AppendPtr";
+        } else {
+          AppendFuncName = "Append";
+        }
+
+        for (CXXRecordDecl::method_iterator Method = SClassDecl->method_begin(),
+             MethodEnd = SClassDecl->method_end();
+             Method != MethodEnd; ++Method) {
+          if((*Method)->getNameInfo().getAsString() == AppendFuncName) {
+            CXXScopeSpec CSS;
+            LookupResult MemberLookup(*this, Field->getDeclName(), Loc,
+                                      LookupMemberName);
+            MemberLookup.addDecl(*Field);
+            MemberLookup.resolveKind();
+            ExprResult FRef = BuildMemberReferenceExpr(This, This->getType(),
+                                                       Loc, /*IsArrow=*/true,
+                                                       CSS, SourceLocation(), 0,
+                                                       MemberLookup, 
+                                                       /*TemplateArgs=*/nullptr,/*S*/nullptr);
+
+            // Construct the parameter "const void *" for Append.
+            ExprResult T = CreateBuiltinUnaryOp(Loc, UO_AddrOf, FRef.getAs<Expr>());
+
+            // Construct the parameter "size_t" for Append.
+            QualType SizeType = Context.getSizeType();
+            llvm::APInt Size(Context.getTypeSize(SizeType),
+                          Context.getTypeSizeInChars(Field->getType()).getQuantity());
+
+            LookupResult Methodlookup(*this,
+                                      (*Method)->getNameInfo(),LookupOrdinaryName);
+            LookupQualifiedName(Methodlookup, SClassDecl, false);
+            CXXScopeSpec SS;
+            const Type *CanonicalT = Context.getCanonicalType(SRefType.getTypePtr());
+            SS.MakeTrivial(Context,
+                           NestedNameSpecifier::Create(Context, 0,
+                                                       false, CanonicalT),
+                           Loc);
+            ExprResult AppendFunctionRef
+              = BuildMemberReferenceExpr(SRef, SRefType, Loc,
+                                         /*isArrow=*/false, SS,
+                                         /*TemplateKWLoc=*/SourceLocation(),
+                                         /*FirstQualifierInScope=*/0,
+                                         Methodlookup,
+                                         /*TemplateArgs=*/nullptr,/*S*/nullptr);
+            Expr* CallArgs[2]
+              = {IntegerLiteral::Create(Context, Size, SizeType, Loc),T.getAs<Expr>()};
+            MultiExprArg MEArg(CallArgs, 2);
+            ExprResult Call = BuildCallToMemberFunction(/*Scope=*/0,
+                                                        AppendFunctionRef.getAs<Expr>(),
+                                                        Loc, /*CallArgs*/MEArg, /*2,*/ Loc);
+            Statements.push_back(Call/*.getAs<Stmt>()*/.getAs<Expr>());
+          }
+        }
+      }
+
+    } // HSA extension check
+
+  }
+  StmtResult Body;
+  {
+    CompoundScopeRAII CompoundScope(*this);
+    Body = ActOnCompoundStmt(Loc, Loc, (Statements),
+                             /*isStmtExpr=*/false);
+    assert(!Body.isInvalid() && "Compound statement creation cannot fail");
+  }
+  Serialization->setBody(Body.getAs<Stmt>());
+
+  if (ASTMutationListener *L = getASTMutationListener()) {
+    L->CompletedImplicitDefinition(Serialization);
+  }
+}
+
+Sema::ImplicitExceptionSpecification
+Sema::ComputeDefaultedMoveAssignmentExceptionSpec(CXXMethodDecl *MD) {
+  CXXRecordDecl *ClassDecl = MD->getParent();
+
+  ImplicitExceptionSpecification ExceptSpec(*this);
+  if (ClassDecl->isInvalidDecl())
+    return ExceptSpec;
+
+  // C++0x [except.spec]p14:
+  //   An implicitly declared special member function (Clause 12) shall have an 
+  //   exception-specification. [...]
+
+  // It is unspecified whether or not an implicit move assignment operator
+  // attempts to deduplicate calls to assignment operators of virtual bases are
+  // made. As such, this exception specification is effectively unspecified.
+  // Based on a similar decision made for constness in C++0x, we're erring on
+  // the side of assuming such calls to be made regardless of whether they
+  // actually happen.
+  // Note that a move constructor is not implicitly declared when there are
+  // virtual bases, but it can still be user-declared and explicitly defaulted.
+  for (const auto &Base : ClassDecl->bases()) {
+    if (Base.isVirtual())
+      continue;
+
+    CXXRecordDecl *BaseClassDecl
+      = cast<CXXRecordDecl>(Base.getType()->getAs<RecordType>()->getDecl());
+    if (CXXMethodDecl *MoveAssign = LookupMovingAssignment(BaseClassDecl,
+                                                           0, false, 0))
+      ExceptSpec.CalledDecl(Base.getBeginLoc(), MoveAssign);
+  }
+
+  for (const auto &Base : ClassDecl->vbases()) {
+    CXXRecordDecl *BaseClassDecl
+      = cast<CXXRecordDecl>(Base.getType()->getAs<RecordType>()->getDecl());
+    if (CXXMethodDecl *MoveAssign = LookupMovingAssignment(BaseClassDecl,
+                                                           0, false, 0))
+      ExceptSpec.CalledDecl(Base.getBeginLoc(), MoveAssign);
+  }
+
+  for (const auto *Field : ClassDecl->fields()) {
+    QualType FieldType = Context.getBaseElementType(Field->getType());
+    if (CXXRecordDecl *FieldClassDecl = FieldType->getAsCXXRecordDecl()) {
+      if (CXXMethodDecl *MoveAssign =
+              LookupMovingAssignment(FieldClassDecl,
+                                     FieldType.getCVRQualifiers(),
+                                     false, 0))
+        ExceptSpec.CalledDecl(Field->getLocation(), MoveAssign);
+    }
+  }
+
+  return ExceptSpec;
+}
+
 CXXMethodDecl *Sema::DeclareImplicitMoveAssignment(CXXRecordDecl *ClassDecl) {
   assert(ClassDecl->needsImplicitMoveAssignment());
 
diff --git a/lib/Sema/SemaExpr.cpp b/lib/Sema/SemaExpr.cpp
index d8869ffe94..e9c148e460 100644
--- a/lib/Sema/SemaExpr.cpp
+++ b/lib/Sema/SemaExpr.cpp
@@ -3312,6 +3312,61 @@ ExprResult Sema::ActOnIntegerConstant(SourceLocation Loc, uint64_t Val) {
                                 Context.IntTy, Loc);
 }
 
+void DiagnoseCXXAMPFloatingLiteral(Sema &S, NumericLiteralParser &Literal,
+                                  QualType Ty, SourceLocation Loc) {
+  if(!S.IsInAMPRestricted() || !Literal.isFloatingLiteral())
+    return;
+
+  const llvm::fltSemantics &Format = S.Context.getFloatTypeSemantics(Ty);
+
+  using llvm::APFloat;
+  APFloat Val(Format);
+
+  APFloat::opStatus result = Literal.GetFloatValue(Val);
+
+  // Diagnose when become the largest finite number with an in-exact result
+  if((result ==APFloat::opInexact) && Val.isNormal()) {
+    bool BecomeLMData = Val.compare(APFloat::getLargest(Format)) == APFloat::cmpEqual ||
+      Val.compare(APFloat::getLargest(Format, true)) == APFloat::cmpEqual;
+
+    // Though Val is 'equal' with positive/negative largest, check if acatully it is less than
+    // either of them since the result is in-exact, e.g.
+    //
+    //  "double d100 = 1.7976931348623158e+308;"  // OK, near but less than the positive largest
+    //  "double d113 = -1.7976931348623158e+308;"// OK, near but less than the negative largest
+    //  "float f100 = 3.402823466e+38f;                     // OK, near but less than the positive largest
+    //  "float f113 = -3.402823466e+38f;             // OK, near but less than the negative largest
+    //  "float f = 3.402823467e+38f;"                   // Error, near but greater than the positive largest
+    if(BecomeLMData) {
+      // TODO: Check the fractional coefficient since already in normalized form
+      #if 0
+      S.Diag(Loc, diag::err_amp_constant_too_big);
+      #endif
+    }
+  }
+  // Diagnose overflow, e.g.
+  //  "double d = 1.7976931348623159e+308;" // Error, near but greater than the positive largest
+  if (result & APFloat::opOverflow) {
+    SmallString<20> buffer;
+    APFloat::getLargest(Format).toString(buffer);
+
+    S.Diag(Loc, diag::err_amp_float_overflow)
+    << Ty
+    << StringRef(buffer.data(), buffer.size());
+  }
+}
+
+void DiagnoseCXXAMPIntergerLiteral(Sema &S, NumericLiteralParser &Literal,
+                                  QualType Ty, SourceLocation Loc, unsigned MaxWidth ) {
+  if(!S.IsInAMPRestricted() || !Literal.isFloatingLiteral())
+    return;
+
+  llvm::APInt ResultVal(MaxWidth, 0);
+  if (Literal.GetIntegerValue(ResultVal))
+    S.Diag(Loc, diag::err_amp_constant_too_big);
+
+}
+
 static Expr *BuildFloatingLiteral(Sema &S, NumericLiteralParser &Literal,
                                   QualType Ty, SourceLocation Loc) {
   const llvm::fltSemantics &Format = S.Context.getFloatTypeSemantics(Ty);
@@ -3321,6 +3376,12 @@ static Expr *BuildFloatingLiteral(Sema &S, NumericLiteralParser &Literal,
 
   APFloat::opStatus result = Literal.GetFloatValue(Val);
 
+  // C++AMP
+  // Note that we suppress normal diagnositic
+  if(S.getLangOpts().CPlusPlusAMP && result != APFloat::opOK) {
+    DiagnoseCXXAMPFloatingLiteral(S, Literal, Ty, Loc);
+  }
+
   // Overflow is always an error, but underflow is only an error if
   // we underflowed to zero (APFloat reports denormals as underflow).
   if ((result & APFloat::opOverflow) ||
@@ -3335,7 +3396,8 @@ static Expr *BuildFloatingLiteral(Sema &S, NumericLiteralParser &Literal,
       APFloat::getSmallest(Format).toString(buffer);
     }
 
-    S.Diag(Loc, diagnostic)
+    if(!S.getLangOpts().CPlusPlusAMP)
+      S.Diag(Loc, diagnostic)
       << Ty
       << StringRef(buffer.data(), buffer.size());
   }
@@ -3446,9 +3508,16 @@ ExprResult Sema::ActOnNumericConstant(const Token &Tok, Scope *UDLScope) {
         Lit = BuildFloatingLiteral(*this, Literal, CookedTy, Tok.getLocation());
       } else {
         llvm::APInt ResultVal(Context.getTargetInfo().getLongLongWidth(), 0);
-        if (Literal.GetIntegerValue(ResultVal))
-          Diag(Tok.getLocation(), diag::err_integer_literal_too_large)
-              << /* Unsigned */ 1;
+        if (Literal.GetIntegerValue(ResultVal)) {
+          // C++AMP
+          // Note that this suppress normal diagnostic
+          if(getLangOpts().CPlusPlusAMP)
+            DiagnoseCXXAMPIntergerLiteral(*this, Literal, CookedTy, Tok.getLocation(),
+              Context.getTargetInfo().getLongLongWidth());
+          else
+            Diag(Tok.getLocation(), diag::err_integer_literal_too_large)
+                << /* Unsigned */ 1;
+        }
         Lit = IntegerLiteral::Create(Context, ResultVal, CookedTy,
                                      Tok.getLocation());
       }
@@ -3593,8 +3662,12 @@ ExprResult Sema::ActOnNumericConstant(const Token &Tok, Scope *UDLScope) {
 
     if (Literal.GetIntegerValue(ResultVal)) {
       // If this value didn't fit into uintmax_t, error and force to ull.
-      Diag(Tok.getLocation(), diag::err_integer_literal_too_large)
-          << /* Unsigned */ 1;
+      // C++AMP
+      if(getLangOpts().CPlusPlusAMP)
+        DiagnoseCXXAMPIntergerLiteral(*this, Literal, Ty, Tok.getLocation(), MaxWidth);
+      else
+        Diag(Tok.getLocation(), diag::err_integer_literal_too_large)
+            << /* Unsigned */ 1;
       Ty = Context.UnsignedLongLongTy;
       assert(Context.getTypeSize(Ty) == ResultVal.getBitWidth() &&
              "long long is not intmax_t?");
@@ -5736,6 +5809,66 @@ ExprResult Sema::ActOnConvertVectorExpr(Expr *E, ParsedType ParsedDestTy,
   return SemaConvertVectorExpr(E, TInfo, BuiltinLoc, RParenLoc);
 }
 
+void Sema::DiagnoseCXXAMPMethodCallExpr(SourceLocation LParenLoc,
+                                  CXXMethodDecl *Callee) {
+  if(!Callee || Callee->isConstexpr() || Callee->getBuiltinID() != 0u)
+    return;
+
+  FunctionDecl* Caller = this->getCurFunctionDecl();
+  LambdaScopeInfo* LambdaInfo = this->getCurLambda();
+  bool CallerAMP = (LambdaInfo && LambdaInfo->CallOperator)?
+    LambdaInfo->CallOperator->hasAttr<CXXAMPRestrictAMPAttr>():
+    (Caller?Caller->hasAttr<CXXAMPRestrictAMPAttr>():false);
+  bool CallerCPU= (LambdaInfo && LambdaInfo->CallOperator)?
+    LambdaInfo->CallOperator->hasAttr<CXXAMPRestrictCPUAttr>():
+    (Caller?Caller->hasAttr<CXXAMPRestrictCPUAttr>():false);
+  bool CalleeAMP = Callee->hasAttr<CXXAMPRestrictAMPAttr>();
+  bool CalleeCPU = Callee->hasAttr<CXXAMPRestrictCPUAttr>();
+
+  // Logic for auto-compile-for-accelerator:
+  // In device path, if auto-compile-for-accelerator flag is on,
+  // and caller method has GPU attribute (CXXAMPRestrictAMPAttr),
+  // and callee method doesn't have GPU attribute (CXXAMPRestrictAMPAttr),
+  // then annotate it with one, and recalculate related boolean flags
+  if (getLangOpts().DevicePath && getLangOpts().AutoCompileForAccelerator) {
+    if (CallerAMP && !CalleeAMP) {
+      //llvm::errs() << "add [[hc]] to callee: " << Callee->getName() << "\n";
+      Callee->addAttr(::new (Context) CXXAMPRestrictAMPAttr(Callee->getLocation(), Context, 0));
+      CalleeAMP = Callee->hasAttr<CXXAMPRestrictAMPAttr>();
+    }
+  }
+
+  // Case by case
+  if((LambdaInfo && LambdaInfo->CallOperator) && !getLangOpts().AMPCPU) {
+    // caller: __GPU, lambda; callee: non __GPU, lambda
+    // int i = 0;
+    // auto l2 = []() { i = 1; };
+    // auto l = []() __GPU {
+    //    l2();    // Error
+    //  };
+    if(getLangOpts().DevicePath && Callee->getParent() && Callee->getParent()->isLambda() &&
+      (CallerAMP && CallerCPU) && (!CalleeAMP && !CalleeCPU) )
+      // FIXME: Need a mangled lambda name as '<lambda_xxxxID> operator()'
+      Diag(LParenLoc, diag::err_amp_overloaded_member_function)
+        << Callee->getQualifiedNameAsString()
+        << LambdaInfo->CallOperator->getQualifiedNameAsString();
+  } else if(Caller && ! (LambdaInfo && LambdaInfo->CallOperator) && !getLangOpts().AMPCPU) {
+    // caller: __GPU, global; callee: non __GPU, class static
+    //    class C1 {
+    //      public:
+    //        static void foo(int &flag) {flag = 1;}
+    //    };
+    //    bool test() __GPU {
+    //      int flag = 0;
+    //      C1::foo(flag);    // Error
+    //    }
+    if(getLangOpts().DevicePath && Callee->isStatic() && (CallerAMP && CallerCPU) &&
+      (!CalleeAMP && !CalleeCPU) )
+      Diag(LParenLoc, diag::err_amp_overloaded_member_function)
+        << Callee->getQualifiedNameAsString() << Caller->getNameAsString();
+  }
+}
+
 /// BuildResolvedCallExpr - Build a call to a resolved expression,
 /// i.e. an expression not of \p OverloadTy.  The expression should
 /// unary-convert to an expression of function-pointer or
@@ -5958,6 +6091,14 @@ ExprResult Sema::BuildResolvedCallExpr(Expr *Fn, NamedDecl *NDecl,
     }
   }
 
+  // C++AMP
+  if(getLangOpts().CPlusPlusAMP) {
+    if (CXXMethodDecl *Method = dyn_cast_or_null<CXXMethodDecl>(FDecl))
+      DiagnoseCXXAMPMethodCallExpr(LParenLoc, Method);
+    else
+      DiagnoseCXXAMPOverloadedCallExpr(LParenLoc, FDecl);
+  }
+
   if (CXXMethodDecl *Method = dyn_cast_or_null<CXXMethodDecl>(FDecl))
     if (!Method->isStatic())
       return ExprError(Diag(LParenLoc, diag::err_member_call_without_object)
@@ -10145,6 +10286,72 @@ static ValueDecl *getCompareDecl(Expr *E) {
   return nullptr;
 }
 
+// The following are not allowed in amp restricted codes
+  //     Recursion.
+  //     Variables declared with the volatile keyword.
+  //     Virtual functions.
+  //     Pointers to functions.
+  //     Pointers to member functions.
+  //     Pointers in structures.
+  //     Pointers to pointers.
+  //     goto statements.
+  //     Labeled statements.
+  //     try , catch, or throw statements.
+  //     Global variables.
+  //     Static variables. Use tile_static Keyword instead.
+  //>>>>>>>>>>>>>>>>
+  //        Refering to [2.4.3.2] Primary Expressions (C++11 5.1)
+  //          An identifier or qualified identifier that refers to an object shall refer only to:
+  //          (1) a parameter to the function, or
+  //          (2) a local variable declared at a block scope within the function, or
+  //          (3) a non-static member of the class of which this function is a member, or
+  //          (4) a static const type that can be reduced to a integer literal and is only used as an rvalue, or
+  //          (5) a global const type that can be reduced to a integer literal and is only used as an rvalue, or
+  //          (6) a captured variable in a lambda expression.
+  //<<<<<<<<<<<<<<<<
+  //     dynamic_cast casts.
+  //     The typeid operator.
+  //     asm declarations.
+  //     Varargs.
+void Sema::DiagnoseCXXAMPExpr(Expr* Stripped, ExprResult &HS, bool DiagnoseWhenStatic) {
+  if(IsInAMPRestricted()) {
+    if (DeclRefExpr* DRL = dyn_cast<DeclRefExpr>(Stripped))
+      if (VarDecl *var = dyn_cast<VarDecl>(DRL->getDecl())) {
+        QualType Type = var->getType();
+        if(!var->hasLocalStorage() || var->isStaticDataMember()) {
+          if (var->hasAttr<HCCTileStaticAttr>()) {
+             // Skip tile_static
+          } else if(Type.isConstQualified() /*&& LHS.get()->isRValue()*/) {
+            // Skip a static const type and global const type that is rvalue
+            if((var->getStorageClass() == SC_Static &&
+              isa<UnaryOperator>(HS.get()->IgnoreParens()) &&
+              cast<UnaryOperator>(HS.get()->IgnoreParens())->getOpcode()== UO_AddrOf) ||
+              DiagnoseWhenStatic) {
+              //Still diagnose pointer to static and/or member, e.g
+              //        static const int flagxxx = 2;
+              //        void foo(bool set) __GPU
+              //       {
+              //          int n = flagxxx + 3;
+              //          const int  *p = &flagxxx;        // error
+              //       }
+              // Or sometimes HS is not a UnaryOperator, we use manually-set flag
+              // 'DiagnoseWhenStatic' to determine
+              //
+              Diag(HS.get()->getBeginLoc(), diag::err_amp_using_static_or_global_variables)
+                << var->getName();
+            }
+          } else
+            if (getLangOpts().HSAExtension || getLangOpts().AMPCPU) {
+              ; // hsa extention
+            } else {
+              Diag(HS.get()->getBeginLoc(), diag::err_amp_using_static_or_global_variables)
+                << var->getName();
+            }
+        }
+      }
+    }
+}
+
 /// Diagnose some forms of syntactically-obvious tautological comparison.
 static void diagnoseTautologicalComparison(Sema &S, SourceLocation Loc,
                                            Expr *LHS, Expr *RHS,
@@ -10444,6 +10651,7 @@ static QualType checkArithmeticOrEnumeralCompare(Sema &S, ExprResult &LHS,
 
   // The result of comparisons is 'bool' in C++, 'int' in C.
   return S.Context.getLogicalOperationType();
+
 }
 
 // C99 6.5.8, C++ [expr.rel]
@@ -10494,6 +10702,17 @@ QualType Sema::CheckCompareOperands(ExprResult &LHS, ExprResult &RHS,
       (RHSType->isArithmeticType() || RHSType->isEnumeralType()))
     return checkArithmeticOrEnumeralCompare(*this, LHS, RHS, Loc, Opc);
 
+  // C++AMP
+  if (getLangOpts().CPlusPlusAMP ) {
+    Expr *LHSStripped = LHS.get()->IgnoreParenImpCasts();
+    Expr *RHSStripped = RHS.get()->IgnoreParenImpCasts();
+    DiagnoseCXXAMPExpr(LHSStripped, LHS);
+    DiagnoseCXXAMPExpr(RHSStripped, RHS);
+  }
+
+  // The result of comparisons is 'bool' in C++, 'int' in C.
+  QualType ResultTy = Context.getLogicalOperationType();
+
   const Expr::NullPointerConstantKind LHSNullKind =
       LHS.get()->isNullPointerConstant(Context, Expr::NPC_ValueDependentIsNull);
   const Expr::NullPointerConstantKind RHSNullKind =
@@ -11651,6 +11870,15 @@ QualType Sema::CheckAssignmentOperands(Expr *LHSExpr, ExprResult &RHS,
 
   CheckForNullPointerDereference(*this, LHSExpr);
 
+  // C++AMP
+  //    Primary Expression: " const int *p = &flag;
+  //    where flag is a 'static int'
+  if(getLangOpts().CPlusPlusAMP) {
+    ExprResult ER = LHSExpr;
+    DiagnoseCXXAMPExpr(LHSExpr->IgnoreParenImpCasts(), ER);
+    DiagnoseCXXAMPExpr(RHS.get()->IgnoreParenImpCasts(), RHS);
+  }
+
   // C99 6.5.16p3: The type of an assignment expression is the type of the
   // left operand unless the left operand has qualified type, in which case
   // it is the unqualified version of the type of the left operand.
@@ -11839,6 +12067,18 @@ static QualType CheckIncrementDecrementOperand(Sema &S, Expr *Op,
   // Now make sure the operand is a modifiable lvalue.
   if (CheckForModifiableLvalue(Op, OpLoc, S))
     return QualType();
+
+  // C++AMP [2.4.3.7]
+  if (S.getLangOpts().CPlusPlusAMP && S.IsInAMPRestricted() && !S.getLangOpts().HSAExtension) {
+    if(ResType->isPointerType() && ResType->getPointeeType()->isBooleanType()) {
+      // No matter IsPrefix or not. We only care about the opcode string
+      StringRef OpcString = (IsInc)?UnaryOperator::getOpcodeStr(UnaryOperatorKind(UO_PreInc))
+             :UnaryOperator::getOpcodeStr(UnaryOperatorKind(UO_PreDec));
+      S.Diag(Op->getExprLoc(), diag::err_amp_arithmetic_operation_on_pointer_to_bool)
+         << OpcString;
+    }
+  }
+
   // In C++, a prefix increment is the same type as the operand. Otherwise
   // (in C or with postfix), the increment is the unqualified type of the
   // operand.
@@ -12094,6 +12334,11 @@ QualType Sema::CheckAddressOfOperand(ExprResult &OrigOp, SourceLocation OpLoc) {
           !getLangOpts().CPlusPlus) {
         AddressOfError = AO_Register_Variable;
       }
+
+      // C++AMP
+      if(getLangOpts().CPlusPlusAMP) {
+        DiagnoseCXXAMPExpr(op, OrigOp, true);
+      }
     } else if (isa<MSPropertyDecl>(dcl)) {
       AddressOfError = AO_Property_Expansion;
     } else if (isa<FunctionTemplateDecl>(dcl)) {
@@ -12677,6 +12922,19 @@ ExprResult Sema::CreateBuiltinBinOp(SourceLocation OpLoc,
   CheckArrayAccess(LHS.get());
   CheckArrayAccess(RHS.get());
 
+  // C++AMP [2.4.3.7]
+  // FIXME: Should check if there are allowed Pointer arithmetic
+  if (getLangOpts().CPlusPlusAMP && IsInAMPRestricted() && !getLangOpts().HSAExtension) {
+     QualType L = LHS.get()->getType();
+     QualType R = RHS.get()->getType();
+    if(L->isPointerType() && L->getPointeeType()->isBooleanType())
+      Diag(LHS.get()->getExprLoc(), diag::err_amp_arithmetic_operation_on_pointer_to_bool)
+         << BinaryOperator::getOpcodeStr(Opc);
+    if(R->isPointerType() && R->getPointeeType()->isBooleanType())
+      Diag(RHS.get()->getExprLoc(), diag::err_amp_arithmetic_operation_on_pointer_to_bool)
+         << BinaryOperator::getOpcodeStr(Opc);
+  }
+
   if (const ObjCIsaExpr *OISA = dyn_cast<ObjCIsaExpr>(LHS.get()->IgnoreParenCasts())) {
     NamedDecl *ObjectSetClass = LookupSingleName(TUScope,
                                                  &Context.Idents.get("object_setClass"),
@@ -15008,6 +15266,40 @@ static bool isImplicitlyDefinableConstexprFunction(FunctionDecl *Func) {
          (Func->isImplicitlyInstantiable() || (MD && !MD->isUserProvided()));
 }
 
+namespace
+{   // TODO: potentially temporary.
+  inline
+  bool isHIPFunctor(const CXXMethodDecl* f)
+  {
+    static constexpr const char prefix[] = "HIP_kernel_functor_name_begin";
+
+    return f->getOverloadedOperator() == OO_Call &&
+      f->getParent()->getName().find(prefix) != StringRef::npos;
+  }
+
+  inline
+  Stmt* findCall(Stmt* x)
+  {
+    if (!x || isa<CallExpr>(x)) return x;
+
+    for (auto&& y : x->children()) {
+      auto r = findCall(y);
+      if (r) return r;
+    }
+
+    return nullptr;
+  }
+
+  inline
+  void addCalleeAttributesToFunctor(
+    const FunctionDecl* callee, FunctionDecl* functorCallOperator)
+  {
+    functorCallOperator->dropAttrs();
+    functorCallOperator->setAttrs(callee->getAttrs());
+    functorCallOperator->dropAttr<AnnotateAttr>();
+  }
+}
+
 /// Mark a function referenced, and check whether it is odr-used
 /// (C++ [basic.def.odr]p2, C99 6.9p3)
 void Sema::MarkFunctionReferenced(SourceLocation Loc, FunctionDecl *Func,
@@ -15107,7 +15399,22 @@ void Sema::MarkFunctionReferenced(SourceLocation Loc, FunctionDecl *Func,
         }
       } else if (Constructor->getInheritedConstructor()) {
         DefineInheritingConstructor(Loc, Constructor);
-      }
+	  } else if (LangOpts.CPlusPlusAMP) {
+        if (Constructor->hasAttr<CXXAMPRestrictAMPAttr>() &&
+            Constructor->hasAttr<AnnotateAttr>() &&
+            Constructor->getAttr<AnnotateAttr>()->getAnnotation() ==
+              "auto_deserialize") {
+#if 0
+          DeclarationNameInfo  AmpFunInfo = Func -> getNameInfo();
+          std::string MethodFunName = AmpFunInfo.getAsString();
+          llvm::errs() << "Definiting Function = " << MethodFunName << "\n";
+#endif
+
+          // do not generate deserializer in case there are previous errors
+          if (!this->getDiagnostics().hasErrorOccurred())
+            DefineAmpGpuDeSerializeFunction(Loc, Constructor);
+        }
+      } // end of if CPlusPlusAMP	
     } else if (CXXDestructorDecl *Destructor =
                    dyn_cast<CXXDestructorDecl>(Func)) {
       Destructor = cast<CXXDestructorDecl>(Destructor->getFirstDecl());
@@ -15119,7 +15426,21 @@ void Sema::MarkFunctionReferenced(SourceLocation Loc, FunctionDecl *Func,
       if (Destructor->isVirtual() && getLangOpts().AppleKext)
         MarkVTableUsed(Loc, Destructor->getParent());
     } else if (CXXMethodDecl *MethodDecl = dyn_cast<CXXMethodDecl>(Func)) {
-      if (MethodDecl->isOverloadedOperator() &&
+      // C++AMP
+      DeclarationNameInfo  AmpFunInfo = Func -> getNameInfo();
+      std::string MethodFunName = AmpFunInfo.getAsString();
+      std::string AmpFunName = "__cxxamp_serialize";
+      if (AmpFunName == MethodFunName) {
+        DefineAmpCpuSerializeFunction(Loc, MethodDecl);
+      } else if (MethodFunName == "__cxxamp_trampoline"||
+           MethodFunName == "__cxxamp_trampoline_name") {
+        DefineAMPTrampoline(Loc, MethodDecl);
+      } else if (MethodDecl->isOverloadedOperator() && 
+	              isHIPFunctor(MethodDecl)) {
+	    auto t = findCall(MethodDecl->getBody());
+          if (t) 
+            addCalleeAttributesToFunctor(cast<CallExpr>(t)->getDirectCallee(), MethodDecl);
+      } else if (MethodDecl->isOverloadedOperator() &&
           MethodDecl->getOverloadedOperator() == OO_Equal) {
         MethodDecl = cast<CXXMethodDecl>(MethodDecl->getFirstDecl());
         if (MethodDecl->isDefaulted() && !MethodDecl->isDeleted()) {
diff --git a/lib/Sema/SemaExprCXX.cpp b/lib/Sema/SemaExprCXX.cpp
index c1c08969c7..c91560637f 100644
--- a/lib/Sema/SemaExprCXX.cpp
+++ b/lib/Sema/SemaExprCXX.cpp
@@ -5746,6 +5746,12 @@ QualType Sema::CXXCheckConditionalOperands(ExprResult &Cond, ExprResult &LHS,
   VK = VK_RValue;
   OK = OK_Ordinary;
 
+  // C++AMP
+  if(getLangOpts().CPlusPlusAMP) {
+    DiagnoseCXXAMPExpr(LHS.get()->IgnoreParenImpCasts(), LHS);
+    DiagnoseCXXAMPExpr(RHS.get()->IgnoreParenImpCasts(), RHS);
+  }
+
   // Either of the arguments dependent?
   if (LHS.get()->isTypeDependent() || RHS.get()->isTypeDependent())
     return Context.DependentTy;
diff --git a/lib/Sema/SemaExprMember.cpp b/lib/Sema/SemaExprMember.cpp
index c856e37e99..36ca0a9e14 100644
--- a/lib/Sema/SemaExprMember.cpp
+++ b/lib/Sema/SemaExprMember.cpp
@@ -1136,6 +1136,79 @@ Sema::BuildMemberReferenceExpr(Expr *BaseExpr, QualType BaseExprType,
   }
 
   if (CXXMethodDecl *MemberFn = dyn_cast<CXXMethodDecl>(MemberDecl)) {
+    // C++AMP
+    if(getLangOpts().CPlusPlusAMP) {
+      bool ParentCPU = false;
+      bool ParentAMP = false;
+      std::string name;
+      if(getCurFunctionDecl()) {
+        ParentCPU = getCurFunctionDecl()->hasAttr<CXXAMPRestrictCPUAttr>();
+        ParentAMP = getCurFunctionDecl()->hasAttr<CXXAMPRestrictAMPAttr>();
+        name = getCurFunctionDecl()->getNameAsString();
+      }
+      if(ParentCPU || ParentAMP) {
+        // Lambda if any will inheritate AMP restrictions from Parent RECURSIVELY
+      } else if(getCurLambda() && getCurLambda()->CallOperator) {
+        // Suppress the restrictions
+        ParentCPU = getCurLambda()->CallOperator->hasAttr<CXXAMPRestrictCPUAttr>();
+        ParentAMP = getCurLambda()->CallOperator->hasAttr<CXXAMPRestrictAMPAttr>();
+        name = getCurLambda()->CallOperator->getNameAsString();
+      }
+      // Check local
+      // FIXME: the class should inheritate AMP restriction from parent RECURSIVELY
+      //             especially the class.local. Any missing of that can cause unexpected errors
+      bool MemberAMP = MemberFn->hasAttr<CXXAMPRestrictAMPAttr>();
+      bool MemberCPU = MemberFn->hasAttr<CXXAMPRestrictCPUAttr>() || !MemberAMP;
+      const CXXRecordDecl * RDecl = MemberFn->getParent();
+
+      // Logic for auto-compile-for-accelerator:
+      // In both CPU and device path, if auto-compile-for-accelerator flag is on,
+      // and caller has GPU attribute (CXXAMPRestrictAMPAttr),
+      // and callee method doesn't have GPU attribute (CXXAMPRestrictAMPAttr),
+      // and callee is not within C++ or HCC default library,
+      // then annotate callee with one, and recalculate related boolean flags
+
+      if (getLangOpts().AutoCompileForAccelerator) {
+        if (ParentAMP && !MemberAMP) {
+          std::string QualifiedName = MemberDecl->getQualifiedNameAsString();
+          // Skip self implementation and unwanted
+          if(QualifiedName.find("Kalmar::")!=std::string::npos ||
+             QualifiedName.find("hc::")!=std::string::npos ||
+             QualifiedName.find("Concurrency::")!=std::string::npos ||
+             QualifiedName.find("std::")!=std::string::npos ||
+             QualifiedName.find("__cxxamp_serialize")!=std::string::npos ||
+             QualifiedName.find("__cxxamp_trampoline_name")!=std::string::npos) {
+          } else {
+            //llvm::errs() << "add [[hc]] to member: " << MemberFn->getName() << "\n";
+            MemberFn->addAttr(::new (Context) CXXAMPRestrictAMPAttr(MemberFn->getLocation(), Context, 0));
+            MemberAMP = MemberFn->hasAttr<CXXAMPRestrictAMPAttr>();
+            MemberCPU = MemberFn->hasAttr<CXXAMPRestrictCPUAttr>() || !MemberAMP;
+          }
+        }
+      }
+
+      if(RDecl && RDecl->isLocalClass()) {
+        // Do nothing
+      } else if(ParentCPU== MemberCPU && ParentAMP== MemberAMP) {
+        // The function is not overloaded
+      } else if((!MemberCPU &&!MemberAMP && (ParentCPU ||ParentAMP)) ||
+        (!ParentCPU&&!ParentAMP && (!MemberCPU && MemberAMP)) ||
+        (ParentCPU== (!ParentAMP) && MemberCPU == (!MemberAMP) && ParentCPU!=MemberCPU)) {
+        std::string QualifiedName = MemberDecl->getQualifiedNameAsString();
+        // Skip self implementation and unwanted
+        if(QualifiedName.find("::accelerator_view")!=std::string::npos ||
+          QualifiedName.find("array_view<")!=std::string::npos ||
+          QualifiedName.find("::accelerator")!=std::string::npos ||
+          QualifiedName.find("Concurrency::")!=std::string::npos ||
+          QualifiedName.find("std::")!=std::string::npos ||
+          QualifiedName.find("__cxxamp_serialize")!=std::string::npos) {
+        } else {
+          Diag(MemberLoc, diag::err_amp_overloaded_member_function)
+               << MemberDecl->getQualifiedNameAsString() << name;
+        }
+      }
+    }
+
     ExprValueKind valueKind;
     QualType type;
     if (MemberFn->isInstance()) {
diff --git a/lib/Sema/SemaInit.cpp b/lib/Sema/SemaInit.cpp
index 60f34775c6..0fc9f79c6e 100644
--- a/lib/Sema/SemaInit.cpp
+++ b/lib/Sema/SemaInit.cpp
@@ -20,6 +20,7 @@
 #include "clang/Sema/Designator.h"
 #include "clang/Sema/Initialization.h"
 #include "clang/Sema/Lookup.h"
+#include "clang/Sema/ScopeInfo.h"
 #include "clang/Sema/SemaInternal.h"
 #include "llvm/ADT/APInt.h"
 #include "llvm/ADT/SmallString.h"
@@ -3842,6 +3843,709 @@ ResolveConstructorOverload(Sema &S, SourceLocation DeclLoc,
   return CandidateSet.BestViableFunction(S, DeclLoc, Best);
 }
 
+// Specially diagnose dtor case: "no dtor possible thus no default ctor possible"
+//    struct A3_base_1 {
+//        int i;
+//        ~A3_base_1() restrict(cpu) {}
+//    };
+//    class A3_member_1 {
+//    public:
+//        ~A3_member_1() restrict(amp) {}
+//    };
+//    struct A3 : A3_base_1 {
+//        A3_member_1 m1;
+//        // no dtor possible thus no default ctor possible
+//    };
+static void CheckCXXAMPHasPossibleSMF(Sema&S, CXXRecordDecl* RDecl,
+                           bool OrgCPU, bool OrgAMP,
+                           bool NewCPU, bool NewAMP, unsigned DiagID) {
+  if((OrgCPU && OrgAMP) || (NewCPU && NewAMP))
+    return;
+
+  if(OrgCPU || OrgAMP) {
+    if((OrgCPU !=NewCPU) &&(OrgAMP !=NewAMP))
+      S.Diag(RDecl->getInnerLocStart(), DiagID)
+        << RDecl->getName();
+  }
+}
+
+#define INTERSECT_ATTR(OrgCPU, OrgAMP, NewCPU, NewAMP) \
+  if(!(OrgCPU && OrgAMP) && (OrgCPU!= OrgAMP)) { \
+    if(NewCPU && NewAMP) \
+      ; \
+    else if((OrgCPU!=NewCPU) && (OrgAMP!=NewAMP)) { \
+     /* Mutex situation. Throw exceptions about "Wrong Restrictions" */\
+     ; \
+    } \
+  } else if (OrgCPU && OrgAMP) { \
+     if(!(NewCPU && NewAMP)) { \
+       OrgCPU = NewCPU; \
+       OrgAMP = NewAMP; \
+    } \
+  } else { \
+     OrgCPU |= NewCPU; \
+     OrgAMP |= NewAMP; \
+  }
+
+// Using locally
+#define SMF_DefaultConstructor 0x1
+#define SMF_CopyConstructor 0x2
+#define SMF_MoveConstructor 0x4
+#define SMF_CopyAssignment 0x8
+#define SMF_MoveAssignment 0x10
+#define SMF_Destructor 0x20
+#define SMF_All 0x3f
+
+void Sema::InheritSMFDtorIntersections(CXXRecordDecl* RDecl,
+                           bool& CPUAttr, bool& AMPAttr,
+                           bool& ParentCPUAttr, bool& ParentAMPAttr) {
+  // Step1
+  // The compiler sets the restrictions of compiler-generated destructors to the
+  // intersection of the restrictions on all of the destructors of the data members
+  // [able to destroy all data members] and all of the base classes.destructors
+  // [able to call all base classes destructors]. If there are no such destructors,
+  // then all possible restrictions are used [able to destroy in any context].
+  // However, any restriction that would result in an error is not set.
+
+  if(RDecl->getQualifiedNameAsString().find("std::")!=std::string::npos)
+    return;
+
+  // Check the base classes if any
+  if(RDecl->getDefinition()) {
+    RDecl = RDecl->getDefinition();
+    for(CXXRecordDecl::base_class_const_iterator BaseIt = RDecl->bases_begin();
+           BaseIt!=RDecl->bases_end(); BaseIt++) {
+      CXXRecordDecl *BaseRDecl =
+          cast<CXXRecordDecl>(BaseIt->getType()->getAs<RecordType>()->getDecl());
+      if(!BaseRDecl) continue;
+      if(BaseRDecl->getQualifiedNameAsString().find("std::")!=std::string::npos)
+        continue;
+      if(CXXDestructorDecl * BaseDtor = BaseRDecl ->getDestructor()) {
+        if(!BaseDtor->isUserProvided())
+          InheritSMFDtorIntersections(BaseRDecl, CPUAttr, AMPAttr, ParentCPUAttr, ParentAMPAttr);
+        else{
+          CheckCXXAMPHasPossibleSMF(*this, RDecl, CPUAttr, AMPAttr,
+                                          BaseDtor->hasAttr<CXXAMPRestrictCPUAttr>(),
+                                          BaseDtor->hasAttr<CXXAMPRestrictAMPAttr>(),
+                                          diag::err_amp_has_no_default_ctor);
+          INTERSECT_ATTR(CPUAttr, AMPAttr, BaseDtor->hasAttr<CXXAMPRestrictCPUAttr>(),
+                                          BaseDtor->hasAttr<CXXAMPRestrictAMPAttr>());
+        }
+      } else {
+        InheritSMFDtorIntersections(BaseRDecl, CPUAttr, AMPAttr, ParentCPUAttr, ParentAMPAttr);
+      }
+    }
+  }
+
+  // Check the fields if any
+  for (CXXRecordDecl::field_iterator It = RDecl->field_begin(),
+      ItE = RDecl->field_end(); It != ItE; ++It) {
+    const FieldDecl *FD = *It;
+    const RecordType *RT = Context.getBaseElementType(FD->getType())->getAs<RecordType>();
+    if (!RT) {
+      if(ParentAMPAttr) {
+        const Type* Ty = FD->getType().getTypePtrOrNull();
+        // FIXME:The following codes might not work since the in-compatible scalar types of
+        // struct/union/class are diagnosed in advance, e.g. ActOnDeclarator
+        if(Ty && (Ty->isCharType() ||
+         Ty->isWideCharType() ||
+         Ty->isSpecificBuiltinType(BuiltinType::Short) ||
+         Ty->isSpecificBuiltinType(BuiltinType::LongLong) ||
+         Ty->isSpecificBuiltinType(BuiltinType::LongDouble)))
+          CPUAttr = true;
+      }
+      continue;
+    }
+    CXXRecordDecl *MemberDecl = cast<CXXRecordDecl>(RT->getDecl());
+    InheritSMFDtorIntersections(MemberDecl, CPUAttr, AMPAttr, ParentCPUAttr, ParentAMPAttr);
+  }
+
+  // Check the member methods if any
+  for ( CXXRecordDecl::method_iterator MethodIt = RDecl->method_begin(),
+          MethodItE = RDecl->method_end(); MethodIt != MethodItE; ++MethodIt) {
+    if(dyn_cast<CXXDestructorDecl>(*MethodIt)) {
+      CheckCXXAMPHasPossibleSMF(*this, RDecl, CPUAttr, AMPAttr,
+                                      MethodIt->hasAttr<CXXAMPRestrictCPUAttr>(),
+                                      MethodIt->hasAttr<CXXAMPRestrictAMPAttr>(),
+                                      diag::err_amp_has_no_default_ctor);
+      INTERSECT_ATTR(CPUAttr, AMPAttr, MethodIt->hasAttr<CXXAMPRestrictCPUAttr>(),
+                                        MethodIt->hasAttr<CXXAMPRestrictAMPAttr>());
+    }
+  }
+
+}
+
+void Sema::InheritSMFCtorIntersections(CXXRecordDecl* RDecl,
+                           bool& CPUAttr, bool& AMPAttr,
+                           bool& ParentCPUAttr, bool& ParentAMPAttr, int flag, bool ConstParam) {
+  // Step2
+  // The compiler sets the restrictions of compiler-generated default constructors
+  // to the intersection of the restrictions on all of the default constructors of the member
+  // fields [able to construct all member fields], all of the base classes default constructors
+  // [able to call all base classes default constructors], and the destructor of the class
+  // [able to destroy in any context constructed]. However, any restriction that would result
+  // in an error is not set.
+  if(RDecl->getQualifiedNameAsString().find("std::")!=std::string::npos)
+    return;
+
+  // Check the base classes if any
+  if(RDecl->getDefinition()) {
+    RDecl = RDecl->getDefinition();
+    for(CXXRecordDecl::base_class_const_iterator BaseIt = RDecl->bases_begin();
+           BaseIt != RDecl->bases_end(); BaseIt++) {
+      CXXRecordDecl *BaseRDecl =
+          cast<CXXRecordDecl>(BaseIt->getType()->getAs<RecordType>()->getDecl());
+      if(!BaseRDecl) continue;
+      if(BaseRDecl->getQualifiedNameAsString().find("std::")!=std::string::npos)
+        continue;
+      if(BaseRDecl->ctor_end() == BaseRDecl->ctor_begin())
+        InheritSMFCtorIntersections(BaseRDecl, CPUAttr, AMPAttr,
+                       ParentCPUAttr, ParentAMPAttr, flag, ConstParam);
+      else {
+        bool BaseMergedCPU = false;
+        bool BaseMergedAMP = false;
+        bool DoIt = false;
+        unsigned DiagID;
+        for(CXXRecordDecl::ctor_iterator CtorIt = BaseRDecl->ctor_begin();
+          CtorIt!= BaseRDecl->ctor_end(); CtorIt++) {
+          CXXConstructorDecl* CD = (*CtorIt);
+          if(flag == SMF_DefaultConstructor) {
+            if(!CD->isDefaultConstructor())
+              continue;
+            DiagID = diag::err_amp_has_no_default_ctor;
+            if(CXXDestructorDecl * CDD = BaseRDecl->getDestructor()) {
+              if(CDD->isUserProvided()) {
+                BaseMergedCPU |= CDD->hasAttr<CXXAMPRestrictCPUAttr>();
+                BaseMergedAMP |= CDD->hasAttr<CXXAMPRestrictAMPAttr>();
+                DoIt = true;
+              }
+            }
+          } else if(flag == SMF_CopyConstructor) {
+            if(CD->isCopyConstructor()) {
+              const ReferenceType *ParamTy =
+                CD->getParamDecl(0)->getType()->getAs<ReferenceType>();
+                if (ParamTy && (ConstParam==ParamTy->getPointeeType().isConstQualified())) {
+              } else
+                continue;
+              DiagID = diag::err_amp_has_no_copy_constructor;
+              if(CXXDestructorDecl * CDD = BaseRDecl->getDestructor()) {
+                if(CDD->isUserProvided()) {
+                  BaseMergedCPU |= CDD->hasAttr<CXXAMPRestrictCPUAttr>();
+                  BaseMergedAMP |= CDD->hasAttr<CXXAMPRestrictAMPAttr>();
+                  DoIt = true;
+                }
+              }
+            } else
+            continue;
+          } else if(flag == SMF_MoveConstructor) {
+            if(!CD->isMoveConstructor() || !CD->isDefaulted())
+              continue;
+          } else if (flag != SMF_All) {
+            continue;
+          }
+          if(!CD->isUserProvided())
+            InheritSMFCtorIntersections(BaseRDecl, CPUAttr, AMPAttr,
+                           ParentCPUAttr, ParentAMPAttr, flag, ConstParam);
+          else {
+            // At this point, multiple restrictions might not be merged yet, e.g
+            //   struct A8_base {
+            //        A8_base() restrict(cpu,amp) {}
+            //        A8_base(A8_base&) restrict(cpu) {}    // #1 CopyCtor
+            //        A8_base(A8_base&) restrict(amp) {}   // #2 CopyCotr
+            //    };
+            BaseMergedCPU |= CD->hasAttr<CXXAMPRestrictCPUAttr>();
+            BaseMergedAMP |= CD->hasAttr<CXXAMPRestrictAMPAttr>();
+            DoIt = true;
+          }
+        }
+        if(DoIt) {
+          CheckCXXAMPHasPossibleSMF(*this, RDecl, CPUAttr, AMPAttr,
+            BaseMergedCPU, BaseMergedAMP, DiagID);
+          INTERSECT_ATTR(CPUAttr, AMPAttr, BaseMergedCPU, BaseMergedAMP);
+        }
+
+      }
+    }
+  }
+
+  // Check the fields if any
+  for (CXXRecordDecl::field_iterator It = RDecl->field_begin(),
+      ItE = RDecl->field_end(); It != ItE; ++It) {
+    const FieldDecl *FD = *It;
+    const RecordType *RT = Context.getBaseElementType(FD->getType())->getAs<RecordType>();
+    if (!RT)
+      continue;
+    CXXRecordDecl *MemberDecl = cast<CXXRecordDecl>(RT->getDecl());
+    InheritSMFCtorIntersections(MemberDecl, CPUAttr, AMPAttr,
+                         ParentCPUAttr, ParentAMPAttr, flag, ConstParam);
+  }
+
+  // Empty class with user-defined dtor
+  //   class A1 {
+  //     public:
+  //     ~A1() restrict(amp) {}
+  //      // defaulted: A1() restrict(amp)
+  //   };
+  // Check own ctors
+  {
+    bool BaseMergedCPU = false;
+    bool BaseMergedAMP = false;
+    bool DoIt = false;
+    unsigned DiagID;
+    for(CXXRecordDecl::ctor_iterator CtorIt = RDecl->ctor_begin();
+      CtorIt!= RDecl->ctor_end(); CtorIt++) {
+      CXXConstructorDecl* CD = (*CtorIt);
+      if(flag == SMF_DefaultConstructor) {
+        if(!CD->isDefaultConstructor())
+          continue;
+        DiagID = diag::err_amp_has_no_default_ctor;
+        if(CXXDestructorDecl * CDD = RDecl->getDestructor()) {
+          if(CDD->isUserProvided()) {
+            BaseMergedCPU |= CDD->hasAttr<CXXAMPRestrictCPUAttr>();
+            BaseMergedAMP |= CDD->hasAttr<CXXAMPRestrictAMPAttr>();
+            DoIt = true;
+          }
+        }
+      } else if(flag == SMF_CopyConstructor) {
+        if(CD->isCopyConstructor()) {
+          const ReferenceType *ParamTy =
+            CD->getParamDecl(0)->getType()->getAs<ReferenceType>();
+            if (ParamTy && (ConstParam==ParamTy->getPointeeType().isConstQualified())) {
+            } else
+              continue;
+          DiagID = diag::err_amp_has_no_copy_constructor;
+         if(CXXDestructorDecl * CDD = RDecl->getDestructor()) {
+            if(CDD->isUserProvided()) {
+              BaseMergedCPU |= CDD->hasAttr<CXXAMPRestrictCPUAttr>();
+              BaseMergedAMP |= CDD->hasAttr<CXXAMPRestrictAMPAttr>();
+              DoIt = true;
+            }
+          }
+
+        } else
+        continue;
+      } else if(flag == SMF_MoveConstructor) {
+        if(!CD->isMoveConstructor() ||!CD->isDefaulted())
+          continue;
+      } else if (flag != SMF_All) {
+        continue;
+      }
+      if(CD->isUserProvided()) {
+        // At this point, multiple restrictions might not be merged yet, e.g
+        //   struct A8_base {
+        //        A8_base() restrict(cpu,amp) {}
+        //        A8_base(A8_base&) restrict(cpu) {}    // #1 CopyCtor
+        //        A8_base(A8_base&) restrict(amp) {}   // #2 CopyCotr
+        //    };
+        BaseMergedCPU |= CD->hasAttr<CXXAMPRestrictCPUAttr>();
+        BaseMergedAMP |= CD->hasAttr<CXXAMPRestrictAMPAttr>();
+        DoIt = true;
+      }
+    }
+    if(DoIt) {
+      CheckCXXAMPHasPossibleSMF(*this, RDecl, CPUAttr, AMPAttr,
+          BaseMergedCPU, BaseMergedAMP, DiagID);
+      INTERSECT_ATTR(CPUAttr, AMPAttr, BaseMergedCPU, BaseMergedAMP);
+    }
+  }
+
+  // Check the member methods if any
+  {
+    bool BaseMergedCPU = false;
+    bool BaseMergedAMP = false;
+    bool DoIt = false;
+    for ( CXXRecordDecl::method_iterator MethodIt = RDecl->method_begin(),
+            MethodItE = RDecl->method_end(); MethodIt != MethodItE; ++MethodIt) {
+      // CopyAssign, MoveAssign
+      if(flag == SMF_CopyAssignment) {
+        if(!MethodIt ->isCopyAssignmentOperator())
+            continue;
+      } else  if(flag == SMF_MoveAssignment) {
+        if(!MethodIt ->isMoveAssignmentOperator())
+            continue;
+      } else if (flag != SMF_All) {
+        continue;
+      }
+
+      if(!MethodIt->isUserProvided()) {
+      } else {
+        BaseMergedCPU |= MethodIt->hasAttr<CXXAMPRestrictCPUAttr>();
+        BaseMergedAMP |= MethodIt->hasAttr<CXXAMPRestrictAMPAttr>();
+        DoIt = true;
+      }
+    }
+    if(DoIt) {
+      // FIXME: Comment out this point since no idea of what DiagID it should be
+      #if 0
+      CheckCXXAMPHasPossibleSMF(*this, RDecl, CPUAttr, AMPAttr, BaseMergedCPU, BaseMergedAMP);
+      #endif
+      INTERSECT_ATTR(CPUAttr, AMPAttr, BaseMergedCPU, BaseMergedAMP);
+    }
+  }
+
+}
+
+static void CheckCXXAMPHasPossibleSMFMethod(Sema&S,
+                           CXXRecordDecl* RDecl, CXXMethodDecl* Method,
+                           bool OrgCPU, bool OrgAMP,
+                           bool NewCPU, bool NewAMP, unsigned DiagID) {
+  if((OrgCPU && OrgAMP) || (NewCPU && NewAMP))
+    return;
+
+  if(OrgCPU || OrgAMP) {
+    if((OrgCPU !=NewCPU) &&(OrgAMP !=NewAMP))
+      S.Diag(Method->getInnerLocStart(), DiagID)
+        << Method->getNameAsString()
+        <<RDecl->getName();
+  }
+}
+
+// Defaulted CopyAssign and MoveAssign
+void Sema::InheritSMFMethodIntersections(CXXRecordDecl* RDecl,
+                           bool& CPUAttr, bool& AMPAttr,
+                           bool& ParentCPUAttr, bool& ParentAMPAttr, int flag, bool ConstParam) {
+  // Step3
+  // The compiler sets the restrictions of compiler-generated copy constructors to
+  // the intersection of the restrictions on all of the copy constructors of the member fields
+  // [able to construct all member fields], all of the base classes copy constructors
+  // [able to call all base classes\u2019 copy constructors], and the destructor of the class
+  // [able to destroy in any context constructed].However, any restriction that would result
+  // in an error is not set.
+  //
+  //Step4
+  // The compiler sets the restrictions of compiler-generated assignment operators to
+  // the intersection of the restrictions on all of the assignment operators of the member
+  // fields [able to assign all member fields] and all of the base classes assignment
+  // operators [able to call all base classes assignment operators]. However, any restriction
+  // that would result in an error is not set.
+
+  // Check the base classes if any
+  if(RDecl->getDefinition()) {
+    RDecl = RDecl->getDefinition();
+    for(CXXRecordDecl::base_class_const_iterator BaseIt = RDecl->bases_begin();
+           BaseIt != RDecl->bases_end(); BaseIt++) {
+      CXXRecordDecl *BaseRDecl =
+          cast<CXXRecordDecl>(BaseIt->getType()->getAs<RecordType>()->getDecl());
+      if(!BaseRDecl) continue;
+      if(BaseRDecl->ctor_end() == BaseRDecl->ctor_begin())
+        InheritSMFMethodIntersections(BaseRDecl, CPUAttr, AMPAttr,
+                         ParentCPUAttr, ParentAMPAttr, flag);
+      else {
+        bool BaseMergedCPU = false;
+        bool BaseMergedAMP = false;
+        bool DoIt = false;
+        unsigned DiagID;
+        CXXMethodDecl* Method = NULL;
+        for ( CXXRecordDecl::method_iterator MethodIt = RDecl->method_begin(),
+              MethodItE = RDecl->method_end(); MethodIt != MethodItE; ++MethodIt) {
+          if(flag == SMF_CopyAssignment) {
+            if(!MethodIt->isCopyAssignmentOperator())
+              continue;
+            const ReferenceType *ParamTy =
+              MethodIt->getParamDecl(0)->getType()->getAs<ReferenceType>();
+            if (ParamTy && (ConstParam==ParamTy->getPointeeType().isConstQualified())) {
+            } else
+              continue;
+            DiagID = diag::err_amp_has_no_copy_assign_or_move_assign;
+            Method = (*MethodIt);
+          } else  if(flag == SMF_MoveAssignment) {
+            if(!MethodIt->isMoveAssignmentOperator())
+              continue;
+            // No test cases
+          } else if (flag != SMF_All) {
+            continue;
+          }
+         if( !MethodIt->isUserProvided()) {
+            InheritSMFMethodIntersections(BaseRDecl, CPUAttr, AMPAttr,
+                            ParentCPUAttr, ParentAMPAttr, flag, ConstParam);
+          } else {
+            BaseMergedCPU |= MethodIt->hasAttr<CXXAMPRestrictCPUAttr>();
+            BaseMergedAMP |= MethodIt->hasAttr<CXXAMPRestrictAMPAttr>();
+            DoIt = true;
+          }
+        }
+        if(DoIt) {
+          CheckCXXAMPHasPossibleSMFMethod(*this, RDecl, Method, CPUAttr, AMPAttr,
+            BaseMergedCPU, BaseMergedAMP, DiagID);
+          INTERSECT_ATTR(CPUAttr, AMPAttr, BaseMergedCPU, BaseMergedAMP);
+        }
+      }
+    }
+  }
+
+    // Check the fields if any
+    for (CXXRecordDecl::field_iterator It = RDecl->field_begin(),
+        ItE = RDecl->field_end(); It != ItE; ++It) {
+      const FieldDecl *FD = *It;
+      const RecordType *RT = Context.getBaseElementType(FD->getType())->getAs<RecordType>();
+      if (!RT)
+        continue;
+      CXXRecordDecl *MemberDecl = cast<CXXRecordDecl>(RT->getDecl());
+      InheritSMFMethodIntersections(MemberDecl, CPUAttr, AMPAttr,
+                           ParentCPUAttr, ParentAMPAttr, flag, ConstParam);
+    }
+
+    // Check the member methods if any
+    {
+      bool BaseMergedCPU = false;
+      bool BaseMergedAMP = false;
+      bool DoIt = false;
+      unsigned DiagID;
+      CXXMethodDecl* Method = NULL;
+      for ( CXXRecordDecl::method_iterator MethodIt = RDecl->method_begin(),
+              MethodItE = RDecl->method_end(); MethodIt != MethodItE; ++MethodIt) {
+        // CopyAssign, MoveAssign
+        if(flag == SMF_CopyAssignment) {
+          if(!MethodIt->isCopyAssignmentOperator())
+              continue;
+          const ReferenceType *ParamTy =
+            MethodIt->getParamDecl(0)->getType()->getAs<ReferenceType>();
+          if (ParamTy && (ConstParam==ParamTy->getPointeeType().isConstQualified())) {
+          } else
+            continue;
+          DiagID = diag::err_amp_has_no_copy_assign_or_move_assign;
+          Method = (*MethodIt);
+        } else  if(flag == SMF_MoveAssignment) {
+          if(!MethodIt->isMoveAssignmentOperator())
+              continue;
+        } else if (flag != SMF_All) {
+          continue;
+        }
+
+        if(MethodIt->isUserProvided()) {
+          BaseMergedCPU |= MethodIt->hasAttr<CXXAMPRestrictCPUAttr>();
+          BaseMergedAMP |= MethodIt->hasAttr<CXXAMPRestrictAMPAttr>();
+          DoIt = true;
+        }
+      }
+      if(DoIt) {
+        CheckCXXAMPHasPossibleSMFMethod(*this, RDecl, Method, CPUAttr, AMPAttr,
+          BaseMergedCPU, BaseMergedAMP, DiagID);
+        INTERSECT_ATTR(CPUAttr, AMPAttr, BaseMergedCPU, BaseMergedAMP);
+      }
+    }
+
+}
+
+static void CheckCXXAMPSMFDestructor(Sema &S, CXXRecordDecl* RDecl,
+                           bool& ParentCPUAttr, bool& ParentAMPAttr) {
+  if(!RDecl || S.getLangOpts().HSAExtension)
+    return;
+
+  ASTContext &Context = S.Context;
+  SourceLocation Loc = RDecl->getLocation();
+  CXXDestructorDecl * DD = RDecl ->getDestructor();
+  if(DD && !DD->isUserProvided() && !DD->hasAttr<CXXAMPRestrictCPUAttr>() &&
+       !DD->hasAttr<CXXAMPRestrictAMPAttr>()) {
+    bool CPUAttr = false;
+    bool AMPAttr = false;
+    S.InheritSMFDtorIntersections(RDecl, CPUAttr, AMPAttr, ParentCPUAttr, ParentAMPAttr);
+    if(!DD->hasAttr<CXXAMPRestrictCPUAttr>() &&
+       !DD->hasAttr<CXXAMPRestrictAMPAttr>()) {
+      if(CPUAttr && !DD->hasAttr<CXXAMPRestrictCPUAttr>())
+        DD->addAttr(::new (Context) CXXAMPRestrictCPUAttr(Loc, Context, 0));
+
+       if(AMPAttr && !DD->hasAttr<CXXAMPRestrictAMPAttr>()) {
+         DD->addAttr(::new (Context) CXXAMPRestrictAMPAttr(Loc, Context, 0));
+      }
+    } else {
+        // FIXME:
+        llvm::errs()<<"IT SHOULD NOT HAPPEN!\n";
+        exit(1);
+    }
+
+  }
+  if(DD && (DD->hasAttr<CXXAMPRestrictCPUAttr>() ||
+       DD->hasAttr<CXXAMPRestrictAMPAttr>()))
+    S.DiagnoseCXXAMPOverloadedCallExpr(DD->getInnerLocStart(), DD);
+}
+
+static void CheckCXXAMPSMFConstructor(Sema &S, CXXRecordDecl* RDecl,
+                           bool& ParentCPUAttr, bool& ParentAMPAttr) {
+  if (!RDecl || S.getLangOpts().HSAExtension)
+    return;
+
+  ASTContext &Context = S.Context;
+  SourceLocation Loc = RDecl->getLocation();
+
+  if(RDecl->hasDefaultConstructor() || RDecl->needsImplicitDefaultConstructor()) {
+    bool CPUAttr = false;
+    bool AMPAttr = false;
+    CXXConstructorDecl* DefaultCtor = NULL;
+    if(!RDecl->needsImplicitDefaultConstructor()) {
+      for(CXXRecordDecl::ctor_iterator CtorIt = RDecl->ctor_begin();
+        CtorIt!=RDecl->ctor_end(); CtorIt++)
+        if((*CtorIt)->isDefaultConstructor() && !(*CtorIt)->isUserProvided() &&
+          (*CtorIt)->isDefaulted()) {
+          DefaultCtor = (*CtorIt);}
+    } else {
+      DefaultCtor = S.DeclareImplicitDefaultConstructor(RDecl);
+    }
+    if(DefaultCtor && !DefaultCtor->hasAttr<CXXAMPRestrictCPUAttr>() &&
+      !DefaultCtor->hasAttr<CXXAMPRestrictAMPAttr>()) {
+        S.InheritSMFCtorIntersections(RDecl, CPUAttr, AMPAttr,
+          ParentCPUAttr, ParentAMPAttr, SMF_DefaultConstructor);
+      if(DefaultCtor && CPUAttr && !DefaultCtor->hasAttr<CXXAMPRestrictCPUAttr>())
+        DefaultCtor->addAttr(::new (Context) CXXAMPRestrictCPUAttr(Loc, Context, 0));
+
+      if(DefaultCtor && AMPAttr && !DefaultCtor->hasAttr<CXXAMPRestrictAMPAttr>())
+        DefaultCtor->addAttr(::new (Context) CXXAMPRestrictAMPAttr(Loc, Context, 0));
+    }
+    if(DefaultCtor && (DefaultCtor->hasAttr<CXXAMPRestrictCPUAttr>() ||
+      DefaultCtor->hasAttr<CXXAMPRestrictAMPAttr>()))
+      S.DiagnoseCXXAMPOverloadedCallExpr(DefaultCtor->getInnerLocStart(), DefaultCtor);
+
+  }
+
+  if(!RDecl->hasUserDeclaredCopyConstructor()) {
+    bool CPUAttr = false;
+    bool AMPAttr = false;
+    CXXConstructorDecl* CopyCtor = NULL;
+    bool ConstParam = RDecl->implicitCopyConstructorHasConstParam();
+    if(!RDecl->needsImplicitCopyConstructor()) {
+      for(CXXRecordDecl::ctor_iterator CtorIt = RDecl->ctor_begin();
+        CtorIt!=RDecl->ctor_end(); CtorIt++)
+        if((*CtorIt)->isCopyConstructor() && !(*CtorIt)->isUserProvided() &&
+          (*CtorIt)->isDefaulted()){
+          CopyCtor = (*CtorIt);
+          ConstParam = RDecl->hasCopyConstructorWithConstParam();
+        }
+    } else {
+      CopyCtor = S.DeclareImplicitCopyConstructor(RDecl);
+    }
+
+    if(CopyCtor && !CopyCtor->hasAttr<CXXAMPRestrictCPUAttr>() &&
+      !CopyCtor->hasAttr<CXXAMPRestrictAMPAttr>()) {
+      S.InheritSMFCtorIntersections(RDecl, CPUAttr, AMPAttr,
+        ParentCPUAttr, ParentAMPAttr, SMF_CopyConstructor, ConstParam);
+      if(CopyCtor && CPUAttr && !CopyCtor->hasAttr<CXXAMPRestrictCPUAttr>())
+        CopyCtor->addAttr(::new (Context) CXXAMPRestrictCPUAttr(Loc, Context, 0));
+
+      if(CopyCtor && AMPAttr && !CopyCtor->hasAttr<CXXAMPRestrictAMPAttr>())
+        CopyCtor->addAttr(::new (Context) CXXAMPRestrictAMPAttr(Loc, Context, 0));
+
+      if(CopyCtor && (CopyCtor->hasAttr<CXXAMPRestrictCPUAttr>() ||
+        CopyCtor->hasAttr<CXXAMPRestrictAMPAttr>()))
+      S.DiagnoseCXXAMPOverloadedCallExpr(CopyCtor->getInnerLocStart(), CopyCtor);
+    }
+  }
+
+  if(!RDecl->hasUserDeclaredMoveConstructor()) {
+    bool CPUAttr = false;
+    bool AMPAttr = false;
+    CXXConstructorDecl* MoveCtor = NULL;
+    if(!RDecl->needsImplicitMoveConstructor()) {
+      for(CXXRecordDecl::ctor_iterator CtorIt = RDecl->ctor_begin();
+        CtorIt!=RDecl->ctor_end(); CtorIt++)
+        if((*CtorIt)->isMoveConstructor() && !(*CtorIt)->isUserProvided() &&
+          (*CtorIt)->isDefaulted()){
+          MoveCtor = (*CtorIt);
+        }
+    } else {
+      MoveCtor = S.DeclareImplicitMoveConstructor(RDecl);
+    }
+
+    if(MoveCtor && !MoveCtor->hasAttr<CXXAMPRestrictCPUAttr>() &&
+      !MoveCtor->hasAttr<CXXAMPRestrictAMPAttr>()) {
+      S.InheritSMFCtorIntersections(RDecl, CPUAttr, AMPAttr,
+        ParentCPUAttr, ParentAMPAttr, SMF_MoveConstructor);
+      if(MoveCtor && CPUAttr && !MoveCtor->hasAttr<CXXAMPRestrictCPUAttr>())
+        MoveCtor->addAttr(::new (Context) CXXAMPRestrictCPUAttr(Loc, Context, 0));
+
+      if(MoveCtor && AMPAttr && !MoveCtor->hasAttr<CXXAMPRestrictAMPAttr>())
+        MoveCtor->addAttr(::new (Context) CXXAMPRestrictAMPAttr(Loc, Context, 0));
+
+      if(MoveCtor && (MoveCtor->hasAttr<CXXAMPRestrictCPUAttr>() ||
+        MoveCtor->hasAttr<CXXAMPRestrictAMPAttr>()))
+      S.DiagnoseCXXAMPOverloadedCallExpr(MoveCtor->getInnerLocStart(), MoveCtor);
+    }
+  }
+
+}
+
+static void CheckCXXAMPSMFMethod(Sema& S, CXXRecordDecl* RDecl,
+                                 bool& ParentCPUAttr, bool& ParentAMPAttr) {
+  if (!RDecl || S.getLangOpts().HSAExtension)
+    return;
+
+  ASTContext &Context = S.Context;
+  SourceLocation Loc = RDecl->getLocation();
+  if(!RDecl->hasUserDeclaredCopyAssignment()) {
+    bool CPUAttr = false;
+    bool AMPAttr = false;
+    CXXMethodDecl* CopyAssign = NULL;
+    bool ConstParam = RDecl->implicitCopyAssignmentHasConstParam();
+    if(!RDecl->needsImplicitCopyAssignment()) {
+      for(CXXRecordDecl::method_iterator MethodIt = RDecl->method_begin();
+        MethodIt!=RDecl->method_end(); MethodIt++)
+        if((*MethodIt)->isCopyAssignmentOperator() && !(*MethodIt)->isUserProvided() &&
+          (*MethodIt)->isDefaulted()){
+          CopyAssign = (*MethodIt);
+          ConstParam = RDecl->hasCopyAssignmentWithConstParam();
+        }
+    } else {
+      CopyAssign = S.DeclareImplicitCopyAssignment(RDecl);
+    }
+
+    if(CopyAssign && !CopyAssign->hasAttr<CXXAMPRestrictCPUAttr>() &&
+      !CopyAssign->hasAttr<CXXAMPRestrictAMPAttr>()) {
+      S.InheritSMFMethodIntersections(RDecl, CPUAttr, AMPAttr,
+        ParentCPUAttr, ParentAMPAttr, SMF_CopyAssignment, ConstParam);
+      if(CopyAssign && CPUAttr && !CopyAssign->hasAttr<CXXAMPRestrictCPUAttr>())
+        CopyAssign->addAttr(::new (Context) CXXAMPRestrictCPUAttr(Loc, Context, 0));
+
+      if(CopyAssign && AMPAttr && !CopyAssign->hasAttr<CXXAMPRestrictAMPAttr>())
+        CopyAssign->addAttr(::new (Context) CXXAMPRestrictAMPAttr(Loc, Context, 0));
+
+      if(CopyAssign && (CopyAssign->hasAttr<CXXAMPRestrictCPUAttr>() ||
+        CopyAssign->hasAttr<CXXAMPRestrictAMPAttr>()))
+        S.DiagnoseCXXAMPOverloadedCallExpr(CopyAssign->getInnerLocStart(), CopyAssign);
+    }
+  }
+  // No test cases for move assign
+
+}
+
+static void CheckCXXAMPSMF(Sema& S, CXXRecordDecl *RDecl, bool& Checking) {
+  if (!RDecl || S.getLangOpts().HSAExtension)
+    return;
+
+  assert(RDecl);
+
+  bool ParentCPUAttr = false;
+  bool ParentAMPAttr = false;
+  if(S.getCurFunctionDecl() && (S.getCurFunctionDecl()->hasAttr<CXXAMPRestrictAMPAttr>() ||
+          S.getCurFunctionDecl()->hasAttr<CXXAMPRestrictCPUAttr>())) {
+    ParentCPUAttr = S.getCurFunctionDecl()->hasAttr<CXXAMPRestrictCPUAttr>();
+    ParentAMPAttr = S.getCurFunctionDecl()->hasAttr<CXXAMPRestrictAMPAttr>();
+  } else if(S.getCurLambda() && S.getCurLambda()->CallOperator &&
+         (S.getCurLambda()->CallOperator ->hasAttr<CXXAMPRestrictCPUAttr>() ||
+         S.getCurLambda()->CallOperator ->hasAttr<CXXAMPRestrictAMPAttr>())) {
+    ParentCPUAttr = S.getCurLambda()->CallOperator ->hasAttr<CXXAMPRestrictCPUAttr>();
+    ParentAMPAttr = S.getCurLambda()->CallOperator ->hasAttr<CXXAMPRestrictAMPAttr>();
+  } else if(RDecl->isLocalClass ()) {
+    // FIXME: Need to initiate ParentCPU and ParentAMP here
+    ParentCPUAttr = true;
+    ParentAMPAttr = true;
+  }
+
+  if(!ParentCPUAttr && !ParentAMPAttr) {
+    Checking = true;
+    return;
+  }
+
+  Checking = false;
+
+  // Virtual bases are not allowed
+  if(!S.getLangOpts().HSAExtension && RDecl->isClass() && RDecl->getNumVBases())
+    S.Diag(RDecl->getBeginLoc(), diag::err_amp_virtual_base_class_unsupported)
+    << RDecl->getDeclName().getAsString();
+
+  CheckCXXAMPSMFDestructor(S, RDecl, ParentCPUAttr, ParentAMPAttr);
+  CheckCXXAMPSMFConstructor(S, RDecl, ParentCPUAttr, ParentAMPAttr);
+  CheckCXXAMPSMFMethod(S, RDecl, ParentCPUAttr, ParentAMPAttr);
+}
+
 /// Attempt initialization by constructor (C++ [dcl.init]), which
 /// enumerates the constructors of the initialized entity and performs overload
 /// resolution to select the best.
@@ -3964,6 +4668,12 @@ static void TryConstructorInitialization(Sema &S,
     return;
   }
 
+  // C++AMP Open Spec [2.3.2 Function Overloading]
+  bool NotCXXAMPSpec = true;
+  if(DestRecordDecl && S.getLangOpts().CPlusPlusAMP) {
+    CheckCXXAMPSMF(S, DestRecordDecl, NotCXXAMPSpec);
+  }
+
   bool HadMultipleCandidates = (CandidateSet.size() > 1);
 
   // In C++17, ResolveConstructorOverload can select a conversion function
@@ -3994,7 +4704,7 @@ static void TryConstructorInitialization(Sema &S,
   if (Kind.getKind() == InitializationKind::IK_Default &&
       Entity.getType().isConstQualified()) {
     if (!CtorDecl->getParent()->allowConstDefaultInit()) {
-      if (!maybeRecoverWithZeroInitialization(S, Sequence, Entity))
+      if (!maybeRecoverWithZeroInitialization(S, Sequence, Entity) && NotCXXAMPSpec)
         Sequence.SetFailed(InitializationSequence::FK_DefaultInitOfConst);
       return;
     }
@@ -4577,6 +5287,39 @@ static void TryReferenceInitializationCore(Sema &S,
     = S.CompareReferenceRelationship(DeclLoc, cv1T1, cv2T2, DerivedToBase,
                                      ObjCConversion, ObjCLifetimeConversion);
 
+  // C++AMP
+  if(S.getLangOpts().CPlusPlusAMP && isLValueRef && InitCategory.isLValue() &&
+    T2->isFunctionType()) {
+    // Case by case
+    //      int glorp(int x) __GPU_ONLY {
+    //          return 668 + x;
+    //      }
+    //
+    //      int main() {
+    //         typedef int FT(int);
+    //         FT& p = glorp;   // Error: initialize function reference with a function with 
+    //                                  // incompatible restriction specifier
+    //         printf("%d\n", p(-2));
+    //         return 1;
+    //      }
+     const DeclRefExpr* Decl = dyn_cast<DeclRefExpr>(Initializer);
+    // FIXME: better to check source & target signature and invoke function conversion error
+    // when implement amp restriction into signature
+    if(Decl && Decl->getDecl()) {
+      bool EntityCPU = false;
+      bool EntityAMP = false;
+      if(Entity.getDecl()) {
+        EntityCPU = Entity.getDecl()->hasAttr<CXXAMPRestrictCPUAttr>();
+        EntityAMP = Entity.getDecl()->hasAttr<CXXAMPRestrictAMPAttr>();
+      }
+      if(!(EntityCPU ==Decl->getDecl()->hasAttr<CXXAMPRestrictCPUAttr>() &&
+        EntityAMP == Decl->getDecl()->hasAttr<CXXAMPRestrictAMPAttr>())){
+        if(!(EntityAMP && EntityCPU) && (EntityAMP ||EntityCPU))
+          S.Diag(DeclLoc, diag::err_amp_function_conversion);
+      }
+    }
+  }
+
   // C++0x [dcl.init.ref]p5:
   //   A reference to type "cv1 T1" is initialized by an expression of type
   //   "cv2 T2" as follows:
@@ -5541,6 +6284,13 @@ void InitializationSequence::InitializeFrom(Sema &S,
 
   // Handle default initialization.
   if (Kind.getKind() == InitializationKind::IK_Default) {
+    // C++ AMP specific
+    // Prevent tile_static variables being initialized
+    if (S.getLangOpts().CPlusPlusAMP &&
+        Entity.getDecl() &&
+        Entity.getDecl()->hasAttr<HCCTileStaticAttr>())
+        return;
+
     TryDefaultInitialization(S, Entity, Kind, *this);
     return;
   }
@@ -8705,11 +9455,23 @@ bool InitializationSequence::Diagnose(Sema &S,
         // If this is a defaulted or implicitly-declared function, then
         // it was implicitly deleted. Make it clear that the deletion was
         // implicit.
-        if (S.isImplicitlyDeleted(Best->Function))
-          S.Diag(Kind.getLocation(), diag::err_ovl_deleted_special_init)
+        if (S.isImplicitlyDeleted(Best->Function)) {
+          // C++AMP
+          bool check = true;
+          if(S.getLangOpts().CPlusPlusAMP) {
+            FunctionDecl* F = S.getCurFunctionDecl();
+            // FIXME:Best->Function loses C++AMP restriction after getting candidate
+            if(F && (F->hasAttr<CXXAMPRestrictAMPAttr>() ||
+              F->hasAttr<CXXAMPRestrictCPUAttr>())){
+              check = false;
+             }
+          }
+          if(check) {
+            S.Diag(Kind.getLocation(), diag::err_ovl_deleted_special_init)
             << S.getSpecialMember(cast<CXXMethodDecl>(Best->Function))
             << DestType << ArgsRange;
-        else
+          }
+        } else
           S.Diag(Kind.getLocation(), diag::err_ovl_deleted_init)
               << DestType << ArgsRange;
 
diff --git a/lib/Sema/SemaLambda.cpp b/lib/Sema/SemaLambda.cpp
index 986524e6d5..66051224f0 100644
--- a/lib/Sema/SemaLambda.cpp
+++ b/lib/Sema/SemaLambda.cpp
@@ -1736,6 +1736,85 @@ ExprResult Sema::BuildLambdaExpr(SourceLocation StartLoc, SourceLocation EndLoc,
       CaptureInits.push_back(Init.get());
     }
 
+    // C++AMP
+    std::vector<std::pair<Capture, unsigned> > FoundVec;
+    if (getLangOpts().CPlusPlusAMP && CallOperator->hasAttr<CXXAMPRestrictAMPAttr>()) {
+      for (unsigned K = 0, N = LSI->Captures.size(); K != N; ++K) {
+        Capture From = LSI->Captures[K];
+        assert(!From.isBlockCapture() && "Cannot capture __block variables");
+        if (From.isThisCapture()) continue;
+        // Handle [Var]
+        if(From.getCaptureType()->isPointerType()) {
+          if (getLangOpts().HSAExtension) {
+            // relax this rule in HSA to allow capturing raw pointers
+          } else {
+            FoundVec.push_back(std::make_pair(From, (unsigned)diag::err_amp_captured_variable_type));
+          }
+        }
+
+        if(From.getCaptureType()->isClassType() && From.isCopyCapture()) {
+          // hc::array and Concurrency::array can't be captured by copy
+          if (From.getCaptureType()->isGPUArrayType()) {
+            FoundVec.push_back(std::make_pair(From, (unsigned)diag::err_amp_captured_variable_type));
+          }
+        }
+        // Handle [This], [&]
+        if(From.isReferenceCapture() || From.isThisCapture()) {
+          if(const ReferenceType* RT = dyn_cast<ReferenceType>(From.getCaptureType())) {
+            const PrintingPolicy PrintPolicy = Context.getPrintingPolicy();
+            std::string Info = QualType::getAsString(From.getCaptureType()->getPointeeType().split(), PrintPolicy);
+            if (!getLangOpts().HSAExtension) {
+              if(RT->getPointeeType()->isPointerType()) {
+                #if 0
+                // Add the skipped type here
+                if(!Info.empty() && (Info.find("array<")!=std::string::npos ||
+                  Info.find("array_view<")!=std::string::npos))
+                #endif
+                FoundVec.push_back(std::make_pair(From,
+                                                 (unsigned)diag::err_amp_captured_by_reference_for_variables));
+              } else  {
+                // Boolean type is allowed in capture
+                // FIXME: Ugly codes. Need reliable methods to skip amp compatible types
+                if(Info.find("array<")!=std::string::npos || Info.find("array_view<")!=std::string::npos ||
+                  Info.find("texture<")!=std::string::npos ||
+                  RT->getPointeeType()->isBooleanType()){
+                  // amp-compatible types
+                } else
+                  FoundVec.push_back(std::make_pair(From,
+                                             (unsigned)diag::err_amp_captured_by_reference_for_variables));
+              }
+            } // HSA extension check
+          }
+        }
+      }
+    }
+    // Capture a restrict-amp function pointer by value in a restrict(cpu) lambda
+    if (getLangOpts().CPlusPlusAMP &&
+          (((!CallOperator->hasAttr<CXXAMPRestrictAMPAttr>() &&
+            CallOperator->hasAttr<CXXAMPRestrictCPUAttr>()) ||
+            (!CallOperator->hasAttr<CXXAMPRestrictAMPAttr>() &&
+            !CallOperator->hasAttr<CXXAMPRestrictCPUAttr>())))) {
+      for (unsigned K = 0, N = LSI->Captures.size(); K != N; ++K) {
+        Capture From = LSI->Captures[K];
+        assert(!From.isBlockCapture() && "Cannot capture __block variables");
+        if (From.isThisCapture()) continue;
+        QualType CaptureType = From.getCaptureType();
+        if(!CaptureType.isNull() && CaptureType->isFunctionPointerType()) {
+          if( From.getVariable() && From.getVariable()->hasAttr<CXXAMPRestrictAMPAttr>())
+            FoundVec.push_back(std::make_pair(From, (unsigned)diag::err_amp_captured_variable_type));
+        }
+      }
+    }
+    if(FoundVec.size()) {
+      for( std::vector<std::pair<Capture, unsigned> >::iterator iter = FoundVec.begin();
+        iter!=FoundVec.end(); iter++)
+        if(iter->first.getVariable())
+          Diag(iter->first.getLocation(), iter->second) << iter->first.getVariable()->getName();
+        else
+          Diag(iter->first.getLocation(), iter->second);
+      return ExprError();
+    }
+
     // C++11 [expr.prim.lambda]p6:
     //   The closure type for a lambda-expression with no lambda-capture
     //   has a public non-virtual non-explicit const conversion function
@@ -1763,6 +1842,11 @@ ExprResult Sema::BuildLambdaExpr(SourceLocation StartLoc, SourceLocation EndLoc,
 
   Cleanup.mergeFrom(LambdaCleanup);
 
+  // C++AMP
+  if (getLangOpts().CPlusPlusAMP && NeedAMPDeserializer(Class)) {
+    DeclareAMPDeserializer(Class, NULL);
+  }
+
   LambdaExpr *Lambda = LambdaExpr::Create(Context, Class, IntroducerRange,
                                           CaptureDefault, CaptureDefaultLoc,
                                           Captures,
diff --git a/lib/Sema/SemaLookup.cpp b/lib/Sema/SemaLookup.cpp
index 8a24dd884a..dfb58dfd92 100644
--- a/lib/Sema/SemaLookup.cpp
+++ b/lib/Sema/SemaLookup.cpp
@@ -906,6 +906,24 @@ static void DeclareImplicitMemberFunctionsWithName(Sema &S,
     }
     break;
 
+  case DeclarationName::Identifier:                                           
+    if (S.getLangOpts().CPlusPlusAMP) {                                       
+      if (const CXXRecordDecl *Record = dyn_cast<CXXRecordDecl>(DC)) {        
+        CXXRecordDecl *Class = const_cast<CXXRecordDecl *>(Record);                 
+        if (!Class->getDefinition() || !CanDeclareSpecialMemberFunction(Record)) {                   
+          break;                                                                    
+        }                                                                           
+        if (Name.getAsString() == "__cxxamp_trampoline") {                    
+          S.DeclareAMPTrampoline(Class, Name);                                
+        } else if (Name.getAsString() == "__cxxamp_trampoline_name") {              
+          S.DeclareAMPTrampolineName(Class, Name);                            
+        } else if (Name.getAsString() == "__cxxamp_serialize") {              
+          S.DeclareAMPSerializer(Class, Name);                                      
+        }                                                                     
+      }                                                                                                   
+    }                                                                         
+    break;
+
   case DeclarationName::CXXDeductionGuideName:
     S.DeclareImplicitDeductionGuides(Name.getCXXDeductionGuideTemplate(), Loc);
     break;
@@ -3218,10 +3236,97 @@ DeclContext::lookup_result Sema::LookupConstructors(CXXRecordDecl *Class) {
     if (getLangOpts().CPlusPlus11 && Class->needsImplicitMoveConstructor())
       DeclareImplicitMoveConstructor(Class);
   }
+  // C++AMP
+  if (getLangOpts().CPlusPlusAMP && NeedAMPDeserializer(Class)) {
+    DeclareAMPDeserializer(Class, NULL);
+  }
 
   CanQualType T = Context.getCanonicalType(Context.getTypeDeclType(Class));
   DeclarationName Name = Context.DeclarationNames.getCXXConstructorName(T);
-  return Class->lookup(Name);
+  DeclContext::lookup_result result = Class->lookup(Name);
+
+  if (!getLangOpts().CPlusPlusAMP) {
+    return result;
+  } else {
+    // C++AMP-specific logic
+    // We need to trim the result for constructors found
+    bool isAMP = false;
+    bool isCPU = false;
+    if (FunctionDecl *FD = dyn_cast<FunctionDecl>(CurContext)) {
+      isAMP = FD->hasAttr<CXXAMPRestrictAMPAttr>();
+      isCPU = FD->hasAttr<CXXAMPRestrictCPUAttr>();
+      // In case the current context is restrict(amp, cpu), we simply
+      // return the result
+      if (isAMP && isCPU)
+        return result;
+    }
+
+    // walkthrough the result and see if there is anything to be trimmed
+    bool to_trim_result = false;
+    for (DeclContext::lookup_iterator I = result.begin(), E = result.end();
+         I != E; ++I) {
+      if (FunctionDecl *MD = dyn_cast<FunctionDecl>(*I)) {
+        if (!isAMP) {
+          // for host codes (!isAMP)
+          // strip compiler-injected restrict(amp) constructors such as
+          // deserialize functions
+          if (!MD->hasAttr<CXXAMPRestrictCPUAttr>() &&
+              MD->hasAttr<AnnotateAttr>() &&
+              MD->getAttr<AnnotateAttr>()->getAnnotation()
+                .find("auto_deserialize") != StringRef::npos) {
+            to_trim_result = true;
+            break;
+          }
+        } else {
+          // for kernel codes (!isCPU)
+          // strip constructors which don't have restrict(amp)
+          if (!isCPU &&
+              !MD->hasAttr<CXXAMPRestrictAMPAttr>() &&
+              !MD->isImplicit()) {
+            to_trim_result = true;
+            break;
+          }
+        }
+      }
+    }
+
+    // directly return the result if there is nothing to trim
+    if (!to_trim_result) {
+      return result;
+    }
+
+    // FIXME: TrimmedLookupResult is allocated from heap, but it's not deleted
+    SmallVector<NamedDecl*, 8> *TrimmedLookupResult = new SmallVector<NamedDecl*, 8>;
+    for (DeclContext::lookup_iterator I = result.begin(), E = result.end();
+         I != E; ++I) {
+      bool delete_this = false;
+      if (FunctionDecl *MD = dyn_cast<FunctionDecl>(*I)) {
+        if (!isAMP) {
+          // for host codes (!isAMP)
+          // strip compiler-injected restrict(amp) constructors such as
+          // deserialize functions
+          if (!MD->hasAttr<CXXAMPRestrictCPUAttr>() &&
+              MD->hasAttr<AnnotateAttr>() &&
+              MD->getAttr<AnnotateAttr>()->getAnnotation()
+                .find("auto_deserialize") != StringRef::npos) {
+            delete_this = true;
+          }
+        } else {
+          // for kernel codes (!isCPU)
+          // strip constructors which don't have restrict(amp)
+          if (!isCPU &&
+              !MD->hasAttr<CXXAMPRestrictAMPAttr>() &&
+              !MD->isImplicit()) {
+            delete_this = true;
+          }
+        }
+      }
+      if (!delete_this) {
+        TrimmedLookupResult->push_back(*I);
+      }
+    }
+    return DeclContext::lookup_result(*TrimmedLookupResult);
+  }
 }
 
 /// Look up the copying assignment operator for the given class.
diff --git a/lib/Sema/SemaOverload.cpp b/lib/Sema/SemaOverload.cpp
index f632a4d3bd..0ed2845606 100644
--- a/lib/Sema/SemaOverload.cpp
+++ b/lib/Sema/SemaOverload.cpp
@@ -25,6 +25,7 @@
 #include "clang/Sema/Initialization.h"
 #include "clang/Sema/Lookup.h"
 #include "clang/Sema/SemaInternal.h"
+#include "clang/Sema/ScopeInfo.h"
 #include "clang/Sema/Template.h"
 #include "clang/Sema/TemplateDeduction.h"
 #include "llvm/ADT/DenseSet.h"
@@ -928,6 +929,49 @@ static bool checkArgPlaceholdersForOverload(Sema &S,
   return false;
 }
 
+void Sema::DiagnoseCXXAMPDtorOverload(FunctionDecl *New,
+                    const LookupResult &Old) {
+  CXXMethodDecl *NewMethod = dyn_cast<CXXMethodDecl>(New);
+  if(!NewMethod ||!isa<CXXDestructorDecl>(NewMethod))
+    return;
+
+  // class A
+  // {
+  //   public:
+  //     ~A() restrict(cpu) {}
+  //     ~A() restrict(amp) {} // Error
+  // };
+  if(!Old.empty()) {
+    std::vector<std::pair<SourceLocation, diag::kind> > FoundVec;
+    for(LookupResult::iterator PreDecl = Old.begin(); PreDecl != Old.end(); PreDecl++) {
+      diag::kind PrevDiag;
+      FunctionDecl *Old = 0;
+      if (FunctionTemplateDecl *OldFunctionTemplate  = dyn_cast<FunctionTemplateDecl>(*PreDecl))
+        Old = OldFunctionTemplate->getTemplatedDecl();
+      else
+        Old = dyn_cast<FunctionDecl>((*PreDecl));
+      if (Old && Old->isThisDeclarationADefinition())
+        PrevDiag = diag::note_previous_definition;
+      else if (Old && Old->isImplicit())
+        PrevDiag = diag::note_previous_implicit_declaration;
+      else
+        PrevDiag = diag::note_previous_declaration;
+      // FIXME: Since we don't inline restrictions into Function's prototype, if prototype
+      // is the same, the only reason for overloadable is that they have different restrction
+      if(Old && Old->getType() == New->getType())
+        // Make sure they are only different from restrictions
+        if(New->hasAttr<CXXAMPRestrictAMPAttr>()!=(*PreDecl)->hasAttr<CXXAMPRestrictAMPAttr>() ||
+          New->hasAttr<CXXAMPRestrictCPUAttr>()!=(*PreDecl)->hasAttr<CXXAMPRestrictCPUAttr>())
+          FoundVec.push_back(std::make_pair((*PreDecl)->getLocation(), PrevDiag));
+    }
+    if(FoundVec.size()) {
+      Diag(New->getLocation(), diag::err_destructor_redeclared);
+      for(unsigned i = 0; i < FoundVec.size(); i++)
+        Diag(FoundVec[i].first, FoundVec[i].second);
+    }
+  }
+}
+
 /// Determine whether the given New declaration is an overload of the
 /// declarations in Old. This routine returns Ovl_Match or Ovl_NonFunction if
 /// New and Old cannot be overloaded, e.g., if New has the same signature as
@@ -1040,6 +1084,10 @@ Sema::CheckOverload(Scope *S, FunctionDecl *New, const LookupResult &Old,
     }
   }
 
+  // C++AMP
+  if(getLangOpts().CPlusPlusAMP && dyn_cast<CXXMethodDecl>(New))
+    DiagnoseCXXAMPDtorOverload(New, Old);
+
   // C++ [temp.friend]p1:
   //   For a friend function declaration that is not a template declaration:
   //    -- if the name of the friend is a qualified or unqualified template-id,
@@ -1093,6 +1141,26 @@ bool Sema::IsOverload(FunctionDecl *New, FunctionDecl *Old,
   if ((OldTemplate == nullptr) != (NewTemplate == nullptr))
     return true;
 
+  // C++AMP
+  if(getLangOpts().CPlusPlusAMP) {
+    // allow this case:
+    // void fun(...) restrict(amp)
+    // void fun(...) restrict(cpu)
+    bool OldisAMP = Old->hasAttr<CXXAMPRestrictAMPAttr>();
+    bool OldisCPU = Old->hasAttr<CXXAMPRestrictCPUAttr>();
+    bool NewisAMP = New->hasAttr<CXXAMPRestrictAMPAttr>();
+    bool NewisCPU = New->hasAttr<CXXAMPRestrictCPUAttr>();
+    //support restrict overload
+    if (NewisAMP && !NewisCPU && !OldisAMP && OldisCPU)
+      return true;
+    if (!NewisAMP && NewisCPU && OldisAMP && !OldisCPU)
+      return true;
+    if (!NewisAMP && !NewisCPU && (OldisAMP ^ OldisCPU))
+      return true;
+    if ((NewisAMP ^ NewisCPU) && !OldisAMP && !OldisCPU)
+      return true;
+  }
+
   // Is the function New an overload of the function Old?
   QualType OldQType = Context.getCanonicalType(Old->getType());
   QualType NewQType = Context.getCanonicalType(New->getType());
@@ -9028,6 +9096,163 @@ Sema::AddArgumentDependentLookupCandidates(DeclarationName Name,
   }
 }
 
+// FIXME: should consider decltype trailing return type
+bool Sema::IsInAMPRestricted() {
+  return ((getCurFunctionDecl() && getCurFunctionDecl()->hasAttr<CXXAMPRestrictAMPAttr>()) ||
+      (getCurLambda() && getCurLambda()->CallOperator &&
+      getCurLambda()->CallOperator->hasAttr<CXXAMPRestrictAMPAttr>()));
+}
+
+// Determine if in CPU and/or AMP restricted codes
+// FIXME: should consider decltype trailing return type
+bool Sema::IsInAnyExplicitRestricted() {
+  return ((getCurFunctionDecl() && (getCurFunctionDecl()->hasAttr<CXXAMPRestrictAMPAttr>() ||
+    getCurFunctionDecl()->hasAttr<CXXAMPRestrictCPUAttr>())) ||
+    (getCurLambda() && getCurLambda()->CallOperator &&
+    (getCurLambda()->CallOperator->hasAttr<CXXAMPRestrictAMPAttr>() ||
+    getCurLambda()->CallOperator->hasAttr<CXXAMPRestrictCPUAttr>())));
+}
+
+static bool IsInAMPFunction(Scope *scope) {
+  while (scope) {
+    if (scope->getFlags() & Scope::FnScope) {
+      FunctionDecl *FD = dyn_cast_or_null<FunctionDecl>(static_cast<DeclContext*>(scope->getEntity()));
+      if (FD && FD->hasAttr<CXXAMPRestrictAMPAttr>()) {
+        return true;
+      }
+    }
+    scope = scope->getParent();
+  }
+  return false;
+}
+
+static bool IsInExplicitCPUFunction(Scope *scope) {
+  while (scope) {
+    if (scope->getFlags() & Scope::FnScope) {
+      FunctionDecl *FD = dyn_cast_or_null<FunctionDecl>(static_cast<DeclContext*>(scope->getEntity()));
+      if (FD && FD->hasAttr<CXXAMPRestrictCPUAttr>()) {
+        return true;
+      }
+    }
+    scope = scope->getParent();
+  }
+  return false;
+}
+
+// FIXME: is it a reliable way?
+void Sema::GetCXXAMPParentRestriction(Scope* SC,
+                          bool& ParentCPU, bool& ParentAMP, bool&ParentAUTO) {
+  if(getCurLambda() && getCurLambda()->CallOperator) {
+    ParentCPU = getCurLambda()->CallOperator->hasAttr<CXXAMPRestrictCPUAttr>();
+    ParentAMP = getCurLambda()->CallOperator->hasAttr<CXXAMPRestrictAMPAttr>();
+    // Will deduce in 'auto' inferring, however make the overload resolution happy for now
+    if(getCurLambda()->CallOperator->hasAttr<CXXAMPRestrictAUTOAttr>())
+      ParentAUTO = true;
+  }
+  if(!ParentCPU && !ParentAMP) {
+    if(getCurFunctionDecl()) {
+      ParentCPU = getCurFunctionDecl()->hasAttr<CXXAMPRestrictCPUAttr>();
+      ParentAMP = getCurFunctionDecl()->hasAttr<CXXAMPRestrictAMPAttr>();
+      // Will deduce in 'auto' inferring, however make the overload resolution happy for now
+      if(getCurFunctionDecl()->hasAttr<CXXAMPRestrictAUTOAttr>())
+        ParentAUTO = true;
+    }
+  }
+
+  // To determine class member if it is in AMP restricted
+  //
+  //  void wrap_test_mem_2() restrict(amp) {
+  //
+  //    struct test_mem_2 {
+  //      decltype(f()) member; // expect: amp_t member
+  //    };
+  //
+  //  }
+  if(!getCurFunctionDecl() && !getCurLambda() && SC) {
+    ParentAMP = SC->isAMPScope();
+    ParentCPU = SC->isCPUScope();
+    ParentAUTO = SC->isAUTOScope();
+    if(IsInAMPFunction(SC))
+      ParentAMP = true;
+    if(IsInExplicitCPUFunction(SC))
+      ParentCPU = true;
+
+    // Reach the end? then return
+    if(dyn_cast<TranslationUnitDecl>(getFunctionLevelDeclContext()))
+      return;
+  }
+}
+
+static int getCXXAMPPrio(FunctionDecl *Func, bool isDevice,
+  bool ParentCPU, bool ParentAMP, bool ParentAUTO)
+{
+  bool isAMP = Func->hasAttr<CXXAMPRestrictAMPAttr>();
+  bool isCPU = Func->hasAttr<CXXAMPRestrictCPUAttr>();
+  // Ensure that the callee's 'auto' has been inferred before, otherwise no way to recursively
+  // resolve its overload without any explicit restrictions on it
+  if(Func->hasAttr<CXXAMPRestrictAUTOAttr>()) {
+    llvm::errs()<<"The function should have been inferred at this point!\n";
+    exit(1);
+  }
+  // Deduce to normal case
+  if(ParentCPU && ParentAMP)
+    ParentAUTO = false;
+
+  int NonAutoSpec = 0;
+  if(ParentAUTO) {
+    NonAutoSpec = clang::CPPAMP_AMP | clang::CPPAMP_CPU ;
+    if(ParentCPU)
+      NonAutoSpec &=~clang::CPPAMP_CPU;
+    if(ParentAMP)
+      NonAutoSpec &=~clang::CPPAMP_AMP;
+  }
+
+  if (!isAMP) isCPU = true;
+  int Prio = 0;
+  // Specially handle auto restricted caller
+  if(NonAutoSpec) {
+    if (isAMP && isCPU)
+      Prio = 2;
+    else if (isDevice && isAMP) {
+      Prio = 2; // If the caller is CPU only, this callee will be diagnosed later
+    } else if (!isDevice && isCPU) {
+      Prio = 2;
+    }
+    // unreachable
+    else if (!isAMP && !isCPU)
+       Prio = 2;
+
+   return Prio;
+
+  } else {
+    if (isAMP && isCPU)
+      Prio = 2;
+    else if (isDevice && isAMP)
+      Prio = 2; // If the caller is CPU only, this callee will be diagnosed later
+    else if (!isDevice && isCPU) {
+      // FIXME: proposition:use amp context in CPU fallback
+      // If the pro. is true, we should not allow any explicitly cpu-restricted
+      // in an amp context even in CPU path, i.e. !isDevice
+      if(ParentAMP && !ParentCPU)
+        return Prio;
+      Prio = 2;
+    } else if (!isDevice && isAMP) {
+      // FIXME: We can still select amp restricted function in CPU path
+      // since we don't emit it in code generation phase
+      if(ParentAMP && !ParentCPU)
+        return 2;
+    }
+    // unreachable
+    else if (!isAMP && !isCPU)
+      Prio = 2;
+  }
+
+  // Can't resolve
+  //  (1) isDevice && !isAMP
+  //  (2) !isDevice && isAMP
+  return Prio;
+}
+
 namespace {
 enum class Comparison { Equal, Better, Worse };
 }
@@ -9131,7 +9356,9 @@ static bool isBetterMultiversionCandidate(const OverloadCandidate &Cand1,
 /// candidate is a better candidate than the second (C++ 13.3.3p1).
 bool clang::isBetterOverloadCandidate(
     Sema &S, const OverloadCandidate &Cand1, const OverloadCandidate &Cand2,
-    SourceLocation Loc, OverloadCandidateSet::CandidateSetKind Kind) {
+    SourceLocation Loc, OverloadCandidateSet::CandidateSetKind Kind,
+    bool UserDefinedConversion,
+    Scope* SC) {
   // Define viable functions to be better candidates than non-viable
   // functions.
   if (!Cand2.Viable)
@@ -9139,6 +9366,49 @@ bool clang::isBetterOverloadCandidate(
   else if (!Cand1.Viable)
     return false;
 
+  // C++AMP
+  if (S.getLangOpts().CPlusPlusAMP && Cand1.Function && Cand2.Function) {
+    bool ParentCPUAttr = false;
+    bool ParentAMPAttr = false;
+    bool ParentAUTOAttr = false;
+    S.GetCXXAMPParentRestriction(SC, ParentCPUAttr, ParentAMPAttr, ParentAUTOAttr);
+
+    FunctionDecl *First = Cand1.Function;
+    FunctionDecl *Second = Cand2.Function;
+    if (!First->isImplicit() && !Second->isImplicit()) {
+      int CurPrio = getCXXAMPPrio(First, S.getLangOpts().DevicePath,
+        ParentCPUAttr, ParentAMPAttr, ParentAUTOAttr);
+      int FunPrio = getCXXAMPPrio(Second, S.getLangOpts().DevicePath,
+        ParentCPUAttr, ParentAMPAttr, ParentAUTOAttr);
+      if (CurPrio > FunPrio)
+        return true;
+      if (CurPrio < FunPrio)
+        return false;
+    } else if (!First->isImplicit()) {
+      if(ParentAMPAttr) {
+         // GPU path
+        if (First->hasAttr<CXXAMPRestrictAMPAttr>())
+            return true;
+      } else {
+        // CPU path
+        if (First->hasAttr<CXXAMPRestrictCPUAttr>() || !First->hasAttr<CXXAMPRestrictAMPAttr>())
+          return true;
+      }
+
+      if(!S.getCurFunctionDecl() && !S.getCurLambda()) {
+        if (S.getLangOpts().DevicePath) {
+          // GPU path
+          if (First->hasAttr<CXXAMPRestrictAMPAttr>())
+            return true;
+        } else {
+          // CPU path
+          if (First->hasAttr<CXXAMPRestrictCPUAttr>() || !First->hasAttr<CXXAMPRestrictAMPAttr>())
+            return true;
+        }
+      }
+    }
+  }
+
   // C++ [over.match.best]p1:
   //
   //   -- if F is a static member function, ICS1(F) is defined such
@@ -9409,7 +9679,9 @@ void Sema::diagnoseEquivalentInternalLinkageDeclarations(
 /// \returns The result of overload resolution.
 OverloadingResult
 OverloadCandidateSet::BestViableFunction(Sema &S, SourceLocation Loc,
-                                         iterator &Best) {
+                                         iterator &Best,
+                                         bool UserDefinedConversion,
+                                         Scope* SC) {
   llvm::SmallVector<OverloadCandidate *, 16> Candidates;
   std::transform(begin(), end(), std::back_inserter(Candidates),
                  [](OverloadCandidate &Cand) { return &Cand; });
@@ -9444,7 +9716,8 @@ OverloadCandidateSet::BestViableFunction(Sema &S, SourceLocation Loc,
   for (auto *Cand : Candidates)
     if (Cand->Viable)
       if (Best == end() ||
-          isBetterOverloadCandidate(S, *Cand, *Best, Loc, Kind))
+          isBetterOverloadCandidate(S, *Cand, *Best, Loc, Kind,
+                                    UserDefinedConversion, SC))
         Best = Cand;
 
   // If we didn't find any viable functions, abort.
@@ -9457,7 +9730,8 @@ OverloadCandidateSet::BestViableFunction(Sema &S, SourceLocation Loc,
   // function. If not, we have an ambiguity.
   for (auto *Cand : Candidates) {
     if (Cand->Viable && Cand != Best &&
-        !isBetterOverloadCandidate(S, *Best, *Cand, Loc, Kind)) {
+        !isBetterOverloadCandidate(S, *Best, *Cand, Loc, Kind,
+                                   UserDefinedConversion, SC)) {
       if (S.isEquivalentInternalLinkageDeclaration(Best->Function,
                                                    Cand->Function)) {
         EquivalentCands.push_back(Cand->Function);
@@ -9477,6 +9751,72 @@ OverloadCandidateSet::BestViableFunction(Sema &S, SourceLocation Loc,
     S.diagnoseEquivalentInternalLinkageDeclarations(Loc, Best->Function,
                                                     EquivalentCands);
 
+  // C++ AMP-specific
+  if (S.getLangOpts().CPlusPlusAMP) {
+    // Diagnose err_amp_call_from_both_amp_and_cpu_to_disctint
+    // TODO: Will consider implementation dependent, e.g. opencl_fabs.
+#if 0
+    bool ParentHasBoth = false;
+    if(S.getCurLambda() && S.getCurLambda()->CallOperator)
+      ParentHasBoth = S.getCurLambda()->CallOperator->hasAttr<CXXAMPRestrictAMPAttr>() &&
+                                 S.getCurLambda()->CallOperator->hasAttr<CXXAMPRestrictCPUAttr>();
+    else if(S.getCurFunctionDecl())
+      ParentHasBoth = S.getCurFunctionDecl()->hasAttr<CXXAMPRestrictAMPAttr>() &&
+                                 S.getCurFunctionDecl()->hasAttr<CXXAMPRestrictCPUAttr>();
+
+    bool BestFoundHasDistinct = false;
+    if(Best->Function)
+      BestFoundHasDistinct = !Best->Function->hasAttr<CXXAMPRestrictAMPAttr>() ||
+                                 !Best->Function->hasAttr<CXXAMPRestrictCPUAttr>();
+    if(ParentHasBoth && BestFoundHasDistinct) {
+      // There are a lot of overloaded, e.g. with different AMP restrictions
+      if(end()- begin() > 1) {
+        for (iterator Cand = begin(); Cand != end(); ++Cand)
+          Cand->Function->dump();
+        S.Diag(Loc, diag::err_amp_call_from_both_amp_and_cpu_to_disctint);
+        }
+      }
+#endif
+    // Implementation dependent
+    if (S.getLangOpts().DevicePath && !S.getLangOpts().AMPCPU) {
+      // in GPU path, check if calling from AMP to CPU
+      bool ParentAMP = false;
+      if(S.getCurFunctionDecl() && S.getCurFunctionDecl()->hasAttr<CXXAMPRestrictAMPAttr>())
+        ParentAMP = true;
+      // Superess the restrictions
+      if(S.getCurLambda() && S.getCurLambda()->CallOperator) {
+        if(S.getCurLambda()->CallOperator->hasAttr<CXXAMPRestrictAMPAttr>())
+          ParentAMP = true;
+        else
+          ParentAMP = false;
+      }
+      if (ParentAMP && Best->Function && Best->Function->hasAttr<CXXAMPRestrictCPUAttr>() &&
+        !Best->Function->hasAttr<CXXAMPRestrictAMPAttr>()) {
+        S.Diag(Loc, diag::err_amp_call_from_amp_to_cpu);
+      }
+    }
+
+    // in CPU path, check if calling from CPU to AMP
+    // SMF's restriction intersections might take place after selecting best function.
+    // Disable the following semantic checking
+    if (0 && !S.getLangOpts().DevicePath) {
+      bool ParentCPU = false;
+      if(S.getCurFunctionDecl() && !S.getCurFunctionDecl()->hasAttr<CXXAMPRestrictAMPAttr>())
+        ParentCPU = true;
+      // Superess the restrictions
+      if(S.getCurLambda() && S.getCurLambda()->CallOperator) {
+        if(!S.getCurLambda()->CallOperator->hasAttr<CXXAMPRestrictAMPAttr>())
+          ParentCPU = true;
+        else
+          ParentCPU = false;
+        }
+      if (ParentCPU && Best->Function && !Best->Function->hasAttr<CXXAMPRestrictCPUAttr>() &&
+        Best->Function->hasAttr<CXXAMPRestrictAMPAttr>()) {
+        S.Diag(Loc, diag::err_amp_call_from_cpu_to_amp);
+      }
+    }
+  }
+
   return OR_Success;
 }
 
@@ -11238,8 +11578,38 @@ private:
     if (!S.checkAddressOfFunctionIsAvailable(Specialization))
       return false;
 
-    Matches.push_back(std::make_pair(CurAccessFunPair, Specialization));
-    return true;
+    // C++AMP
+    if (S.getLangOpts().CPlusPlusAMP) {
+      FunctionDecl *Current = S.getCurFunctionDecl();
+
+      // fall back to normal non C++ AMP logic in case we are not in FunctionDecl
+      if (!Current) {
+        Matches.push_back(std::make_pair(CurAccessFunPair, Specialization));
+        return true;
+      }
+
+      bool hasAMP = Current->hasAttr<CXXAMPRestrictAMPAttr>();
+      bool hasCPU = Current->hasAttr<CXXAMPRestrictCPUAttr>();
+
+      if (!hasAMP) {
+        hasCPU = true;
+      }
+
+      if (hasAMP && FunctionTemplate->hasAttr<CXXAMPRestrictAMPAttr>()) {
+        Matches.push_back(std::make_pair(CurAccessFunPair, Specialization));
+        return true;
+      }
+
+      if (hasCPU && (FunctionTemplate->hasAttr<CXXAMPRestrictCPUAttr>() || !FunctionTemplate->hasAttr<CXXAMPRestrictAMPAttr>())) {
+        Matches.push_back(std::make_pair(CurAccessFunPair, Specialization));
+        return true;
+      }
+
+      return false;
+    } else { // non C++ AMP
+      Matches.push_back(std::make_pair(CurAccessFunPair, Specialization));
+      return true;
+    }
   }
 
   bool AddMatchingNonTemplateFunction(NamedDecl* Fn,
@@ -11278,10 +11648,43 @@ private:
       // If we're in C, we need to support types that aren't exactly identical.
       if (!S.getLangOpts().CPlusPlus ||
           candidateHasExactlyCorrectType(FunDecl)) {
-        Matches.push_back(std::make_pair(
+        // C++AMP
+        if (S.getLangOpts().CPlusPlusAMP) {
+          FunctionDecl *Current = S.getCurFunctionDecl();
+
+          // fall back to normal non C++ AMP logic in case we are not in a FunctionDecl
+          if (!Current) {
+            Matches.push_back(std::make_pair(CurAccessFunPair,
+              cast<FunctionDecl>(FunDecl->getCanonicalDecl())));
+            FoundNonTemplateFunction = true;
+            return true;
+          }
+
+          bool hasAMP = Current->hasAttr<CXXAMPRestrictAMPAttr>();
+          bool hasCPU = Current->hasAttr<CXXAMPRestrictCPUAttr>();
+          if (!hasAMP)
+            hasCPU = true;
+
+          if (hasAMP && FunDecl->hasAttr<CXXAMPRestrictAMPAttr>()) {
+            Matches.push_back(std::make_pair(CurAccessFunPair,
+              cast<FunctionDecl>(FunDecl->getCanonicalDecl())));
+            FoundNonTemplateFunction = true;
+            return true;
+          }
+
+          if (hasCPU && (FunDecl->hasAttr<CXXAMPRestrictCPUAttr>() || !FunDecl->hasAttr<CXXAMPRestrictAMPAttr>())) {
+            Matches.push_back(std::make_pair(CurAccessFunPair,
+              cast<FunctionDecl>(FunDecl->getCanonicalDecl())));
+            FoundNonTemplateFunction = true;
+            return true;
+          }
+          return false;
+        } else { // non C++ AMP
+          Matches.push_back(std::make_pair(
             CurAccessFunPair, cast<FunctionDecl>(FunDecl->getCanonicalDecl())));
-        FoundNonTemplateFunction = true;
-        return true;
+          FoundNonTemplateFunction = true;
+          return true;
+        }
       }
     }
 
@@ -12071,6 +12474,90 @@ BuildRecoveryCallExpr(Sema &SemaRef, Scope *S, Expr *Fn,
                                RParenLoc);
 }
 
+static FunctionDecl *getBestCandidateForHIP(Sema &S,
+                                            UnresolvedLookupExpr *ULE,
+                                            MultiExprArg Args) {
+  OverloadCandidateSet CandidateSet{ULE->getBeginLoc(),
+                                    OverloadCandidateSet::CSK_Normal};
+  S.AddOverloadedCallCandidates(ULE, Args, CandidateSet);
+
+  if (CandidateSet.empty()) return nullptr;
+
+  auto It = CandidateSet.end();
+  CandidateSet.BestViableFunction(S, ULE->getBeginLoc(), It);
+
+  if (It != CandidateSet.end()) return It->Function;
+
+  It = std::min_element(CandidateSet.begin(), CandidateSet.end(),
+                        [&](const OverloadCandidate &C0,
+                            const OverloadCandidate &C1) {
+    unsigned int Cnt0 = 0;
+    unsigned int Cnt1 = 0;
+
+    for (decltype(Args.size()) I = 0; I != Args.size(); ++I) {
+      Cnt0 += C0.Function->parameters()[I]->getType() != Args[I]->getType();
+      Cnt1 += C1.Function->parameters()[I]->getType() != Args[I]->getType();
+    }
+
+    return Cnt0 < Cnt1;
+  });
+
+  return It->Function;
+}
+
+static void maybeCastArgsForHIPGlobalFunction(Sema &S,
+                                              UnresolvedLookupExpr *ULE,
+                                              MultiExprArg Args) {
+  static constexpr const char HIPLaunch[]{"hipLaunchKernelGGL"};
+
+  if (ULE->getName().getAsString().find(HIPLaunch) == std::string::npos) {
+    return;
+  }
+
+  auto F = Args.front();
+  while (!isa<UnresolvedLookupExpr>(F)) {
+    ParenExpr *PE = dyn_cast<ParenExpr>(F);
+    if (!PE)
+      return;
+    F = PE->getSubExpr();
+  }
+
+  static constexpr unsigned int IgnoreCnt{5u}; // Skip launch configuration.
+
+  FunctionDecl *FD =
+    getBestCandidateForHIP(S, cast<UnresolvedLookupExpr>(F),
+                           MultiExprArg{Args.begin() + IgnoreCnt, Args.end()});
+
+  if (!FD) return;
+
+  std::transform(FD->param_begin(), FD->param_end(), Args.begin() + IgnoreCnt,
+                 Args.begin() + IgnoreCnt,
+                 [&](const ParmVarDecl *Formal, Expr *Actual) {
+    QualType FormalT = Formal->getType();
+    QualType ActualT = Actual->getType();
+
+    if (FormalT == ActualT) return Actual;
+    if (FormalT->isReferenceType()) return Actual;
+
+    CastKind CK;
+    if (FormalT->isPointerType()) CK = CK_NoOp;
+    if (FormalT->isIntegerType()) {
+      if (ActualT->isIntegerType()) CK = CK_IntegralCast;
+      if (ActualT->isFloatingType()) CK = CK_FloatingToIntegral;
+    }
+    if (FormalT->isFloatingType()) {
+      if (ActualT->isIntegerType()) CK = CK_FloatingToIntegral;
+      if (ActualT->isFloatingType()) CK = CK_FloatingCast;
+    }
+    // TODO: this does not handle UDTs which are convertible either via ctor
+    //       or via an user defined conversion operator, since it is unclear if
+    //       this is a valid case for a __global__ function.
+
+    return cast<Expr>(ImplicitCastExpr::Create(S.Context, FormalT, CK, Actual,
+                                               nullptr, VK_XValue));
+  });
+}
+
 /// Constructs and populates an OverloadedCandidateSet from
 /// the given function.
 /// \returns true when an the ExprResult output parameter has been set.
@@ -12099,6 +12586,10 @@ bool Sema::buildOverloadedCallSet(Scope *S, Expr *Fn,
   }
 #endif
 
+  if (getLangOpts().CPlusPlusAMP) {
+    maybeCastArgsForHIPGlobalFunction(*this, ULE, Args);
+  }
+
   UnbridgedCastsSet UnbridgedCasts;
   if (checkArgPlaceholdersForOverload(*this, Args, UnbridgedCasts)) {
     *Result = ExprError();
@@ -12138,6 +12629,130 @@ bool Sema::buildOverloadedCallSet(Scope *S, Expr *Fn,
   return false;
 }
 
+void Sema::DiagnoseCXXAMPOverloadedCallExpr(SourceLocation LParenLoc,
+                                            FunctionDecl* Callee) {
+  if(!Callee || Callee->isConstexpr() || Callee->getBuiltinID() != 0u)
+    return;
+
+  if(Callee->getQualifiedNameAsString().find("std::")!=std::string::npos)
+    return;
+
+  FunctionDecl* Caller = this->getCurFunctionDecl();
+  LambdaScopeInfo* LambdaInfo = this->getCurLambda();
+  bool CallerAMP = (LambdaInfo && LambdaInfo->CallOperator)?
+    LambdaInfo->CallOperator->hasAttr<CXXAMPRestrictAMPAttr>():
+    (Caller?Caller->hasAttr<CXXAMPRestrictAMPAttr>():false);
+  bool CallerCPU= (LambdaInfo && LambdaInfo->CallOperator)?
+    LambdaInfo->CallOperator->hasAttr<CXXAMPRestrictCPUAttr>():
+    (Caller?Caller->hasAttr<CXXAMPRestrictCPUAttr>():false);
+  bool CalleeAMP = Callee->hasAttr<CXXAMPRestrictAMPAttr>();
+  bool CalleeCPU = Callee->hasAttr<CXXAMPRestrictCPUAttr>();
+
+  // Logic for auto-compile-for-accelerator:
+  // In device path, if auto-compile-for-accelerator flag is on,
+  // and caller has GPU attribute (CXXAMPRestrictAMPAttr),
+  // and callee function doesn't have GPU attribute (CXXAMPRestrictAMPAttr),
+  // and callee function is a global function, or a static function,
+  // then annotate it with one, and recalculate related boolean flags
+  if (getLangOpts().DevicePath && getLangOpts().AutoCompileForAccelerator) {
+    if ((CallerAMP && !CalleeAMP) &&
+        (Callee->isGlobal() || Callee->getStorageClass() == SC_Static)) {
+      //llvm::errs() << "add [[hc]] to callee: " << Callee->getName() << "\n";
+      Callee->addAttr(::new (Context) CXXAMPRestrictAMPAttr(Callee->getLocation(), Context, 0));
+      CalleeAMP = Callee->hasAttr<CXXAMPRestrictAMPAttr>();
+    }
+  }
+
+  // Case by case
+  if (LambdaInfo && LambdaInfo->CallOperator && !getLangOpts().AMPCPU) {
+    // caller: __GPU, lambda; callee: non __GPU, global
+    //    void f(int &flag) { flag = 1; }
+    //    auto l = [](int &flag) __GPU {
+    //      f();  // Error
+    //    };
+    if(getLangOpts().DevicePath && Callee->isGlobal() && (CallerAMP && CallerCPU) &&
+      (!CalleeAMP &&!CalleeCPU))
+      // FIXME: Need a mangled lambda name as '<lambda_xxxxxID> operator()'
+      Diag(LParenLoc, diag::err_amp_overloaded_member_function)
+        << Callee->getQualifiedNameAsString()
+        <<  LambdaInfo->CallOperator->getQualifiedNameAsString();
+
+    if(getLangOpts().DevicePath && CallerAMP && !CalleeAMP)
+      // FIXME: Need a mangled lambda name as '<lambda_xxxxxID> operator()'
+      Diag(LParenLoc, diag::err_amp_overloaded_member_function)
+        << Callee->getQualifiedNameAsString()
+        <<  LambdaInfo->CallOperator->getQualifiedNameAsString();
+
+    // caller: CPU_Only; callee: has GPU
+    if(!getLangOpts().DevicePath && (!CallerAMP && CalleeAMP && !CalleeCPU))
+      Diag(LParenLoc, diag::err_amp_overloaded_member_function)
+        << Callee->getQualifiedNameAsString()
+        <<  LambdaInfo->CallOperator->getQualifiedNameAsString();
+
+  }
+  else if(Caller && ! (LambdaInfo && LambdaInfo->CallOperator) && !getLangOpts().AMPCPU) {
+    // caller: __GPU, global; callee: non __GPU, global
+    //    void fooxxx(int &flag) { flag = 1; }
+    //    bool test() __GPU {
+    //      int flag = 0;
+    //      fooxxx(flag);  // Error
+    //    }
+    if(getLangOpts().DevicePath && Caller->isGlobal() && Callee->isGlobal() &&
+      (CallerAMP && CallerCPU) && (!CalleeAMP && !CalleeCPU) )
+      Diag(LParenLoc, diag::err_amp_overloaded_member_function)
+        << Callee->getQualifiedNameAsString() << Caller->getNameAsString();
+
+    // caller: __GPU, global; callee: non __GPU, file static
+    //    static void fooxxx(int &flag) { flag = 1; }
+    //    bool test() __GPU {
+    //      int flag = 0;
+    //      fooxxx(flag);  // Error
+    //    }
+    if(getLangOpts().DevicePath && Caller->isGlobal() && Callee->getStorageClass() == SC_Static &&
+      (CallerAMP && CallerCPU) && (!CalleeAMP&&!CalleeCPU) )
+      Diag(LParenLoc, diag::err_amp_overloaded_member_function)
+        << Callee->getQualifiedNameAsString() << Caller->getNameAsString();
+
+    // caller: __GPU, member; callee: non __GPU, global
+    //    void foo(int &flag) { flag = 1; }
+    //    void foo(int &flag) __GPU {
+    //       ::foo(flag);    // Error
+    //    }
+    if(getLangOpts().DevicePath && Callee->isGlobal() && dyn_cast<CXXMethodDecl>(Caller) &&
+      (CallerAMP && CallerCPU) && (!CalleeAMP&&!CalleeCPU) )
+      Diag(LParenLoc, diag::err_amp_overloaded_member_function)
+        << Callee->getQualifiedNameAsString() << Caller->getNameAsString();
+
+    // Handle SMF case by case
+    // Empty class with base class having user-defined default ctor
+    //    struct A2_base {
+    //      A2_base() restrict(cpu) {}
+    //    };
+    //     class A2 : public A2_base {
+    //        // defaulted: A2() restrict(cpu)
+    //     }
+    //
+    //  void test() restrict(amp) {
+    //        A2 a2;     // Error test() is amp restricted, while A2() is cpu restricted
+    //  }
+    CXXMethodDecl* CM = dyn_cast<CXXMethodDecl>(Callee);
+    if((dyn_cast<CXXConstructorDecl>(Callee) ||dyn_cast<CXXDestructorDecl>(Callee) ||
+      (CM && CM ->isCopyAssignmentOperator())) &&
+      (((CallerAMP && !CallerCPU) && (CalleeCPU&&!CalleeAMP)) ||
+      ((!CallerAMP && CallerCPU) && (!CalleeCPU&&CalleeAMP))))
+      Diag(LParenLoc, diag::err_amp_overloaded_member_function)
+        << Callee->getQualifiedNameAsString() << Caller->getQualifiedNameAsString();
+
+    // caller: CPU_Only or non __GPU; callee: GPU_Only
+    // Note that GPU path is already checked in Overload Resolution. We only check CPU path
+    // right after that in here.
+    if(!getLangOpts().DevicePath && (!CallerAMP) && (CalleeAMP && !CalleeCPU))
+      Diag(LParenLoc, diag::err_amp_overloaded_member_function)
+        << Callee->getQualifiedNameAsString() << Caller->getNameAsString();
+  }
+
+}
+
 /// FinishOverloadedCallExpr - given an OverloadCandidateSet, builds and returns
 /// the completed call expression. If overload resolution fails, emits
 /// diagnostics and returns ExprError()
@@ -12162,6 +12777,10 @@ static ExprResult FinishOverloadedCallExpr(Sema &SemaRef, Scope *S, Expr *Fn,
     SemaRef.CheckUnresolvedLookupAccess(ULE, (*Best)->FoundDecl);
     if (SemaRef.DiagnoseUseOfDecl(FDecl, ULE->getNameLoc()))
       return ExprError();
+    // C++AMP
+    if(SemaRef.getLangOpts().CPlusPlusAMP)
+      SemaRef.DiagnoseCXXAMPOverloadedCallExpr(LParenLoc, FDecl);
+
     Fn = SemaRef.FixOverloadedFunctionReference(Fn, (*Best)->FoundDecl, FDecl);
     return SemaRef.BuildResolvedCallExpr(Fn, FDecl, LParenLoc, Args, RParenLoc,
                                          ExecConfig, /*IsExecConfig=*/false,
@@ -12271,7 +12890,7 @@ ExprResult Sema::BuildOverloadedCallExpr(Scope *S, Expr *Fn,
 
   OverloadCandidateSet::iterator Best;
   OverloadingResult OverloadResult =
-      CandidateSet.BestViableFunction(*this, Fn->getBeginLoc(), Best);
+      CandidateSet.BestViableFunction(*this, Fn->getBeginLoc(), Best, false, S);
 
   return FinishOverloadedCallExpr(*this, S, Fn, ULE, LParenLoc, Args, RParenLoc,
                                   ExecConfig, &CandidateSet, &Best,
@@ -13425,6 +14044,10 @@ Sema::BuildCallToObjectOfClassType(Scope *S, Expr *Obj,
 
   // Build the full argument list for the method call (the implicit object
   // parameter is placed at the beginning of the list).
+  // C++AMP
+  if(getLangOpts().CPlusPlusAMP && Method && Method->getParent()->isLambda())
+    DiagnoseCXXAMPMethodCallExpr(LParenLoc, Method);
+
   SmallVector<Expr *, 8> MethodArgs(NumArgsSlots);
 
   bool IsError = false;
diff --git a/lib/Sema/SemaStmtAsm.cpp b/lib/Sema/SemaStmtAsm.cpp
index 9b051e02d1..84b0330f32 100644
--- a/lib/Sema/SemaStmtAsm.cpp
+++ b/lib/Sema/SemaStmtAsm.cpp
@@ -255,6 +255,24 @@ StmtResult Sema::ActOnGCCAsmStmt(SourceLocation AsmLoc, bool IsSimple,
   // The parser verifies that there is a string literal here.
   assert(AsmString->isAscii());
 
+  // If we're compiling CUDA file and function attributes indicate that it's not
+  // for this compilation side, skip all the checks.
+  if (!DeclAttrsMatchCUDAMode(getLangOpts(), getCurFunctionDecl())) {
+    GCCAsmStmt *NS = new (Context) GCCAsmStmt(
+        Context, AsmLoc, IsSimple, IsVolatile, NumOutputs, NumInputs, Names,
+        Constraints, Exprs.data(), AsmString, NumClobbers, Clobbers, NumLabels, RParenLoc);
+    return NS;
+  }
+
+  // If we're compiling HCC file and function attributes indicate that it's not
+  // for this compilation side, skip all the checks.
+  if (!DeclAttrsMatchHCCMode(getLangOpts(), getCurFunctionDecl())) {
+    GCCAsmStmt *NS = new (Context) GCCAsmStmt(
+        Context, AsmLoc, IsSimple, IsVolatile, NumOutputs, NumInputs, Names,
+        Constraints, Exprs.data(), AsmString, NumClobbers, Clobbers, NumLabels, RParenLoc);
+    return NS;
+  }
+
   for (unsigned i = 0; i != NumOutputs; i++) {
     StringLiteral *Literal = Constraints[i];
     assert(Literal->isAscii());
diff --git a/lib/Sema/SemaTemplate.cpp b/lib/Sema/SemaTemplate.cpp
index 135ca2b25c..27036855fa 100644
--- a/lib/Sema/SemaTemplate.cpp
+++ b/lib/Sema/SemaTemplate.cpp
@@ -4830,6 +4830,37 @@ Sema::SubstDefaultTemplateArgumentIfAvailable(TemplateDecl *Template,
                 TempTempParm->getDefaultArgument().getTemplateNameLoc());
 }
 
+void Sema::DiagnoseCXXAMPTemplateArgument(NamedDecl *Param,
+                                          const TemplateArgumentLoc &AL,
+                                          NamedDecl *Template,
+                                          SourceLocation TemplateLoc) {
+   if(!Param ||!Template)
+    return;
+
+  // Check array's template type parameters.
+  IdentifierInfo* Info = Template->getIdentifier();
+  if(Info && Info->isStr("array") &&
+    Template->getQualifiedNameAsString().find("Concurrency::array")!=std::string::npos) {
+    // For a declaration:
+    //         template<typename T, N> class array;
+    // And a usage,
+    //         array<Axxxx, 1>
+    //
+    // Param is related to 'T'
+    // Arg is related to 'Axxxx'
+    // Template is related to 'array'
+
+    if (dyn_cast<TemplateTypeParmDecl>(Param)) {
+      const TemplateArgument &Arg = AL.getArgument();
+      QualType ArgType = Context.getCanonicalType(Arg.getAsType());
+      const Type* Ty = ArgType.getTypePtrOrNull();
+      if(IsIncompatibleType(Ty, true))
+        Diag(AL.getLocation(), diag::err_amp_type_unsupported)
+        << ArgType.getAsString();
+    }
+  }
+}
+
 /// Convert a template-argument that we parsed as a type into a template, if
 /// possible. C++ permits injected-class-names to perform dual service as
 /// template template arguments and as template type arguments.
@@ -4897,6 +4928,11 @@ bool Sema::CheckTemplateArgument(NamedDecl *Param,
                                  unsigned ArgumentPackIndex,
                             SmallVectorImpl<TemplateArgument> &Converted,
                                  CheckTemplateArgumentKind CTAK) {
+  // C++AMP
+  if(getLangOpts().CPlusPlusAMP && Template) {
+    DiagnoseCXXAMPTemplateArgument(Param, Arg, Template, TemplateLoc);
+  }
+
   // Check template type parameters.
   if (TemplateTypeParmDecl *TTP = dyn_cast<TemplateTypeParmDecl>(Param))
     return CheckTemplateTypeArgument(TTP, Arg, Converted);
@@ -8451,6 +8487,34 @@ bool Sema::CheckFunctionTemplateSpecialization(
         continue;
       }
 
+      // C++ AMP
+      // Check if the specialization has the same or more restriction specifiers
+      // Truth table (row: restriction specifier of the input, column: restriction specifier of the candidate.
+      // +---------+------+-----+-----+---------+
+      // |         | none | cpu | amp | cpu/amp |
+      // +---------+------+-----+-----+---------+
+      // | none    |  OK  |  OK |  NG |  OK     |
+      // +---------+------+-----+-----+---------+
+      // | cpu     |  OK  |  OK |  NG |  OK     |
+      // +---------+------+-----+-----+---------+
+      // | amp     |  NG  |  NG |  OK |  OK     |
+      // +---------+------+-----+-----+---------+
+      // | cpu/amp |  NG  |  NG |  NG |  OK     |
+      // +---------+------+-----+-----+---------+
+      if (getLangOpts().CPlusPlusAMP) {
+        if (FD->hasAttr<CXXAMPRestrictAMPAttr>()) {
+          if (!Specialization->hasAttr<CXXAMPRestrictAMPAttr>()) {
+            continue;
+          } else if (FD->hasAttr<CXXAMPRestrictCPUAttr>() && !Specialization->hasAttr<CXXAMPRestrictCPUAttr>()) {
+            continue;
+          }
+        } else {
+          if (Specialization->hasAttr<CXXAMPRestrictAMPAttr>() && !Specialization->hasAttr<CXXAMPRestrictCPUAttr>()) {
+            continue;
+          }
+        }
+      }
+
       // Record this candidate.
       if (ExplicitTemplateArgs)
         ConvertedTemplateArgs[Specialization] = std::move(Args);
@@ -8563,6 +8627,22 @@ bool Sema::CheckFunctionTemplateSpecialization(
     MarkUnusedFileScopedDecl(Specialization);
   }
 
+  // C++ AMP
+  if (getLangOpts().CPlusPlusAMP) {
+    SourceLocation Loc = FD->getLocation();
+    if (FD->hasAttr<CXXAMPRestrictAMPAttr>()) {
+      if (!Specialization->hasAttr<CXXAMPRestrictAMPAttr>())
+        Specialization->addAttr(::new (Context) CXXAMPRestrictAMPAttr(Loc, Context, 0));
+    } else
+      Specialization->dropAttr<CXXAMPRestrictAMPAttr>();
+
+    if (FD->hasAttr<CXXAMPRestrictCPUAttr>()) {
+      if (!Specialization->hasAttr<CXXAMPRestrictCPUAttr>())
+        Specialization->addAttr(::new (Context) CXXAMPRestrictCPUAttr(Loc, Context, 0));
+     } else
+       Specialization->dropAttr<CXXAMPRestrictCPUAttr>();
+  }
+
   // Turn the given function declaration into a function template
   // specialization, with the template arguments from the previous
   // specialization.
diff --git a/lib/Sema/SemaTemplateInstantiateDecl.cpp b/lib/Sema/SemaTemplateInstantiateDecl.cpp
index 67343d11d3..903e2054bc 100644
--- a/lib/Sema/SemaTemplateInstantiateDecl.cpp
+++ b/lib/Sema/SemaTemplateInstantiateDecl.cpp
@@ -1690,6 +1690,31 @@ static QualType adjustFunctionTypeForInstantiation(ASTContext &Context,
                                  NewFunc->getParamTypes(), NewEPI);
 }
 
+static void MarkByValueRecordsPassedToHIPGlobalFN(FunctionDecl *FDecl)
+{ // TODO: this is a temporary kludge; a preferable solution shall be provided
+  //       in the future, which shall eschew FE involvement.
+  static constexpr const char HIPLaunch[]{"hipLaunchKernelGGL"};
+
+  if (!FDecl) return;
+  if (FDecl->getDeclName().isIdentifier() &&
+    FDecl->getNameAsString().find(HIPLaunch) == std::string::npos) return;
+
+  for (auto &&Parameter : FDecl->parameters()) {
+    if (Parameter->getOriginalType()->isPointerType()) continue;
+    if (Parameter->getOriginalType()->isReferenceType()) continue;
+    if (!Parameter->getOriginalType()->isRecordType()) continue;
+
+    if (auto RD = Parameter->getOriginalType()->getAsCXXRecordDecl()) {
+      if (RD->hasAttr<PackedAttr>()) continue; // Spurious for lambdas.
+      if (!RD->isLambda()) continue;
+
+      static constexpr const char HIPKernargRecord[]{"__HIP_KERNARG_RECORD__"};
+      RD->addAttr(
+        AnnotateAttr::CreateImplicit(RD->getASTContext(), HIPKernargRecord));
+    }
+  }
+}
+
 /// Normal class members are of more specific types and therefore
 /// don't make it here.  This function serves three purposes:
 ///   1) instantiating function templates
@@ -1991,6 +2016,13 @@ Decl *TemplateDeclInstantiator::VisitFunctionDecl(FunctionDecl *D,
     PrincipalDecl->setNonMemberOperator();
 
   assert(!D->isDefaulted() && "only methods should be defaulted");
+
+
+  if (SemaRef.getLangOpts().CPlusPlusAMP) {
+    // TODO: kludge warning, to be removed.
+    MarkByValueRecordsPassedToHIPGlobalFN(Function);
+  }
+
   return Function;
 }
 
diff --git a/lib/Sema/SemaType.cpp b/lib/Sema/SemaType.cpp
index 2b9d06814d..57908817e7 100644
--- a/lib/Sema/SemaType.cpp
+++ b/lib/Sema/SemaType.cpp
@@ -7572,6 +7572,11 @@ static void processTypeAttrs(TypeProcessingState &state, QualType &type,
           << attr.getName();
       break;
 
+    case ParsedAttr::AT_HC_CPU:
+    case ParsedAttr::AT_HC_HC:
+    case ParsedAttr::AT_AMDGPUWavesPerEU:
+    case ParsedAttr::AT_AMDGPUFlatWorkGroupSize:
+    case ParsedAttr::AT_AMDGPUMaxWorkGroupDim:
     case ParsedAttr::IgnoredAttribute:
       break;
 
diff --git a/lib/Sema/StmtResInfer.cpp b/lib/Sema/StmtResInfer.cpp
new file mode 100755
index 0000000000..5a30b12566
--- /dev/null
+++ b/lib/Sema/StmtResInfer.cpp
@@ -0,0 +1,1052 @@
+//===---- StmtResInfer.cpp - Inferring implementation for  auto-restricted FunctionDecl-- ===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the Sema::TryCXXAMPRestrictionInferring method, which tries to 
+// automatically infer CXXAMP specific auto-restricted FunctionDecl with any eligible 
+// non-auto implicit restrictions onto it.
+//
+//===----------------------------------------------------------------------===//
+
+#include "clang/Sema/SemaInternal.h"
+#include "TypeLocBuilder.h"
+#include "clang/AST/ASTConsumer.h"
+#include "clang/AST/ASTContext.h"
+#include "clang/AST/CXXInheritance.h"
+#include "clang/AST/CharUnits.h"
+#include "clang/AST/CommentDiagnostic.h"
+#include "clang/AST/DeclCXX.h"
+#include "clang/AST/DeclObjC.h"
+#include "clang/AST/DeclTemplate.h"
+#include "clang/AST/EvaluatedExprVisitor.h"
+#include "clang/AST/ExprCXX.h"
+#include "clang/AST/StmtCXX.h"
+#include "clang/Basic/SourceManager.h"
+#include "clang/Sema/Scope.h"
+#include "clang/AST/Attr.h"
+#include "clang/AST/CommentVisitor.h"
+#include "clang/AST/DeclCXX.h"
+#include "clang/AST/DeclObjC.h"
+#include "clang/AST/DeclVisitor.h"
+#include "clang/AST/StmtVisitor.h"
+#include "clang/Basic/Module.h"
+#include "clang/AST/DeclVisitor.h"
+#include "clang/AST/StmtVisitor.h"
+using namespace clang::comments;
+
+using namespace clang;
+using namespace sema;
+
+
+//===----------------------------------------------------------------------===//
+// StmtResInfer Visitor
+//===----------------------------------------------------------------------===//
+
+namespace  {
+  class StmtResInfer
+      : public ConstDeclVisitor<StmtResInfer>, public ConstStmtVisitor<StmtResInfer> {
+    const SourceManager *SM;
+    unsigned CppAMPSpec;
+    Sema& TheSema;
+
+  private:
+    inline void ClearResAMP() { CppAMPSpec &=~CPPAMP_AMP;}
+    inline void ClearResCPU() { CppAMPSpec &=~CPPAMP_CPU;}
+
+  public:
+    StmtResInfer(Sema& S, unsigned& NonAutoSpec, const SourceManager *SM) 
+      : SM(SM), CppAMPSpec(NonAutoSpec), TheSema(S) { }
+
+    ~StmtResInfer() {
+    }
+    unsigned Infer(const Stmt* Node);
+
+//private:
+    void dumpDecl(const Decl *D);
+    void dumpStmt(const Stmt *S);
+    void dumpFullComment(const FullComment *C);
+
+    // Formatting
+    void indent();
+    void unindent();
+    void lastChild();
+    bool hasMoreChildren();
+    void setMoreChildren(bool Value);
+
+    // Utilities
+    void dumpPointer(const void *Ptr);
+    void dumpSourceRange(SourceRange R);
+    void dumpLocation(SourceLocation Loc);
+    void dumpBareType(QualType T);
+    void dumpType(QualType T);
+    void dumpBareDeclRef(const Decl *Node);
+    void dumpDeclRef(const Decl *Node, const char *Label = 0);
+    void dumpName(const NamedDecl *D);
+    bool hasNodes(const DeclContext *DC);
+    void dumpDeclContext(const DeclContext *DC);
+    void dumpAttr(const Attr *A);
+
+    // C++ Utilities
+    void dumpAccessSpecifier(AccessSpecifier AS);
+    void dumpCXXCtorInitializer(const CXXCtorInitializer *Init);
+    void dumpTemplateParameters(const TemplateParameterList *TPL);
+    void dumpTemplateArgumentListInfo(const TemplateArgumentListInfo &TALI);
+    void dumpTemplateArgumentLoc(const TemplateArgumentLoc &A);
+    void dumpTemplateArgumentList(const TemplateArgumentList &TAL);
+    void dumpTemplateArgument(const TemplateArgument &A,
+                              SourceRange R = SourceRange());
+
+    // Decls
+    void VisitLabelDecl(const LabelDecl *D);
+    void VisitTypedefDecl(const TypedefDecl *D);
+    void VisitEnumDecl(const EnumDecl *D);
+    void VisitRecordDecl(const RecordDecl *D);
+    void VisitEnumConstantDecl(const EnumConstantDecl *D);
+    void VisitIndirectFieldDecl(const IndirectFieldDecl *D);
+    void VisitFunctionDecl(const FunctionDecl *D);
+    void VisitFieldDecl(const FieldDecl *D);
+    void VisitVarDecl(const VarDecl *D);
+    void VisitFileScopeAsmDecl(const FileScopeAsmDecl *D);
+    void VisitImportDecl(const ImportDecl *D);
+
+    // C++ Decls
+    void VisitNamespaceDecl(const NamespaceDecl *D);
+    void VisitUsingDirectiveDecl(const UsingDirectiveDecl *D);
+    void VisitNamespaceAliasDecl(const NamespaceAliasDecl *D);
+    void VisitTypeAliasDecl(const TypeAliasDecl *D);
+    void VisitTypeAliasTemplateDecl(const TypeAliasTemplateDecl *D);
+    void VisitCXXRecordDecl(const CXXRecordDecl *D);
+    void VisitStaticAssertDecl(const StaticAssertDecl *D);
+    void VisitFunctionTemplateDecl(const FunctionTemplateDecl *D);
+    void VisitClassTemplateDecl(const ClassTemplateDecl *D);
+    void VisitClassTemplateSpecializationDecl(
+        const ClassTemplateSpecializationDecl *D);
+    void VisitClassTemplatePartialSpecializationDecl(
+        const ClassTemplatePartialSpecializationDecl *D);
+    void VisitClassScopeFunctionSpecializationDecl(
+        const ClassScopeFunctionSpecializationDecl *D);
+    void VisitTemplateTypeParmDecl(const TemplateTypeParmDecl *D);
+    void VisitNonTypeTemplateParmDecl(const NonTypeTemplateParmDecl *D);
+    void VisitTemplateTemplateParmDecl(const TemplateTemplateParmDecl *D);
+    void VisitUsingDecl(const UsingDecl *D);
+    void VisitUnresolvedUsingTypenameDecl(const UnresolvedUsingTypenameDecl *D);
+    void VisitUnresolvedUsingValueDecl(const UnresolvedUsingValueDecl *D);
+    void VisitUsingShadowDecl(const UsingShadowDecl *D);
+    void VisitLinkageSpecDecl(const LinkageSpecDecl *D);
+    void VisitAccessSpecDecl(const AccessSpecDecl *D);
+    void VisitFriendDecl(const FriendDecl *D);
+
+    // ObjC Decls
+    void VisitObjCIvarDecl(const ObjCIvarDecl *D);
+    void VisitObjCMethodDecl(const ObjCMethodDecl *D);
+    void VisitObjCCategoryDecl(const ObjCCategoryDecl *D);
+    void VisitObjCCategoryImplDecl(const ObjCCategoryImplDecl *D);
+    void VisitObjCProtocolDecl(const ObjCProtocolDecl *D);
+    void VisitObjCInterfaceDecl(const ObjCInterfaceDecl *D);
+    void VisitObjCImplementationDecl(const ObjCImplementationDecl *D);
+    void VisitObjCCompatibleAliasDecl(const ObjCCompatibleAliasDecl *D);
+    void VisitObjCPropertyDecl(const ObjCPropertyDecl *D);
+    void VisitObjCPropertyImplDecl(const ObjCPropertyImplDecl *D);
+    void VisitBlockDecl(const BlockDecl *D);
+
+    // Stmts.
+    void VisitStmt(const Stmt *Node);
+    void VisitDeclStmt(const DeclStmt *Node);
+    void VisitAttributedStmt(const AttributedStmt *Node);
+    void VisitLabelStmt(const LabelStmt *Node);
+    void VisitGotoStmt(const GotoStmt *Node);
+    void VisitCXXTryStmt(const CXXTryStmt* Node);
+    
+    // Exprs
+    void VisitExpr(const Expr *Node);
+    void VisitCastExpr(const CastExpr *Node);
+    void VisitDeclRefExpr(const DeclRefExpr *Node);
+    void VisitPredefinedExpr(const PredefinedExpr *Node);
+    void VisitCharacterLiteral(const CharacterLiteral *Node);
+    void VisitIntegerLiteral(const IntegerLiteral *Node);
+    void VisitFloatingLiteral(const FloatingLiteral *Node);
+    void VisitStringLiteral(const StringLiteral *Str);
+    void VisitUnaryOperator(const UnaryOperator *Node);
+    void VisitUnaryExprOrTypeTraitExpr(const UnaryExprOrTypeTraitExpr *Node);
+    void VisitMemberExpr(const MemberExpr *Node);
+    void VisitExtVectorElementExpr(const ExtVectorElementExpr *Node);
+    void VisitBinaryOperator(const BinaryOperator *Node);
+    void VisitCompoundAssignOperator(const CompoundAssignOperator *Node);
+    void VisitAddrLabelExpr(const AddrLabelExpr *Node);
+    void VisitBlockExpr(const BlockExpr *Node);
+    void VisitOpaqueValueExpr(const OpaqueValueExpr *Node);
+
+    // C++
+    void VisitCXXNamedCastExpr(const CXXNamedCastExpr *Node);
+    void VisitCXXBoolLiteralExpr(const CXXBoolLiteralExpr *Node);
+    void VisitCXXThisExpr(const CXXThisExpr *Node);
+    void VisitCXXThrowExpr(const CXXThrowExpr *Node);
+    void VisitCXXFunctionalCastExpr(const CXXFunctionalCastExpr *Node);
+    void VisitCXXConstructExpr(const CXXConstructExpr *Node);
+    void VisitCXXBindTemporaryExpr(const CXXBindTemporaryExpr *Node);
+    void VisitExprWithCleanups(const ExprWithCleanups *Node);
+    void VisitUnresolvedLookupExpr(const UnresolvedLookupExpr *Node);
+    void dumpCXXTemporary(const CXXTemporary *Temporary);
+    void VisitCXXTypeidExpr(const CXXTypeidExpr* Node);
+    void VisitCXXDynamicCastExpr(const CXXDynamicCastExpr* Node);
+
+    // ObjC
+    void VisitObjCAtCatchStmt(const ObjCAtCatchStmt *Node);
+    void VisitObjCEncodeExpr(const ObjCEncodeExpr *Node);
+    void VisitObjCMessageExpr(const ObjCMessageExpr *Node);
+    void VisitObjCBoxedExpr(const ObjCBoxedExpr *Node);
+    void VisitObjCSelectorExpr(const ObjCSelectorExpr *Node);
+    void VisitObjCProtocolExpr(const ObjCProtocolExpr *Node);
+    void VisitObjCPropertyRefExpr(const ObjCPropertyRefExpr *Node);
+    void VisitObjCSubscriptRefExpr(const ObjCSubscriptRefExpr *Node);
+    void VisitObjCIvarRefExpr(const ObjCIvarRefExpr *Node);
+    void VisitObjCBoolLiteralExpr(const ObjCBoolLiteralExpr *Node);
+
+    // Comments.
+    const char *getCommandName(unsigned CommandID);
+    void dumpComment(const Comment *C);
+
+    // Inline comments.
+    void visitTextComment(const TextComment *C);
+    void visitInlineCommandComment(const InlineCommandComment *C);
+    void visitHTMLStartTagComment(const HTMLStartTagComment *C);
+    void visitHTMLEndTagComment(const HTMLEndTagComment *C);
+
+    // Block comments.
+    void visitBlockCommandComment(const BlockCommandComment *C);
+    void visitParamCommandComment(const ParamCommandComment *C);
+    void visitTParamCommandComment(const TParamCommandComment *C);
+    void visitVerbatimBlockComment(const VerbatimBlockComment *C);
+    void visitVerbatimBlockLineComment(const VerbatimBlockLineComment *C);
+    void visitVerbatimLineComment(const VerbatimLineComment *C);
+  };
+}
+void StmtResInfer::dumpPointer(const void *Ptr) {
+}
+
+void StmtResInfer::dumpLocation(SourceLocation Loc) {
+}
+
+void StmtResInfer::dumpSourceRange(SourceRange R) {
+}
+
+void StmtResInfer::dumpBareType(QualType T) {
+}
+
+void StmtResInfer::dumpType(QualType T) {
+  dumpBareType(T);
+}
+
+void StmtResInfer::dumpBareDeclRef(const Decl *D) {
+  {
+    // C++AMP
+    if(D->getKind() == Decl::Function){
+      if(TheSema.getLangOpts().DevicePath && (CppAMPSpec & CPPAMP_AMP) &&
+        !D->hasAttr<CXXAMPRestrictAMPAttr>())
+        ClearResAMP();
+      if(!TheSema.getLangOpts().DevicePath && (CppAMPSpec & CPPAMP_CPU) &&
+        D->hasAttr<CXXAMPRestrictAMPAttr>() && !D->hasAttr<CXXAMPRestrictCPUAttr>())
+        ClearResCPU();
+      }
+  }
+
+  if (const ValueDecl *VD = dyn_cast<ValueDecl>(D))
+    dumpType(VD->getType());
+}
+
+void StmtResInfer::dumpDeclRef(const Decl *D, const char *Label) {
+  if (!D)
+    return;
+
+  dumpBareDeclRef(D);
+}
+
+void StmtResInfer::dumpName(const NamedDecl *ND) {
+}
+
+bool StmtResInfer::hasNodes(const DeclContext *DC) {
+  if (!DC)
+    return false;
+
+  return DC->decls_begin() != DC->decls_end();
+}
+
+void StmtResInfer::dumpDeclContext(const DeclContext *DC) {
+  if (!DC)
+    return;
+  for (DeclContext::decl_iterator I = DC->decls_begin(), E = DC->decls_end();
+       I != E; ++I) {
+    dumpDecl(*I);
+  }
+}
+
+//===----------------------------------------------------------------------===//
+//  Decl dumping methods.
+//===----------------------------------------------------------------------===//
+
+void StmtResInfer::dumpDecl(const Decl *D) {
+  if (!D) {
+    return;
+  }
+
+  // Decls within functions are visited by the body
+  bool HasDeclContext = !isa<FunctionDecl>(*D) && !isa<ObjCMethodDecl>(*D) &&
+                         hasNodes(dyn_cast<DeclContext>(D));
+
+  ConstDeclVisitor<StmtResInfer>::Visit(D);
+  if (HasDeclContext)
+    dumpDeclContext(cast<DeclContext>(D));
+}
+
+void StmtResInfer::VisitLabelDecl(const LabelDecl *D) {
+}
+
+void StmtResInfer::VisitTypedefDecl(const TypedefDecl *D) {
+  dumpType(D->getUnderlyingType());
+}
+
+void StmtResInfer::VisitEnumDecl(const EnumDecl *D) {
+  if (D->isFixed())
+    dumpType(D->getIntegerType());
+}
+
+void StmtResInfer::VisitRecordDecl(const RecordDecl *D) {
+}
+
+void StmtResInfer::VisitEnumConstantDecl(const EnumConstantDecl *D) {
+  dumpType(D->getType());
+  if (const Expr *Init = D->getInitExpr()) {
+    dumpStmt(Init);
+  }
+}
+
+void StmtResInfer::VisitIndirectFieldDecl(const IndirectFieldDecl *D) {
+  dumpType(D->getType());
+  for (IndirectFieldDecl::chain_iterator I = D->chain_begin(),
+                                         E = D->chain_end();
+       I != E; ++I) {
+    if (I + 1 == E)
+    dumpDeclRef(*I);
+  }
+}
+
+void StmtResInfer::VisitFunctionDecl(const FunctionDecl *D) {
+}
+
+void StmtResInfer::VisitFieldDecl(const FieldDecl *D) {
+  dumpName(D);
+  dumpType(D->getType());
+  // Resue in BuildMemInitializer for err_amp_unsupported_reference_or_pointer
+  const Type* Ty  = D->getType().getTypePtrOrNull();
+  QualType TheType = D->getType();
+
+  if(Ty) {
+    // Case by case
+    if(Ty->isPointerType())
+      TheType = Ty->getPointeeType();
+    if(Ty->isArrayType())
+      TheType = dyn_cast<ArrayType>(Ty)->getElementType();
+    if(!TheType.isNull() && TheType->isRecordType()) {
+      CXXRecordDecl* RDecl = TheType->getAsCXXRecordDecl();
+        if (RDecl->getName() == "array")
+          ClearResAMP();
+    }
+  }
+  // Checke if it is array_view's reference or pointer
+  if(Ty && (Ty->isPointerType() ||Ty->isReferenceType())) {
+    const Type* TargetTy = Ty->getPointeeType().getTypePtrOrNull();
+    if(const TemplateSpecializationType* TST = TargetTy->getAs<TemplateSpecializationType>()) {
+      // Check if it is a TemplateSpecializationType
+      // FIXME: should consider alias Template
+      // Get its underlying template decl*
+      if(ClassTemplateDecl* CTDecl = dyn_cast_or_null<ClassTemplateDecl>(
+        TST->getTemplateName().getAsTemplateDecl())) {
+        if(CXXRecordDecl* RDecl = CTDecl->getTemplatedDecl())
+          if(RDecl->getName() == "array_view")
+            ClearResAMP();
+      }
+    }
+  }
+  
+  bool IsBitField = D->isBitField();
+  Expr *Init = D->getInClassInitializer();
+  bool HasInit = Init;
+
+  if (IsBitField) {
+    dumpStmt(D->getBitWidth());
+  }
+  if (HasInit) {
+    dumpStmt(Init);
+  }
+}
+
+void StmtResInfer::VisitVarDecl(const VarDecl *D) {
+  if(TheSema.IsIncompatibleType(D->getType().getTypePtrOrNull(), false, true)) {
+    ClearResAMP();
+    return;
+  }
+  
+  if(D->getType().isVolatileQualified())
+    ClearResAMP();
+
+  if(D->getType()->isCharType() || D->getType()->isWideCharType() || 
+    D->getType()->isSpecificBuiltinType(BuiltinType::Short) || 
+    D->getType()->isSpecificBuiltinType(BuiltinType::LongLong) || 
+    D->getType()->isSpecificBuiltinType(BuiltinType::LongDouble))
+   ClearResAMP();
+
+  //var's type
+  dumpType(D->getType());
+
+  // TODO: Should infer if it is static
+#if 0
+  StorageClass SC = D->getStorageClass();
+#endif
+
+  if (D->hasInit()) {
+    dumpStmt(D->getInit());
+  }
+}
+
+void StmtResInfer::VisitFileScopeAsmDecl(const FileScopeAsmDecl *D) {
+  dumpStmt(D->getAsmString());
+}
+
+void StmtResInfer::VisitImportDecl(const ImportDecl *D) {
+}
+
+//===----------------------------------------------------------------------===//
+// C++ Declarations
+//===----------------------------------------------------------------------===//
+
+void StmtResInfer::VisitNamespaceDecl(const NamespaceDecl *D) {
+  if (!D->isOriginalNamespace())
+    dumpDeclRef(D->getOriginalNamespace(), "original");
+}
+
+void StmtResInfer::VisitUsingDirectiveDecl(const UsingDirectiveDecl *D) {
+  dumpBareDeclRef(D->getNominatedNamespace());
+}
+
+void StmtResInfer::VisitNamespaceAliasDecl(const NamespaceAliasDecl *D) {
+  dumpDeclRef(D->getAliasedNamespace());
+}
+
+void StmtResInfer::VisitTypeAliasDecl(const TypeAliasDecl *D) {
+  dumpType(D->getUnderlyingType());
+}
+
+void StmtResInfer::VisitTypeAliasTemplateDecl(const TypeAliasTemplateDecl *D) {
+  // TODO
+  #if 0
+  dumpTemplateParameters(D->getTemplateParameters());
+  #endif
+  dumpDecl(D->getTemplatedDecl());
+}
+
+void StmtResInfer::VisitCXXRecordDecl(const CXXRecordDecl *D) {
+  VisitRecordDecl(D);
+  if (!D->isCompleteDefinition())
+    return;
+
+  for (CXXRecordDecl::base_class_const_iterator I = D->bases_begin(),
+                                                E = D->bases_end();
+       I != E; ++I) {
+    dumpType(I->getType());
+  }
+}
+
+void StmtResInfer::VisitStaticAssertDecl(const StaticAssertDecl *D) {
+  dumpStmt(D->getAssertExpr());
+  dumpStmt(D->getMessage());
+}
+
+void StmtResInfer::VisitFunctionTemplateDecl(const FunctionTemplateDecl *D) {
+}
+
+void StmtResInfer::VisitClassTemplateDecl(const ClassTemplateDecl *D) {
+}
+
+void StmtResInfer::VisitClassTemplateSpecializationDecl(
+    const ClassTemplateSpecializationDecl *D) {
+  VisitCXXRecordDecl(D);
+  // TODO
+  #if 0
+  dumpTemplateArgumentList(D->getTemplateArgs());
+  #endif
+}
+
+void StmtResInfer::VisitClassTemplatePartialSpecializationDecl(
+    const ClassTemplatePartialSpecializationDecl *D) {
+  VisitClassTemplateSpecializationDecl(D);
+  // TODO
+  #if 0
+  dumpTemplateParameters(D->getTemplateParameters());
+  #endif
+}
+
+void StmtResInfer::VisitClassScopeFunctionSpecializationDecl(
+    const ClassScopeFunctionSpecializationDecl *D) {
+  dumpDeclRef(D->getSpecialization());
+  // TODO
+  #if 0
+  if (D->hasExplicitTemplateArgs())
+    dumpTemplateArgumentListInfo(D->templateArgs());
+  #endif
+}
+
+void StmtResInfer::VisitTemplateTypeParmDecl(const TemplateTypeParmDecl *D) {
+  if (D->hasDefaultArgument())
+    dumpType(D->getDefaultArgument());
+}
+
+void StmtResInfer::VisitNonTypeTemplateParmDecl(const NonTypeTemplateParmDecl *D) {
+  dumpType(D->getType());
+  if (D->hasDefaultArgument())
+    dumpStmt(D->getDefaultArgument());
+}
+
+void StmtResInfer::VisitTemplateTemplateParmDecl(
+    const TemplateTemplateParmDecl *D) {
+  // TODO
+  #if 0
+  dumpTemplateParameters(D->getTemplateParameters());
+  if (D->hasDefaultArgument())
+    dumpTemplateArgumentLoc(D->getDefaultArgument());
+  #endif
+}
+
+void StmtResInfer::VisitUsingDecl(const UsingDecl *D) {
+}
+
+void StmtResInfer::VisitUnresolvedUsingTypenameDecl(
+    const UnresolvedUsingTypenameDecl *D) {
+}
+
+void StmtResInfer::VisitUnresolvedUsingValueDecl(const UnresolvedUsingValueDecl *D) {
+  dumpType(D->getType());
+}
+
+void StmtResInfer::VisitUsingShadowDecl(const UsingShadowDecl *D) {
+  dumpBareDeclRef(D->getTargetDecl());
+}
+
+void StmtResInfer::VisitLinkageSpecDecl(const LinkageSpecDecl *D) {
+}
+
+void StmtResInfer::VisitAccessSpecDecl(const AccessSpecDecl *D) {
+}
+
+void StmtResInfer::VisitFriendDecl(const FriendDecl *D) {
+  if (TypeSourceInfo *T = D->getFriendType())
+    dumpType(T->getType());
+  else
+    dumpDecl(D->getFriendDecl());
+}
+
+//===----------------------------------------------------------------------===//
+// Obj-C Declarations
+//===----------------------------------------------------------------------===//
+
+void StmtResInfer::VisitObjCIvarDecl(const ObjCIvarDecl *D) {
+  dumpType(D->getType());
+}
+
+void StmtResInfer::VisitObjCMethodDecl(const ObjCMethodDecl *D) {
+  dumpType(D->getReturnType());
+
+  bool HasBody = D->hasBody();
+
+  if (D->isThisDeclarationADefinition()) {
+    dumpDeclContext(D);
+  } else {
+    for (ObjCMethodDecl::param_const_iterator I = D->param_begin(),
+                                              E = D->param_end();
+         I != E; ++I) {
+      if (I + 1 == E)
+      dumpDecl(*I);
+    }
+  }
+
+  if (HasBody) {
+    dumpStmt(D->getBody());
+  }
+}
+
+void StmtResInfer::VisitObjCCategoryDecl(const ObjCCategoryDecl *D) {
+  dumpDeclRef(D->getClassInterface());
+  if (D->protocol_begin() == D->protocol_end())
+  dumpDeclRef(D->getImplementation());
+  for (ObjCCategoryDecl::protocol_iterator I = D->protocol_begin(),
+                                           E = D->protocol_end();
+       I != E; ++I) {
+    if (I + 1 == E)
+    dumpDeclRef(*I);
+  }
+}
+
+void StmtResInfer::VisitObjCCategoryImplDecl(const ObjCCategoryImplDecl *D) {
+  dumpDeclRef(D->getClassInterface());
+  dumpDeclRef(D->getCategoryDecl());
+}
+
+void StmtResInfer::VisitObjCProtocolDecl(const ObjCProtocolDecl *D) {
+  for (ObjCProtocolDecl::protocol_iterator I = D->protocol_begin(),
+                                           E = D->protocol_end();
+       I != E; ++I) {
+    if (I + 1 == E)
+    dumpDeclRef(*I);
+  }
+}
+
+void StmtResInfer::VisitObjCInterfaceDecl(const ObjCInterfaceDecl *D) {
+  dumpDeclRef(D->getSuperClass(), "super");
+  if (D->protocol_begin() == D->protocol_end())
+  dumpDeclRef(D->getImplementation());
+  for (ObjCInterfaceDecl::protocol_iterator I = D->protocol_begin(),
+                                            E = D->protocol_end();
+       I != E; ++I) {
+    if (I + 1 == E)
+    dumpDeclRef(*I);
+  }
+}
+
+void StmtResInfer::VisitObjCImplementationDecl(const ObjCImplementationDecl *D) {
+  dumpDeclRef(D->getSuperClass(), "super");
+  dumpDeclRef(D->getClassInterface());
+  for (ObjCImplementationDecl::init_const_iterator I = D->init_begin(),
+                                                   E = D->init_end();
+       I != E; ++I) {
+  }
+}
+
+void StmtResInfer::VisitObjCCompatibleAliasDecl(const ObjCCompatibleAliasDecl *D) {
+  dumpDeclRef(D->getClassInterface());
+}
+
+void StmtResInfer::VisitObjCPropertyDecl(const ObjCPropertyDecl *D) {
+  dumpType(D->getType());
+
+  ObjCPropertyDecl::PropertyAttributeKind Attrs = D->getPropertyAttributes();
+  if (Attrs != ObjCPropertyDecl::OBJC_PR_noattr) {
+    if (Attrs & ObjCPropertyDecl::OBJC_PR_getter) {
+      if (!(Attrs & ObjCPropertyDecl::OBJC_PR_setter))
+      dumpDeclRef(D->getGetterMethodDecl(), "getter");
+    }
+    if (Attrs & ObjCPropertyDecl::OBJC_PR_setter) {
+      dumpDeclRef(D->getSetterMethodDecl(), "setter");
+    }
+  }
+}
+
+void StmtResInfer::VisitObjCPropertyImplDecl(const ObjCPropertyImplDecl *D) {
+  dumpDeclRef(D->getPropertyDecl());
+  dumpDeclRef(D->getPropertyIvarDecl());
+}
+
+void StmtResInfer::VisitBlockDecl(const BlockDecl *D) {
+  for (BlockDecl::param_const_iterator I = D->param_begin(), E = D->param_end();
+       I != E; ++I)
+    dumpDecl(*I);
+
+  if (D->isVariadic()) {
+  }
+
+  if (D->capturesCXXThis()) {
+  }
+  for (BlockDecl::capture_const_iterator I = D->capture_begin(), E = D->capture_end();
+       I != E; ++I) {
+    if (I->getVariable()) {
+      dumpBareDeclRef(I->getVariable());
+    }
+    if (I->hasCopyExpr())
+      dumpStmt(I->getCopyExpr());
+  }
+  dumpStmt(D->getBody());
+}
+
+//===----------------------------------------------------------------------===//
+//  Stmt dumping methods.
+//===----------------------------------------------------------------------===//
+
+void StmtResInfer::dumpStmt(const Stmt *S) {
+  if (!S) {
+    return;
+  }
+
+  if (const DeclStmt *DS = dyn_cast<DeclStmt>(S)) {
+    VisitDeclStmt(DS);
+    return;
+  }
+
+  ConstStmtVisitor<StmtResInfer>::Visit(S);
+  for (Stmt::const_child_iterator CI = S->child_begin(); CI != S->child_end(); ++CI) {
+    dumpStmt(*CI);
+  }
+}
+
+// Perform the inferring
+unsigned StmtResInfer::Infer(const Stmt* Node) {
+  dumpStmt(Node);
+  return CppAMPSpec;
+}
+void StmtResInfer::VisitStmt(const Stmt *Node) {
+}
+
+void StmtResInfer::VisitDeclStmt(const DeclStmt *Node) {
+  VisitStmt(Node);
+  for (DeclStmt::const_decl_iterator I = Node->decl_begin(),
+                                     E = Node->decl_end();
+       I != E; ++I) {
+    dumpDecl(*I);
+  }
+}
+
+void StmtResInfer::VisitAttributedStmt(const AttributedStmt *Node) {
+  VisitStmt(Node);
+}
+
+void StmtResInfer::VisitLabelStmt(const LabelStmt *Node) {
+  VisitStmt(Node);
+
+  // label statement is not valid in C++AMP
+  // but is valid in HSA extension mode
+  if(!TheSema.getLangOpts().HSAExtension) {
+    ClearResAMP();
+  }
+}
+
+void StmtResInfer::VisitGotoStmt(const GotoStmt *Node) {
+  VisitStmt(Node);
+
+  // goto statement is not valid in C++AMP
+  // but is valid in HSA extension mode
+  if(!TheSema.getLangOpts().HSAExtension) {
+    ClearResAMP();
+  }
+}
+void StmtResInfer::VisitCXXTryStmt(const CXXTryStmt* Node) {
+  VisitStmt(Node);
+  ClearResAMP();
+}
+void StmtResInfer::VisitCXXTypeidExpr(const CXXTypeidExpr* Node) {
+  VisitStmt(Node);
+  ClearResAMP();
+}
+void StmtResInfer::VisitCXXDynamicCastExpr(const CXXDynamicCastExpr* Node) {
+  VisitStmt(Node);
+  ClearResAMP();
+}
+
+
+//===----------------------------------------------------------------------===//
+//  Expr dumping methods.
+//===----------------------------------------------------------------------===//
+
+void StmtResInfer::VisitExpr(const Expr *Node) {
+  VisitStmt(Node);
+  dumpType(Node->getType());
+}
+
+void StmtResInfer::VisitCastExpr(const CastExpr *Node) {
+  VisitExpr(Node);
+  //TODO: infer if any
+}
+
+void StmtResInfer::VisitDeclRefExpr(const DeclRefExpr *Node) {
+  //Format: DeclRefExpr 0x3eca4e8 <col:10> 'int (void)' lvalue
+  VisitExpr(Node);
+  dumpBareDeclRef(Node->getDecl());
+  if (Node->getDecl() != Node->getFoundDecl()) {
+    dumpBareDeclRef(Node->getFoundDecl());
+  }
+}
+
+void StmtResInfer::VisitUnresolvedLookupExpr(const UnresolvedLookupExpr *Node) {
+  VisitExpr(Node);
+}
+
+void StmtResInfer::VisitObjCIvarRefExpr(const ObjCIvarRefExpr *Node) {
+  VisitExpr(Node);
+}
+
+void StmtResInfer::VisitPredefinedExpr(const PredefinedExpr *Node) {
+  VisitExpr(Node);
+}
+
+void StmtResInfer::VisitCharacterLiteral(const CharacterLiteral *Node) {
+  VisitExpr(Node);
+}
+
+void StmtResInfer::VisitIntegerLiteral(const IntegerLiteral *Node) {
+  VisitExpr(Node);
+}
+
+void StmtResInfer::VisitFloatingLiteral(const FloatingLiteral *Node) {
+  VisitExpr(Node);
+}
+
+void StmtResInfer::VisitStringLiteral(const StringLiteral *Str) {
+  VisitExpr(Str);
+}
+
+void StmtResInfer::VisitUnaryOperator(const UnaryOperator *Node) {
+  VisitExpr(Node);
+}
+
+void StmtResInfer::VisitUnaryExprOrTypeTraitExpr(
+    const UnaryExprOrTypeTraitExpr *Node) {
+  VisitExpr(Node);
+  if (Node->isArgumentType())
+    dumpType(Node->getArgumentType());
+}
+
+void StmtResInfer::VisitMemberExpr(const MemberExpr *Node) {
+  VisitExpr(Node);
+  ValueDecl* VD = Node->getMemberDecl();
+  if((CppAMPSpec & CPPAMP_AMP) && !VD->hasAttr<CXXAMPRestrictAMPAttr>())
+    ClearResAMP();
+  if((CppAMPSpec & CPPAMP_CPU) && VD->hasAttr<CXXAMPRestrictAMPAttr>() &&
+    !VD->hasAttr<CXXAMPRestrictCPUAttr>())
+    ClearResCPU();;
+}
+
+void StmtResInfer::VisitExtVectorElementExpr(const ExtVectorElementExpr *Node) {
+  VisitExpr(Node);
+}
+
+void StmtResInfer::VisitBinaryOperator(const BinaryOperator *Node) {
+  VisitExpr(Node);
+}
+
+void StmtResInfer::VisitCompoundAssignOperator(
+    const CompoundAssignOperator *Node) {
+  VisitExpr(Node);
+  dumpBareType(Node->getComputationLHSType());
+  dumpBareType(Node->getComputationResultType());
+}
+
+void StmtResInfer::VisitBlockExpr(const BlockExpr *Node) {
+  VisitExpr(Node);
+  dumpDecl(Node->getBlockDecl());
+}
+
+void StmtResInfer::VisitOpaqueValueExpr(const OpaqueValueExpr *Node) {
+  VisitExpr(Node);
+  if (Expr *Source = Node->getSourceExpr()) {
+    dumpStmt(Source);
+  }
+}
+
+// GNU extensions.
+
+void StmtResInfer::VisitAddrLabelExpr(const AddrLabelExpr *Node) {
+  VisitExpr(Node);
+}
+
+//===----------------------------------------------------------------------===//
+// C++ Expressions
+//===----------------------------------------------------------------------===//
+
+void StmtResInfer::VisitCXXNamedCastExpr(const CXXNamedCastExpr *Node) {
+  VisitExpr(Node);
+}
+
+void StmtResInfer::VisitCXXBoolLiteralExpr(const CXXBoolLiteralExpr *Node) {
+  VisitExpr(Node);
+}
+
+void StmtResInfer::VisitCXXThisExpr(const CXXThisExpr *Node) {
+  VisitExpr(Node);
+}
+
+void StmtResInfer::VisitCXXThrowExpr(const CXXThrowExpr *Node) {
+  VisitExpr(Node);
+  ClearResAMP();
+}
+
+void StmtResInfer::VisitCXXFunctionalCastExpr(const CXXFunctionalCastExpr *Node) {
+  VisitExpr(Node);
+}
+
+void StmtResInfer::VisitCXXConstructExpr(const CXXConstructExpr *Node) {
+  VisitExpr(Node);
+  // TODO: infer if any
+#if 0
+  CXXConstructorDecl *Ctor = Node->getConstructor();
+#endif
+}
+
+void StmtResInfer::VisitCXXBindTemporaryExpr(const CXXBindTemporaryExpr *Node) {
+  VisitExpr(Node);
+  dumpCXXTemporary(Node->getTemporary());
+}
+
+void StmtResInfer::VisitExprWithCleanups(const ExprWithCleanups *Node) {
+  VisitExpr(Node);
+  for (unsigned i = 0, e = Node->getNumObjects(); i != e; ++i)
+    dumpDeclRef(Node->getObject(i), "cleanup");
+}
+
+void StmtResInfer::dumpCXXTemporary(const CXXTemporary *Temporary) {
+}
+
+//===----------------------------------------------------------------------===//
+// Obj-C Expressions
+//===----------------------------------------------------------------------===//
+
+void StmtResInfer::VisitObjCMessageExpr(const ObjCMessageExpr *Node) {
+  VisitExpr(Node);
+}
+
+void StmtResInfer::VisitObjCBoxedExpr(const ObjCBoxedExpr *Node) {
+  VisitExpr(Node);
+}
+void StmtResInfer::VisitObjCAtCatchStmt(const ObjCAtCatchStmt *Node) {
+  VisitStmt(Node);
+}
+
+void StmtResInfer::VisitObjCEncodeExpr(const ObjCEncodeExpr *Node) {
+  VisitExpr(Node);
+}
+
+void StmtResInfer::VisitObjCSelectorExpr(const ObjCSelectorExpr *Node) {
+  VisitExpr(Node);
+}
+
+void StmtResInfer::VisitObjCProtocolExpr(const ObjCProtocolExpr *Node) {
+  VisitExpr(Node);
+}
+
+void StmtResInfer::VisitObjCPropertyRefExpr(const ObjCPropertyRefExpr *Node) {
+  VisitExpr(Node);
+}
+
+void StmtResInfer::VisitObjCSubscriptRefExpr(const ObjCSubscriptRefExpr *Node) {
+  VisitExpr(Node);
+}
+
+void StmtResInfer::VisitObjCBoolLiteralExpr(const ObjCBoolLiteralExpr *Node) {
+  VisitExpr(Node);
+}
+
+static void InferFunctionType(FunctionDecl* FD, unsigned& Spec) {
+  //check function return type
+  {
+    QualType ResultType = FD->getReturnType();
+    if(ResultType->isPointerType())
+      ResultType = ResultType->getPointeeType();
+    // double* is allowed, while double** is not allowed
+    if(ResultType->isPointerType()) {
+      Spec &=~CPPAMP_AMP;
+      return;
+    }
+  }
+  // check if there's incompatible parameters in the function declarator
+  for (FunctionDecl::param_iterator PIt = FD->param_begin();
+    PIt != FD->param_end(); ++PIt) {
+    ParmVarDecl *pvDecl = (*PIt);
+    if(!pvDecl)
+      continue;
+
+    QualType Ty = pvDecl->getOriginalType();
+    if (Ty->isCharType() || Ty->isWideCharType() || Ty->isSpecificBuiltinType(BuiltinType::Short) || 
+      Ty->isSpecificBuiltinType(BuiltinType::LongLong) ||
+      Ty->isSpecificBuiltinType(BuiltinType::LongDouble) || Ty.isVolatileQualified()) {
+      Spec &=~CPPAMP_AMP;
+      return;
+    }
+
+    if (Ty->isEnumeralType()) {
+      const EnumType* ETy = dyn_cast<EnumType>(Ty);
+      if (ETy && ETy->getDecl()) {
+        const Type* UTy = ETy->getDecl()->getIntegerType().getTypePtrOrNull();
+        if (UTy->isCharType() || UTy->isWideCharType() || 
+          UTy->isSpecificBuiltinType(BuiltinType::Short) || 
+          UTy->isSpecificBuiltinType(BuiltinType::LongLong) || 
+          UTy->isSpecificBuiltinType(BuiltinType::LongDouble)) {
+           Spec &=~CPPAMP_AMP;
+           return;
+        }
+      }
+    }
+
+   // Pointer's pointer
+   QualType TheType = Ty;
+    if(Ty->isPointerType())
+      TheType = Ty->getPointeeType();
+    // double* is allowed, while double** is not allowed
+    if(TheType->isPointerType()) {
+      Spec &=~CPPAMP_AMP;
+      return;
+    }
+  }
+
+  QualType ResultType = FD->getReturnType();
+  // check if the return type is of incompatible type
+  if (ResultType->isCharType() || ResultType->isSpecificBuiltinType(BuiltinType::Short)) {
+    Spec &=~CPPAMP_AMP;
+    return;
+  }
+
+  if(FD->getType().isVolatileQualified())
+     Spec &=~CPPAMP_AMP;
+
+    return;
+}
+
+// FIXME: Once all statements of the declaration are passed, the restricitons
+// inferring can be performed. This is only allowed in auto-restricted declaration
+// Top down
+void Sema::TryCXXAMPRestrictionInferring(Decl *dcl, Stmt *S) {
+  if (!getLangOpts().CPlusPlusAMP ||!dcl ||!dcl->hasAttr<CXXAMPRestrictAUTOAttr>())
+    return;
+  
+  // Only allow on funtion definition
+  assert(isa<FunctionDecl>(*dcl) && dcl->hasBody());
+
+  unsigned OtherSpec = CPPAMP_AMP | CPPAMP_CPU;
+  if(dcl->hasAttr<CXXAMPRestrictAMPAttr>())
+    OtherSpec &= ~CPPAMP_AMP;
+  if(dcl->hasAttr<CXXAMPRestrictCPUAttr>())
+    OtherSpec &= ~CPPAMP_CPU;
+
+  // Inferring process
+  // skip method in a lambda class (ex: kernel function in parallel_for_each)
+  if (isa<CXXMethodDecl>(dcl) && dyn_cast<CXXMethodDecl>(dcl)->getParent()->isLambda()) {
+  } else if(OtherSpec & CPPAMP_AMP) {
+    // Assuming that 'auto' has been already inferred in parent scope if any
+    // Contained in any CPU only caller?
+    if(!IsInAMPRestricted() && dcl->getParentFunctionOrMethod())
+      OtherSpec &= ~CPPAMP_AMP;
+    else if(FunctionDecl* FD = dyn_cast<FunctionDecl>(dcl))
+     InferFunctionType(FD, OtherSpec);
+  }
+  
+  if(OtherSpec) {
+     StmtResInfer SRI(*this, OtherSpec, &this->getSourceManager());
+     OtherSpec = SRI.Infer(S);
+    }
+
+  // Update non-auto restriction specifiers if any
+  if(OtherSpec) {
+    
+    //Place all manually created Attr in where 'auto' physically is
+    CXXAMPRestrictAUTOAttr *AUTOAttr = dcl->getAttr<CXXAMPRestrictAUTOAttr>();
+    assert(AUTOAttr);
+    if(OtherSpec & CPPAMP_AMP)
+      dcl->addAttr(::new (Context) CXXAMPRestrictAMPAttr(AUTOAttr->getRange(), Context, 0));
+    if(OtherSpec & CPPAMP_CPU)
+      dcl->addAttr(::new (Context) CXXAMPRestrictCPUAttr(AUTOAttr->getRange(), Context, 0));
+  }
+  
+  // The inferring process is done. Drop AUTO Attribute in this compilation path
+  dcl->dropAttr<CXXAMPRestrictAUTOAttr>();
+
+}
+
diff --git a/test/CodeGen/address-safety-attr-kasan-hwasan.cpp b/test/CodeGen/address-safety-attr-kasan-hwasan.cpp
new file mode 100755
index 0000000000..7a84b798e4
--- /dev/null
+++ b/test/CodeGen/address-safety-attr-kasan-hwasan.cpp
@@ -0,0 +1,53 @@
+// Make sure the sanitize_address attribute is emitted when using both ASan and KASan.
+// Also document that __attribute__((no_sanitize_address)) doesn't disable KASan instrumentation.
+
+/// RUN: %clang_cc1 -triple i386-unknown-linux -disable-O0-optnone -emit-llvm -o - %s | FileCheck -check-prefix=CHECK-NOASAN %s
+/// RUN: %clang_cc1 -triple i386-unknown-linux -fsanitize=address -disable-O0-optnone -emit-llvm -o - %s | FileCheck -check-prefix=CHECK-ASAN %s
+/// RUN: %clang_cc1 -triple i386-unknown-linux -fsanitize=kernel-address -disable-O0-optnone -emit-llvm -o - %s | FileCheck -check-prefix=CHECK-KASAN %s
+/// RUN: %clang_cc1 -triple i386-unknown-linux -fsanitize=hwaddress -disable-O0-optnone -emit-llvm -o - %s | FileCheck -check-prefix=CHECK-HWASAN %s
+
+int HasSanitizeAddress() {
+  return 1;
+}
+// CHECK-NOASAN: {{Function Attrs: noinline nounwind$}}
+// CHECK-ASAN: Function Attrs: noinline nounwind sanitize_address
+// CHECK-KASAN: Function Attrs: noinline nounwind sanitize_address
+// CHECK-HWASAN: Function Attrs: noinline nounwind sanitize_hwaddress
+
+__attribute__((no_sanitize("address")))
+int NoSanitizeQuoteAddress() {
+  return 0;
+}
+// CHECK-NOASAN: {{Function Attrs: noinline nounwind$}}
+// CHECK-ASAN: {{Function Attrs: noinline nounwind$}}
+// CHECK-KASAN: {{Function Attrs: noinline nounwind sanitize_address$}}
+// CHECK-HWASAN: {{Function Attrs: noinline nounwind sanitize_hwaddress$}}
+
+__attribute__((no_sanitize_address))
+int NoSanitizeAddress() {
+  return 0;
+}
+// CHECK-NOASAN: {{Function Attrs: noinline nounwind$}}
+// CHECK-ASAN: {{Function Attrs: noinline nounwind$}}
+// CHECK-KASAN: {{Function Attrs: noinline nounwind sanitize_address$}}
+// CHECK-HWASAN: {{Function Attrs: noinline nounwind sanitize_hwaddress$}}
+
+__attribute__((no_sanitize("kernel-address")))
+int NoSanitizeKernelAddress() {
+  return 0;
+}
+
+// CHECK-NOASAN: {{Function Attrs: noinline nounwind$}}
+// CHECK-ASAN: {{Function Attrs: noinline nounwind sanitize_address$}}
+// CHECK-KASAN: {{Function Attrs: noinline nounwind$}}
+// CHECK-HWASAN: {{Function Attrs: noinline nounwind sanitize_hwaddress$}}
+
+__attribute__((no_sanitize("hwaddress")))
+int NoSanitizeHWAddress() {
+  return 0;
+}
+
+// CHECK-NOASAN: {{Function Attrs: noinline nounwind$}}
+// CHECK-ASAN: {{Function Attrs: noinline nounwind sanitize_address$}}
+// CHECK-KASAN: {{Function Attrs: noinline nounwind sanitize_address$}}
+// CHECK-HWASAN: {{Function Attrs: noinline nounwind$}}
diff --git a/test/CodeGen/address-safety-attr-kasan.cpp b/test/CodeGen/address-safety-attr-kasan.cpp
new file mode 100755
index 0000000000..603134db69
--- /dev/null
+++ b/test/CodeGen/address-safety-attr-kasan.cpp
@@ -0,0 +1,38 @@
+// Make sure the sanitize_address attribute is emitted when using both ASan and KASan.
+// Also document that __attribute__((no_sanitize_address)) doesn't disable KASan instrumentation.
+
+/// RUN: %clang_cc1 -triple i386-unknown-linux -disable-O0-optnone -emit-llvm -o - %s | FileCheck -check-prefix=CHECK-NOASAN %s
+/// RUN: %clang_cc1 -triple i386-unknown-linux -fsanitize=address -disable-O0-optnone -emit-llvm -o - %s | FileCheck -check-prefix=CHECK-ASAN %s
+/// RUN: %clang_cc1 -triple i386-unknown-linux -fsanitize=kernel-address -disable-O0-optnone -emit-llvm -o - %s | FileCheck -check-prefix=CHECK-KASAN %s
+
+int HasSanitizeAddress() {
+  return 1;
+}
+// CHECK-NOASAN: {{Function Attrs: noinline nounwind$}}
+// CHECK-ASAN: Function Attrs: noinline nounwind sanitize_address
+// CHECK-KASAN: Function Attrs: noinline nounwind sanitize_address
+
+__attribute__((no_sanitize("address")))
+int NoSanitizeQuoteAddress() {
+  return 0;
+}
+// CHECK-NOASAN: {{Function Attrs: noinline nounwind$}}
+// CHECK-ASAN: {{Function Attrs: noinline nounwind$}}
+// CHECK-KASAN: {{Function Attrs: noinline nounwind sanitize_address$}}
+
+__attribute__((no_sanitize_address))
+int NoSanitizeAddress() {
+  return 0;
+}
+// CHECK-NOASAN: {{Function Attrs: noinline nounwind$}}
+// CHECK-ASAN: {{Function Attrs: noinline nounwind$}}
+// CHECK-KASAN: {{Function Attrs: noinline nounwind sanitize_address$}}
+
+__attribute__((no_sanitize("kernel-address")))
+int NoSanitizeKernelAddress() {
+  return 0;
+}
+
+// CHECK-NOASAN: {{Function Attrs: noinline nounwind$}}
+// CHECK-ASAN: {{Function Attrs: noinline nounwind sanitize_address$}}
+// CHECK-KASAN: {{Function Attrs: noinline nounwind$}}
diff --git a/test/CodeGen/amdgcn-auto-var.cpp b/test/CodeGen/amdgcn-auto-var.cpp
new file mode 100755
index 0000000000..deb6738e23
--- /dev/null
+++ b/test/CodeGen/amdgcn-auto-var.cpp
@@ -0,0 +1,30 @@
+// RUN: %clang_cc1 -O0 -std=c++11 -emit-llvm -o - -triple amdgcn %s | FileCheck %s
+
+// CHECK: %struct.ATy = type { i32* }
+struct ATy {
+  int *p;
+};
+
+// CHECK-LABEL: @_Z1fPi(i32* %a)
+void f(int* a) {
+  // CHECK: %[[a_addr:.*]] = alloca i32*
+  // CHECK: %[[b:.*]] = alloca i32
+  // CHECK: %[[A:.*]] = alloca %struct.ATy, align 8
+
+  // CHECK:  store i32* %a, i32** %[[a_addr]]
+
+  // CHECK:  store i32 1, i32* %[[b]]
+  int b = 1;
+
+  // CHECK: %[[p:.*]] = getelementptr inbounds %struct.ATy, %struct.ATy* %[[A]], i32 0, i32 0
+  // CHECK: store i32* %[[b]], i32** %[[p]], align 8
+  ATy A{&b};
+
+  // CHECK: %[[r0:.*]] = load i32, i32* %b
+  // CHECK: %[[r1:.*]] = load i32*, i32** %[[a_addr]]
+  // CHECK: store i32 %[[r0]], i32* %[[r1]]
+  *a = b;
+
+  // CHECK: store i32* %[[b]], i32** %[[a_addr]], align 8
+  a = &b;
+}
diff --git a/test/CodeGen/tbaa-struct.cpp b/test/CodeGen/tbaa-struct.cpp
index 670584e3f8..a883a06e08 100644
--- a/test/CodeGen/tbaa-struct.cpp
+++ b/test/CodeGen/tbaa-struct.cpp
@@ -55,6 +55,7 @@ void copy3(U *u1, U *u2) {
   *u1 = *u2;
 }
 
+
 // Make sure that zero-length bitfield works.
 struct C {
   char a;
diff --git a/test/CodeGenCUDA/address-spaces.cu b/test/CodeGenCUDA/address-spaces.cu
index 58b0a43707..abda34be07 100644
--- a/test/CodeGenCUDA/address-spaces.cu
+++ b/test/CodeGenCUDA/address-spaces.cu
@@ -59,7 +59,7 @@ __device__ void func1() {
   __shared__ float a;
   callee(&a); // implicit cast from parameters
 }
-// CHECK: define void @_Z5func1v()
+// CHECK-LABEL: define void @_Z5func1v()
 // CHECK: call void @_Z6calleePf(float* addrspacecast (float addrspace(3)* @_ZZ5func1vE1a to float*))
 
 __device__ void func2() {
@@ -91,5 +91,5 @@ __shared__ float b;
 __device__ float *func5() {
   return &b; // implicit cast from a return value
 }
-// CHECK: define float* @_Z5func5v()
+// CHECK-LABEL: define float* @_Z5func5v()
 // CHECK: ret float* addrspacecast (float addrspace(3)* @b to float*)
diff --git a/test/CodeGenCUDA/convergent.cu b/test/CodeGenCUDA/convergent.cu
index dd410cc644..de9460c607 100644
--- a/test/CodeGenCUDA/convergent.cu
+++ b/test/CodeGenCUDA/convergent.cu
@@ -25,9 +25,11 @@ __host__ __device__ void baz();
 __host__ __device__ void bar() {
   // DEVICE: call void @_Z3bazv() [[CALL_ATTR:#[0-9]+]]
   baz();
-  // DEVICE: call i32 asm "trap;", "=l"() [[ASM_ATTR:#[0-9]+]]
+  #ifdef NVPTX
+  // NVPTX: call i32 asm "trap;", "=l"() [[ASM_ATTR:#[0-9]+]]
   int x;
   asm ("trap;" : "=l"(x));
+  #endif
   // DEVICE: call void asm sideeffect "trap;", ""() [[ASM_ATTR:#[0-9]+]]
   asm volatile ("trap;");
 }
diff --git a/test/CodeGenCUDA/device-vtable.cu b/test/CodeGenCUDA/device-vtable.cu
index a7307dcdaf..d5a76a17f3 100644
--- a/test/CodeGenCUDA/device-vtable.cu
+++ b/test/CodeGenCUDA/device-vtable.cu
@@ -10,6 +10,8 @@
 // RUN:     | FileCheck %s -check-prefix=CHECK-HOST -check-prefix=CHECK-BOTH
 // RUN: %clang_cc1 -triple nvptx64-nvidia-cuda -fcuda-is-device -emit-llvm -o - %s \
 // RUN:     | FileCheck %s -check-prefix=CHECK-DEVICE -check-prefix=CHECK-BOTH
+// RUN: %clang_cc1 -triple amdgcn -fcuda-is-device -emit-llvm -o - %s \
+// RUN:     | FileCheck %s -check-prefix=CHECK-DEVICE -check-prefix=CHECK-BOTH
 
 #include "Inputs/cuda.h"
 
diff --git a/test/CodeGenCUDA/filter-decl.cu b/test/CodeGenCUDA/filter-decl.cu
index 0f4691f7c8..3144b728e8 100644
--- a/test/CodeGenCUDA/filter-decl.cu
+++ b/test/CodeGenCUDA/filter-decl.cu
@@ -1,5 +1,6 @@
 // RUN: %clang_cc1 -triple %itanium_abi_triple -emit-llvm %s -o - | FileCheck -check-prefix=CHECK-HOST %s
-// RUN: %clang_cc1 -triple %itanium_abi_triple -emit-llvm %s -o - -fcuda-is-device | FileCheck -check-prefix=CHECK-DEVICE %s
+// RUN: %clang_cc1 -triple %itanium_abi_triple -emit-llvm %s -o - -fcuda-is-device | FileCheck -check-prefixes=CHECK-DEVICE,ITANIUM %s
+// RUN: %clang_cc1 -triple amdgcn -emit-llvm %s -o - -fcuda-is-device | FileCheck -check-prefixes=CHECK-DEVICE,AMDGCN %s
 
 #include "Inputs/cuda.h"
 
diff --git a/test/CodeGenCUDA/function-overload.cu b/test/CodeGenCUDA/function-overload.cu
index c82b2e96f6..1c519aa84e 100644
--- a/test/CodeGenCUDA/function-overload.cu
+++ b/test/CodeGenCUDA/function-overload.cu
@@ -8,6 +8,8 @@
 // RUN:     | FileCheck -check-prefix=CHECK-BOTH -check-prefix=CHECK-HOST %s
 // RUN: %clang_cc1 -triple nvptx64-nvidia-cuda -fcuda-is-device -emit-llvm -o - %s \
 // RUN:     | FileCheck -check-prefix=CHECK-BOTH -check-prefix=CHECK-DEVICE %s
+// RUN: %clang_cc1 -triple amdgcn -fcuda-is-device -emit-llvm -o - %s \
+// RUN:     | FileCheck -check-prefix=CHECK-BOTH -check-prefix=CHECK-DEVICE %s
 
 #include "Inputs/cuda.h"
 
diff --git a/test/CodeGenCUDA/kernel-args-alignment.cu b/test/CodeGenCUDA/kernel-args-alignment.cu
index 2bfd098a85..bf628711e7 100644
--- a/test/CodeGenCUDA/kernel-args-alignment.cu
+++ b/test/CodeGenCUDA/kernel-args-alignment.cu
@@ -3,10 +3,17 @@
 //
 // RUN: %clang_cc1 --std=c++11 -triple x86_64-unknown-linux-gnu -emit-llvm \
 // RUN:    -target-sdk-version=8.0 -o - %s \
-// RUN:  | FileCheck -check-prefixes=HOST-OLD,CHECK %s
+// RUN:  | FileCheck -check-prefixes=HOST-OLD,HOST-OLD-NV,CHECK %s
 
 // RUN: %clang_cc1 --std=c++11 -fcuda-is-device -triple nvptx64-nvidia-cuda \
-// RUN:   -emit-llvm -o - %s | FileCheck -check-prefixes=DEVICE,CHECK %s
+// RUN:   -emit-llvm -o - %s | FileCheck -check-prefixes=DEVICE,DEVICE-NV,CHECK %s
+
+// RUN: %clang_cc1 --std=c++11 -triple x86_64-unknown-linux-gnu -x hip \
+// RUN:  -aux-triple amdgcn-amd-amdhsa -emit-llvm -o - %s | \
+// RUN:  FileCheck -check-prefixes=HOST-OLD,HOST-OLD-AMD,CHECK %s
+
+// RUN: %clang_cc1 --std=c++11 -fcuda-is-device -triple amdgcn-amd-amdhsa \
+// RUN:  -x hip -emit-llvm -o - %s | FileCheck -check-prefixes=DEVICE,DEVICE-AMD,CHECK %s
 
 #include "Inputs/cuda.h"
 
@@ -22,19 +29,31 @@ struct S {
 
 // Clang should generate a packed LLVM struct for S (denoted by the <>s),
 // otherwise this test isn't interesting.
-// CHECK: %struct.S = type <{ i32*, i8, %struct.U, [5 x i8] }>
+// HOST: %struct.S = type <{ i32*, i8, %struct.U, [5 x i8] }>
+// DEVICE: %struct.S = type <{ i32*, i8, %struct.U, [5 x i8] }>
 
 static_assert(alignof(S) == 8, "Unexpected alignment.");
 
 // HOST-LABEL: @_Z6kernelc1SPi
-// Marshalled kernel args should be:
+// For NVPTX backend, marshalled kernel args should be:
 //   1. offset 0, width 1
 //   2. offset 8 (because alignof(S) == 8), width 16
 //   3. offset 24, width 8
-// HOST-OLD: call i32 @cudaSetupArgument({{[^,]*}}, i64 1, i64 0)
-// HOST-OLD: call i32 @cudaSetupArgument({{[^,]*}}, i64 16, i64 8)
-// HOST-OLD: call i32 @cudaSetupArgument({{[^,]*}}, i64 8, i64 24)
+// HOST-NV-OLD: call i32 @cudaSetupArgument({{[^,]*}}, i64 1, i64 0)
+// HOST-NV-OLD: call i32 @cudaSetupArgument({{[^,]*}}, i64 16, i64 8)
+// HOST-NV-OLD: call i32 @cudaSetupArgument({{[^,]*}}, i64 8, i64 24)
+// AMDGPU backend assumes struct type kernel arguments are passed directly,
+// not byval. It lays out kernel arguments by size and alignment in IR.
+// Packed struct type in IR always has ABI alignment of 1.
+// For AMDGPU backend, marshalled kernel args should be:
+//   1. offset 0, width 1
+//   2. offset 1 (because ABI alignment of S is 1), width 16
+//   3. offset 24, width 8
+// HOST-AMD: call i32 @hipSetupArgument({{[^,]*}}, i64 1, i64 0)
+// HOST-AMD: call i32 @hipSetupArgument({{[^,]*}}, i64 16, i64 1)
+// HOST-AMD: call i32 @hipSetupArgument({{[^,]*}}, i64 8, i64 24)
 
 // DEVICE-LABEL: @_Z6kernelc1SPi
-// DEVICE-SAME: i8{{[^,]*}}, %struct.S* byval(%struct.S) align 8{{[^,]*}}, i32*
+// DEVICE-NV-SAME: i8{{[^,]*}}, %struct.S* byval(%struct.S) align 8{{[^,]*}}, i32*
+// DEVICE-AMD-SAME: i8{{[^,]*}}, %struct.S{{[^,*]*}}, i32*
 __global__ void kernel(char a, S s, int *b) {}
diff --git a/test/CodeGenCUDA/kernel-stub-name.cu b/test/CodeGenCUDA/kernel-stub-name.cu
index 539d7eec1b..a16592602d 100644
--- a/test/CodeGenCUDA/kernel-stub-name.cu
+++ b/test/CodeGenCUDA/kernel-stub-name.cu
@@ -10,7 +10,7 @@ template<class T>
 __global__ void kernelfunc() {}
 
 // CHECK-LABEL: define{{.*}}@_Z8hostfuncv()
-// CHECK: call void @[[STUB:_Z10kernelfuncIiEvv.stub]]()
+// CHECK: call void @[[STUB:__device_stub__Z10kernelfuncIiEvv]]()
 void hostfunc(void) { kernelfunc<int><<<1, 1>>>(); }
 
 // CHECK: define{{.*}}@[[STUB]]
diff --git a/test/CodeGenCUDA/linker-options.cu b/test/CodeGenCUDA/linker-options.cu
new file mode 100644
index 0000000000..4b2e6bdde6
--- /dev/null
+++ b/test/CodeGenCUDA/linker-options.cu
@@ -0,0 +1,4 @@
+// RUN: %clang_cc1 -emit-llvm -o - -fcuda-is-device -x hip %s | FileCheck %s
+
+// CHECK-NOT: llvm.linker.options
+#pragma comment(lib, "a.so")
diff --git a/test/CodeGenCUDA/llvm-used.cu b/test/CodeGenCUDA/llvm-used.cu
index 44666a91c3..ca8295dab0 100644
--- a/test/CodeGenCUDA/llvm-used.cu
+++ b/test/CodeGenCUDA/llvm-used.cu
@@ -1,4 +1,5 @@
 // RUN: %clang_cc1 -emit-llvm %s -o - -fcuda-is-device -triple nvptx64-unknown-unknown | FileCheck %s
+// RUN: %clang_cc1 -emit-llvm %s -o - -fcuda-is-device -triple amdgcn-amd-amdhsa | FileCheck %s
 
 
 // Make sure we emit the proper addrspacecast for llvm.used.  PR22383 exposed an
diff --git a/test/CodeGenCUDA/printf.cu b/test/CodeGenCUDA/printf.cu
index dc3f4ea788..2868be9d5e 100644
--- a/test/CodeGenCUDA/printf.cu
+++ b/test/CodeGenCUDA/printf.cu
@@ -2,7 +2,10 @@
 // REQUIRES: nvptx-registered-target
 
 // RUN: %clang_cc1 -triple nvptx64-nvidia-cuda -fcuda-is-device -emit-llvm \
-// RUN:   -o - %s | FileCheck %s
+// RUN:   -o - %s | FileCheck -check-prefixes=CHECK,NVPTX %s
+
+// RUN: %clang_cc1 -triple amdgcn-amd-amdhsa -fcuda-is-device -emit-llvm \
+// RUN:   -o - %s | FileCheck -check-prefixes=CHECK,AMDGCN %s
 
 #include "Inputs/cuda.h"
 
@@ -10,30 +13,33 @@ extern "C" __device__ int vprintf(const char*, const char*);
 
 // Check a simple call to printf end-to-end.
 // CHECK: [[SIMPLE_PRINTF_TY:%[a-zA-Z0-9_]+]] = type { i32, i64, double }
+// CHECK-LABEL: define i32 @_Z11CheckSimplev()
 __device__ int CheckSimple() {
   // CHECK: [[BUF:%[a-zA-Z0-9_]+]] = alloca [[SIMPLE_PRINTF_TY]]
   // CHECK: [[FMT:%[0-9]+]] = load{{.*}}%fmt
   const char* fmt = "%d %lld %f";
-  // CHECK: [[PTR0:%[0-9]+]] = getelementptr inbounds [[SIMPLE_PRINTF_TY]], [[SIMPLE_PRINTF_TY]]* [[BUF]], i32 0, i32 0
-  // CHECK: store i32 1, i32* [[PTR0]], align 4
-  // CHECK: [[PTR1:%[0-9]+]] = getelementptr inbounds [[SIMPLE_PRINTF_TY]], [[SIMPLE_PRINTF_TY]]* [[BUF]], i32 0, i32 1
-  // CHECK: store i64 2, i64* [[PTR1]], align 8
-  // CHECK: [[PTR2:%[0-9]+]] = getelementptr inbounds [[SIMPLE_PRINTF_TY]], [[SIMPLE_PRINTF_TY]]* [[BUF]], i32 0, i32 2
-  // CHECK: store double 3.0{{[^,]*}}, double* [[PTR2]], align 8
-  // CHECK: [[BUF_CAST:%[0-9]+]] = bitcast [[SIMPLE_PRINTF_TY]]* [[BUF]] to i8*
-  // CHECK: [[RET:%[0-9]+]] = call i32 @vprintf(i8* [[FMT]], i8* [[BUF_CAST]])
+  // CHECK: [[PTR0:%[0-9]+]] = getelementptr inbounds [[SIMPLE_PRINTF_TY]], [[SIMPLE_PRINTF_TY]]{{.*}}* [[BUF]], i32 0, i32 0
+  // CHECK: store i32 1, i32{{.*}}* [[PTR0]], align 4
+  // CHECK: [[PTR1:%[0-9]+]] = getelementptr inbounds [[SIMPLE_PRINTF_TY]], [[SIMPLE_PRINTF_TY]]{{.*}}* [[BUF]], i32 0, i32 1
+  // CHECK: store i64 2, i64{{.*}}* [[PTR1]], align 8
+  // CHECK: [[PTR2:%[0-9]+]] = getelementptr inbounds [[SIMPLE_PRINTF_TY]], [[SIMPLE_PRINTF_TY]]{{.*}}* [[BUF]], i32 0, i32 2
+  // CHECK: store double 3.0{{[^,]*}}, double{{.*}}* [[PTR2]], align 8
+  // CHECK: [[BUF_CAST:%[0-9]+]] = bitcast [[SIMPLE_PRINTF_TY]]{{.*}}* [[BUF]] to i8{{.*}}*
+  // CHECK: [[RET:%[0-9]+]] = call i32 @vprintf(i8{{.*}}* [[FMT]], i8{{.*}}* [[BUF_CAST]])
   // CHECK: ret i32 [[RET]]
   return printf(fmt, 1, 2ll, 3.0);
 }
 
+// CHECK-LABEL: define void @_Z11CheckNoArgsv()
 __device__ void CheckNoArgs() {
-  // CHECK: call i32 @vprintf({{.*}}, i8* null){{$}}
+  // CHECK: call i32 @vprintf({{.*}}, i8{{.*}}* null){{$}}
   printf("hello, world!");
 }
 
 // Check that printf's alloca happens in the entry block, not inside the if
 // statement.
 __device__ bool foo();
+// CHECK-LABEL: define void @_Z25CheckAllocaIsInEntryBlockv()
 __device__ void CheckAllocaIsInEntryBlock() {
   // CHECK: alloca %printf_args
   // CHECK: call {{.*}} @_Z3foov()
diff --git a/test/CodeGenCXX/amdgcn-global-init.cpp b/test/CodeGenCXX/amdgcn-global-init.cpp
new file mode 100755
index 0000000000..ad38875ffb
--- /dev/null
+++ b/test/CodeGenCXX/amdgcn-global-init.cpp
@@ -0,0 +1,211 @@
+// RUN: %clang_cc1 -std=c++11 -triple=amdgcn-amd-amdhsa -emit-llvm -fexceptions %s -o - |FileCheck %s
+// RUN: %clang_cc1 -std=c++11 -triple=amdgcn-amd-amdhsa -emit-llvm %s -o - |FileCheck -check-prefix CHECK-NOEXC %s
+// RUN: %clang_cc1 -std=c++11 -triple=amdgcn-amd-amdhsa -emit-llvm \
+// RUN:     -momit-leaf-frame-pointer -mdisable-fp-elim %s -o - \
+// RUN:   | FileCheck -check-prefix CHECK-FP %s
+
+struct A {
+  A();
+  ~A();
+};
+
+struct B { B(); ~B(); };
+
+struct C { void *field; };
+
+struct D { ~D(); };
+
+// CHECK: @__dso_handle = external hidden addrspace(1) global i8
+// CHECK: @c = addrspace(1) global %struct.C zeroinitializer, align 8
+
+// PR6205: The casts should not require global initializers
+// CHECK: @_ZN6PR59741cE = external addrspace(1) global %"struct.PR5974::C"
+// CHECK: @_ZN6PR59741aE = addrspace(1) global %"struct.PR5974::A"* addrspacecast (%"struct.PR5974::A" addrspace(1)* getelementptr inbounds (%"struct.PR5974::C", %"struct.PR5974::C" addrspace(1)* @_ZN6PR59741cE, i32 0, i32 0) to %"struct.PR5974::A"*), align 8
+// CHECK: @_ZN6PR59741bE = addrspace(1) global %"struct.PR5974::B"* bitcast (i8* getelementptr (i8, i8* addrspacecast (i8 addrspace(1)* bitcast (%"struct.PR5974::C" addrspace(1)* @_ZN6PR59741cE to i8 addrspace(1)*) to i8*), i64 4) to %"struct.PR5974::B"*), align 8
+
+// CHECK: call void @_ZN1AC1Ev(%struct.A* addrspacecast (%struct.A addrspace(1)* @a to %struct.A*))
+// CHECK: call i32 @__cxa_atexit(void (i8*)* bitcast (void (%struct.A*)* @_ZN1AD1Ev to void (i8*)*), i8* addrspacecast (i8 addrspace(1)* getelementptr inbounds (%struct.A, %struct.A addrspace(1)* @a, i32 0, i32 0) to i8*), i8* addrspacecast (i8 addrspace(1)* @__dso_handle to i8*))
+A a;
+
+// CHECK: call void @_ZN1BC1Ev(%struct.B* addrspacecast (%struct.B addrspace(1)* @b to %struct.B*))
+// CHECK: call i32 @__cxa_atexit(void (i8*)* bitcast (void (%struct.B*)* @_ZN1BD1Ev to void (i8*)*), i8* addrspacecast (i8 addrspace(1)* getelementptr inbounds (%struct.B, %struct.B addrspace(1)* @b, i32 0, i32 0) to i8*), i8* addrspacecast (i8 addrspace(1)* @__dso_handle to i8*))
+B b;
+
+// PR6205: this should not require a global initializer
+// CHECK-NOT: call void @_ZN1CC1Ev
+C c;
+
+// CHECK: call i32 @__cxa_atexit(void (i8*)* bitcast (void (%struct.D*)* @_ZN1DD1Ev to void (i8*)*), i8* addrspacecast (i8 addrspace(1)* getelementptr inbounds (%struct.D, %struct.D addrspace(1)* @d, i32 0, i32 0) to i8*), i8* addrspacecast (i8 addrspace(1)* @__dso_handle to i8*))
+D d;
+
+// <rdar://problem/7458115>
+namespace test1 {
+  int f();
+  const int x = f();   // This has side-effects and gets emitted immediately.
+  const int y = x - 1; // This gets deferred.
+  const int z = ~y;    // This also gets deferred, but gets "undeferred" before y.
+  int test() { return z; }
+// CHECK-LABEL:      define i32 @_ZN5test14testEv()
+
+  // All of these initializers end up delayed, so we check them later.
+}
+
+// <rdar://problem/8246444>
+namespace test2 {
+  struct allocator { allocator(); ~allocator(); };
+  struct A { A(const allocator &a = allocator()); ~A(); };
+
+  A a;
+// CHECK: call void @_ZN5test29allocatorC1Ev(
+// CHECK: invoke void @_ZN5test21AC1ERKNS_9allocatorE(
+// CHECK: call void @_ZN5test29allocatorD1Ev(
+// CHECK: call i32 @__cxa_atexit({{.*}} @_ZN5test21AD1Ev {{.*}} @_ZN5test21aE
+}
+
+namespace test3 {
+  // Tested at the beginning of the file.
+  const char * const var = "string";
+  extern const char * const var;
+
+  const char *test() { return var; }
+}
+
+namespace test4 {
+  struct A {
+    A();
+  };
+  extern int foo();
+
+  // This needs an initialization function and guard variables.
+  // CHECK: load i8, i8 addrspace(1)* bitcast (i64 addrspace(1)* @_ZGVN5test41xE to i8 addrspace(1)*)
+  // CHECK: [[CALL:%.*]] = call i32 @_ZN5test43fooEv
+  // CHECK-NEXT: store i32 %call, i32* addrspacecast (i32 addrspace(1)* @_ZN5test41xE to i32*)
+  // CHECK-NEXT: store i64 1, i64 addrspace(1)* @_ZGVN5test41xE
+  __attribute__((weak)) int x = foo();
+}
+
+namespace PR5974 {
+  struct A { int a; };
+  struct B { int b; };
+  struct C : A, B { int c; };
+
+  extern C c;
+
+  // These should not require global initializers.
+  A* a = &c;
+  B* b = &c;
+}
+
+// PR9570: the indirect field shouldn't crash IR gen.
+namespace test5 {
+  static union {
+    unsigned bar[4096] __attribute__((aligned(128)));
+  };
+}
+
+namespace std { struct type_info; }
+
+namespace test6 {
+  struct A { virtual ~A(); };
+  struct B : A {};
+  extern A *p;
+
+  // We must emit a dynamic initializer for 'q', because it could throw.
+  B *const q = &dynamic_cast<B&>(*p);
+  // CHECK: call void @__cxa_bad_cast()
+  // CHECK: store {{.*}} @_ZN5test6L1qE
+
+  // We don't need to emit 'r' at all, because it has internal linkage, is
+  // unused, and its initialization has no side-effects.
+  B *const r = dynamic_cast<B*>(p);
+  // CHECK-NOT: call void @__cxa_bad_cast()
+  // CHECK-NOT: store {{.*}} @_ZN5test6L1rE
+
+  // This can throw, so we need to emit it.
+  const std::type_info *const s = &typeid(*p);
+  // CHECK: store {{.*}} @_ZN5test6L1sE
+
+  // This can't throw, so we don't.
+  const std::type_info *const t = &typeid(p);
+  // CHECK-NOT: @_ZN5test6L1tE
+
+  extern B *volatile v;
+  // CHECK: store {{.*}} @_ZN5test6L1wE
+  B *const w = dynamic_cast<B*>(v);
+
+  // CHECK: load volatile
+  // CHECK: store {{.*}} @_ZN5test6L1xE
+  const int x = *(volatile int*)0x1234;
+
+  namespace {
+    int a = int();
+    volatile int b = int();
+    int c = a;
+    int d = b;
+    // CHECK-NOT: store {{.*}} @_ZN5test6{{[A-Za-z0-9_]*}}1aE
+    // CHECK-NOT: store {{.*}} @_ZN5test6{{[A-Za-z0-9_]*}}1bE
+    // CHECK-NOT: store {{.*}} @_ZN5test6{{[A-Za-z0-9_]*}}1cE
+    // CHECK: load volatile {{.*}} @_ZN5test6{{[A-Za-z0-9_]*}}1bE
+    // CHECK: store {{.*}} @_ZN5test6{{[A-Za-z0-9_]*}}1dE
+  }
+}
+
+namespace test7 {
+  struct A { A(); };
+  struct B { ~B(); int n; };
+  struct C { C() = default; C(const C&); int n; };
+  struct D {};
+
+  // CHECK: call void @_ZN5test71AC1Ev({{.*}}@_ZN5test7L1aE
+  const A a = A();
+
+  // CHECK: call i32 @__cxa_atexit({{.*}} @_ZN5test71BD1Ev{{.*}} @_ZN5test7L2b1E
+  // CHECK: call i32 @__cxa_atexit({{.*}} @_ZN5test71BD1Ev{{.*}} @_ZGRN5test72b2E
+  // CHECK: call void @_ZN5test71BD1Ev(
+  // CHECK: store {{.*}} @_ZN5test7L2b3E
+  const B b1 = B();
+  const B &b2 = B();
+  const int b3 = B().n;
+
+  // CHECK-NOT: @_ZN5test7L2c1E
+  // CHECK: call void @llvm.memset{{.*}} @_ZN5test7L2c1E
+  // CHECK-NOT: @_ZN5test7L2c1E
+  // CHECK: @_ZN5test7L2c2E
+  // CHECK-NOT: @_ZN5test7L2c3E
+  // CHECK: @_ZN5test7L2c4E
+  const C c1 = C();
+  const C c2 = static_cast<const C&>(C());
+  const int c3 = C().n;
+  const int c4 = C(C()).n;
+
+  // CHECK-NOT: @_ZN5test7L1dE
+  const D d = D();
+
+  // CHECK: store {{.*}} @_ZN5test71eE
+  int f(), e = f();
+}
+
+
+// At the end of the file, we check that y is initialized before z.
+
+// CHECK:      define internal void [[TEST1_Z_INIT:@.*]]()
+// CHECK:        load i32, i32* addrspacecast (i32 addrspace(4)* @_ZN5test1L1yE to i32*)
+// CHECK-NEXT:   xor
+// CHECK-NEXT:   store i32 {{.*}}, i32* addrspacecast (i32 addrspace(4)* @_ZN5test1L1zE to i32*)
+// CHECK:      define internal void [[TEST1_Y_INIT:@.*]]()
+// CHECK:        load i32, i32* addrspacecast (i32 addrspace(4)* @_ZN5test1L1xE to i32*)
+// CHECK-NEXT:   sub
+// CHECK-NEXT:   store i32 {{.*}}, i32* addrspacecast (i32 addrspace(4)* @_ZN5test1L1yE to i32*)
+
+// CHECK: define internal void @_GLOBAL__sub_I_amdgcn_global_init.cpp() #{{[0-9]+}}
+// CHECK:   call void [[TEST1_Y_INIT]]
+// CHECK:   call void [[TEST1_Z_INIT]]
+
+// rdar://problem/8090834: this should be nounwind
+// CHECK-NOEXC: define internal void @_GLOBAL__sub_I_amdgcn_global_init.cpp() [[NUW:#[0-9]+]]
+
+// CHECK-NOEXC: attributes [[NUW]] = { noinline nounwind{{.*}} }
+
+// PR21811: attach the appropriate attribute to the global init function
+// CHECK-FP: define internal void @_GLOBAL__sub_I_amdgcn_global_init.cpp() [[NUX:#[0-9]+]]
+// CHECK-FP: attributes [[NUX]] = { noinline nounwind {{.*}}"no-frame-pointer-elim-non-leaf"{{.*}} }
diff --git a/test/CodeGenHCC/register-control.cpp b/test/CodeGenHCC/register-control.cpp
new file mode 100755
index 0000000000..3f311e4d75
--- /dev/null
+++ b/test/CodeGenHCC/register-control.cpp
@@ -0,0 +1,86 @@
+// RUN: %clang_cc1 -famp-is-device -fhsa-ext -std=c++amp -x hc-kernel -triple amdgcn -target-cpu fiji -emit-llvm -disable-llvm-passes -o - %s| FileCheck %s
+//
+// This test emulates parallel-for-each without relying on HCC header files.
+// By using pseudo definitions of some HCC types this test can generate the trampoline functions which are
+// needed for testing the register control attributes.
+// The objective is to focus on language aspects without introducing unnecessary declarations in the header files.
+
+class accelerator_view { int dummy; };
+class extent { int dummy; };
+struct index {
+  index() __attribute__((annotate("__cxxamp_opencl_index"))){}
+  int x;
+};
+
+struct array {
+  int x;
+  void foo() restrict(amp) {}
+};
+
+template <typename Kernel>
+__attribute__((noinline,used)) void parallel_for_each(
+    const accelerator_view& av, const extent& compute_domain, const Kernel& f) [[hc]] {
+  auto foo = &Kernel::__cxxamp_trampoline;
+  auto bar = &Kernel::operator();
+}
+
+int* foo(int x)[[hc]];
+
+int main() {
+  int x[10];
+
+  accelerator_view acc;
+  extent ext;
+  array arr;
+
+  // Test parallel-for-each with functor.
+  class A {
+  public:
+    void foo()restrict(amp){}
+    // CHECK-LABEL: define internal amdgpu_kernel void @_ZZ4mainEN1A19__cxxamp_trampolineEi(i32)
+    // CHECK-SAME: #[[ATTR2:[0-9]+]]
+    void operator()(index& i)
+    [[hc]]
+    [[hc_waves_per_eu(3)]]
+    [[hc_flat_workgroup_size(1,1)]]
+    [[hc_flat_workgroup_size(2,2,"gfx700")]]
+    [[hc_flat_workgroup_size(3,3,"gfx701")]]
+    [[hc_flat_workgroup_size(7,7,"gfx803")]]
+    [[hc_flat_workgroup_size(4,4,"gfx800")]]
+    [[hc_flat_workgroup_size(5,5,"gfx801")]]
+    [[hc_flat_workgroup_size(6,6,"gfx802")]]
+    [[hc_max_workgroup_dim(4,5,6)]]
+    { x = i.x; }
+    int x;
+  } a;
+
+  parallel_for_each(acc, ext, a);
+
+  // Test parallel-for-each with lambda function.
+  // CHECK-LABEL: define internal amdgpu_kernel void @"_ZZ4mainEN3$_019__cxxamp_trampolineEP5array"(%struct.array*)
+  // CHECK-SAME: #[[ATTR3:[0-9]+]]
+  parallel_for_each(acc, ext, [&](index& i)
+      [[hc]]
+      [[hc_waves_per_eu(4)]]
+      [[hc_flat_workgroup_size(5)]]
+      [[hc_max_workgroup_dim(6,7,8)]]
+      {
+        arr.x = 123;
+      });
+
+  // Test parallel-for-each with lambda function.
+  // CHECK-LABEL: define internal amdgpu_kernel void @"_ZZ4mainEN3$_119__cxxamp_trampolineEP5array"(%struct.array*)
+  // CHECK-SAME: #[[ATTR4:[0-9]+]]
+  parallel_for_each(acc, ext, [&](index& i)
+      [[hc]]
+      [[hc_max_workgroup_dim(3,4,5)]]
+      {
+        arr.x = 123;
+      });
+
+  return 0;
+}
+
+// CHECK: attributes #[[ATTR2]] ={{.*}}"amdgpu-flat-work-group-size"="7,7" "amdgpu-max-work-group-dim"="4,5,6" "amdgpu-waves-per-eu"="3"
+// CHECK: attributes #[[ATTR3]] ={{.*}}"amdgpu-flat-work-group-size"="5" "amdgpu-max-work-group-dim"="6,7,8" "amdgpu-waves-per-eu"="4"
+// CHECK: attributes #[[ATTR4]] ={{.*}}"amdgpu-flat-work-group-size"="1,60" "amdgpu-max-work-group-dim"="3,4,5"
diff --git a/test/CodeGenObjCXX/msabi-protocol-conformance.mm b/test/CodeGenObjCXX/msabi-protocol-conformance.mm
new file mode 100755
index 0000000000..9f668d1f27
--- /dev/null
+++ b/test/CodeGenObjCXX/msabi-protocol-conformance.mm
@@ -0,0 +1,34 @@
+// RUN: %clang_cc1 -triple thumbv7-windows-msvc -fobjc-runtime=ios-6.0 -o - -emit-llvm %s | FileCheck %s
+
+@protocol P;
+@protocol Q;
+
+@class I;
+
+void f(id<P>, id, id<P>, id) {}
+// CHECK-LABEL: "\01?f@@YAXPAU?$objc_object@YP@@@@PAUobjc_object@@01@Z"
+
+void f(id, id<P>, id<P>, id) {}
+// CHECK-LABEL: "\01?f@@YAXPAUobjc_object@@PAU?$objc_object@YP@@@@10@Z"
+
+void f(id<P>, id<P>) {}
+// CHECK-LABEL: "\01?f@@YAXPAU?$objc_object@YP@@@@0@Z"
+
+void f(id<P>) {}
+// CHECK-LABEL: "\01?f@@YAXPAU?$objc_object@YP@@@@@Z"
+
+void f(id<P, Q>) {}
+// CHECK-LABEL: "\01?f@@YAXPAU?$objc_object@YP@@YQ@@@@@Z"
+
+void f(Class<P>) {}
+// CHECK-LABEL: "\01?f@@YAXPAU?$objc_class@YP@@@@@Z"
+
+void f(Class<P, Q>) {}
+// CHECK-LABEL: "\01?f@@YAXPAU?$objc_class@YP@@YQ@@@@@Z"
+
+void f(I<P> *) {}
+// CHECK-LABEL: "\01?f@@YAXPAU?$I@YP@@@@@Z"
+
+void f(I<P, Q> *) {}
+// CHECK-LABEL: "\01?f@@YAXPAU?$I@YP@@YQ@@@@@Z"
+
diff --git a/test/CodeGenOpenCL/amdgpu-call-kernel.cl b/test/CodeGenOpenCL/amdgpu-call-kernel.cl
index 005793916c..97ae8eb7cd 100755
--- a/test/CodeGenOpenCL/amdgpu-call-kernel.cl
+++ b/test/CodeGenOpenCL/amdgpu-call-kernel.cl
@@ -1,6 +1,7 @@
 // REQUIRES: amdgpu-registered-target
 // RUN: %clang_cc1 -triple amdgcn-unknown-unknown -S -emit-llvm -o - %s | FileCheck %s
-// CHECK: define amdgpu_kernel void @test_call_kernel(i32 addrspace(1)* nocapture %out)
+// ToDo: Fix llvm so that %out has nocapture attribute.
+// CHECK: define amdgpu_kernel void @test_call_kernel(i32 addrspace(1)* %out)
 // CHECK: store i32 4, i32 addrspace(1)* %out, align 4
 
 kernel void test_kernel(global int *out)
diff --git a/test/CodeGenOpenCL/convergent.cl b/test/CodeGenOpenCL/convergent.cl
index 193d391ced..577ab07c8a 100644
--- a/test/CodeGenOpenCL/convergent.cl
+++ b/test/CodeGenOpenCL/convergent.cl
@@ -66,7 +66,7 @@ void test_merge_if(int a) {
 // CHECK: br i1 %[[tobool]], label %[[if_end:.+]], label %[[if_then:.+]]
 // CHECK: [[if_then]]:
 // CHECK: tail call spir_func void @f()
-// CHECK-NOT: call spir_func void @convfun()
+// CHECK-NOT: call spir_func void @non_convfun()
 // CHECK-NOT: call spir_func void @g()
 // CHECK: br label %[[if_end]]
 // CHECK: [[if_end]]:
diff --git a/test/Driver/hip-toolchain-no-rdc.hip b/test/Driver/hip-toolchain-no-rdc.hip
index 540b932860..74b53caaaa 100644
--- a/test/Driver/hip-toolchain-no-rdc.hip
+++ b/test/Driver/hip-toolchain-no-rdc.hip
@@ -20,7 +20,7 @@
 // CHECK-SAME: "-aux-triple" "x86_64-unknown-linux-gnu"
 // CHECK-SAME: "-emit-llvm-bc"
 // CHECK-SAME: {{.*}} "-main-file-name" "a.cu" {{.*}} "-target-cpu" "gfx803"
-// CHECK-SAME: "-fcuda-is-device" "-fvisibility" "hidden"
+// CHECK-SAME: "-fcuda-is-device" "-fcuda-allow-variadic-functions" "-fvisibility" "hidden"
 // CHECK-SAME: "-fapply-global-visibility-to-externs"
 // CHECK-SAME: "{{.*}}lib1.bc" "{{.*}}lib2.bc"
 // CHECK-SAME: {{.*}} "-o" [[A_BC_803:".*bc"]] "-x" "hip"
@@ -48,7 +48,7 @@
 // CHECK-SAME: "-aux-triple" "x86_64-unknown-linux-gnu"
 // CHECK-SAME: "-emit-llvm-bc"
 // CHECK-SAME: {{.*}} "-main-file-name" "a.cu" {{.*}} "-target-cpu" "gfx900"
-// CHECK-SAME: "-fcuda-is-device" "-fvisibility" "hidden"
+// CHECK-SAME: "-fcuda-is-device" "-fcuda-allow-variadic-functions" "-fvisibility" "hidden"
 // CHECK-SAME: "-fapply-global-visibility-to-externs"
 // CHECK-SAME: "{{.*}}lib1.bc" "{{.*}}lib2.bc"
 // CHECK-SAME: {{.*}} "-o" [[A_BC_900:".*bc"]] "-x" "hip"
@@ -92,7 +92,7 @@
 // CHECK-SAME: "-aux-triple" "x86_64-unknown-linux-gnu"
 // CHECK-SAME: "-emit-llvm-bc"
 // CHECK-SAME: {{.*}} "-main-file-name" "b.hip" {{.*}} "-target-cpu" "gfx803"
-// CHECK-SAME: "-fcuda-is-device" "-fvisibility" "hidden"
+// CHECK-SAME: "-fcuda-is-device" "-fcuda-allow-variadic-functions" "-fvisibility" "hidden"
 // CHECK-SAME: "-fapply-global-visibility-to-externs"
 // CHECK-SAME: "{{.*}}lib1.bc" "{{.*}}lib2.bc"
 // CHECK-SAME: {{.*}} "-o" [[B_BC_803:".*bc"]] "-x" "hip"
@@ -120,7 +120,7 @@
 // CHECK-SAME: "-aux-triple" "x86_64-unknown-linux-gnu"
 // CHECK-SAME: "-emit-llvm-bc"
 // CHECK-SAME: {{.*}} "-main-file-name" "b.hip" {{.*}} "-target-cpu" "gfx900"
-// CHECK-SAME: "-fcuda-is-device" "-fvisibility" "hidden"
+// CHECK-SAME: "-fcuda-is-device" "-fcuda-allow-variadic-functions" "-fvisibility" "hidden"
 // CHECK-SAME: "-fapply-global-visibility-to-externs"
 // CHECK-SAME: "{{.*}}lib1.bc" "{{.*}}lib2.bc"
 // CHECK-SAME: {{.*}} "-o" [[B_BC_900:".*bc"]] "-x" "hip"
diff --git a/test/Driver/hip-toolchain-rdc.hip b/test/Driver/hip-toolchain-rdc.hip
index 15ac5f1931..fc440a085e 100644
--- a/test/Driver/hip-toolchain-rdc.hip
+++ b/test/Driver/hip-toolchain-rdc.hip
@@ -16,7 +16,7 @@
 // CHECK-SAME: "-aux-triple" "x86_64-unknown-linux-gnu"
 // CHECK-SAME: "-emit-llvm-bc"
 // CHECK-SAME: {{.*}} "-main-file-name" "a.cu" {{.*}} "-target-cpu" "gfx803"
-// CHECK-SAME: "-fcuda-is-device" "-fgpu-rdc" "-fvisibility" "hidden"
+// CHECK-SAME: "-fcuda-is-device" "-fgpu-rdc" "-fcuda-allow-variadic-functions" "-fvisibility" "hidden"
 // CHECK-SAME: "-fapply-global-visibility-to-externs"
 // CHECK-SAME: "{{.*}}lib1.bc" "{{.*}}lib2.bc"
 // CHECK-SAME: {{.*}} "-o" [[A_BC:".*bc"]] "-x" "hip"
@@ -26,7 +26,7 @@
 // CHECK-SAME: "-aux-triple" "x86_64-unknown-linux-gnu"
 // CHECK-SAME: "-emit-llvm-bc"
 // CHECK-SAME: {{.*}} "-main-file-name" "b.hip" {{.*}} "-target-cpu" "gfx803"
-// CHECK-SAME: "-fcuda-is-device" "-fgpu-rdc" "-fvisibility" "hidden"
+// CHECK-SAME: "-fcuda-is-device" "-fgpu-rdc" "-fcuda-allow-variadic-functions" "-fvisibility" "hidden"
 // CHECK-SAME: "-fapply-global-visibility-to-externs"
 // CHECK-SAME: "{{.*}}lib1.bc" "{{.*}}lib2.bc"
 // CHECK-SAME: {{.*}} "-o" [[B_BC:".*bc"]] "-x" "hip"
@@ -50,7 +50,7 @@
 // CHECK-SAME: "-aux-triple" "x86_64-unknown-linux-gnu"
 // CHECK-SAME: "-emit-llvm-bc"
 // CHECK-SAME: {{.*}} "-main-file-name" "a.cu" {{.*}} "-target-cpu" "gfx900"
-// CHECK-SAME: "-fcuda-is-device" "-fgpu-rdc"
+// CHECK-SAME: "-fcuda-is-device" "-fgpu-rdc" "-fcuda-allow-variadic-functions"
 // CHECK-SAME: "{{.*}}lib1.bc" "{{.*}}lib2.bc"
 // CHECK-SAME: {{.*}} "-o" [[A_BC:".*bc"]] "-x" "hip"
 // CHECK-SAME: {{.*}} [[A_SRC]]
@@ -59,7 +59,7 @@
 // CHECK-SAME: "-aux-triple" "x86_64-unknown-linux-gnu"
 // CHECK-SAME: "-emit-llvm-bc"
 // CHECK-SAME: {{.*}} "-main-file-name" "b.hip" {{.*}} "-target-cpu" "gfx900"
-// CHECK-SAME: "-fcuda-is-device" "-fgpu-rdc"
+// CHECK-SAME: "-fcuda-is-device" "-fgpu-rdc" "-fcuda-allow-variadic-functions"
 // CHECK-SAME: "{{.*}}lib1.bc" "{{.*}}lib2.bc"
 // CHECK-SAME: {{.*}} "-o" [[B_BC:".*bc"]] "-x" "hip"
 // CHECK-SAME: {{.*}} [[B_SRC]]
diff --git a/test/OpenMP/nvptx_parallel_codegen.cpp b/test/OpenMP/nvptx_parallel_codegen.cpp
index cdbc887244..5035afe991 100644
--- a/test/OpenMP/nvptx_parallel_codegen.cpp
+++ b/test/OpenMP/nvptx_parallel_codegen.cpp
@@ -2,6 +2,7 @@
 // RUN: %clang_cc1 -verify -fopenmp -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm-bc %s -o %t-ppc-host.bc
 // RUN: %clang_cc1 -verify -fopenmp -x c++ -triple nvptx64-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o - -disable-llvm-optzns | FileCheck %s --check-prefix CHECK --check-prefix CHECK-64
 // RUN: %clang_cc1 -verify -fopenmp -x c++ -triple i386-unknown-unknown -fopenmp-targets=nvptx-nvidia-cuda -emit-llvm-bc %s -o %t-x86-host.bc
+// RUN: %clang_cc1 -verify -fopenmp -x c++ -triple i386-unknown-unknown -fopenmp-targets=amdgcn -emit-llvm-bc %s -o %t-x86-host.bc
 // RUN: %clang_cc1 -verify -fopenmp -x c++ -triple nvptx-unknown-unknown -fopenmp-targets=nvptx-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -o - -disable-llvm-optzns | FileCheck %s --check-prefix CHECK --check-prefix CHECK-32
 // RUN: %clang_cc1 -verify -fopenmp -fexceptions -fcxx-exceptions -x c++ -triple nvptx-unknown-unknown -fopenmp-targets=nvptx-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -o - -disable-llvm-optzns | FileCheck %s --check-prefix CHECK --check-prefix CHECK-32
 // expected-no-diagnostics
diff --git a/test/SemaCXX/amdgpu-wchar.cxx b/test/SemaCXX/amdgpu-wchar.cxx
new file mode 100644
index 0000000000..3d5141fd49
--- /dev/null
+++ b/test/SemaCXX/amdgpu-wchar.cxx
@@ -0,0 +1,11 @@
+// RUN: %clang_cc1 -triple amdgcn -std=c++11 %s
+
+typedef __WINT_TYPE__ wint_t;
+
+#if _WIN32
+static_assert(sizeof(wchar_t)==2, "fail");
+static_assert(sizeof(wint_t)==2, "fail");
+#else
+static_assert(sizeof(wchar_t)==4, "fail");
+static_assert(sizeof(wint_t)==4, "fail");
+#endif
diff --git a/test/SemaCXX/warn-self-assign.cpp b/test/SemaCXX/warn-self-assign.cpp
new file mode 100755
index 0000000000..7d558c6a07
--- /dev/null
+++ b/test/SemaCXX/warn-self-assign.cpp
@@ -0,0 +1,50 @@
+// RUN: %clang_cc1 -fsyntax-only -Wself-assign -verify %s
+
+void f() {
+  int a = 42, b = 42;
+  a = a; // expected-warning{{explicitly assigning}}
+  b = b; // expected-warning{{explicitly assigning}}
+  a = b;
+  b = a = b;
+  a = a = a; // expected-warning{{explicitly assigning}}
+  a = b = b = a;
+  a &= a; // expected-warning{{explicitly assigning}}
+  a |= a; // expected-warning{{explicitly assigning}}
+  a ^= a;
+}
+
+// Dummy type.
+struct S {};
+
+void false_positives() {
+#define OP =
+#define LHS a
+#define RHS a
+  int a = 42;
+  // These shouldn't warn due to the use of the preprocessor.
+  a OP a;
+  LHS = a;
+  a = RHS;
+  LHS OP RHS;
+#undef OP
+#undef LHS
+#undef RHS
+
+  S s;
+  s = s; // Not a builtin assignment operator, no warning.
+
+  // Volatile stores aren't side-effect free.
+  volatile int vol_a;
+  vol_a = vol_a;
+  volatile int &vol_a_ref = vol_a;
+  vol_a_ref = vol_a_ref;
+}
+
+template <typename T> void g() {
+  T a;
+  a = a; // May or may not be a builtin assignment operator, no warning.
+}
+void instantiate() {
+  g<int>();
+  g<S>();
+}
diff --git a/test/SemaOpenCL/address-spaces-conversions-cl2.0.cl b/test/SemaOpenCL/address-spaces-conversions-cl2.0.cl
index 89cf8e3a81..73e0fac925 100644
--- a/test/SemaOpenCL/address-spaces-conversions-cl2.0.cl
+++ b/test/SemaOpenCL/address-spaces-conversions-cl2.0.cl
@@ -327,7 +327,7 @@ void test_conversion(__global int *arg_glob, __local int *arg_loc,
 
   b = var_sub - arg_priv;
 #ifndef GENERIC
-// expected-error-re@-2{{arithmetic operation with operands of type  ('__{{global|constant}} int *' and 'int *') which are pointers to non-overlapping address spaces}}
+// expected-error-re@-2{{arithmetic operation with operands of type  ('__{{global|constant}} int *' and '__private int *') which are pointers to non-overlapping address spaces}}
 #endif
 
   b = var_sub - arg_gen;
diff --git a/tools/clang-offload-bundler/ClangOffloadBundler.cpp b/tools/clang-offload-bundler/ClangOffloadBundler.cpp
index 0c628963a2..68b1c5456d 100644
--- a/tools/clang-offload-bundler/ClangOffloadBundler.cpp
+++ b/tools/clang-offload-bundler/ClangOffloadBundler.cpp
@@ -973,6 +973,7 @@ int main(int argc, const char **argv) {
     KindIsValid = KindIsValid && StringSwitch<bool>(Kind)
                                      .Case("host", true)
                                      .Case("openmp", true)
+                                     .Case("hcc", true)
                                      .Case("hip", true)
                                      .Default(false);
 
diff --git a/tools/driver/CMakeLists.txt b/tools/driver/CMakeLists.txt
index 590d708d83..be92053b9c 100644
--- a/tools/driver/CMakeLists.txt
+++ b/tools/driver/CMakeLists.txt
@@ -63,7 +63,7 @@ endif()
 add_dependencies(clang clang-resource-headers)
 
 if(NOT CLANG_LINKS_TO_CREATE)
-  set(CLANG_LINKS_TO_CREATE clang++ clang-cl clang-cpp)
+  set(CLANG_LINKS_TO_CREATE clang++ clang-cl clang-cpp hcc)
 endif()
 
 foreach(link ${CLANG_LINKS_TO_CREATE})
