/***
* ==++==
*
* Copyright (c) Microsoft Corporation.  All rights reserved.
*
* ==--==
* =+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+
*
* amprt.h
*
* Define the C++ interfaces exported by the C++ AMP runtime
*
* =-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-
****/
#pragma once

#if !(defined (_M_X64) || defined (_M_IX86) || defined (_M_ARM) || defined (_M_ARM64) )
    #error ERROR: C++ AMP runtime is supported only on X64, X86, ARM, and ARM64 architectures.
#endif  

#if defined (_M_CEE)
    #error ERROR: C++ AMP runtime is not supported when compiling /clr.
#endif  

#ifndef __cplusplus
    #error ERROR: C++ AMP runtime is supported only for C++.
#endif  

#if !defined(_CXXAMP)

#if defined(_DEBUG)
    #pragma comment(lib, "vcampd")
#else   // _DEBUG
    #pragma comment(lib, "vcamp")
#endif  // _DEBUG

#endif // _CXXAMP

#if !defined(_CXXAMP)

#define __GPU      restrict(amp,cpu)
#define __GPU_ONLY restrict(amp)
#define __CPU_ONLY

#else

#define __GPU
#define __GPU_ONLY
#define __CPU_ONLY

#endif // _CXXAMP

#include <exception>
#include <unknwn.h>
#include <crtdbg.h>
#include <string>
#include <vector>

#if defined(_CXXAMP)
#include <strsafe.h>
#endif // _CXXAMP

#include <future>
#include <functional>
#include <map>
#include <unordered_map>
#include <set>
#include <unordered_set>
#include <concrt.h>
#include <type_traits>

#if !defined(_AMPIMP)
#define _AMPIMP     __declspec(dllimport)
#endif

#pragma pack(push,8)

// Part of runtime-compiler interface
extern "C" 
{
    // Access mode of fields
    enum _Access_mode
    {
        _No_access = 0,
        _Read_access = (1 << 0),
        _Write_access = (1 << 1),
        _Is_array_mode = (1 << 30),
        _Read_write_access = _Read_access | _Write_access,
    };
}

namespace Concurrency
{
    /// <summary>
    ///     Enumeration type used to denote the various types of access to data.
    /// </summary>
    enum access_type
    {
        access_type_none = 0,
        access_type_read = (1 << 0),
        access_type_write = (1 << 1),
        access_type_read_write = access_type_read | access_type_write,
        access_type_auto = (1 << 31),
    };

// Forward declarations
class accelerator_view;
class accelerator;

namespace details
{
    const size_t ERROR_MSG_BUFFER_SIZE = 1024;

    //  A reference counter to be used as the base class for all reference counted types.
    class _Reference_counter
    {
    public:

        //  Constructor.
        _Reference_counter()  : _M_rc(0) {}

        //  Destructor.
        virtual ~_Reference_counter() {}

        // Add a reference. 
        // Thread-safe.
        size_t _Add_reference()
        {
            return InterlockedIncrement(reinterpret_cast<LONG volatile*>(&_M_rc));
        }

        // Remove a reference. 
        // Thread-safe.
        size_t _Remove_reference()
        {
            _ASSERTE(_M_rc > 0);

            size_t refCount = InterlockedDecrement(reinterpret_cast<LONG volatile*>(&_M_rc));

            if (refCount == 0)
                this->_Release();

            return refCount;
        }

        // Release the counter
        _AMPIMP void _Release();

        // Return the reference count value
        size_t _Get_reference_count()
        {
            return _M_rc;
        }

    private:
        size_t _M_rc;
    };

    // A smart pointer to a reference counted object
    // T must be a type derived from _Reference_counter
    template <class T>
    class _Reference_counted_obj_ptr
    {
    public:
    
        // Constructor
        _Reference_counted_obj_ptr(T* _Ptr = NULL) :  _M_obj_ptr(_Ptr)
        {
            _Init();
        }

        // Copy constructor
        _Reference_counted_obj_ptr(const _Reference_counted_obj_ptr &_Other) : _M_obj_ptr(_Other._M_obj_ptr)
        {
            _Init();
        }

        // Move constructor
        _Reference_counted_obj_ptr(_Reference_counted_obj_ptr &&_Other) : _M_obj_ptr(_Other._M_obj_ptr)
        {
            _Other._M_obj_ptr = nullptr;
            // No change to ref-count
        }

        // Destructor
        ~_Reference_counted_obj_ptr()
        {
            if (_M_obj_ptr != NULL) {
                _UnInitialize(_M_obj_ptr);
            }
        }

        // Assignment operator
        _Reference_counted_obj_ptr& operator=(const _Reference_counted_obj_ptr &_Other)
        {
            if (_M_obj_ptr != _Other._M_obj_ptr)
            {
                T *oldPtr = _M_obj_ptr;
                _M_obj_ptr = _Other._M_obj_ptr;
                _Init();
        
                if (oldPtr != NULL) {
                    _UnInitialize(oldPtr);
                }
            }
            return *this;
        }

        // Move-assignment operator
        _Reference_counted_obj_ptr& operator=(_Reference_counted_obj_ptr &&_Other)
        {
            if (_M_obj_ptr != _Other._M_obj_ptr)
            {
                T *oldPtr = _M_obj_ptr;
                _M_obj_ptr = _Other._M_obj_ptr;
                _Other._M_obj_ptr = nullptr;
                // No change to ref-count of the adopted pointer.

                if (oldPtr != nullptr)
                {
                    _UnInitialize(oldPtr);
                }
            }
            return *this;
        }

        _Ret_ T* operator->() const
        {
            return _M_obj_ptr;
        }

        T& operator*() const
        {
            return *_M_obj_ptr;
        }

        operator T*() const
        {
            return _M_obj_ptr;
        }

        _Ret_ T* _Get_ptr() const
        {
            return _M_obj_ptr;
        }

    private:
        T *_M_obj_ptr;

        void _Init()
        {
            if (_M_obj_ptr == NULL)
                return;

            reinterpret_cast<_Reference_counter*>(_M_obj_ptr)->_Add_reference();
        }

        static void _UnInitialize(_In_ T *_Obj_ptr)
        {
            reinterpret_cast<_Reference_counter*>(_Obj_ptr)->_Remove_reference();
        }
    };

    // Forward declarations
    class _Trace;
    class _Amp_runtime_trace;
    class _Buffer;
    class _Texture;
    class _Sampler;
    class _Ubiquitous_buffer;
    class _D3D_interop;
    class _Accelerator_view_impl;
    class _CPU_accelerator_view_impl;
    class _D3D_accelerator_view_impl;
    class _Accelerator_impl;
    class _Event_impl;
    class _DPC_runtime_factory;
    class _View_shape;
    struct _Buffer_descriptor;
    class _Accelerator_view_hasher;
    struct _DPC_shader_blob;
    struct _View_info;

    // The enum specifies the base type for short vector type.
    enum _Short_vector_base_type_id : unsigned int
    {
        _Uint_type = 0,
        _Int_type = 1,
        _Float_type = 2,
        _Unorm_type = 3,
        _Norm_type = 4,
        _Double_type = 5,
        _Invalid_type = 0xFFFFFFFF
    };

    typedef enum _Short_vector_base_type_id _Texture_base_type_id;

} // namespace Concurrency::details

typedef details::_Reference_counted_obj_ptr<details::_Accelerator_view_impl> _Accelerator_view_impl_ptr;
typedef details::_Reference_counted_obj_ptr<details::_Accelerator_impl> _Accelerator_impl_ptr;
typedef details::_Reference_counted_obj_ptr<details::_Buffer> _Buffer_ptr;
typedef details::_Reference_counted_obj_ptr<details::_Texture> _Texture_ptr;
typedef details::_Reference_counted_obj_ptr<details::_Sampler> _Sampler_ptr;
typedef details::_Reference_counted_obj_ptr<details::_Ubiquitous_buffer> _Ubiquitous_buffer_ptr;
typedef details::_Reference_counted_obj_ptr<details::_Event_impl> _Event_impl_ptr;
typedef details::_Reference_counted_obj_ptr<details::_View_shape> _View_shape_ptr;

namespace details
{
    // The _Event class.
    class _Event
    {
        friend class _Buffer;
        friend class _Texture;
        friend class accelerator_view;
        friend class _D3D_accelerator_view_impl;

    public:
        /// <summary>
        ///     Constructor of the _Event.
        /// </summary>
        _AMPIMP _Event();

        /// <summary>
        ///     Destructor of the _Event.
        /// </summary>
        _AMPIMP ~_Event();

        /// <summary>
        ///     Copy constructor
        /// </summary>
        _AMPIMP _Event(const _Event & _Other);

        /// <summary>
        ///     Assignment operator
        /// </summary>
        _AMPIMP _Event & operator=(const _Event & _Other);

        /// <summary>
        ///     Poll whether the _Event has completed or not. Swallows any exceptions
        /// </summary>
        /// <returns>
        ///     true, if the _Event has completed, false otherwise
        /// </returns>
        _AMPIMP bool _Is_finished_nothrow(); 

        /// <summary>
        ///     Poll whether the _Event has completed or not and throws any exceptions that occur
        /// </summary>
        /// <returns>
        ///     true, if the _Event has completed, false otherwise
        /// </returns>
        _AMPIMP bool _Is_finished(); 

        /// <summary>
        ///     Wait until the _Event completes and throw any exceptions that occur.
        /// </summary>
        _AMPIMP void _Get();

        /// <summary>
        ///     Tells if this is an empty event
        /// </summary>
        /// <returns>
        ///     true, if the _Event is empty
        ///     false, otherwise
        /// </returns>
        _AMPIMP bool _Is_empty() const;

        /// <summary>
        ///     Creates an event which is an ordered collection of this and _Ev
        /// </summary>
        /// <returns>
        ///     The composite event
        /// </returns>
        _AMPIMP _Event _Add_event(_Event _Ev);

        /// <summary>
        ///     Creates an event which is an ordered collection of this and a continuation task
        /// </summary>
        /// <returns>
        ///     The composite event
        /// </returns>
        _AMPIMP _Event _Add_continuation(const std::function<_Event __cdecl ()> &_Continuation_task);

        /// <summary>
        ///     Return true if the other _Event is same as this _Event; false otherwise
        /// </summary>
        _AMPIMP bool operator==(const _Event &_Other) const;

        /// <summary>
        ///     Return false if the other _Event is same as this _Event; true otherwise
        /// </summary>
        _AMPIMP bool operator!=(const _Event &_Other) const;

    private:

        // Private constructor
        _Event(_In_ _Event_impl* _Impl);

        _Event_impl_ptr _M_ptr_event_impl;
    };

    typedef _Buffer_descriptor *_View_key;

    _Ret_ _Accelerator_view_impl* _Get_accelerator_view_impl_ptr(const accelerator_view& _Accl_view);
    _Ret_ _Accelerator_impl* _Get_accelerator_impl_ptr(const accelerator& _Accl);
    _Event _Get_access_async(const _View_key _Key, accelerator_view _Av, _Access_mode _Mode, _Buffer_ptr &_Buf_ptr);
    unsigned int _Get_mipmap_levels(const _Texture *_Tex);

    inline bool _Is_valid_access_mode(_Access_mode _Mode)
    {
        if ((_Mode != _Read_access) && 
            (_Mode != _Write_access) &&
            (_Mode != _Read_write_access))
        {
            return false;
        }

        return true;
    }

    // Caution: Do not change this structure defintion.
    // This struct is special and is processed by the FE to identify the buffers
    // used in a parallel_for_each and to setup the _M_data_ptr with the appropriate
    // buffer ptr value in the device code.
    typedef struct _Buffer_descriptor
    {
        friend _Event _Get_access_async(const _View_key _Key, accelerator_view _Av, _Access_mode _Mode, _Buffer_ptr &_Buf_ptr);

        // _M_data_ptr points to the raw data underlying the buffer for accessing on host
        mutable void *_M_data_ptr;

    private:
        // _M_buffer_ptr points to a _Ubiquitous_buffer that holds the data in an 1D array.
        // This is private to ensure that all assignments to this data member
        // only happen through public functions which properly manage the
        // ref count of the underlying buffer
        _Ubiquitous_buffer *_M_buffer_ptr;

    public:
        // _M_curr_cpu_access_mode specifies the current access mode of the data on the
        // cpu accelerator_view specified at the time of registration of this view
        _Access_mode _M_curr_cpu_access_mode;

        // _M_type_acess_mode specifies the access mode of the overlay type
        // array_views set it to the appropriate access mode and for arrays it is
        // always _Is_array_mode.
        _Access_mode _M_type_access_mode;

    public:
        // Public functions

        // Default constructor
        _Buffer_descriptor() __GPU
            : _M_data_ptr(NULL), _M_buffer_ptr(NULL),
            _M_curr_cpu_access_mode(_No_access), _M_type_access_mode(_Is_array_mode)
        {
        }

        _Buffer_descriptor(_In_ void *_Data_ptr, _In_ _Ubiquitous_buffer *_Buffer_ptr,
                           _Access_mode _Curr_cpu_access_mode, _Access_mode _Type_mode) __GPU
            : _M_data_ptr(_Data_ptr), _M_buffer_ptr(NULL),
            _M_curr_cpu_access_mode(_Curr_cpu_access_mode), _M_type_access_mode(_Type_mode)
        {
            _Set_buffer_ptr(_Buffer_ptr);
        }

        // Destructor
        ~_Buffer_descriptor() __GPU
        {
            _Set_buffer_ptr(NULL);
        }

        // Copy constructor
        _Buffer_descriptor(const _Buffer_descriptor &_Other) __GPU
            : _M_data_ptr(_Other._M_data_ptr), _M_buffer_ptr(NULL),
            _M_curr_cpu_access_mode(_Other._M_curr_cpu_access_mode), _M_type_access_mode(_Other._M_type_access_mode)
        {
            _Set_buffer_ptr(_Other._M_buffer_ptr);
        }

        // Assignment operator
        _Buffer_descriptor& operator=(const _Buffer_descriptor &_Other) __GPU
        {
            if (this != &_Other)
            {
                _M_data_ptr = _Other._M_data_ptr;
                _M_curr_cpu_access_mode = _Other._M_curr_cpu_access_mode;
                _M_type_access_mode = _Other._M_type_access_mode;
                _Set_buffer_ptr(_Other._M_buffer_ptr);
            }

            return *this;
        }

        _Ret_ _Ubiquitous_buffer* _Get_buffer_ptr() const __CPU_ONLY
        {
            return _M_buffer_ptr;
        }

        void _Set_buffer_ptr(_In_opt_ _Ubiquitous_buffer *_Buffer_ptr) __CPU_ONLY
        {
            if (_M_buffer_ptr != _Buffer_ptr)
            {
                if (_M_buffer_ptr != NULL) {
                    reinterpret_cast<_Reference_counter*>(_M_buffer_ptr)->_Remove_reference();
                }

                _M_buffer_ptr = _Buffer_ptr;

                if (_M_buffer_ptr != NULL) {
                    reinterpret_cast<_Reference_counter*>(_M_buffer_ptr)->_Add_reference();
                }
            }
        }

#if !defined(_CXXAMP)
        void _Set_buffer_ptr(_In_opt_ _Ubiquitous_buffer *_Buffer_ptr) __GPU_ONLY
        {
            // No need to set the buffer ptr on the GPU
            UNREFERENCED_PARAMETER(_Buffer_ptr);
            _M_buffer_ptr = NULL;
        }
#endif // _CXXAMP

        bool _Is_array() const
        {
            return (_M_type_access_mode == _Is_array_mode);
        }

        _Ret_ _View_key _Get_view_key()
        {
            return this;
        }

        const _View_key _Get_view_key() const
        {
            return ((const _View_key)(this));
        }

        _AMPIMP void _Get_CPU_access(_Access_mode _Requested_mode) const;

    } _Buffer_descriptor;

    // Caution: Do not change this structure defintion.
    // This struct is special and is processed by the FE to identify the textures
    // used in a parallel_for_each and to setup the _M_data_ptr with the appropriate
    // texture ptr value in the device code.
    typedef struct _Texture_descriptor
    {
        // _M_data_ptr points to the raw data underlying the texture
        mutable IUnknown *_M_data_ptr;

    private:
        // _M_texture_ptr points to a _Texture that holds the data
        // This is private to ensure that all assignments to this data member
        // only happen through public functions which properly manage the
        // ref count of the underlying texture
        _Texture *_M_texture_ptr;

        // The index of the most detailed (largest in size) mipmap level for the texture (or texture view)
        // This value is always zero for the texture and might be non-zero for the texture views
        unsigned int _M_most_detailed_mipmap_level;

        // Number of accessible mipmap levels for the texture (or texture view),
        // e.g. if the texture has 3 mipmap levels ([0, 1, 2]),
        // then read-only texture view with most detailed mipmap level equal to 1, can have 1 or 2 mipmap levels ([1] or [1, 2]).
        // Further texture_views created on top of the texture view defined above can only narrow down the range of accessible mipmap levels.
        unsigned int _M_view_mipmap_levels;

    public:
        // Public functions

        // Default constructor
        _Texture_descriptor() __GPU
            : _M_data_ptr(NULL), _M_texture_ptr(NULL), _M_most_detailed_mipmap_level(0), _M_view_mipmap_levels(0)
        {
            // Enables move constructor
        }

        // Constructor for the texture
        _Texture_descriptor(unsigned int _Most_detailed_mipmap_level, unsigned int _View_mipmap_levels) __GPU
            : _M_data_ptr(NULL), _M_texture_ptr(NULL), _M_most_detailed_mipmap_level(_Most_detailed_mipmap_level), _M_view_mipmap_levels(_View_mipmap_levels)
        {
        }

        // Constructor for the interop texture
        _Texture_descriptor(_In_ _Texture * _Texture_ptr) : _M_data_ptr(NULL), _M_texture_ptr(NULL), _M_most_detailed_mipmap_level(0) __CPU_ONLY
        {
            _Set_texture_ptr(_Texture_ptr);

            // Adopt number of mipmap levels from underlying texture object
            _M_view_mipmap_levels = _Get_mipmap_levels(_M_texture_ptr);
        }

        // Destructor
        ~_Texture_descriptor() __GPU
        {
            _Set_texture_ptr(NULL);
        }

        // Copy constructor
        _Texture_descriptor(const _Texture_descriptor &_Other) __GPU
            : _M_data_ptr(_Other._M_data_ptr), _M_texture_ptr(NULL),
              _M_most_detailed_mipmap_level(_Other._M_most_detailed_mipmap_level), _M_view_mipmap_levels(_Other._M_view_mipmap_levels)
        {
            _Set_texture_ptr(_Other._M_texture_ptr);
        }

        // Copy constructor with ability to redefine mipmap information
        _Texture_descriptor(const _Texture_descriptor &_Other, unsigned int _Most_detailed_mipmap_level, unsigned int _View_mipmap_levels) __GPU
            : _M_data_ptr(_Other._M_data_ptr), _M_texture_ptr(NULL),
              _M_most_detailed_mipmap_level(_Most_detailed_mipmap_level), _M_view_mipmap_levels(_View_mipmap_levels)
        {
            _Set_texture_ptr(_Other._M_texture_ptr);
        }

        // Assignment operator
        _Texture_descriptor& operator=(const _Texture_descriptor &_Other) __GPU
        {
            if (this != &_Other)
            {
                _M_data_ptr = _Other._M_data_ptr;
                _Set_texture_ptr(_Other._M_texture_ptr);
                _M_most_detailed_mipmap_level = _Other._M_most_detailed_mipmap_level;
                _M_view_mipmap_levels = _Other._M_view_mipmap_levels;
            }

            return *this;
        }

        // Move constructor
        _Texture_descriptor(_Texture_descriptor &&_Other) __CPU_ONLY
        {
            *this = std::move(_Other);
        }

        bool operator==(const _Texture_descriptor &_Other) const __GPU
        {
            return _M_texture_ptr == _Other._M_texture_ptr 
                && _M_data_ptr == _Other._M_data_ptr
                && _M_most_detailed_mipmap_level == _Other._M_most_detailed_mipmap_level
                && _M_view_mipmap_levels == _Other._M_view_mipmap_levels;
        }

        _Ret_ _Texture* _Get_texture_ptr() const __CPU_ONLY
        {
            _ASSERTE(_M_texture_ptr);
            return _M_texture_ptr;
        }

        unsigned int _Get_most_detailed_mipmap_level() const __GPU
        {
            return _M_most_detailed_mipmap_level;
        }

        unsigned int _Get_view_mipmap_levels() const __GPU
        {
            return _M_view_mipmap_levels;
        }

        void _Set_view_mipmap_levels(unsigned int _View_mipmap_levels) __CPU_ONLY
        {
            _M_view_mipmap_levels = _View_mipmap_levels;
        }

        void _Set_texture_ptr(_In_opt_ _Texture *_Texture_ptr) __CPU_ONLY
        {
            if (_M_texture_ptr != _Texture_ptr)
            {
                if (_M_texture_ptr != NULL) {
                    reinterpret_cast<_Reference_counter*>(_M_texture_ptr)->_Remove_reference();
                }

                _M_texture_ptr = _Texture_ptr;

                if (_M_texture_ptr != NULL) {
                    reinterpret_cast<_Reference_counter*>(_M_texture_ptr)->_Add_reference();
                }
            }
        }

#if !defined(_CXXAMP)
        void _Set_texture_ptr(_In_opt_ _Texture *_Texture_ptr) __GPU_ONLY
        {
            // No need to set the texture ptr on the GPU
            UNREFERENCED_PARAMETER(_Texture_ptr);
            _M_texture_ptr = NULL;
        }
#endif // _CXXAMP

        // This helper function is used to determine aliasing and copy violations
        bool _Are_mipmap_levels_overlapping(const _Texture_descriptor *_Other) const __CPU_ONLY
        {
            _ASSERTE(_Other);

            if (this->_Get_texture_ptr() != _Other->_Get_texture_ptr())
            {
                return false;
            }

            return !((_M_most_detailed_mipmap_level < _Other->_M_most_detailed_mipmap_level) ? ((_M_most_detailed_mipmap_level + _M_view_mipmap_levels - 1) < _Other->_M_most_detailed_mipmap_level)
                                                                                             : ((_Other->_M_most_detailed_mipmap_level + _Other->_M_view_mipmap_levels - 1) < _M_most_detailed_mipmap_level));
        }

    } _Texture_descriptor;

    // Caution: Do not change this structure defintion.
    // This struct is special and is processed by the FE to identify the samplers
    // used in a parallel_for_each.
    typedef struct _Sampler_descriptor
    {
        // _M_data_ptr points to the sampler on accelerator
        mutable void *_M_data_ptr;

    private:
        // _M_sampler_ptr points to a _Sampler that holds the underlying sampler
        // representation. This is private to ensure that all assignments to this data member
        // only happen through public functions which properly manage the
        // ref count of the underlying _Sampler object.
        _Sampler *_M_sampler_ptr;

    public:
        // Public functions

        // Default constructor
        _Sampler_descriptor() __GPU
            : _M_data_ptr(NULL), _M_sampler_ptr(NULL)
        {
        }

        _Sampler_descriptor(_In_ _Sampler * _Sampler_ptr) __GPU
            : _M_data_ptr(NULL), _M_sampler_ptr(NULL)
        {
            _Set_sampler_ptr(_Sampler_ptr);
        }

        // Destructor
        ~_Sampler_descriptor() __GPU
        {
            _Set_sampler_ptr(NULL);
        }

        // Copy constructor
        _Sampler_descriptor(const _Sampler_descriptor &_Other) __GPU
            : _M_data_ptr(_Other._M_data_ptr), _M_sampler_ptr(NULL)
        {
            _Set_sampler_ptr(_Other._M_sampler_ptr);
        }

        // Assignment operator
        _Sampler_descriptor& operator=(const _Sampler_descriptor &_Other) __GPU
        {
            if (this != &_Other)
            {
                _M_data_ptr = _Other._M_data_ptr;
                _Set_sampler_ptr(_Other._M_sampler_ptr);
            }

            return *this;
        }

        // Move constructor
        _Sampler_descriptor(_Sampler_descriptor &&_Other) __CPU_ONLY
        {
            *this = std::move(_Other);
        }

        bool operator==(const _Sampler_descriptor &_Other) const __GPU
        {
            return _M_sampler_ptr == _Other._M_sampler_ptr && _M_data_ptr == _Other._M_data_ptr;
        }

        _Ret_ _Sampler* _Get_sampler_ptr() const __CPU_ONLY
        {
            return _M_sampler_ptr;
        }

        void _Set_sampler_ptr(_In_opt_ _Sampler *_Sampler_ptr) __CPU_ONLY
        {
            if (_M_sampler_ptr != _Sampler_ptr)
            {
                if (_M_sampler_ptr != NULL) {
                    reinterpret_cast<_Reference_counter*>(_M_sampler_ptr)->_Remove_reference();
                }

                _M_sampler_ptr = _Sampler_ptr;

                if (_M_sampler_ptr != NULL) {
                    reinterpret_cast<_Reference_counter*>(_M_sampler_ptr)->_Add_reference();
                }
            }
        }

#if !defined(_CXXAMP)
        void _Set_sampler_ptr(_In_opt_ _Sampler *_Sampler_ptr) __GPU_ONLY
        {
            // No need to set the sampler ptr on the GPU
            UNREFERENCED_PARAMETER(_Sampler_ptr);
            _M_sampler_ptr = NULL;
        }
#endif // _CXXAMP

    } _Sampler_descriptor;

} // namespace Concurrency::details

// Forward declaration
class accelerator;

namespace details
{
    _AMPIMP size_t __cdecl _Get_num_devices();
    _AMPIMP _Ret_ _Accelerator_impl_ptr * __cdecl _Get_devices();
    _AMPIMP accelerator __cdecl _Select_default_accelerator();
    _AMPIMP bool __cdecl _Set_default_accelerator(_Accelerator_impl_ptr _Accl);
    _AMPIMP bool __cdecl _Is_D3D_accelerator_view(const accelerator_view& _Av);
    _AMPIMP void __cdecl _Register_async_event(const _Event &_Ev, const std::shared_future<void> &_Shared_future);
    _AMPIMP _Access_mode __cdecl _Get_recommended_buffer_host_access_mode(const accelerator_view &_Av);
}

/// <summary>
///    Queuing modes supported for accelerator views
/// </summary>
enum queuing_mode {
    queuing_mode_immediate,
    queuing_mode_automatic
}; 

/// <summary>
///     Exception thrown due to a C++ AMP runtime_exception.
///     This is the base type for all C++ AMP exception types.
/// </summary>
class runtime_exception : public std::exception
{
public:
    /// <summary>
    ///     Construct a runtime_exception exception with a message and an error code
    /// </summary>
    /// <param name="_Message">
    ///     Descriptive message of error
    /// </param>
    /// <param name="_Hresult">
    ///     HRESULT of error that caused this exception
    /// </param>
    _AMPIMP runtime_exception(const char * _Message, HRESULT _Hresult) throw();

    /// <summary>
    ///     Construct a runtime_exception exception with an error code
    /// </summary>
    /// <param name="_Hresult">
    ///     HRESULT of error that caused this exception
    /// </param>
    _AMPIMP explicit runtime_exception(HRESULT _Hresult) throw();

    /// <summary>
    ///     Copy construct a runtime_exception exception
    /// </summary>
    /// <param name="_Other">
    ///     The runtime_exception object to be copied from
    /// </param>
    _AMPIMP runtime_exception(const runtime_exception &_Other) throw();

    /// <summary>
    ///     Assignment operator
    /// </summary>
    /// <param name="_Other">
    ///     The runtime_exception object to be assigned from
    /// </param>
    _AMPIMP runtime_exception &operator=(const runtime_exception &_Other) throw();

    /// <summary>
    ///     Destruct a runtime_exception exception object instance
    /// </summary>
    _AMPIMP virtual ~runtime_exception() throw();

    /// <summary>
    ///     Get the error code that caused this exception
    /// </summary>
    /// <returns>
    ///     HRESULT of error that caused the exception
    /// </returns>
    _AMPIMP HRESULT get_error_code() const throw();

private:
    HRESULT _M_error_code;
}; // class runtime_exception

/// <summary>
///     Exception thrown when an underlying OS/DirectX call fails
///     due to lack of system or device memory 
/// </summary>
class out_of_memory : public runtime_exception
{
public:
    /// <summary>
    ///     Construct an out_of_memory exception with a message
    /// </summary>
    /// <param name="_Message">
    ///     Descriptive message of error
    /// </param>
    _AMPIMP explicit out_of_memory(const char * _Message) throw();

    /// <summary>
    ///     Construct an out_of_memory exception
    /// </summary>
    _AMPIMP out_of_memory () throw();
}; // class out_of_memory

namespace direct3d
{
    /// <summary>
    ///     Get the D3D device interface underlying a accelerator_view.
    /// </summary>
    /// <param name="_Av">
    ///     The D3D accelerator_view for which the underlying D3D device interface is returned.
    /// </param>
    /// <returns>
    ///     The IUnknown interface pointer of the D3D device underlying the accelerator_view.
    /// </returns>
    _AMPIMP _Ret_ IUnknown * __cdecl get_device(const accelerator_view &_Av);

    /// <summary>
    ///     Create a accelerator_view from a D3D device interface pointer.
    /// </summary>
    /// <param name="_D3D_device">
    ///     The D3D device interface pointer to create the accelerator_view from.
    /// </param>
    /// <param name="_Qmode">
    ///     The queuing_mode to be used for the newly created accelerator_view.
    ///     This parameter has a default value of queuing_mode_automatic.
    /// </param>
    /// <returns>
    ///     The accelerator_view created from the passed D3D device interface.
    /// </returns>
    _AMPIMP accelerator_view __cdecl create_accelerator_view(_In_ IUnknown *_D3D_device, queuing_mode _Qmode = queuing_mode_automatic);

    /// <summary>
    ///     Create and return a new accelerator view on the specified accelerator.
    /// </summary>
    /// <param name="_Accelerator">
    ///     The accelerator on which the new accelerator_view is to be created.
    /// </param>
    /// <param name="_Disable_timeout">
    ///     A boolean parameter that specifies whether timeout should be disabled
    ///     for the newly created accelerator_view. This corresponds to the
    ///     D3D11_CREATE_DEVICE_DISABLE_GPU_TIMEOUT flag for Direct3D device creation
    ///     and is used to indicate if the operating system should allow workloads
    ///     that take more than 2 seconds to execute, without resetting the device 
    ///     per the Windows timeout detection and recovery mechanism. Use of this flag
    ///     is recommended if you need to perform time consuming tasks on the accelerator_view.
    /// </param>
    /// <param name="_Qmode">
    ///     The queuing_mode to be used for the newly created accelerator_view.
    ///     This parameter has a default value of queuing_mode_automatic.
    /// </param>
    /// <returns>
    ///     The newly created accelerator_view.
    /// </returns>
    _AMPIMP accelerator_view __cdecl create_accelerator_view(accelerator& _Accelerator, bool _Disable_timeout, queuing_mode _Qmode = queuing_mode_automatic);

    /// <summary>
    ///     Returns a boolean flag indicating if timeout is disabled
    ///     for the specified accelerator_view. This corresponds to the
    ///     D3D11_CREATE_DEVICE_DISABLE_GPU_TIMEOUT flag for Direct3D device creation.
    /// </summary>
    /// <param name="_Accelerator_view">
    ///     The accelerator_view for which the timeout disabled setting is to be queried.
    /// </param>
    /// <returns>
    ///     A boolean flag indicating if timeout is disabled for the specified accelerator_view.
    /// </returns>
    _AMPIMP bool __cdecl is_timeout_disabled(const accelerator_view& _Accelerator_view);
    
    /// <summary>
    ///     Acquire a lock on an accelerator_view for the purpose of safely performing D3D operations on resources shared 
    ///     with the accelerator_view.  The accelerator_view and all C++ AMP resources associated with this accelerator_view 
    ///     internally take this lock when performing operations and will block while another thread holds the D3D access lock.  
    ///
    ///     This lock is non-recursive: It is undefined behavior to call this function from a thread that already holds the lock.  
    ///     It is undefined behavior to perform operations on the accelerator_view or any data container associated with the 
    ///     accelerator_view from the thread that holds the D3D access lock.
    ///      
    ///     See also scoped_d3d_access_lock, a RAII-style class for a scope-based D3D access lock.
    /// </summary>
    /// <param name="_Av">
    ///     The accelerator_view to lock.
    /// </param>
    _AMPIMP void __cdecl d3d_access_lock(accelerator_view &_Av);

    /// <summary>
    ///     Attempt to acquire the D3D access lock on an accelerator_view without blocking.
    /// </summary>
    /// <param name="_Av">
    ///     The accelerator_view to lock.
    /// </param>
    /// <returns>
    ///     true if the lock was acquired, or false if it is currently held by another thread.
    /// </returns>
    _AMPIMP bool __cdecl d3d_access_try_lock(accelerator_view &_Av);

    /// <summary>
    ///     Release the D3D access lock on the given accelerator_view.  If the calling thread does 
    ///     not hold the lock on the accelerator_view the results are undefined.
    /// </summary>
    /// <param name="_Av">
    ///     The accelerator_view for which the lock is to be released.
    /// </param>
    _AMPIMP void __cdecl d3d_access_unlock(accelerator_view &_Av);

    /// <summary>
    ///     Tag type to indicate the D3D access lock should be adopted rather than
    ///     acquired.
    /// </summary>
    struct adopt_d3d_access_lock_t {};

    /// <summary>
    ///  RAII wrapper for a D3D access lock on an accelerator_view.
    /// </summary>
    class scoped_d3d_access_lock
    {
    public:
        /// <summary> 
        ///     Acquire a D3D access lock on the given accelerator_view.  The lock is released 
        ///     when this object goes out of scope.  Construction will block until the lock
        ///     is acquired.
        /// </summary>
        /// <param name="_Av">
        ///     The accelerator_view to lock.
        /// </param>
        _AMPIMP explicit scoped_d3d_access_lock(accelerator_view &_Av);

        /// <summary>
        ///     Construct a scoped_d3d_access_lock on an accelerator_view for which the lock
        ///     is already held (e.g. was acquired by d3d_access_try_lock).  The D3D access
        ///     lock must already be held by the calling thread and not controlled by any other
        ///     scoped_d3d_access_lock.
        /// </summary>
        /// <param name="_Av">
        ///     The accelerator_view for the lock to adopt.
        /// </param>
        /// <param name="_T">
        ///     The adopt_d3d_access_lock object.
        /// </param>
        _AMPIMP explicit scoped_d3d_access_lock(accelerator_view &_Av, adopt_d3d_access_lock_t _T);

        /// <summary>
        ///     Destructor for scoped_d3d_access_lock: unlock the accelerator_view.
        /// </summary>
        _AMPIMP ~scoped_d3d_access_lock();

        /// <summary>
        ///     Move constructor for scoped_d3d_access_lock:  Take ownership of
        ///     a lock from another scoped_d3d_access_lock.
        /// </summary>
        /// <param name="_Other">
        ///     The accelerator_view from which to move.
        /// </param>
        _AMPIMP scoped_d3d_access_lock(scoped_d3d_access_lock &&_Other);

        /// <summary>
        ///     Move assignment operator for scoped_d3d_access_lock:  Take ownership 
        ///     of a lock from another scoped_d3d_access_lock, releasing the previous
        ///     lock.
        /// </summary>
        /// <param name="_Other">
        ///     The accelerator_view from which to move.
        /// </param>
        /// <returns>
        ///     A reference to this scoped_accelerator_view_lock.
        /// </returns>
        _AMPIMP scoped_d3d_access_lock& operator=(scoped_d3d_access_lock &&_Other);

    private:
        // No copy constructor
        scoped_d3d_access_lock(const scoped_d3d_access_lock &_Other);

        // No assignment operator
        scoped_d3d_access_lock & operator=(const scoped_d3d_access_lock &_Other);

        _Accelerator_view_impl_ptr _M_impl;
    };
} // namespace direct3d

/// <summary>
///  Class represents a accelerator abstraction for C++ AMP data-parallel devices 
/// </summary>
class accelerator
{
    friend class accelerator_view;

    friend class details::_Ubiquitous_buffer;

    friend _AMPIMP accelerator details::_Select_default_accelerator();

    _AMPIMP friend accelerator_view __cdecl direct3d::create_accelerator_view(accelerator& _Accelerator, bool _Disable_timeout, queuing_mode _Qmode /* = queuing_mode_automatic */);

    friend _Ret_ details::_Accelerator_impl* details::_Get_accelerator_impl_ptr(const accelerator& _Accl);

public:

    /// <summary>
    ///     String constant for default accelerator
    /// </summary>
    _AMPIMP static const wchar_t default_accelerator[];

    /// <summary>
    ///     String constant for cpu accelerator
    /// </summary>
    _AMPIMP static const wchar_t cpu_accelerator[];

    /// <summary>
    ///     String constant for direct3d WARP accelerator
    /// </summary>
    _AMPIMP static const wchar_t direct3d_warp[];

    /// <summary>
    ///     String constant for direct3d reference accelerator
    /// </summary>
    _AMPIMP static const wchar_t direct3d_ref[];

    /// <summary>
    ///     Construct a accelerator representing the default accelerator
    /// </summary>
    _AMPIMP accelerator();

    /// <summary>
    ///     Construct a accelerator representing the accelerator with the 
    ///     specified device instance path
    /// </summary>
    explicit accelerator(const std::wstring &_Device_path) : _M_impl(NULL)
    {
        _Init(_Device_path.c_str());
    }

    /// <summary>
    ///     Destructor
    /// </summary>
    _AMPIMP ~accelerator();

    /// <summary>
    ///     Copy constructor
    /// </summary>
    _AMPIMP accelerator(const accelerator &_Other);

    /// <summary>
    ///     Assignment operator
    /// </summary>
    _AMPIMP accelerator &operator=(const accelerator &_Other);

    /// <summary>
    ///     Returns the vector of accelerator objects representing all available accelerators
    /// </summary>
    /// <returns>
    ///     The vector of available accelerators
    /// </returns> 
    static inline std::vector<accelerator> get_all()
    {
        std::vector<accelerator> _AcceleratorVector;
        size_t _NumDevices = details::_Get_num_devices();
        for (size_t _I = 0; (_I < _NumDevices); ++_I)
        {
            _AcceleratorVector.push_back(details::_Get_devices()[_I]);
        }

        return _AcceleratorVector;
    }

    /// <summary>
    ///     Sets the default accelerator to be used for any operation
    ///     that implicitly uses the default accelerator. This method 
    ///     only succeeds if the runtime selected default accelerator
    ///     has not already been used in an operation that implicitly
    ///     uses the default accelerator
    /// </summary>
    /// <returns>
    ///     A boolean value indicating if the call succeeds in setting 
    ///     the default accelerator
    /// </returns> 
    static inline bool set_default(const std::wstring& _Path)
    {
        accelerator _Accl(_Path);
        return details::_Set_default_accelerator(_Accl._M_impl);
    }

    /// <summary>
    ///     Returns the auto selection accelerator_view which when specified
    ///     as the parallel_for_each target results in the target accelerator_view
    ///     for executing the parallel_for_each kernel to be automatically selected
    ///     by the runtime. For all other purposes, the accelerator_view returned 
    ///     by this method is the same as the default accelerator_view of the default
    ///     accelerator
    /// </summary>
    _AMPIMP static accelerator_view __cdecl get_auto_selection_view();

    /// <summary>
    ///     Returns the system-wide unique device instance path as a std::wstring
    /// </summary>
    std::wstring get_device_path() const 
    {
        return _Get_device_path();
    }

    __declspec(property(get=get_device_path)) std::wstring device_path;

    /// <summary>
    ///     Get the version for this accelerator
    /// </summary>
    _AMPIMP unsigned int get_version() const;
    __declspec(property(get=get_version)) unsigned int version; // hiword=major, loword=minor

    /// <summary>
    ///     Returns the device description as a std::wstring
    /// </summary>
    std::wstring get_description() const
    {
        return _Get_description();
    }

    __declspec(property(get=get_description)) std::wstring description;

    /// <summary>
    ///     Returns a boolean value indicating whether the accelerator
    ///     was created with DEBUG layer enabled for extensive error reporting
    /// </summary>
    _AMPIMP bool get_is_debug() const;
    __declspec(property(get=get_is_debug)) bool is_debug;

    /// <summary>
    ///     Returns a boolean value indicating whether the accelerator is emulated. 
    ///     This is true, for example, with the direct3d reference and WARP accelerators.
    /// </summary>
    _AMPIMP bool get_is_emulated() const;
    __declspec(property(get=get_is_emulated)) bool is_emulated;

    /// <summary>
    ///     Returns a boolean value indicating whether the accelerator
    ///     is attached to a display
    /// </summary>
    _AMPIMP bool get_has_display() const;
    __declspec(property(get=get_has_display)) bool has_display;

    /// <summary>
    ///     Returns a boolean value indicating whether the accelerator
    ///     supports full double precision (including double division,
    ///     precise_math functions, int to double, double to int conversions)
    ///     in a parallel_for_each kernel.
    /// </summary>
    _AMPIMP bool get_supports_double_precision() const;
    __declspec(property(get=get_supports_double_precision)) bool supports_double_precision;

    /// <summary>
    ///     Returns a boolean value indicating whether the accelerator
    ///     has limited double precision support (excludes double division,
    ///     precise_math functions, int to double, double to int conversions)
    ///     for a parallel_for_each kernel.
    /// </summary>
    _AMPIMP bool get_supports_limited_double_precision() const;
    __declspec(property(get=get_supports_limited_double_precision)) bool supports_limited_double_precision;

    /// <summary>
    ///     Returns a boolean value indicating whether the accelerator
    ///     supports memory accessible both by the accelerator and the CPU.
    /// </summary>
    _AMPIMP bool get_supports_cpu_shared_memory() const;
    __declspec(property(get=get_supports_cpu_shared_memory)) bool supports_cpu_shared_memory;

    /// <summary>
    ///     Return the default accelerator view associated with this accelerator
    /// </summary>
    _AMPIMP accelerator_view get_default_view() const;
    __declspec(property(get=get_default_view)) accelerator_view default_view;

    /// <summary>
    ///     Get the dedicated memory for this accelerator in KB
    /// </summary>
    _AMPIMP size_t get_dedicated_memory() const;
    __declspec(property(get=get_dedicated_memory)) size_t dedicated_memory;

    /// <summary>
    ///     Get the default cpu access_type for buffers created on this accelerator
    /// </summary>
    _AMPIMP access_type get_default_cpu_access_type() const;
    __declspec(property(get=get_default_cpu_access_type)) access_type default_cpu_access_type;

    /// <summary>
    ///     Set the default cpu access_type for arrays created on this accelerator
    ///     or for implicit memory allocations as part of array_views accessed
    ///     on this this accelerator. This method only succeeds if the default_cpu_access_type
    ///     for the accelerator has not already been overriden by a previous call to this method
    ///     and the runtime selected default_cpu_access_type for this accelerator has not yet
    ///     been used for allocating an array or for an implicit memory allocation backing an
    ///     array_view accessed on this accelerator.
    /// </summary>
    /// <param name="_Default_cpu_access_type">
    ///     The default cpu access_type to be used for array/array_view memory allocations
    ///     on this accelerator.
    /// </param>
    /// <returns>
    ///     A boolean value indicating if the default cpu access_type for the accelerator
    ///     was successfully set.
    /// </returns>
    _AMPIMP bool set_default_cpu_access_type(access_type _Default_cpu_access_type);

    /// <summary>
    ///     Create and return a new accelerator view on this accelerator
    ///     with the specified queuing mode. When unspecified the accelerator_view 
    ///     is created with queuing_mode_automatic queuing mode.
    /// </summary>
    _AMPIMP accelerator_view create_view(queuing_mode qmode = queuing_mode_automatic);

    /// <summary>
    ///     Return true if the other accelerator is same as this accelerator; false otherwise
    /// </summary>
    _AMPIMP bool operator==(const accelerator &_Other) const;

    /// <summary>
    ///     Return false if the other accelerator is same as this accelerator; true otherwise
    /// </summary>
    _AMPIMP bool operator!=(const accelerator &_Other) const;

private:

    // Private constructor
    _AMPIMP accelerator(_Accelerator_impl_ptr _Impl);

    // Private helper methods
    _AMPIMP const wchar_t *_Get_device_path() const;
    _AMPIMP const wchar_t *_Get_description() const;

    _AMPIMP void _Init(const wchar_t *_Path);

private:

    _Accelerator_impl_ptr _M_impl;
};

/// <summary>
///  Class represents a future corresponding to a C++ AMP asynchronous operation
/// </summary>
class completion_future
{
    friend class details::_Amp_runtime_trace;
public:

    /// <summary>
    ///     Default constructor
    /// </summary>
    completion_future()
    {
    }

    /// <summary>
    ///     Copy constructor
    /// </summary>
    completion_future(const completion_future& _Other)
        : _M_shared_future(_Other._M_shared_future),
        _M_task(_Other._M_task)
    {
    }

    /// <summary>
    ///     Move constructor
    /// </summary>
    completion_future(completion_future&& _Other)
        : _M_shared_future(std::move(_Other._M_shared_future)),
        _M_task(std::move(_Other._M_task))
    {
    }

    /// <summary>
    ///     Destructor
    /// </summary>
    ~completion_future()
    {
    }

    /// <summary>
    ///     Copy assignment operator
    /// </summary>
    completion_future& operator=(const completion_future& _Other)
    {
        if (this != &_Other) {
            _M_shared_future = _Other._M_shared_future;
            _M_task = _Other._M_task;
        }

        return (*this);
    }

    /// <summary>
    ///     Move assignment operator
    /// </summary>
    completion_future& operator=(completion_future&& _Other)
    {
        if (this != &_Other) {
            _M_shared_future = std::move(_Other._M_shared_future);
            _M_task = std::move(_Other._M_task);
        }

        return (*this);
    }
    
    /// <summary>
    ///     Waits until the associated asynchronous operation completes
    ///     Throws the stored exception if one was encountered during the 
    ///     asynchronous operation
    /// </summary>
    void get() const
    {
        _M_shared_future.get();
    }
    
    /// <summary>
    ///     Returns true if the object is associated with an asynchronous
    ///     operation
    /// </summary>
    /// <returns>
    ///     true if the object is associated with an asynchronous operation
    ///     and false otherwise
    /// </returns>
    bool valid() const
    {
        return _M_shared_future.valid();
    }
    
    /// <summary>
    ///     Blocks until the associated asynchronous operation completes
    /// </summary>
    void wait() const
    {
        _M_shared_future.wait();
    }

    /// <summary>
    ///     Blocks until the associated asynchronous operation completes or
    ///     _Rel_time has elapsed
    /// </summary>
    /// <returns>
    ///     - future_status::deferred if the associated asynchronous operation is not running
    ///     - future_status::ready if the associated asynchronous operation is finished
    ///     - future_status::timeout if the time period specified has elapsed
    /// </returns>
    template <class _Rep, class _Period>
    std::future_status wait_for(const std::chrono::duration<_Rep, _Period>& _Rel_time) const
    {
        return _M_shared_future.wait_for(_Rel_time);
    }

    /// <summary>
    ///     Blocks until the associated asynchronous operation completes or
    ///     until the current time exceeds _Abs_time
    /// </summary>
    /// <returns>
    ///     - future_status::deferred if the associated asynchronous operation is not running
    ///     - future_status::ready if the associated asynchronous operation is finished
    ///     - future_status::timeout if the time point specified has been reached
    /// </returns>
    template <class _Clock, class _Duration>
    std::future_status wait_until(const std::chrono::time_point<_Clock, _Duration>& _Abs_time) const
    {
        return _M_shared_future.wait_until(_Abs_time);
    }

    /// <summary>
    ///     Returns a std::shared_future&lt;void&gt; object corresponding to the 
    ///     associated asynchronous operation
    /// </summary>
    /// <returns>
    ///     A std::shared_future&lt;void&gt; object corresponding to the associated
    ///     asynchronous operation
    /// </returns>
    operator std::shared_future<void>() const
    {
        return _M_shared_future;
    }
    
    /// <summary>
    ///     Chains a callback Functor to the completion_future to be executed
    ///     when the associated asynchronous operation finishes execution
    /// </summary>
    template <typename _Functor>
    void then(const _Functor &_Func) const
    {
        this->to_task().then(_Func);
    }

    /// <summary>
    ///     Returns a concurrency::task&lt;void&gt; object corresponding to the 
    ///     associated asynchronous operation
    /// </summary>
    /// <returns>
    ///     A concurrency::task&lt;void&gt; object corresponding to the associated
    ///     asynchronous operation
    /// </returns>
    concurrency::task<void> to_task() const
    {
        return _M_task;
    }

private:

    // Private constructor
    completion_future(const std::shared_future<void> &_Shared_future,
                      const concurrency::task<void>& _Task)
                      : _M_shared_future(_Shared_future), _M_task(_Task)
    {
    }

    std::shared_future<void> _M_shared_future;
    concurrency::task<void> _M_task;
}; 

/// <summary>
///  Class represents a virtual device abstraction on a C++ AMP data-parallel accelerator
/// </summary>
class accelerator_view
{
    friend class accelerator;
    friend class details::_Buffer;
    friend class details::_Texture;
    friend class details::_Sampler;
    friend class details::_Ubiquitous_buffer;
    friend class details::_D3D_interop;
    friend class details::_D3D_accelerator_view_impl;
    friend class details::_CPU_accelerator_view_impl;
    friend class details::_Accelerator_view_hasher;

    _AMPIMP friend _Ret_ IUnknown * __cdecl direct3d::get_device(const accelerator_view &_Av);
    
    _AMPIMP friend accelerator_view __cdecl direct3d::create_accelerator_view(_In_ IUnknown *_D3D_device, queuing_mode qmode /* = queuing_mode_automatic */);

    _AMPIMP friend accelerator_view __cdecl direct3d::create_accelerator_view(accelerator& _Accelerator, bool _Disable_timeout, queuing_mode _Qmode /* = queuing_mode_automatic */);

    _AMPIMP friend bool __cdecl direct3d::is_timeout_disabled(const accelerator_view& _Accelerator_view);

    friend _Ret_ details::_Accelerator_view_impl* details::_Get_accelerator_view_impl_ptr(const accelerator_view& _Accl_view);

public:

    /// <summary>
    ///     Destructor
    /// </summary>
    _AMPIMP ~accelerator_view();

    /// <summary>
    ///     Copy constructor
    /// </summary>
    _AMPIMP accelerator_view(const accelerator_view &_Other);

    /// <summary>
    ///     Assignment operator
    /// </summary>
    _AMPIMP accelerator_view &operator=(const accelerator_view &_Other);

    /// <summary>
    ///     Get the accelerator for this accelerator view
    /// </summary>
    _AMPIMP accelerator get_accelerator() const;
    __declspec(property(get=get_accelerator)) Concurrency::accelerator accelerator;

    /// <summary>
    ///     Returns a boolean value indicating whether the accelerator view
    ///     was created with DEBUG layer enabled for extensive error reporting
    /// </summary>
    _AMPIMP bool get_is_debug() const;
    __declspec(property(get=get_is_debug)) bool is_debug;

    /// <summary>
    ///     Get the version for this accelerator view
    /// </summary>
    _AMPIMP unsigned int get_version() const;
    __declspec(property(get=get_version)) unsigned int version; // hiword=major, loword=minor

    /// <summary>
    ///     Get the queuing mode for this accelerator view
    /// </summary>
    _AMPIMP queuing_mode get_queuing_mode() const;
    __declspec(property(get=get_queuing_mode)) Concurrency::queuing_mode queuing_mode;

    /// <summary>
    ///     Returns a boolean value indicating whether the accelerator view
    ///     when passed to a parallel_for_each would result in automatic
    ///     selection of an appropriate execution target by the runtime
    /// </summary>
    _AMPIMP bool get_is_auto_selection() const;
    __declspec(property(get=get_is_auto_selection)) bool is_auto_selection;

    /// <summary>
    ///     Return true if the other accelerator view is same as this accelerator view; false otherwise
    /// </summary>
    _AMPIMP bool operator==(const accelerator_view &_Other) const;

    /// <summary>
    ///     Return false if the other accelerator view is same as this accelerator view; true otherwise
    /// </summary>
    _AMPIMP bool operator!=(const accelerator_view &_Other) const;

    /// <summary>
    ///     Waits for completion of all commands submitted so far to this accelerator_view
    /// </summary>
    _AMPIMP void wait();

    /// <summary>
    ///     Submit all pending commands queued to this accelerator_view to the accelerator
    ///     for execution.
    /// </summary>
    _AMPIMP void flush();

    /// <summary>
    ///     Return a future to track the completion of all commands submitted so far to this accelerator_view
    /// </summary>
    _AMPIMP concurrency::completion_future create_marker();

private:

    // No default constructor
    accelerator_view();

    // Private constructor
    _AMPIMP accelerator_view(_Accelerator_view_impl_ptr _Impl, bool _Auto_selection = false);

private:

    _Accelerator_view_impl_ptr _M_impl;
    bool _M_auto_selection;
};

namespace details
{
    inline _Ret_ _Accelerator_view_impl* _Get_accelerator_view_impl_ptr(const accelerator_view& _Accl_view)
    {
        return _Accl_view._M_impl;
    }

    inline _Ret_ _Accelerator_impl* _Get_accelerator_impl_ptr(const accelerator& _Accl)
    {
         return _Accl._M_impl;
    }

    // Type defining a hasher for accelerator_view objects
    // for use with std::unordered_set and std::unordered_map
    class _Accelerator_view_hasher
    {
    public:
        size_t operator()(const accelerator_view &_Accl_view) const
        {
            std::hash<_Accelerator_view_impl*> _HashFunctor;
            return _HashFunctor(_Accl_view._M_impl._Get_ptr());
        }
    };

    typedef std::unordered_set<accelerator_view, _Accelerator_view_hasher> _Accelerator_view_unordered_set;

    // Describes the N dimensional shape of a view in a buffer
    class _View_shape : public _Reference_counter
    {
    public:

        _AMPIMP static _Ret_ _View_shape* __cdecl _Create_view_shape(unsigned int _Rank, unsigned int _Linear_offset,
                                                                     const unsigned int *_Base_extent, const unsigned int *_View_offset,
                                                                     const unsigned int *_View_extent, const bool *_Projection_info = NULL);

        _AMPIMP _Ret_ _View_shape* _Get_reduced_shape_for_copy();

        inline unsigned int _Get_rank() const
        {
            return _M_rank;
        }

        inline unsigned int _Get_linear_offset() const
        {
            return _M_linear_offset;
        }

        inline const unsigned int *_Get_base_extent() const
        {
            return _M_base_extent;
        }

        inline const unsigned int *_Get_view_offset() const
        {
            return _M_view_offset;
        }
        inline const unsigned int *_Get_view_extent() const
        {
            return _M_view_extent;
        }

        inline const bool *_Get_projection_info() const
        {
            return _M_projection_info;
        }
        
        inline bool _Is_projection() const
        {
            return _M_projection_info[0];
        }

        inline bool _Is_valid(size_t _Buffer_size) const
        {
            // The end point of the base shape should not be greater than the size of the buffer
            size_t endLinearOffset = _M_linear_offset + _Get_extent_size(_M_rank, _M_base_extent);
            if (endLinearOffset > _Buffer_size) {
                return false;
            }

            return _Is_valid();
        }

        inline unsigned int _Get_view_size() const
        {
            return _Get_extent_size(_M_rank, _M_view_extent);
        }

        inline unsigned int _Get_view_linear_offset() const
        {
            return _Get_linear_offset(_M_view_offset);
        }

        static inline bool
        _Compare_extent_with_elem_size(unsigned int _Rank, const unsigned int *_Extent1, size_t _Elem_size1, const unsigned int *_Extent2, size_t _Elem_size2)
        {
            _ASSERTE((_Rank >= 1) && (_Extent1 != NULL)&& (_Extent2 != NULL));

            // The extents should match accounting for the element sizes of the respective buffers
            if ((_Extent1[_Rank - 1] * _Elem_size1) != (_Extent2[_Rank - 1] * _Elem_size2)) 
            {
                return false;
            }

            // Now compare the extent in all but the least significant dimension
            if ((_Rank > 1) && !_Compare_extent(_Rank - 1, _Extent1, _Extent2))
            {
                return false;
            }

            return true;
        }


        static inline bool
        _Compare_extent(unsigned int _Rank, const unsigned int *_Extent1, const unsigned int *_Extent2)
        {
            for (size_t _I = 0; _I < _Rank; ++_I) {
                if (_Extent1[_I] != _Extent2[_I]) {
                    return false;
                }
            }

            return true;
        }

        inline bool _Is_view_linear(unsigned int &_Linear_offset, unsigned int &_Linear_size) const
        {
            // The effective rank for the purpose of determining linearity
            // depends on the highest dimension in which the extent is not 1
            unsigned int _First_dim_with_non_unit_extent = 0;
            while ((_First_dim_with_non_unit_extent < _M_rank) && (_M_view_extent[_First_dim_with_non_unit_extent] == 1)) {
                _First_dim_with_non_unit_extent++;
            }

            unsigned int _Effective_rank = (_M_rank - _First_dim_with_non_unit_extent);

            // It is linear if the effective rank is <= 1 or the base extent
            // and view extent are same in all but the highest dimension with
            // non-unit extent
            if ((_Effective_rank <= 1) ||
                (_Compare_extent(_Effective_rank - 1, &_M_base_extent[_First_dim_with_non_unit_extent + 1], &_M_view_extent[_First_dim_with_non_unit_extent + 1]))) 
            {
                _Linear_offset = _Get_view_linear_offset();
                _Linear_size = _Get_view_size();
                return true;
            }

            return false;
        }

        inline bool _Overlaps(const _View_shape* _Other) const
        {
            if (_Compare_base_shape(_Other))
            {
                // If the base shapes are identical we will do the N-dimensional
                // bounding box overlap test
                
                for (size_t _I = 0; _I < _M_rank; ++_I)
                {
                    if (!_Intervals_overlap(_M_view_offset[_I], _M_view_offset[_I] + _M_view_extent[_I] - 1,
                                            _Other->_M_view_offset[_I], _Other->_M_view_offset[_I] + _Other->_M_view_extent[_I] - 1)) 
                    {
                        return false;
                    }
                }

                return true;
            }
            else
            {
                // The base shapes are different. Check based on linear intervals
                size_t firstStart = _Get_view_linear_offset();
                size_t firstEnd = firstStart + _Get_view_size() - 1;

                size_t secondStart = _Other->_Get_view_linear_offset();
                size_t secondEnd = secondStart + _Other->_Get_view_size() - 1;

                return _Intervals_overlap(firstStart, firstEnd, secondStart, secondEnd);
            }
        }

        inline bool _Subsumes(const _View_shape* _Other) const
        {
            // Subsumption test can only be done for shapes that have the same base shape or 
            // when both have a rank of 1
            if ((_M_rank == 1) && (_Other->_Get_rank() == 1)) 
            {
                size_t thisStart = _Get_view_linear_offset();
                size_t thisEnd = thisStart + _Get_view_size() - 1;

                size_t otherStart = _Other->_Get_view_linear_offset();
                size_t otherEnd = otherStart + _Other->_Get_view_size() - 1;

                return ((otherStart >= thisStart) && (otherEnd <= thisEnd));
            }

            if (!_Compare_base_shape(_Other)) {
                return false;
            }

            if (!_Contains(_Other->_Get_view_offset())) {
                return false;
            }

            std::vector<unsigned int> otherEndPointIndex(_M_rank);
            for (size_t _I = 0; _I < _M_rank; ++_I) {
                otherEndPointIndex[_I] = _Other->_Get_view_offset()[_I] + _Other->_Get_view_extent()[_I] - 1;
            }

            return _Contains(otherEndPointIndex.data());
        }

    private:
        // Private constructor to force construction through the _Create_view_shape method
        _View_shape(unsigned int _Rank, unsigned int _Linear_offset,
                    const unsigned int *_Base_extent, const unsigned int *_View_offset,
                    const unsigned int *_View_extent, const bool *_Projection_info);
        
        virtual ~_View_shape();

        // No default constructor or copy/assignment
        _View_shape();
        _View_shape(const _View_shape &_Other);
        _View_shape(_View_shape &&_Other);
        _View_shape& operator=(const _View_shape &_Other);
        _View_shape& operator=(_View_shape &&_Other);

        // Helper methods
        static bool _Intervals_overlap(size_t _First_start, size_t _First_end,
                                       size_t _Second_start, size_t _Second_end)
        {
            // Order the intervals by their start points
            if (_First_start > _Second_start) {
                size_t temp = _First_start;
                _First_start = _Second_start;
                _Second_start = temp;

                temp = _First_end;
                _First_end = _Second_end;
                _Second_end = temp;
            }

            // The start of the second one must be within the bounds of the first one
            return (_Second_start <= _First_end);
        }

        static unsigned int _Get_extent_size(unsigned int _Rank, const unsigned int *_Extent)
        {
            unsigned int totalExtent = 1;
            for (size_t _I = 0; _I < _Rank; ++_I) {
                totalExtent *= _Extent[_I];
            }

            return totalExtent;
        }

        inline bool _Is_valid() const
        {
            if (_M_rank == 0) {
                return false;
            }

            // Ensure the _M_view_offset + _M_view_extent is within the bounds of _M_base_extent
            size_t viewSize = 1;
            
            for (size_t _I = 0; _I < _M_rank; ++_I)
            {
                viewSize *= _M_view_extent[_I];
                if ((_M_view_offset[_I] + _M_view_extent[_I]) > _M_base_extent[_I]) {
                    return false;
                }
            }

            if (viewSize == 0) {
                return false;
            }

            return true;
        }

        inline bool _Compare_base_shape(const _View_shape* _Other) const
        {
            return ((_M_rank == _Other->_M_rank) &&
                    (_M_linear_offset == _Other->_M_linear_offset) &&
                    _Compare_extent(_M_rank, _M_base_extent, _Other->_M_base_extent));
        }

        // Checks if the element at the specified index
        // is contained within this view shape
        // Assumes the rank of the index is same as the 
        // rank of this view's shape
        inline bool _Contains(const unsigned int* _Element_index) const
        {
            for (size_t _I = 0; _I < _M_rank; ++_I)
            {
                if ((_Element_index[_I] < _M_view_offset[_I]) ||
                    (_Element_index[_I] >= (_M_view_offset[_I] + _M_view_extent[_I]))) 
                {
                    return false;
                }
            }

            return true;
        }

        inline unsigned int _Get_linear_offset(const unsigned int* _Element_index) const
        {
            unsigned int currMultiplier = 1;
            unsigned int linearOffset = _M_linear_offset;
            for (int _I = static_cast<int>(_M_rank - 1); _I >= 0; _I--)
            {
                linearOffset += (currMultiplier * _Element_index[_I]);
                currMultiplier *= _M_base_extent[_I];
            }

            return linearOffset;
        }

    private:       

        unsigned int _M_rank;
        unsigned int _M_linear_offset;
        unsigned int *_M_base_extent;
        unsigned int *_M_view_offset;
        unsigned int *_M_view_extent;
        bool *_M_projection_info;
    };

    // This function creates a new _View_shape object from an existing _View_shape object when the data underlying the view
    // needs to be reinterpreted to use a different element size than the one used by the original view.
    inline
    _Ret_ _View_shape *_Create_reinterpreted_shape(const _View_shape* _Source_shape, size_t _Curr_elem_size, size_t _New_elem_size)
    {
        unsigned int _Rank = _Source_shape->_Get_rank();
        size_t _LinearOffsetInBytes = _Source_shape->_Get_linear_offset() * _Curr_elem_size;
        size_t _BaseLSDExtentInBytes = (_Source_shape->_Get_base_extent())[_Rank - 1] * _Curr_elem_size;
        size_t _ViewLSDOffsetInBytes = (_Source_shape->_Get_view_offset())[_Rank - 1] * _Curr_elem_size;
        size_t _ViewLSDExtentInBytes = (_Source_shape->_Get_view_extent())[_Rank - 1] * _Curr_elem_size;

        _ASSERTE((_LinearOffsetInBytes % _New_elem_size) == 0);
        _ASSERTE((_BaseLSDExtentInBytes % _New_elem_size) == 0);
        _ASSERTE((_ViewLSDOffsetInBytes % _New_elem_size) == 0);
        _ASSERTE((_ViewLSDExtentInBytes % _New_elem_size) == 0);

        size_t _Temp_val = _LinearOffsetInBytes / _New_elem_size;
        _ASSERTE(_Temp_val <= UINT_MAX);
        unsigned int _New_linear_offset = static_cast<unsigned int>(_Temp_val);

        std::vector<unsigned int> _New_base_extent(_Rank);
        std::vector<unsigned int> _New_view_offset(_Rank);
        std::vector<unsigned int> _New_view_extent(_Rank);
        for (unsigned int i = 0; i < _Rank - 1; ++i) {
            _New_base_extent[i] = (_Source_shape->_Get_base_extent())[i];
            _New_view_offset[i] = (_Source_shape->_Get_view_offset())[i];
            _New_view_extent[i] = (_Source_shape->_Get_view_extent())[i];
        }

        // The extent in the least significant dimension needs to be adjusted
        _Temp_val = _BaseLSDExtentInBytes / _New_elem_size;
        _ASSERTE(_Temp_val <= UINT_MAX);
        _New_base_extent[_Rank - 1] = static_cast<unsigned int>(_Temp_val);

        _Temp_val = _ViewLSDOffsetInBytes / _New_elem_size;
        _ASSERTE(_Temp_val <= UINT_MAX);
        _New_view_offset[_Rank - 1] = static_cast<unsigned int>(_Temp_val);

        _Temp_val = _ViewLSDExtentInBytes / _New_elem_size;
        _ASSERTE(_Temp_val <= UINT_MAX);
        _New_view_extent[_Rank - 1] = static_cast<unsigned int>(_Temp_val);

        return _View_shape::_Create_view_shape(_Rank, _New_linear_offset, _New_base_extent.data(), _New_view_offset.data(), _New_view_extent.data());
    }

    inline _Access_mode _Get_synchronize_access_mode(access_type cpu_access_type)
    {
        switch(cpu_access_type)
        {
        case access_type_auto:
        case access_type_read:
            return _Read_access;
        case access_type_write:
            return _Write_access;
        case access_type_read_write:
            return _Read_write_access;
        case access_type_none:
        default:
            _ASSERTE(false);
            return _No_access;
        }
    }

    inline access_type _Get_cpu_access_type(_Access_mode _Cpu_access_mode)
    {
        access_type _Cpu_access_type = access_type_none;
        if (_Cpu_access_mode & _Read_access) {
            _Cpu_access_type = static_cast<access_type>(_Cpu_access_type | access_type_read);
        }

        if (_Cpu_access_mode & _Write_access) {
            _Cpu_access_type = static_cast<access_type>(_Cpu_access_type | access_type_write);
        }

        return _Cpu_access_type;
    }

    //  Class manages a raw buffer in a accelerator view
    class _Buffer : public _Reference_counter
    {
        friend class _CPU_accelerator_view_impl;
        friend class _D3D_accelerator_view_impl;
        friend class _D3D_temp_staging_cache;

    public:

        // Force construction through these static public method to ensure that _Buffer
        // objects are allocated in the runtime

        // Allocate a new buffer on the specified accelerator_view
        _AMPIMP static _Ret_ _Buffer * __cdecl _Create_buffer(accelerator_view _Accelerator_view, accelerator_view _Access_on_accelerator_view, size_t _Num_elems,
                                                              size_t _Elem_size, bool _Is_temp = false, access_type _Cpu_access_type = access_type_auto);

        // Create a buffer object from a pre-allocated storage on the specified accelerator_view. This can be thought
        // of as the accelerator_view "adopting" the passed data buffer.
        _AMPIMP static _Ret_ _Buffer * __cdecl _Create_buffer(_In_ void *_Data_ptr, accelerator_view _Accelerator_view, size_t _Num_elems,
                                                              size_t _Elem_size);

        // Create a staging buffer on the specified accelerator_view which can be accesed on the cpu upon mapping.
        _AMPIMP static _Ret_ _Buffer * __cdecl _Create_stage_buffer(accelerator_view _Accelerator_view, accelerator_view _Access_on_accelerator_view,
                                                                    size_t _Num_elems, size_t _Elem_size, bool _Is_temp = false);

        // Creates a temp staging buffer of the requested size. This function may create 
        // a staging buffer smaller than the requested size.
        _AMPIMP static _Ret_ _Buffer * __cdecl _Get_temp_staging_buffer(accelerator_view _Av, size_t _Requested_num_elems, size_t _Elem_size);

        // Map a zero-copy or staging buffer for access on the CPU.
        _AMPIMP void _Map_buffer(_Access_mode _Map_type, bool _Wait);

        // Asynchronously map a zero-copy or staging buffer for access on the CPU.
        _AMPIMP _Event _Map_buffer_async(_Access_mode _Map_type);

        // Unmap a zero-copy or staging buffer denying CPU access
        _AMPIMP void _Unmap_buffer();

        //  Copy data to _Dest asynchronously.
        _AMPIMP _Event _Copy_to_async(_Out_ _Buffer * _Dest, size_t _Num_elems, size_t _Src_offset = 0, size_t _Dest_offset = 0);

        //  Copy data to _Dest asynchronously.
        _AMPIMP _Event _Copy_to_async(_Out_ _Buffer * _Dest, _View_shape_ptr _Src_shape, _View_shape_ptr _Dest_shape);

        _AMPIMP accelerator_view _Get_accelerator_view() const;
        _AMPIMP accelerator_view _Get_access_on_accelerator_view() const;

        _AMPIMP void _Register_view(_In_ _View_key _Key);
        _AMPIMP void _Unregister_view(_In_ _View_key _Key);

        // Return the raw data ptr - only a accelerator view implementation can interpret
        // this raw pointer. This method should usually not be used in the AMP header files
        // The _Get_host_ptr is the right way for accessing the host accesible ptr for a buffer
        _Ret_ void * _Get_data_ptr() const
        {
            return _M_data_ptr;
        }

        // Returns the host accessible ptr corresponding to the buffer. This would
        // return NULL when the buffer is inaccesible on the CPU
        _Ret_ void * _Get_host_ptr() const
        {
            return _M_host_ptr;
        }

        size_t _Get_elem_size() const
        {
            return _M_elem_size;
        }

        size_t _Get_num_elems() const
        {
            return _M_num_elems;
        }

        _Ret_ _Accelerator_view_impl* _Get_accelerator_view_impl() const
        {
            return _M_accelerator_view;
        }

        _Ret_ _Accelerator_view_impl* _Get_access_on_accelerator_view_impl() const
        {
            return _M_access_on_accelerator_view;
        }

        bool _Owns_data() const 
        {
            return _M_owns_data;
        }

        _AMPIMP bool _Exclusively_owns_data();

        bool _Is_staging() const
        {
            return _M_is_staging;
        }

        _Access_mode _Get_allowed_host_access_mode() const
        {
            return _M_allowed_host_access_mode;
        }

        access_type _Get_allowed_host_access_type() const
        {
            return _Get_cpu_access_type(_M_allowed_host_access_mode);
        }

        bool _Is_host_accessible(_Access_mode _Requested_access_mode) const
        {
            return ((_Get_allowed_host_access_mode() & _Requested_access_mode) == _Requested_access_mode);
        }

        _Access_mode _Get_current_host_access_mode() const
        {
            return _M_current_host_access_mode;
        }

        bool _Is_temp() const
        {
            return _M_is_temp;
        }

        bool _Is_adopted() const
        {
            // Is it adopted from interop?
            return _M_is_adopted;
        }

        bool _Is_buffer() const
        {
            return _M_is_buffer;
        }

		_AMPIMP bool _Is_mappable() const;

    protected:

        // The _Buffer constructor is protected to force construction through the static 
        // _Create_buffer method to ensure the object is allocated in the runtime
        _Buffer(_In_ _Accelerator_view_impl* _Av, _In_ void *_Buffer_data_ptr, _In_ void * _Host_ptr,
                _Access_mode _Allowed_host_access_mode, _Access_mode _Current_host_access_mode, size_t _Num_elems,
                size_t _Elem_size, bool _Owns_data, bool _Is_staging, bool _Is_temp, bool _Is_adopted);

        // protected destructor to force deletion through _Release
        virtual ~_Buffer();

        // No default consturctor, copy constructor and assignment operator
        _Buffer();
        _Buffer(const _Buffer &rhs);
        _Buffer &operator=(const _Buffer &rhs);

        void _Set_host_ptr(_In_ void *_Host_ptr, _Access_mode _Host_access_mode = _No_access)
        {
            _ASSERTE((_Host_ptr == NULL) || (_Host_access_mode != _No_access));

            _M_host_ptr = _Host_ptr;
            if (_Host_ptr == NULL) {
                _M_current_host_access_mode = _No_access;
            }
            else {
                _M_current_host_access_mode = _Host_access_mode;
            }
        }

        void _Set_data_ptr(_In_ IUnknown *_Data_ptr)
        {
            _M_data_ptr = _Data_ptr;
        }

    protected:
        _Accelerator_view_impl_ptr _M_accelerator_view;
        _Accelerator_view_impl_ptr _M_access_on_accelerator_view;
        void * _M_data_ptr;
        void * _M_host_ptr;
        _Access_mode   _M_allowed_host_access_mode;
        _Access_mode _M_current_host_access_mode;
        size_t _M_elem_size;
        size_t _M_num_elems;
        bool   _M_owns_data;
        bool   _M_is_staging;

        // Used to determine how to map the staging buffer after its involved in a copy
        bool   _M_is_temp;

        bool   _M_is_adopted;
        bool   _M_is_buffer;
    private:
        // A set of view_keys to invalidate whenever the host ptr of a staging buffer is invalidated
        std::unique_ptr<std::unordered_set<_View_key>> _M_view_keys;
        Concurrency::critical_section _M_critical_section;
    };

    //  Class manages a texture in a accelerator view
    class _Texture : public _Buffer
    {
        friend class _CPU_accelerator_view_impl;
        friend class _D3D_accelerator_view_impl;
        friend class _D3D_temp_staging_cache;

    public:

        // Allocate a new texture on the specified accelerator_view
        _AMPIMP static _Ret_ _Texture * __cdecl _Create_texture(accelerator_view _Accelerator_view,
                                                                unsigned int _Rank,
                                                                size_t _Width, size_t _Height, size_t _Depth,
                                                                unsigned int _Mip_levels,
                                                                _Short_vector_base_type_id _Type_id,
                                                                unsigned int _Num_channels,
                                                                unsigned int _Bits_per_channel,
                                                                bool _Is_temp = false);

        // Create a texture object from a pre-allocated storage on the specified accelerator_view. This can be thought
        // of as the accelerator_view "adopting" the passed data buffer.  
        _AMPIMP static _Ret_ _Texture * __cdecl _Adopt_texture(unsigned int _Rank, _Texture_base_type_id _Id,
                                                                _In_ IUnknown *_Data_ptr, accelerator_view _Accelerator_view,
                                                                unsigned int _View_format);

        // Create a staging texture on the specified accelerator_view which can be accesed on the cpu upon mapping.
        _AMPIMP static _Ret_ _Texture * __cdecl _Create_stage_texture(accelerator_view _Accelerator_view, accelerator_view _Access_on_accelerator_view,
                                                                      unsigned int _Rank,
                                                                      size_t _Width, size_t _Height, size_t _Depth,
                                                                      unsigned int _Mip_levels,
                                                                      unsigned int _Format,
                                                                      bool _Is_temp = false);

        // Create a staging texture on the specified accelerator_view which can be accesed on the cpu upon mapping.
        _AMPIMP static _Ret_ _Texture * __cdecl _Create_stage_texture(accelerator_view _Accelerator_view, accelerator_view _Access_on_accelerator_view,
                                                                      unsigned int _Rank,
                                                                      size_t _Width, size_t _Height, size_t _Depth,
                                                                      unsigned int _Mip_levels,
                                                                      _Short_vector_base_type_id _Type_id,
                                                                      unsigned int _Num_channels,
                                                                      unsigned int _Bits_per_channel);

        // Creates a temp staging texture. This function may create 
        // a staging texture smaller than the requested size.
        _AMPIMP static _Ret_ _Texture * __cdecl _Get_temp_staging_texture(accelerator_view _Accelerator_view,
                                                                          unsigned int _Rank,
                                                                          size_t _Width, size_t _Height, size_t _Depth,
                                                                          unsigned int _Mip_levels,
                                                                          unsigned int _Format);

        // Constructs a new texture with the same properties as the given texture.
        _AMPIMP static _Ret_ _Texture * __cdecl _Clone_texture(const _Texture *_Src, const accelerator_view &_Accelerator_view, const accelerator_view &_Associated_av);

        //  Copy data to _Dest asynchronously for textures. The two textures must have been created with 
        //  compatible physical formats.
        _AMPIMP _Event _Copy_to_async(_Out_ _Texture * _Dest, const size_t *_Copy_extent,
                                      const size_t *_Src_offset, const size_t *_Dst_offset,
                                      unsigned int _Src_mipmap_level, unsigned int _Dst_mipmap_level);

        size_t _Get_width(unsigned int _Mip_offset = 0) const
        {
            return (_M_width >> _Mip_offset) ? (_M_width >> _Mip_offset) : 1U;
        }

        size_t _Get_height(unsigned int _Mip_offset = 0) const
        {
            return (_M_height >> _Mip_offset) ? (_M_height >> _Mip_offset) : 1U;
        }

        size_t _Get_depth(unsigned int _Mip_offset = 0) const
        {
            return (_M_depth >> _Mip_offset) ? (_M_depth >> _Mip_offset) : 1U;
        }

        unsigned int _Get_rank() const
        {
            return _M_rank;
        }

        unsigned int _Get_texture_format() const
        {
            return _M_texture_format;
        }

        unsigned int _Get_view_format() const
        {
            return _M_view_format;
        }

        unsigned int _Get_num_channels() const
        {
            return _M_num_channels;
        }

        unsigned int _Get_bits_per_channel() const
        {
            // For texture adopted from interop, return 0.
            return _Is_adopted() ? 0 : _M_bits_per_channel;
        }

        unsigned int _Get_bits_per_element() const
        {
            return _M_bits_per_channel * _M_num_channels;
        }

        unsigned int _Get_data_length(unsigned int _Most_detailed_mipmap_level, unsigned int _View_mipmap_levels, const size_t *_Extents = nullptr) const  // in bytes
        {
            _ASSERTE(_View_mipmap_levels);
        
            unsigned long long _Bits_per_byte = 8ULL;
            unsigned long long _Total_bytes = 0ULL;
            
            unsigned int _Mip_level = _Most_detailed_mipmap_level;

            // Sum up data length (in bytes) of all mipmap levels in the view
            for (unsigned int _Mip_offset=0; _Mip_offset < _View_mipmap_levels; ++_Mip_offset)
            {
                unsigned long long _Width = 1ULL;
                unsigned long long _Height = 1ULL;
                unsigned long long _Depth = 1ULL;

                if (_Extents)
                {
                    switch (_M_rank)
                    {
                    case 3:
                        _Depth = (_Extents[2] >> _Mip_level) ? (_Extents[2] >> _Mip_level) : 1U;
                        // deliberately fall thru
                    case 2:
                        _Height = (_Extents[1] >> _Mip_level) ? (_Extents[1] >> _Mip_level) : 1U;
                        // deliberately fall thru
                    case 1:
                        _Width = (_Extents[0] >> _Mip_level) ? (_Extents[0] >> _Mip_level) : 1U;
                        break;
                    default:
                        _ASSERTE(false); // textures are only rank 1-3
                    }
                }
                else
                {
                    _Width = _Get_width(_Mip_level);
                    _Height = _Get_height(_Mip_level);
                    _Depth = _Get_depth(_Mip_level);
                }

                // Note _Get_bits_per_element() can be smaller than 8
                // Use unsigned long long to avoid integer overflow
                _Total_bytes += ((_Width * _Height * _Depth * static_cast<unsigned long long>(_Get_bits_per_element())) + _Bits_per_byte - 1) / _Bits_per_byte;

                _Mip_level++;
            }

            return static_cast<unsigned int>(_Total_bytes);
        }

        unsigned int _Get_mip_levels() const
        {
            return _M_mip_levels;
        }

        size_t _Get_row_pitch() const
        {
            return _M_row_pitch;
        }

        void _Set_row_pitch(size_t _Val)
        {
            _M_row_pitch = _Val;
        }

        size_t _Get_depth_pitch() const
        {
            return _M_depth_pitch;
        }

        void _Set_depth_pitch(size_t _Val)
        {
            _M_depth_pitch = _Val;
        }

    private:

        // The _Texture constructor is private to force construction through the static 
        // _Create_texture method to ensure the object is allocated in the runtime
        _Texture(_In_ _Accelerator_view_impl* _Av, _In_ void *_Texture_data_ptr, _In_ void * _Host_ptr,
                 _Access_mode _Allowed_host_access_mode, _Access_mode _Current_host_access_mode,
                 unsigned int _Rank,
                 size_t _Width, size_t _Height, size_t _Depth,
                 unsigned int _Mip_levels,
                 unsigned int _Texture_format,
                 unsigned int _View_format,
                 unsigned int _Num_channels,
                 unsigned int _Bits_per_channel,
                 bool _Owns_data, bool _Is_staging, bool _Is_temp, bool _Is_adopted);

        // Private destructor to force deletion through _Release
        ~_Texture();

        // No default consturctor, copy constructor and assignment operator
        _Texture();
        _Texture(const _Texture &rhs);
        _Texture &operator=(const _Texture &rhs);

        // Texture only
        unsigned int _M_rank;
        size_t _M_width;
        size_t _M_height;
        size_t _M_depth;
        unsigned int _M_texture_format;
        unsigned int _M_view_format;
        unsigned int _M_bits_per_channel;
        unsigned int _M_num_channels;
        unsigned int _M_mip_levels;

        size_t _M_row_pitch;
        size_t _M_depth_pitch;
    };

    class _Sampler : public _Reference_counter
    {
    public:
        // Create a new sampler with configurations exposed by C++ AMP.
        _AMPIMP static _Ret_ _Sampler * __cdecl _Create(
            unsigned int _Filter_mode,
            unsigned int _Address_mode,
            float _Border_r,
            float _Border_g,
            float _Border_b,
            float _Border_a);

        // Create a sampler object given an adopted opaque data pointer
        _AMPIMP static _Ret_ _Sampler * __cdecl _Create(_In_ void *_Data_ptr);

        // Return the raw data ptr - only an accelerator view implementation can interpret
        // this raw pointer. This method should usually not be used in the AMP header files
        _Ret_ void * _Get_data_ptr() const
        {
            return _M_data_ptr;
        }

        bool _Is_adopted() const
        {
            // Is it adopted from interop?
            return _M_is_adopted;
        }

        unsigned int _Get_filter_mode() const
        {
            return _M_filter_mode;
        }

        unsigned int _Get_address_mode() const
        {
            return _M_address_mode;
        }

        const float* _Get_border_color() const
        {
            return &_M_border_color[0];
        }

    private:
        // The _Sampler constructor is private to force construction through the static 
        // _Create method to ensure the object is allocated in the runtime
        _Sampler(unsigned int _Filter_mode, unsigned int _Address_mode, float _Border_r, float _Border_g, float _Border_b, float _Border_a);

        _Sampler(_In_ void *_Data_ptr);

        // Private destructor to force deletion through _Release
        ~_Sampler();

        // No default consturctor, copy constructor and assignment operator
        _Sampler();
        _Sampler(const _Sampler &rhs);
        _Sampler &operator=(const _Sampler &rhs);

        void * _M_data_ptr;
        bool _M_is_adopted;
        unsigned int _M_filter_mode;
        unsigned int _M_address_mode;
        float _M_border_color[4];
    };

    // Forward declaration for copy helper functions
    _AMPIMP _Event __cdecl _Copy_impl(_In_ _Buffer *_Src, size_t _Src_offset,
                                      _Out_ _Buffer * _Dst, size_t _Dest_offset,
                                      size_t _Num_elems, size_t _Preferred_copy_chunk_num_elems = 0);

    _AMPIMP _Event __cdecl _Copy_async_impl(_In_ _Texture *_Src_tex, const size_t *_Src_offset, unsigned int _Src_mipmap_level,
                                            _Out_ _Texture *_Dst_tex, const size_t *_Dst_offset, unsigned int _Dst_mipmap_level,
                                            const size_t *_Copy_extent, const size_t *_Preferred_copy_chunk_extent = NULL);

    inline bool _Get_chunked_staging_texture(_In_ _Texture* _Tex, const size_t *_Copy_chunk_extent, _Inout_ size_t *_Remaining_copy_extent, _Out_ size_t *_Curr_copy_extent, _Out_ _Texture_ptr *_Staging_texture)
    {
        bool _Truncated_copy = false;
        size_t _Allocation_extent[3] = { _Copy_chunk_extent[0], _Copy_chunk_extent[1], _Copy_chunk_extent[2] };
        
        unsigned int _Most_sig_idx = _Tex->_Get_rank() - 1;

        if (_Allocation_extent[_Most_sig_idx] > _Remaining_copy_extent[_Most_sig_idx]) {
            _Allocation_extent[_Most_sig_idx] = _Remaining_copy_extent[_Most_sig_idx];
        }

        _Texture_ptr _Stage = _Texture::_Get_temp_staging_texture(_Tex->_Get_accelerator_view(), _Tex->_Get_rank(),
            _Allocation_extent[0], _Allocation_extent[1], _Allocation_extent[2],
            /*_Mip_levels=*/1, _Tex->_Get_texture_format());

        std::copy(&_Allocation_extent[0], &_Allocation_extent[3], stdext::make_unchecked_array_iterator(&_Curr_copy_extent[0]));
        size_t _Staging_tex_extent[3] = {_Stage->_Get_width(), _Stage->_Get_height(), _Stage->_Get_depth()};
        if (_Curr_copy_extent[_Most_sig_idx] > _Staging_tex_extent[_Most_sig_idx]) {
            _Curr_copy_extent[_Most_sig_idx] = _Staging_tex_extent[_Most_sig_idx];
        }

        // The truncation can however happen only in the most significant dimension and lower
        // dimensions should not get truncated
        if (_Curr_copy_extent[_Most_sig_idx] < _Remaining_copy_extent[_Most_sig_idx]) 
        {
            _Remaining_copy_extent[_Most_sig_idx] -= _Curr_copy_extent[_Most_sig_idx];
            _Truncated_copy = true;
        }

        for (unsigned int _I = 0; _I < _Most_sig_idx; _I++)
        {
            _ASSERTE(_Curr_copy_extent[_I] == _Remaining_copy_extent[_I]);
        }

        *_Staging_texture = _Stage;
        return _Truncated_copy;
    }

    #pragma warning ( push )
    #pragma warning ( disable : 6101 )
    // Supress "warning C6101: Returning uninitialized memory '*_Dst'.:  A successful"
    // "path through the function does not set the named _Out_ parameter."
    // The callers to _Copy_data_on_host all have static_assert that _Rank has to be 1, 2, or 3 dimensions for texture
    //
    template <typename _Input_iterator, typename _Value_type>
    inline void _Copy_data_on_host(int _Rank, _Input_iterator _Src, _Out_ _Value_type *_Dst,
                                   size_t _Width, size_t _Height, size_t _Depth,
                                   size_t _Dst_row_pitch_in_bytes, size_t _Dst_depth_pitch_in_bytes,
                                   size_t _Src_row_pitch, size_t _Src_depth_pitch)
    {
        switch(_Rank)
        {
        case 1:
            {
                _Input_iterator _End = _Src;
                std::advance(_End, _Width);
                std::copy(_Src, _End, stdext::make_unchecked_array_iterator(_Dst));
            }
            break;
        case 2:
            {
                unsigned char *_Dst_ptr = reinterpret_cast<unsigned char *>(_Dst);
                _Input_iterator _Src_start = _Src;
                for (size_t _I = 0; _I < _Height; _I++)
                {
                    _Input_iterator _Src_end = _Src_start;
                    std::advance(_Src_end, _Width);

                    std::copy(_Src_start, _Src_end, stdext::make_unchecked_array_iterator(reinterpret_cast<_Value_type*>(_Dst_ptr)));

                    _Dst_ptr += _Dst_row_pitch_in_bytes;
                    std::advance(_Src_start, _Src_row_pitch);
                }
            }
            break;
        case 3:
            {
                unsigned char *_Dst_ptr_slice_start = reinterpret_cast<unsigned char *>(_Dst);
                _Input_iterator _Src_depth_slice_start = _Src;
                for (size_t _I = 0; _I < _Depth; _I++)
                {
                    _Input_iterator _Src_start = _Src_depth_slice_start;
                    unsigned char *_Dst_ptr = _Dst_ptr_slice_start;

                    for (size_t _J = 0; _J < _Height; _J++)
                    {
                        _Input_iterator _Src_end = _Src_start;
                        std::advance(_Src_end, _Width);

                        std::copy(_Src_start, _Src_end, stdext::make_unchecked_array_iterator(reinterpret_cast<_Value_type*>(_Dst_ptr)));

                        _Dst_ptr += _Dst_row_pitch_in_bytes;
                        std::advance(_Src_start, _Src_row_pitch);
                    }

                    _Dst_ptr_slice_start += _Dst_depth_pitch_in_bytes;
                    std::advance(_Src_depth_slice_start, _Src_depth_pitch);
                }
            }
            break;
        default:
            _ASSERTE(FALSE);
            break;
        }
    }
    #pragma warning ( pop ) // disable : 6101

    template <typename _Output_iterator, typename _Value_type>
    inline void _Copy_data_on_host(int _Rank, const _Value_type * _Src, _Output_iterator _Dst,
                                   size_t _Width, size_t _Height, size_t _Depth,
                                   size_t _Src_row_pitch_in_bytes, size_t _Src_depth_pitch_in_bytes,
                                   size_t _Dst_row_pitch, size_t _Dst_depth_pitch)
    {
        switch(_Rank)
        {
        case 1:
            {
                const _Value_type * _End = _Src + _Width;
                std::copy(stdext::make_unchecked_array_iterator(_Src), stdext::make_unchecked_array_iterator(_End), _Dst);
            }
            break;
        case 2:
            {
                const unsigned char *_Src_ptr = reinterpret_cast<const unsigned char *>(_Src);
                _Output_iterator _Dst_iter = _Dst;
                for (size_t _I = 0; _I < _Height; _I++)
                {
                    const _Value_type * _Src_end = reinterpret_cast<const _Value_type*>(_Src_ptr) + _Width;

                    std::copy(stdext::make_unchecked_array_iterator(reinterpret_cast<const _Value_type*>(_Src_ptr)), stdext::make_unchecked_array_iterator(_Src_end), _Dst_iter);
                    std::advance(_Dst_iter, _Dst_row_pitch);
                    _Src_ptr += _Src_row_pitch_in_bytes;
                }
            }
            break;
        case 3:
            {
                const unsigned char *_Src_ptr_slice_start = reinterpret_cast<const unsigned char *>(_Src);
                _Output_iterator _Dst_depth_slice_start = _Dst;
                for (size_t _I = 0; _I < _Depth; _I++)
                {
                    _Output_iterator _Dst_iter = _Dst_depth_slice_start;
                    const unsigned char *_Src_ptr = _Src_ptr_slice_start;

                    for (size_t _J = 0; _J < _Height; _J++)
                    {
                        const _Value_type * _Src_end = reinterpret_cast<const _Value_type *>(_Src_ptr) + _Width;
                        
                        std::copy(stdext::make_unchecked_array_iterator(reinterpret_cast<const _Value_type*>(_Src_ptr)), stdext::make_unchecked_array_iterator(_Src_end), _Dst_iter);

                        std::advance(_Dst_iter, _Dst_row_pitch);
                        _Src_ptr += _Src_row_pitch_in_bytes;
                    }

                    _Src_ptr_slice_start += _Src_depth_pitch_in_bytes;
                    std::advance(_Dst_depth_slice_start, _Dst_depth_pitch);
                }
            }
            break;
        default:
            _ASSERTE(FALSE);
            break;
        }
    }

    _AMPIMP size_t __cdecl _Get_preferred_copy_chunk_size(size_t _Total_copy_size_in_bytes);

    inline size_t _Get_preferred_copy_chunk_num_elems(size_t _Total_num_elems, size_t _Elem_size)
    {
        size_t preferredChunkSize = _Get_preferred_copy_chunk_size(_Total_num_elems * _Elem_size);

        return (preferredChunkSize / _Elem_size);
    }

    inline void _Get_preferred_copy_chunk_extent(unsigned int _Rank, size_t _Width, size_t _Height,
                                                 size_t _Depth, size_t _Bits_per_element, _Out_writes_(3) size_t *_Preferred_copy_chunk_extent)
    {
        _ASSERTE(_Preferred_copy_chunk_extent != nullptr);

        size_t requestedByteSize = static_cast<size_t>((static_cast<unsigned long long>(_Width) *
                                                        static_cast<unsigned long long>(_Height) *
                                                        static_cast<unsigned long long>(_Depth) *
                                                        static_cast<unsigned long long>(_Bits_per_element)) >> 3);

        size_t preferredChunkSize = _Get_preferred_copy_chunk_size(requestedByteSize);

        // Lets align the allocation size to the element size of the texture
        size_t preferredCopyChunkNumElems = static_cast<size_t>((static_cast<unsigned long long>(preferredChunkSize) * 8U) / _Bits_per_element);

        // Lets truncate the dimensions of the requested staging texture.
        // We only truncate in the most significant dimension
        switch (_Rank)
        {
        case 1:
            _Width = preferredCopyChunkNumElems;
            break;
        case 2:
            _Height = (preferredCopyChunkNumElems + _Width - 1) / _Width;
            break;
        case 3:
            _Depth = (preferredCopyChunkNumElems + (_Height * _Width) - 1) / (_Height * _Width);
            break;
        default:
            _ASSERTE(false);
        }

        _Preferred_copy_chunk_extent[0] = _Width;
        _Preferred_copy_chunk_extent[1] = _Height;
        _Preferred_copy_chunk_extent[2] = _Depth;
    }

    // Finds the greatest common divisor of 2 unsigned integral numbers using Euclid's algorithm
    template <typename _T>
    inline _T _Greatest_common_divisor(_T _M, _T _N)
    {
        static_assert(std::is_unsigned<_T>::value, "This GCD function only supports unsigned integral types");

        _ASSERTE((_M > 0) && (_N > 0));

        if (_N > _M) {
            std::swap(_N , _M);
        }

        _T _Temp;
        while (_N > 0)
        {
            _Temp = _N;
            _N = _M % _N;
            _M = _Temp;
        }

        return _M;
    }

    // Finds the least common multiple of 2 unsigned integral numbers using their greatest_common_divisor
    template <typename _T>
    inline _T _Least_common_multiple(_T _M, _T _N)
    {
        static_assert(std::is_unsigned<_T>::value, "This LCM function only supports unsigned integral types");

        _ASSERTE((_M > 0) && (_N > 0));

        _T _Gcd = _Greatest_common_divisor(_M, _N);
        return ((_M / _Gcd) * _N);
    }

    template <typename InputIterator, typename _Value_type>
    inline _Event _Copy_impl(InputIterator _SrcFirst, InputIterator _SrcLast, size_t _NumElemsToCopy,
                             _Out_ _Buffer * _Dst, size_t _Dest_offset, size_t _Preferred_copy_chunk_num_elems = 0)
    {
        if (_NumElemsToCopy == 0) {
            return _Event();
        }

        if (_Dst == NULL) {
            throw runtime_exception("Failed to copy to buffer.", E_INVALIDARG);
        }

#pragma warning ( push )
#pragma warning ( disable : 6001 ) // Using uninitialized memory '*_Dst'
        if (((_NumElemsToCopy * sizeof(_Value_type)) + (_Dest_offset * _Dst->_Get_elem_size())) > (_Dst->_Get_num_elems() * _Dst->_Get_elem_size()))
        {
            throw runtime_exception("Invalid _Src argument(s). _Src size exceeds total size of the _Dest.", E_INVALIDARG);
        }
#pragma warning ( pop )

        _ASSERTE(_NumElemsToCopy == (size_t)(std::distance(_SrcFirst, _SrcLast)));

        // If the dest is host accessible for write then we do the copy on
        // accelerator(accelerator::cpu_accelerator).default_view
        if (_Dst->_Is_host_accessible(_Write_access)) 
        {
            // Lets first map the _Dst buffer
            _Event _Ev = _Dst->_Map_buffer_async(_Write_access);

            // The _Dest is accessible on host. We just need to do a std::copy using a raw pointer as OutputIterator
            _Buffer_ptr _PDestBuf = _Dst;
            _Ev = _Ev._Add_continuation(std::function<_Event()>([_PDestBuf,_Dest_offset, _SrcFirst, _SrcLast]() mutable -> _Event 
            {
                _Value_type *_DestPtr = reinterpret_cast<_Value_type*>(reinterpret_cast<char*>(_PDestBuf->_Get_host_ptr()) + (_Dest_offset * _PDestBuf->_Get_elem_size()));
                std::copy(_SrcFirst, _SrcLast, stdext::make_unchecked_array_iterator(_DestPtr));

                return _Event();
            }));
            
            return _Ev;
        }
        else
        {
            // _Dest is on a device. Lets create a temp staging buffer on the _Dest accelerator_view and copy the input over
            // We may create a staging buffer of size smaller than the copy size and in that case we will perform the copy
            // as a series of smaller copies
            _Buffer_ptr _PDestBuf = _Dst;
            size_t _NumElemsToCopyRemaining = _NumElemsToCopy;
            size_t _PreferredNumElemsToCopyPerChunk = _Preferred_copy_chunk_num_elems;
            if (_PreferredNumElemsToCopyPerChunk == 0) {
                // If a preferred copy chunk size was not specified, lets pick one based on the
                // size of the copy
                _PreferredNumElemsToCopyPerChunk = _Get_preferred_copy_chunk_num_elems(_NumElemsToCopy, sizeof(_Value_type));
            }
            size_t _CurrDstOffset = _Dest_offset;
            InputIterator _CurrStartIter = _SrcFirst;
            _Event _Ev;

            size_t _Lcm = _Least_common_multiple(_Dst->_Get_elem_size(), sizeof(_Value_type));
            size_t _AdjustmentRatio = _Lcm / sizeof(_Value_type);

            do
            {
                size_t _AllocationNumElems = _PreferredNumElemsToCopyPerChunk;
                if (_NumElemsToCopyRemaining < _AllocationNumElems) {
                    _AllocationNumElems = _NumElemsToCopyRemaining;
                }

                _Buffer_ptr _PDestStagingBuf = _Buffer::_Get_temp_staging_buffer(_Dst->_Get_accelerator_view(),
                                                                                 _AllocationNumElems, sizeof(_Value_type));

                _ASSERTE(_PDestStagingBuf != NULL);
                _ASSERTE(_PDestStagingBuf->_Get_elem_size() == sizeof(_Value_type));

                InputIterator _CurrEndIter = _CurrStartIter;
                size_t _CurrNumElemsToCopy = _AllocationNumElems;
                if (_CurrNumElemsToCopy > _PDestStagingBuf->_Get_num_elems()) {
                    _CurrNumElemsToCopy = _PDestStagingBuf->_Get_num_elems();
                }

                if (_NumElemsToCopyRemaining <= _CurrNumElemsToCopy) {
                    _CurrNumElemsToCopy = _NumElemsToCopyRemaining;
                    _CurrEndIter = _SrcLast;
                }
                else
                {
                    // We need to adjust the _CurrNumElemsToCopy to be a multiple of the 
                    // least common multiple of the destination buffer's element size and sizeof(_Value_type).
                    _CurrNumElemsToCopy = (_CurrNumElemsToCopy / _AdjustmentRatio) * _AdjustmentRatio;
                    std::advance(_CurrEndIter, _CurrNumElemsToCopy);
                }

                _ASSERTE((_CurrNumElemsToCopy % _AdjustmentRatio) == 0);

                // This would not actually never block since we just created this staging buffer or are using
                // a cached one that is not in use
                _PDestStagingBuf->_Map_buffer(_Write_access, true /* _Wait */);

                // Copy from input to the staging using a raw pointer as OutputIterator
                std::copy(_CurrStartIter, _CurrEndIter, stdext::make_unchecked_array_iterator(reinterpret_cast<_Value_type*>(_PDestStagingBuf->_Get_host_ptr())));

                _Ev = _Ev._Add_event(_PDestStagingBuf->_Copy_to_async(_PDestBuf, _CurrNumElemsToCopy, 0, _CurrDstOffset));

                // Adjust the iterators and offsets
                _NumElemsToCopyRemaining -= _CurrNumElemsToCopy;
                _CurrDstOffset += (_CurrNumElemsToCopy * sizeof(_Value_type)) / _Dst->_Get_elem_size();
                _CurrStartIter = _CurrEndIter;

            } while (_NumElemsToCopyRemaining != 0);

            return _Ev;
        }
    }

    // The std::advance method is only supported for InputIterators and hence we have a custom implementation
    // which forwards to the std::advance if the iterator is an input iterator and uses a loop based advance
    // implementation otherwise
    template<typename _InputIterator, typename _Distance>
    typename std::enable_if<std::is_base_of<std::input_iterator_tag, typename std::iterator_traits<_InputIterator>::iterator_category>::value>::type
    _Advance_output_iterator(_InputIterator &_Iter, _Distance _N)
    {
        std::advance(_Iter, _N);
    }

    template<typename _OutputIterator, typename _Distance>
    typename std::enable_if<!std::is_base_of<std::input_iterator_tag, typename std::iterator_traits<_OutputIterator>::iterator_category>::value>::type
    _Advance_output_iterator(_OutputIterator &_Iter, size_t _N)
    {
        for (size_t i = 0; i < _N; ++i)
        {
            _Iter++;
        }
    }

    template <typename OutputIterator, typename _Value_type>
    inline _Event _Copy_impl(_In_ _Buffer *_Src, size_t _Src_offset, size_t _Num_elems,
                             OutputIterator _DestIter, size_t _Preferred_copy_chunk_num_elems = 0)
    {
        if ((_Src == NULL) || ((_Src_offset + _Num_elems) > _Src->_Get_num_elems())) {
            throw runtime_exception("Failed to copy to buffer.", E_INVALIDARG);
        }

        if (_Num_elems == 0) {
            return _Event();
        }

        size_t _NumElemsToCopy = (_Num_elems * _Src->_Get_elem_size()) / sizeof(_Value_type);

        // If the src is host accessible for readthen we do the copy on
        // accelerator(accelerator::cpu_accelerator).default_view
        if (_Src->_Is_host_accessible(_Read_access)) 
        {
            // Map the _Src buffer
            _Event _Ev = _Src->_Map_buffer_async(_Read_access);

            // The _Src is accessible on host. We just need to do a std::copy using a raw pointer as OutputIterator
            _Buffer_ptr _PSrcBuf = _Src;
            _Ev = _Ev._Add_continuation(std::function<_Event()>([_PSrcBuf, _Src_offset, _DestIter, _NumElemsToCopy]() mutable -> _Event 
            {
                // The _Src is accessible on host. We just need to do a std::copy
                const _Value_type *_PFirst = reinterpret_cast<const _Value_type*>(reinterpret_cast<char*>(_PSrcBuf->_Get_host_ptr()) + (_Src_offset * _PSrcBuf->_Get_elem_size()));
                std::copy(_PFirst, _PFirst + _NumElemsToCopy, _DestIter);

                return _Event();
            }));
            
            return _Ev;
        }
        else
        {
            // The _Src is on the device. We need to copy it out to a temporary staging array
            // We may create a staging buffer of size smaller than the copy size and in that case we will
            // perform the copy as a series of smaller copies

            _Event _Ev;

            _Buffer_ptr _PSrcBuf = _Src;
            size_t _PreferredNumElemsToCopyPerChunk = _Preferred_copy_chunk_num_elems;
            if (_PreferredNumElemsToCopyPerChunk == 0) {
                // If a preferred copy chunk size was not specified, lets pick one based on the
                // size of the copy
                _PreferredNumElemsToCopyPerChunk = _Get_preferred_copy_chunk_num_elems(_NumElemsToCopy, sizeof(_Value_type));
            }

            size_t _AllocationNumElems = _PreferredNumElemsToCopyPerChunk;
            if (_NumElemsToCopy < _AllocationNumElems) {
                _AllocationNumElems = _NumElemsToCopy;
            }

            _Buffer_ptr _PSrcStagingBuf = _Buffer::_Get_temp_staging_buffer(_Src->_Get_accelerator_view(),
                                                                            _AllocationNumElems, sizeof(_Value_type));

            _ASSERTE(_PSrcStagingBuf != NULL);
            _ASSERTE(_PSrcStagingBuf->_Get_elem_size() == sizeof(_Value_type));

            // The total byte size of a copy chunk must be an integral multiple of both the
            // source buffer's element size and sizeof(_Value_type).
            size_t _Lcm = _Least_common_multiple(_Src->_Get_elem_size(), sizeof(_Value_type));
            size_t _AdjustmentRatio = _Lcm / sizeof(_Value_type);

            size_t _CurrNumElemsToCopy = _AllocationNumElems;
            if (_CurrNumElemsToCopy > _PSrcStagingBuf->_Get_num_elems()) {
                _CurrNumElemsToCopy = _PSrcStagingBuf->_Get_num_elems();
            }
            if (_NumElemsToCopy <= _CurrNumElemsToCopy)
            {
                _CurrNumElemsToCopy = _NumElemsToCopy;
            }
            else
            {
                // We need to adjust the _StagingBufNumElems to be a multiple of the 
                // least common multiple of the source buffer's element size and sizeof(_Value_type).
                _CurrNumElemsToCopy = (_CurrNumElemsToCopy / _AdjustmentRatio) * _AdjustmentRatio;
            }

            _ASSERTE((_CurrNumElemsToCopy % _AdjustmentRatio) == 0);

            size_t _NumElemsToCopyRemaining = _NumElemsToCopy - _CurrNumElemsToCopy;

            _Ev = _PSrcBuf->_Copy_to_async(_PSrcStagingBuf, (_CurrNumElemsToCopy * sizeof(_Value_type)) / _PSrcBuf->_Get_elem_size(), _Src_offset, 0);

            if (_NumElemsToCopyRemaining != 0)
            {
                _Ev = _Ev._Add_continuation(std::function<_Event()>([_DestIter, _PSrcBuf, _PSrcStagingBuf,
                                                                     _CurrNumElemsToCopy, _NumElemsToCopyRemaining,
                                                                     _Src_offset, _PreferredNumElemsToCopyPerChunk]() mutable -> _Event 
                {
                    // Initiate an asynchronous copy of the remaining part so that this part of the copy
                    // makes progress while we consummate the copying of the first part
                    size_t _CurrSrcOffset = _Src_offset + ((_CurrNumElemsToCopy * sizeof(_Value_type)) / _PSrcBuf->_Get_elem_size());
                    OutputIterator _CurrDestIter = _DestIter;
                    _Advance_output_iterator<decltype(_CurrDestIter), size_t>(_CurrDestIter, _CurrNumElemsToCopy);
                    _Event _Ret_ev = _Copy_impl<OutputIterator, _Value_type>(_PSrcBuf._Get_ptr(), _CurrSrcOffset,
                                                                             (_NumElemsToCopyRemaining * sizeof(_Value_type)) / _PSrcBuf->_Get_elem_size(),
                                                                             _CurrDestIter, _PreferredNumElemsToCopyPerChunk);

                    // Now copy the data from staging buffer to the destination
                    _Value_type *_PFirst = reinterpret_cast<_Value_type*>(_PSrcStagingBuf->_Get_host_ptr());
                    std::copy(_PFirst, _PFirst + _CurrNumElemsToCopy, _DestIter);
                    return _Ret_ev;
                }));
            }
            else
            {
                _Ev = _Ev._Add_continuation(std::function<_Event()>([_DestIter, _PSrcStagingBuf, _CurrNumElemsToCopy]() mutable -> _Event 
                {
                    _Value_type *_PFirst = reinterpret_cast<_Value_type*>(_PSrcStagingBuf->_Get_host_ptr());
                    std::copy(_PFirst, _PFirst + _CurrNumElemsToCopy, _DestIter);
                    return _Event();
                }));
            }

            return _Ev;
        }
    }

    // Structured copy between buffers across AVs
    _AMPIMP _Event __cdecl _Copy_impl(_In_ _Buffer *_Src, _View_shape_ptr _Src_shape, _Out_ _Buffer * _Dst, _View_shape_ptr _Dst_shape);

    struct _Array_copy_desc
    {
        _Array_copy_desc(
            const unsigned int _Rank,
            const unsigned int _Src_linear_offset,
            const unsigned int * _Src_extents, 
            const unsigned int * _Src_copy_offset,
            const unsigned int _Dst_linear_offset,
            const unsigned int * _Dst_extents, 
            const unsigned int * _Dst_copy_offset, 
            const unsigned int * _Copy_extents)
        {
            this->_Rank = _Rank;

            this->_Src_linear_offset = _Src_linear_offset;
            this->_Src_extents.assign( _Src_extents, _Src_extents + _Rank);
            this->_Src_copy_offset.assign( _Src_copy_offset, _Src_copy_offset + _Rank);
    
            this->_Dst_linear_offset = _Dst_linear_offset;
            this->_Dst_extents.assign( _Dst_extents, _Dst_extents + _Rank);
            this->_Dst_copy_offset.assign( _Dst_copy_offset, _Dst_copy_offset + _Rank);
    
            this->_Copy_extents.assign( _Copy_extents, _Copy_extents + _Rank);
        }

        _Array_copy_desc() {}
    
        unsigned int _Rank;

        // Shape of source
        unsigned int  _Src_linear_offset;
        std::vector<unsigned int> _Src_extents;
        std::vector<unsigned int> _Src_copy_offset;
        
        // Shape of destination
        unsigned int  _Dst_linear_offset;
        std::vector<unsigned int> _Dst_extents;
        std::vector<unsigned int> _Dst_copy_offset;

        // Shape of copy region
        std::vector<unsigned int> _Copy_extents;
    };

    // Declaration
    _AMPIMP HRESULT __cdecl _Recursive_array_copy(const _Array_copy_desc& _Desc, 
                                                  unsigned int _Native_copy_rank,
                                                  std::function<HRESULT(const _Array_copy_desc &_Reduced)> _Native_copy_func);

    _AMPIMP std::pair<accelerator_view, accelerator_view> __cdecl _Get_src_dest_accelerator_view(_In_opt_ const _Buffer_descriptor *_SrcBuffDescPtr,
                                                                                                 _In_opt_ const _Buffer_descriptor *_DestBuffDescPtr);

    // Iterator based copy function
    template<typename _InputInterator, typename _OutputIterator>
    inline _Event _Copy_impl_iter(_InputInterator _SrcFirst, _InputInterator _SrcLast, _OutputIterator _DstFirst)
    {
        std::copy(_SrcFirst, _SrcLast, _DstFirst);
        return _Event();
    }

    // Iterator based copy function
    template <typename InputIterator, typename _Value_type>
    inline _Event _Copy_impl(InputIterator _SrcFirst, _View_shape_ptr _Src_shape, _Inout_ _Buffer * _Dst, _View_shape_ptr _Dst_shape)
    {
        _ASSERTE(_Dst != NULL);
        _ASSERTE(_Src_shape != NULL);
        _ASSERTE(_Dst_shape != NULL);

        if (_Src_shape->_Is_projection()) {
            _Src_shape = _Src_shape->_Get_reduced_shape_for_copy();
        }

        if (_Dst_shape->_Is_projection()) {
            _Dst_shape = _Dst_shape->_Get_reduced_shape_for_copy();
        }

        _ASSERTE(_Src_shape->_Get_rank() == _Dst_shape->_Get_rank());

        _ASSERTE(_View_shape::_Compare_extent_with_elem_size(_Src_shape->_Get_rank(), _Src_shape->_Get_view_extent(),
                                                             sizeof(_Value_type), _Dst_shape->_Get_view_extent(), _Dst->_Get_elem_size()));

        if (_Dst->_Is_host_accessible(_Write_access))
        {
            // The destination buffer is accesible on the host. Map the _Dst buffer
            _Event _Ev = _Dst->_Map_buffer_async(_Write_access);
            _Buffer_ptr _PDestBuf = _Dst;
            return _Ev._Add_continuation(std::function<_Event()>([_SrcFirst, _Src_shape, _PDestBuf, _Dst_shape]() mutable -> _Event {
                return _Copy_impl_iter(_SrcFirst, _Src_shape, stdext::make_unchecked_array_iterator(reinterpret_cast<_Value_type*>(_PDestBuf->_Get_host_ptr())),
                                       _Create_reinterpreted_shape(_Dst_shape, _PDestBuf->_Get_elem_size(), sizeof(_Value_type)));
            }));
        }
        else
        {
            // The dest buffer is not accesible on host. Lets create a temporary 
            // staging buffer on the destination buffer's accelerator_view
            _Buffer_ptr _PTempStagingBuf = _Buffer::_Create_stage_buffer(_Dst->_Get_accelerator_view(), accelerator(accelerator::cpu_accelerator).default_view,
                                                                         _Src_shape->_Get_view_size(), sizeof(_Value_type), true /* _Is_temp */);

            _PTempStagingBuf->_Map_buffer(_Write_access, true /* _Wait */);
            _Value_type *_Dst_ptr = reinterpret_cast<_Value_type*>(_PTempStagingBuf->_Get_host_ptr());
            _Event _Ev = _Copy_impl_iter(_SrcFirst, _Src_shape, stdext::make_unchecked_array_iterator(_Dst_ptr), _Src_shape);

            // Now copy from the staging buffer to the destination buffer
            _Buffer_ptr _PDestBuf = _Dst;
            return _Ev._Add_continuation(std::function<_Event()>([_PTempStagingBuf, _Src_shape, _PDestBuf, _Dst_shape]() mutable -> _Event {
                return _Copy_impl(_PTempStagingBuf, _Src_shape, _PDestBuf, _Dst_shape);
            }));
        }
    }

    template <typename OutputIterator, typename _Value_type>
    inline _Event _Copy_impl(_In_ _Buffer *_Src, _View_shape_ptr _Src_shape, OutputIterator _DestIter, _View_shape_ptr _Dst_shape)
    {
        _ASSERTE(_Src != NULL);
        _ASSERTE(_Src_shape != NULL);
        _ASSERTE(_Dst_shape != NULL);

        if (_Src_shape->_Is_projection()) {
            _Src_shape = _Src_shape->_Get_reduced_shape_for_copy();
        }

        if (_Dst_shape->_Is_projection()) {
            _Dst_shape = _Dst_shape->_Get_reduced_shape_for_copy();
        }

        _ASSERTE(_Src_shape->_Get_rank() == _Dst_shape->_Get_rank());

        _ASSERTE(_View_shape::_Compare_extent_with_elem_size(_Src_shape->_Get_rank(), _Src_shape->_Get_view_extent(),
                                                             _Src->_Get_elem_size(), _Dst_shape->_Get_view_extent(), sizeof(_Value_type)));

        if (_Src->_Is_host_accessible(_Read_access))
        {
            // The source buffer is accessible on the host. Map the _Src buffer
            _Event _Ev = _Src->_Map_buffer_async(_Read_access);

            _Buffer_ptr _PSrcBuf = _Src;
            return _Ev._Add_continuation(std::function<_Event()>([_PSrcBuf, _Src_shape, _DestIter, _Dst_shape]() mutable -> _Event {
                return _Copy_impl_iter(reinterpret_cast<_Value_type*>(_PSrcBuf->_Get_host_ptr()),
                                       _Create_reinterpreted_shape(_Src_shape, _PSrcBuf->_Get_elem_size(), sizeof(_Value_type)),
                                       _DestIter, _Dst_shape);
            }));
        }
        else
        {
            // The source buffer is not accessible on host. Lets create a temporary 
            // staging buffer on the source buffer's accelerator_view and initiate a copy
            // from the source buffer to the temporary staging buffer
            _Buffer_ptr _PTempStagingBuf = _Buffer::_Create_stage_buffer(_Src->_Get_accelerator_view(), accelerator(accelerator::cpu_accelerator).default_view,
                                                                         _Dst_shape->_Get_view_size(), sizeof(_Value_type), true);

            _Event _Ev = _Src->_Copy_to_async(_PTempStagingBuf, _Src_shape, _Dst_shape);
            return _Ev._Add_continuation(std::function<_Event()>([_PTempStagingBuf, _Dst_shape, _DestIter]() mutable -> _Event {
                return _Copy_impl_iter(reinterpret_cast<_Value_type*>(_PTempStagingBuf->_Get_host_ptr()),
                                       _Dst_shape, _DestIter, _Dst_shape);
            }));
        }
    }

    // Iterator based structured copy function
    template<typename _InputInterator, typename _OutputIterator>
    inline _Event _Copy_impl_iter(_InputInterator _SrcIter, _View_shape_ptr _Src_shape,
                                  _OutputIterator _DstIter, _View_shape_ptr _Dst_shape)
    {
        if (_Src_shape->_Is_projection()) {
            _Src_shape = _Src_shape->_Get_reduced_shape_for_copy();
        }

        if (_Dst_shape->_Is_projection()) {
            _Dst_shape = _Dst_shape->_Get_reduced_shape_for_copy();
        }

        _ASSERTE(_Src_shape->_Get_rank() == _Dst_shape->_Get_rank());
        _ASSERTE(_View_shape::_Compare_extent(_Src_shape->_Get_rank(), _Src_shape->_Get_view_extent(), _Dst_shape->_Get_view_extent()));

        // If both the _Src_shape and _Dst_shape are linear we can be more efficient
        unsigned int _Src_linear_offset, _Src_linear_size, _Dst_linear_offset, _Dst_linear_size;
        if (_Src_shape->_Is_view_linear(_Src_linear_offset, _Src_linear_size) &&
            _Dst_shape->_Is_view_linear(_Dst_linear_offset, _Dst_linear_size))
        {
            _ASSERTE(_Src_linear_size == _Dst_linear_size);

            // These iterators might be not contiguous, therefore we use std::advance
            std::advance(_SrcIter, _Src_linear_offset);
            auto _SrcLast = _SrcIter;
            std::advance(_SrcLast, _Src_linear_size);
            std::advance(_DstIter, _Dst_linear_offset);

            return _Copy_impl_iter(_SrcIter, _SrcLast, _DstIter);
        }

        std::vector<unsigned int> _Src_extent(_Src_shape->_Get_rank());
        std::vector<unsigned int> _Src_offset(_Src_shape->_Get_rank());
        std::vector<unsigned int> _Dst_extent(_Dst_shape->_Get_rank());
        std::vector<unsigned int> _Dst_offset(_Dst_shape->_Get_rank());
        std::vector<unsigned int> _Copy_extent(_Src_shape->_Get_rank());

        for (size_t i = 0; i < _Src_shape->_Get_rank(); ++i) {
            _Src_extent[i] = _Src_shape->_Get_base_extent()[i];
            _Src_offset[i] = _Src_shape->_Get_view_offset()[i];
            _Dst_extent[i] = _Dst_shape->_Get_base_extent()[i];
            _Dst_offset[i] = _Dst_shape->_Get_view_offset()[i];
            _Copy_extent[i] = _Src_shape->_Get_view_extent()[i];
        }

        _Array_copy_desc _Desc(
            _Src_shape->_Get_rank(),
            _Src_shape->_Get_linear_offset(),
            _Src_extent.data(),
            _Src_offset.data(),
            _Dst_shape->_Get_linear_offset(),
            _Dst_extent.data(),
            _Dst_offset.data(),
            _Copy_extent.data());

        // Note: Capturing shape pointers would be incorrect, they are valid for setting up the call.
        // They might be deleted right after this call completes. 
        HRESULT hr = _Recursive_array_copy(_Desc, 1, [_SrcIter, _DstIter](const _Array_copy_desc &_Reduced) -> HRESULT {

            auto _SrcFirst = _SrcIter;
            auto _DstFirst = _DstIter;
    
            std::advance(_DstFirst, _Reduced._Dst_linear_offset + _Reduced._Dst_copy_offset[0]);
            std::advance(_SrcFirst, _Reduced._Src_linear_offset + _Reduced._Src_copy_offset[0]);
            auto _SrcLast = _SrcFirst;
            std::advance(_SrcLast, _Reduced._Copy_extents[0]);

            std::copy(_SrcFirst, _SrcLast, _DstFirst);

            return S_OK;
        });

        if (FAILED(hr)) {
            throw Concurrency::runtime_exception("Failed to copy between buffers", E_FAIL);
        }

        return _Event();
    }

    // A ubiquitous buffer that provides access to the underlying data 
    // on any accelerator_view
    class _Ubiquitous_buffer : public _Reference_counter
    {
        friend _Event _Get_access_async(const _View_key _Key, accelerator_view _Av, _Access_mode _Mode, _Buffer_ptr &_Buf_ptr);
        friend _AMPIMP accelerator_view __cdecl _Select_copy_src_accelerator_view(_In_ _View_key _Src_view_key, const accelerator_view &_Dest_accelerator_view);
        friend struct _DPC_call_handle;

    public:

        _AMPIMP static _Ret_ _Ubiquitous_buffer * __cdecl _Create_ubiquitous_buffer(size_t _Num_elems, size_t _Elem_size);

        _AMPIMP static _Ret_ _Ubiquitous_buffer * __cdecl _Create_ubiquitous_buffer(_Buffer_ptr _Master_buffer);

        // Register a new view on top of this _Ubiquitous_buffer
        _AMPIMP void _Register_view(_In_ _View_key _Key, accelerator_view _Cpu_av, _View_shape_ptr _Shape);

        // Register a copy of an existing view registered with this _Ubiquitous_buffer
        _AMPIMP void _Register_view_copy(_In_ _View_key _New_view_key, _In_ _View_key _Existing_view_key);

        // Unregister a view currently registered with this _Ubiquitous_buffer
        _AMPIMP void _Unregister_view(_In_ _View_key _Key);

        // Obtain a specified mode of access to the specified view on the specified target
        // accelerator_view. This method also serves the purpose of determining the 
        // amount of data copy expected to happen as part of this _Get_access request
        // without actually performing the copies or state updates in the _Ubiquitous_buffer. This
        // is used for reporting the implicit data copies that happen when accessing array_views
        // in C++ AMP ETW events
        _AMPIMP _Event _Get_access_async(_In_ _View_key _Key, _Accelerator_view_impl_ptr _Av_view_impl_ptr,
                                         _Access_mode _Mode, _Buffer_ptr &_Buf_ptr,
                                         _Inout_opt_ ULONGLONG *_Sync_size = nullptr);

        // Discard the content underlying this view
        _AMPIMP void _Discard(_In_ _View_key _Key);

        // This method does not synchonize the copies. Should not be used for getting 
        // data access but only to get the underlying buffer's properties
        _AMPIMP _Buffer_ptr _Get_master_buffer() const;

        _AMPIMP accelerator_view _Get_master_accelerator_view() const;

        _AMPIMP _View_shape_ptr _Get_view_shape(_In_ _View_key _Key);

        _Ret_ _Accelerator_view_impl* _Get_master_accelerator_view_impl() const
        {
            return _M_master_av;
        }

        size_t _Get_master_buffer_elem_size() const
        {
            return _M_master_buffer_elem_size;
        }

        size_t _Get_master_buffer_num_elems() const
        {
            return _M_master_buffer_num_elems;
        }

        bool _Has_data_source() const
        {
            return _M_has_data_source;
        }

    private:

        // The _Ubiquitous_buffer constructors are private to force construction through the static 
        // _Create_ubiquitous_buffer method to ensure the object is allocated in the runtime
        _Ubiquitous_buffer(size_t _Num_elems, size_t _Elem_size);
        _Ubiquitous_buffer(_In_ _Buffer* _Master_buffer);

        // Private destructor to force deletion through _Release
        ~_Ubiquitous_buffer();

        // No default consturctor, copy constructor and assignment operator
        _Ubiquitous_buffer();
        _Ubiquitous_buffer(const _Ubiquitous_buffer &rhs);
        _Ubiquitous_buffer &operator=(const _Ubiquitous_buffer &rhs);
        
        // Helper methods

        // Get access to a buffer on a specified accelerator for a specified pre-registered view.
        // If _Sync_size parameter is not null, then function calculates number of bytes that we
        // need to synchronize to get desired access.
        _AMPIMP _Event _Get_access_async(_In_ _View_key _Key, accelerator_view _Av, _Access_mode _Mode,
                                         _Buffer_ptr &_Buf_ptr, _Inout_opt_ ULONGLONG *_Sync_size = NULL);

        // Commit a view to the master buffer if needed. When the _Sync_size parameter is non-null
        // this method just returns the amount of data to be copied as part of the commit, without 
        // actually performing the commit
        _Event _Commit_view_async(_In_ _View_info *_Info, _Inout_ ULONGLONG *_Sync_size = nullptr);

        // Get the _Buffer_ptr corresponding to a specified accelerator_view. When the 
        // _Create parameter is true, it creates a new _Buffer if one does not already exist
        // for that accelerator_view
        _Ret_ _Buffer* _Get_buffer(_In_ _Accelerator_view_impl* _Av, bool _Create = true);

        // Sets a new access mode for the specified view
        void _Set_new_access_mode(_Inout_ _View_info *_Info, _Access_mode _New_mode);

        // Unsets the discard flag from the specified view and all other 
        // overlapping views
        void _Unset_discard_flag(_Inout_ _View_info *_Info);

        // Determines whether the data underlying the specified view has been discarded
        // based on whether a subsuming view has the discard flag set.
        bool _Should_discard(const _View_info *_Info) const;

        // Does this view have exclusive data which is not discarded, 
        // not on the master accelerator_view and also there is not other view 
        // that subsumes this view and is marked dirty
        bool _Has_exclusive_data(const _View_info *_Info) const;

        // Based on the current state of overlapping views in the _Ubiquitous_buffer
        // does the specified view require a data update on the target accelerator_view
        // to fulfil an access request
        bool _Requires_update_on_target_accelerator_view(const _View_info *_Info,
                                                         _Access_mode _Requested_mode,
                                                         _In_ _Accelerator_view_impl* _Target_acclerator_view) const;

        // This method iterates over all views in the specified commit list
        // and flags them as "commit not needed" if that view is subsumed by another view present in the
        // commit list
        static void _Flag_redundant_commits(std::vector<std::pair<_View_info*, bool>> &_Commit_list);

        // This method returns the list of accelerator_views where the specified view already has
        // a valid cached copy of the data and getting read access  would not incur any data movement.
        // The _Can_access_anywhere parameter is an output parameter used to indicate to the 
        // caller that the specified view can be accessed on any accelerator_view without incurring
        // any data movement. This is true when there are no modified overlapping views that require 
        // synchronization and the specified view has the discard_data flag set.
        // This method is used for determining the source accelerator_view for copy and p_f_e operations
        // involving array_views
        _Accelerator_view_unordered_set _Get_caching_info(_In_ _View_key _Key, _Out_opt_ bool *_Can_access_anywhere = NULL);

        _Accelerator_view_unordered_set _Get_caching_info_impl(_In_ _View_key _Key, _Out_opt_ bool *_Can_access_anywhere);

        _Ret_ _Accelerator_view_impl* _Determine_alternate_target_accelerator_view(_In_ _View_key _Key,
                                                                             _In_ _Accelerator_view_impl* _Original_av,
                                                                             _Access_mode _Mode);

    private:

        // Private data

        // The master accelerator_view for this _Ubiquitous_buffer
        // which is specified at construction time
        _Accelerator_view_impl_ptr _M_master_av;

        // The master _Buffer corresponding to this _Ubiquitous_buffer
        // which is specified at construction time
        _Buffer* _M_master_buffer;

        // The size of each element of the master buffer
        size_t _M_master_buffer_elem_size;

        // The number of elements in the master buffer
        size_t _M_master_buffer_num_elems;

        // Indicates if this ubiquitous buffer has an underlying data source
        bool _M_has_data_source;

        // A map of pre-created _Buffers corresponding to different 
        // accelerator_views where the _Ubiquitous_buffer has already been
        // accessed
        std::map<_Accelerator_view_impl_ptr, _Buffer_ptr> _M_buffer_map;
        
        // A mapping between all registered view keys in this _Ubiquitous_buffer
        // to their corresponding _View_info
        std::unordered_map<_View_key, _View_info*> _M_view_map;

        // Set of distinct views of this buffer. As multiple copies of the same
        // view may have been registered for this _Ubiquitous_buffer, this set 
        // maintains the set of distinct views which really matter for the 
        // caching protocol. Also, note that some view_info may not have any live registered 
        // and hence does not exist in the _M_view_map but may exist here since
        // it has uncomiitted data which needs to be considered as part of the cache
        // coherence protocol to prevent modifications underlying this view from being lost
        std::unordered_set<_View_info*> _M_view_info_set;

        // Critical section object to protect the cache directory
        Concurrency::critical_section _M_critical_section;
    };

    // Class defines functions for interoperability with D3D
    class _D3D_interop
    {
    public:
        _AMPIMP static _Ret_ IUnknown * __cdecl _Get_D3D_buffer(_In_ _Buffer *_Buffer_ptr);
        _AMPIMP static _Ret_ IUnknown * __cdecl _Get_D3D_texture(_In_ _Texture *_Texture_ptr);
        _AMPIMP static _Ret_ void * __cdecl _Get_D3D_sampler_data_ptr(_In_ IUnknown *_D3D_sampler);
        _AMPIMP static void __cdecl _Release_D3D_sampler_data_ptr(_In_ void *_Sampler_data_ptr);
        _AMPIMP static _Ret_ IUnknown * __cdecl _Get_D3D_sampler(const Concurrency::accelerator_view &_Av, _In_ _Sampler *_Sampler_ptr);
    };

    inline
    _Event _Get_access_async(const _View_key _Key, accelerator_view _Av, _Access_mode _Mode, _Buffer_ptr &_Buf_ptr)
    {
        return _Key->_Get_buffer_ptr()->_Get_access_async(_Key->_Get_view_key(), _Av, _Mode, _Buf_ptr);
    }

    inline
    _Ret_ _View_shape* _Get_buffer_view_shape(const _Buffer_descriptor& _Descriptor)
    {
        return _Descriptor._Get_buffer_ptr()->_Get_view_shape(_Descriptor._Get_view_key());
    }

    inline
    bool _Is_cpu_accelerator(const accelerator& _Accl) 
    {
        return (_Accl.device_path == accelerator::cpu_accelerator);
    }

} // namespace Concurrency::details

/// <summary>
/// Exception thrown when an underlying DirectX call fails
/// due to the Windows timeout detection and recovery mechanism
/// </summary>
class accelerator_view_removed : public runtime_exception
{
public:
    /// <summary>
    ///     Construct an accelerator_view_removed exception with a message and
    ///     a view removed reason code
    /// </summary>
    /// <param name="_Message">
    ///     Descriptive message of error
    /// </param>
    /// <param name="_View_removed_reason">
    ///     HRESULT error code indicating the cause of removal of the accelerator_view
    /// </param>
    _AMPIMP explicit accelerator_view_removed(const char * _Message, HRESULT _View_removed_reason) throw();

    /// <summary>
    ///     Construct an accelerator_view_removed exception
    /// </summary>
    /// <param name="_View_removed_reason">
    ///     HRESULT error code indicating the cause of removal of the accelerator_view
    /// </param>
    _AMPIMP explicit accelerator_view_removed(HRESULT _View_removed_reason) throw();

    /// <summary>
    ///     Returns an HRESULT error code indicating the cause of the accelerator_view's removal
    /// </summary>
    /// <returns>
    ///     The HRESULT error code that indicates the cause of accelerator_view's removal
    /// </returns>
    _AMPIMP HRESULT get_view_removed_reason() const throw();

private:

    HRESULT _M_view_removed_reason_code;
}; // class accelerator_view_removed

/// <summary>
///     Exception thrown when the runtime fails to launch a kernel
///     using the compute domain specified at the parallel_for_each call site.
/// </summary>
class invalid_compute_domain : public runtime_exception
{
public:
    /// <summary>
    ///     Construct an invalid_compute_domain exception with a message
    /// </summary>
    /// <param name="_Message">
    ///     Descriptive message of error
    /// </param>
    _AMPIMP explicit invalid_compute_domain(const char * _Message) throw();

    /// <summary>
    ///     Construct an invalid_compute_domain exception
    /// </summary>
    _AMPIMP invalid_compute_domain() throw();
}; // class invalid_compute_domain

/// <summary>
///     Exception thrown when an unsupported feature is used
/// </summary>
class unsupported_feature  : public runtime_exception
{
public:
    /// <summary>
    ///     Construct an unsupported_feature exception with a message
    /// </summary>
    /// <param name="_Message">
    ///     Descriptive message of error
    /// </param>
    _AMPIMP explicit unsupported_feature(const char * _Message) throw();

    /// <summary>
    ///     Construct an unsupported_feature exception
    /// </summary>
    _AMPIMP unsupported_feature() throw();
}; // class unsupported_feature

} // namespace Concurrency

// =+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+
//
// Compiler/Runtime Interface
//
// =-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-

#define HELPERAPI __cdecl

using namespace Concurrency::details;

extern "C" {

    // This structure is used for storing information about resources required by the kernel.
    enum _Resource_kind
    {
        RESOURCE_BUFFER      = 0,
        RESOURCE_TEXTURE     = 1,
        RESOURCE_SAMPLER     = 2,
    };

    struct _Device_resource_info
    {
        _Resource_kind _M_resource_kind;  // buffer, texture, or sampler

        void * _M_desc;        // Pointer to the _Buffer_descriptor/_Texture_descriptor/_Sampler_descriptor instance 
                               // which underlies all device resource

        _Access_mode _M_formal_access_mode;         // scalar: read-only
                                                    // const scalar ref: read-only
                                                    // scalar ref: ReadWrite
                                                    // array: ReadWrite
                                                    // const array: ReadOnly
        size_t _M_actual_arg_num;

        BOOL _Is_buffer() const
        {
            return (_M_resource_kind == RESOURCE_BUFFER);
        }

        BOOL _Is_texture() const
        {
            return (_M_resource_kind == RESOURCE_TEXTURE);
        }

        BOOL _Is_sampler() const
        {
            return (_M_resource_kind == RESOURCE_SAMPLER);
        }

        _Ret_ _Buffer_descriptor * _Get_buffer_desc() const
        {
            _ASSERTE(_Is_buffer());
            return reinterpret_cast<_Buffer_descriptor *>(_M_desc);
        }

        _Ret_ _Texture_descriptor * _Get_texture_desc() const
        {
            _ASSERTE(_Is_texture());
            return reinterpret_cast<_Texture_descriptor *>(_M_desc);
        }

        _Ret_ _Sampler_descriptor * _Get_sampler_desc() const
        {
            _ASSERTE(_Is_sampler());
            return reinterpret_cast<_Sampler_descriptor *>(_M_desc);
        }

        _Ret_ void * _Get_resource_ptr() const
        {
            if (_Is_buffer()) 
            {
                _Ubiquitous_buffer * _Tmp = _Get_buffer_desc()->_Get_buffer_ptr();
                return reinterpret_cast<void *>(_Tmp);
            }
            else if (_Is_texture())
            {
                _Texture * _Tmp = _Get_texture_desc()->_Get_texture_ptr();
                return reinterpret_cast<void *>(_Tmp);
            }
            else 
            {
                _ASSERTE(_Is_sampler());
                _Sampler * _Tmp = _Get_sampler_desc()->_Get_sampler_ptr();
                return reinterpret_cast<void *>(_Tmp);
            }
        }
    };

    // This structure is used for storing information about the const buffers
    struct _Device_const_buffer_info
    {
        void * _M_data;                             // Pointer to the host data to intialize the
                                                    // constant buffer with

        size_t _M_const_buf_size;                   // Size of the const buffer in bytes

        unsigned int _M_is_debug_data;              // Is this debug data which will be 
                                                    // intialized by the runtime. 0 (false), 1 (true)
    };
}

namespace Concurrency
{
namespace details
{
    enum _DPC_kernel_func_kind
    {
        NON_ALIASED_SHADER  = 0, // slot 0
        ALIASED_SHADER      = 1,  // slot 1
        NUM_SHADER_VERSIONS = 2
    };

    struct _DPC_call_handle
    {
        _Accelerator_view_impl *_M_rv;
        bool _M_is_explicit_target_acclview;

        // Info about the kernel function arguments
        _Device_resource_info * _M_device_resource_info;
        size_t _M_num_resources;
        size_t _M_num_writable_buffers;
        size_t _M_num_samplers;

        // Info about the host buffer created corresponding to the const buffer
        _Device_const_buffer_info * _M_const_buffer_info;
        size_t _M_num_const_buffers;

        bool _M_RW_aliasing;

        // Kernel funcs
        _DPC_shader_blob * _M_shader_blobs[NUM_SHADER_VERSIONS];

        // Compute domain info
        int _M_is_flat_model;
        unsigned int _M_compute_rank;
        unsigned int * _M_grid_extents;

        // Kernel dispatch info
        unsigned int _M_groupCountX;
        unsigned int _M_groupCountY;
        unsigned int _M_groupCountZ;

        // The shape of the group
        unsigned int _M_groupExtentX;
        unsigned int _M_groupExtentY;
        unsigned int _M_groupExtentZ;

        _DPC_call_handle(const accelerator_view &_Accelerator_view)
        {
            if (!_Accelerator_view.is_auto_selection) {
                _M_rv = _Get_accelerator_view_impl_ptr(_Accelerator_view);
            }
            else {
                _M_rv = NULL;
            }

            _M_is_explicit_target_acclview = false;
            if (_M_rv != NULL) {
                _M_is_explicit_target_acclview = true;
            }

            _M_device_resource_info = NULL;        
            _M_num_resources = 0;
            _M_num_writable_buffers = 0;
            _M_num_samplers = 0;

            _M_const_buffer_info = NULL;
            _M_num_const_buffers = 0;

            _M_RW_aliasing = false;

            for (size_t _I = 0; _I < NUM_SHADER_VERSIONS; _I++) 
            {
                _M_shader_blobs[_I] = NULL;
            }

            _M_is_flat_model = 0;
            _M_compute_rank = 0;
            _M_grid_extents = NULL;

            _M_groupCountX = 0;
            _M_groupCountY = 0;
            _M_groupCountZ = 0;

            _M_groupExtentX = 0;
            _M_groupExtentY = 0;
            _M_groupExtentZ = 0;            
        }

        ~_DPC_call_handle()
        {
            if (_M_grid_extents) {
                delete [] _M_grid_extents;
            }
        }

        bool _Is_buffer_aliased(_In_ void *_Buffer_ptr)
        {
            return ((_M_aliased_buffer_set != nullptr) && (_M_aliased_buffer_set->find(_Buffer_ptr) != _M_aliased_buffer_set->end()));
        }

        bool _Is_buffer_unaccessed(size_t _Buffer_idx)
        {
            return ((_M_is_device_buffer_unaccessed != nullptr) && _M_is_device_buffer_unaccessed->operator[](_Buffer_idx));
        }

        void _Set_buffer_unaccessed(size_t _Buffer_idx)
        {
            if (_M_is_device_buffer_unaccessed == nullptr) {
                _M_is_device_buffer_unaccessed = std::unique_ptr<std::vector<bool>>(new std::vector<bool>(_M_num_resources, false));
            }

            _M_is_device_buffer_unaccessed->operator[](_Buffer_idx) = true;
        }

        const int* _Get_redirect_indices() const
        {
            if (!_M_RW_aliasing) {
                return nullptr;
            }

            _ASSERTE(_M_Redirect_indices != nullptr);

            return _M_Redirect_indices->data();
        }

        void _Check_buffer_aliasing();
        void _Update_buffer_rw_property();
        void _Setup_aliasing_redirection_indices();
        void _Select_accelerator_view();
        void _Verify_buffers_against_accelerator_view();

    private:
        std::unique_ptr<std::unordered_set<void*>> _M_aliased_buffer_set;
        std::unique_ptr<std::vector<bool>> _M_is_device_buffer_unaccessed;
        // Info about read-write aliasing 
        std::unique_ptr<std::vector<int>> _M_Redirect_indices;
    };

    // This structure is used for passing the scheduling
    // info to the parallel_for_each which is handed back
    // to the compiler-runtime interface methods by the front-end
    struct _Host_Scheduling_info
    {
        // The accelerator view to invoke a parallel_for_each on
        accelerator_view _M_accelerator_view; 
    };

} // namespace Concurrency::details


/// <summary>
///     Uninitializes the C++ AMP runtime. It is legal to
///     call this function multiple times during an applications
///     lifetime. Calling any C++ AMP API afer calling this function
///     will reinitialize the C++ AMP runtime. Note that it is illegal
///     to use C++ AMP objects across calls to this function and doing
///     so will result in undefined behavior. Also, concurrently calling
///     this function and any other AMP APIs is illegal and would result
///     in undefined behavior.
/// </summary>
_AMPIMP void __cdecl amp_uninitialize();

} // namespace Concurrency

extern "C" {

    // Return a compiler helper handle.
    _AMPIMP _Ret_ _DPC_call_handle * HELPERAPI __dpc_create_call_handle(_In_ _Host_Scheduling_info *_Sch_info) throw(...);

    // Destroy the call handle
    _AMPIMP void HELPERAPI __dpc_release_call_handle(_In_ _DPC_call_handle * _Handle) throw(...);

    _AMPIMP void HELPERAPI __dpc_set_device_resource_info(_In_ _DPC_call_handle * _Handle, _In_ _Device_resource_info * _DeviceResourceInfo, size_t _NumResources) throw(...);

    // Set const buffer info.
    _AMPIMP void HELPERAPI __dpc_set_const_buffer_info(_In_ _DPC_call_handle * _Handle, _In_ _Device_const_buffer_info * _DeviceConstBufferInfo, size_t _NumConstBuffers) throw(...);

    // Set the kernel shader info
    _AMPIMP void HELPERAPI __dpc_set_kernel_shader_info(_In_ _DPC_call_handle * _Handle,
                                                        _Inout_ void ** _ShaderBlobs) throw(...);
    // Set kernel dispatch info
    _AMPIMP void HELPERAPI __dpc_set_kernel_dispatch_info(_In_ _DPC_call_handle * _Handle,
                                                         unsigned int _ComputeRank,
                                                         _In_ int * _Extents,
                                                         unsigned int _GroupRank,
                                                         const unsigned int * _GroupExtents,
                                                         unsigned int & _GroupCountX,
                                                         unsigned int & _GroupCountY,
                                                         unsigned int & _GroupCountZ) throw(...);

    // Dispatch the kernel
    _AMPIMP void HELPERAPI __dpc_dispatch_kernel(_In_ _DPC_call_handle * _Handle) throw(...);
    
#ifdef _DEBUG
    // Dispatch the kernel passed as a HLSL source level shader 
    // This function is to be used only for testing and debugging purposes
    _AMPIMP void HELPERAPI __dpc_dispatch_kernel_test(_In_ _DPC_call_handle * _Handle, _In_ WCHAR* szFileName, LPCSTR szEntryPoint) throw(...);
#endif 
}

// =+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+
//
// C++ AMP ETW Provider
//
// =-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-

namespace Concurrency
{
namespace details
{

// Thread-safe factory method for _Amp_runtime_trace object
_AMPIMP _Ret_ _Amp_runtime_trace* __cdecl _Get_amp_trace();

// Class that gathers C++ AMP diagnostic information and triggers events
class _Amp_runtime_trace
{

// Called by factory to create single instance of _Amp_runtime_trace type
friend BOOL CALLBACK _Init_amp_runtime_trace(PINIT_ONCE _Init_once, PVOID _Param, _Inout_ PVOID *_Context);

public:
    // Destructor for _Amp_runtime_trace, called at program termination
    _AMPIMP ~_Amp_runtime_trace();

    // End event is triggered by multiple other events such us StartComputeEvent to show exactly when given activity completed
    _AMPIMP void _Write_end_event(ULONG _Span_id);

    // Add accelerator configuration information
    // Note: This member function does not have to be exported, it is used by C++ AMP runtime factory
    void _Add_accelerator_config_event(PVOID _Accelerator_id, LPCWSTR _Device_path, LPCWSTR _Device_description);

    // Used by callback function, to write all configuration data when new session is detected
    // Note: This member function does not have to be exported, it is used by C++ AMP runtime factory
    void _Write_all_accelerator_config_events();

    // Started accelerator_view::wait operation
    // Note: This member function does not have to be exported, it is used by C++ AMP runtime factory
    ULONG _Start_accelerator_view_wait_event(PVOID _Accelerator_id, PVOID _Accelerator_view_id);

    // Launched accelerator_view::flush operation
    // Note: This member function does not have to be exported, it is used by C++ AMP runtime factory
    void _Launch_flush_event(PVOID _Accelerator_id, PVOID _Accelerator_view_id);

    // Launched accelerator_view::create_marker operation
    // Note: This member function does not have to be exported, it is used by C++ AMP runtime factory
    ULONG _Launch_marker(PVOID _Accelerator_id, PVOID _Accelerator_view_id);

    // Below are set of helpers that take various types that were available at event injection point and extract all necessary data
    _AMPIMP ULONG _Start_parallel_for_each_event_helper(_In_ _DPC_call_handle *_Handle);

    // This helper wraps functor with wait start and wait end events
    inline concurrency::completion_future _Start_async_op_wait_event_helper(ULONG _Async_op_id, _Event _Ev)
    {
        std::shared_future<void> retFuture;
        concurrency::task_completion_event<void> retTaskCompletionEvent;

        // Create a std::shared_future by creating a deferred task through std::async that waits for the
        // event _Ev to finish. Wrap functor with start and end events
        retFuture = std::async(std::launch::sync, [=]() mutable {
            try 
            {
                if (_Async_op_id == _Amp_runtime_trace::_M_event_disabled)
                {
                    _Ev._Get();
                }
                else
                {
                    auto _Span_id = details::_Get_amp_trace()->_Start_async_op_wait_event(_Async_op_id);
                    _Ev._Get();
                    details::_Get_amp_trace()->_Write_end_event(_Span_id);
                }
            }
            catch(...) 
            {
                // If an exception is encountered when executing the asynchronous operation
                // we should set the exception on the retTaskCompletionEvent so that it is
                // appropriately cancelled and the exception is propagated to continuations
                retTaskCompletionEvent.set_exception(std::current_exception());
                throw;
            }

            retTaskCompletionEvent.set();
        });

        // Register the async event with the runtime asynchronous events manager
        _Register_async_event(_Ev, retFuture);

        // Lets issue a continuation just to swallow any exceptions that are encountered during the
        // async operation and are never observed by the user or are just observed through the
        // shared_future and not through the task
        concurrency::task<void> retTask(retTaskCompletionEvent);
        retTask.then([](concurrency::task<void> _Task) {
            try {
                _Task.get();
            }
            catch(...) {
            }
        });

        return Concurrency::completion_future(retFuture, retTask);
    }

    _AMPIMP ULONG _Start_array_view_synchronize_event_helper(const _Buffer_descriptor &_Buff_desc);
    _AMPIMP ULONG _Launch_array_view_synchronize_event_helper(const _Buffer_descriptor &_Buff_desc);
    
    // Helpers for buffers (array, array_view)
    _AMPIMP ULONG _Start_copy_event_helper(const _Buffer_descriptor &_Src, const _Buffer_descriptor &_Dest, ULONGLONG _Num_bytes_for_copy);
    _AMPIMP ULONG _Start_copy_event_helper(nullptr_t, const _Buffer_descriptor &_Dest, ULONGLONG _Num_bytes_for_copy);
    _AMPIMP ULONG _Start_copy_event_helper(const _Buffer_descriptor &_Src, nullptr_t, ULONGLONG _Num_bytes_for_copy);
    _AMPIMP ULONG _Launch_async_copy_event_helper(const _Buffer_descriptor &_Src, const _Buffer_descriptor &_Dest, ULONGLONG _Num_bytes_for_copy);
    _AMPIMP ULONG _Launch_async_copy_event_helper(nullptr_t, const _Buffer_descriptor &_Dest, ULONGLONG _Num_bytes_for_copy);
    _AMPIMP ULONG _Launch_async_copy_event_helper(const _Buffer_descriptor &_Src, nullptr_t, ULONGLONG _Num_bytes_for_copy);
    
    // Helper for textures
    _AMPIMP ULONG _Start_copy_event_helper(const _Texture_descriptor &_Src, nullptr_t, ULONGLONG _Num_bytes_for_copy);
    _AMPIMP ULONG _Start_copy_event_helper(nullptr_t, const _Texture_descriptor &_Dest, ULONGLONG _Num_bytes_for_copy);
    _AMPIMP ULONG _Start_copy_event_helper(const _Texture_descriptor &_Src, const _Texture_descriptor &_Dest, ULONGLONG _Num_bytes_for_copy);
    _AMPIMP ULONG _Launch_async_copy_event_helper(const _Texture_descriptor &_Src, nullptr_t, ULONGLONG _Num_bytes_for_copy);
    _AMPIMP ULONG _Launch_async_copy_event_helper(nullptr_t, const _Texture_descriptor &_Dest, ULONGLONG _Num_bytes_for_copy);
    _AMPIMP ULONG _Launch_async_copy_event_helper(const _Texture_descriptor &_Src, const _Texture_descriptor &_Dest, ULONGLONG _Num_bytes_for_copy);

    void _Enable_provider(bool _Enable = true);

private:
    // Private constructor. This type is created by factory method
    _Amp_runtime_trace(PVOID _Callback_function, _In_ _Trace *_Trace);

    // Disallow copy construction 
    _Amp_runtime_trace(const _Amp_runtime_trace&);

    // Disallow assignment operator
    _Amp_runtime_trace& operator=(const _Amp_runtime_trace&);
    
    // Used internally to write configuation events 
    void _Write_accelerator_config_event(const std::tuple<PVOID, LPCWSTR, LPCWSTR> &_ConfigTuple);

    // Event triggered when computation is scheduled
    ULONG _Start_parallel_for_each_event(
        PVOID _Accelerator_id, 
        PVOID _Accelerator_view_id, 
        BOOL _Is_tiled_explicitly, 
        ULONGLONG _Num_of_tiles, 
        ULONG _Num_of_threads_per_tile, 
        BOOL _Is_aliased, 
        ULONG _Num_read_only_resources, 
        ULONG _Num_read_write_resources, 
        ULONGLONG _Size_of_all_resouces, 
        ULONG _Size_of_const_data, 
        ULONGLONG _Size_of_data_for_copy);

    // Synchronous copy operation has started
    ULONG _Start_copy_event(
        PVOID _Src_accelerator_id, 
        PVOID _Src_accelerator_view_id,
        PVOID _Dst_accelerator_id, 
        PVOID _Dst_accelerator_view_id,
        ULONGLONG _Num_bytes_for_copy,
        BOOL _Is_src_staging,
        BOOL _Is_dst_staging);

    // Asynchronous copy operation has been launched
    ULONG _Launch_async_copy_event(
        PVOID _Src_accelerator_id, 
        PVOID _Src_accelerator_view_id,
        PVOID _Dst_accelerator_id, 
        PVOID _Dst_accelerator_view_id,
        ULONGLONG _Num_bytes_for_copy,
        BOOL _Is_src_staging,
        BOOL _Is_dst_staging);

    // Started waiting for asynchronous operation to complete
    _AMPIMP ULONG _Start_async_op_wait_event(ULONG _Async_op_id);

    // Started array_view::synchronize operation
    ULONG _Start_array_view_synchronize_event(ULONGLONG _Num_bytes_to_synchronize);

    // Async array_view::synchronize operation has been launched
    ULONG _Launch_array_view_synchronize_event(ULONGLONG _Num_bytes_to_synchronize);

    // Helper function that extracts information from buffer descriptor
    std::tuple<PVOID, PVOID, BOOL> _Get_resource_diagnostic_info(const _Buffer_descriptor &_Buff_desc, accelerator_view _Accl_view) const;

    // Helper function that extracts information from texture descriptor
    std::tuple<PVOID, PVOID, BOOL> _Get_resource_diagnostic_info(const _Texture_descriptor &_Tex_desc) const;

    // Generates unique identifiers for span_id and async_op_id
    ULONG _Get_unique_identifier();

    // Critical section object used by callback function to synchronize following situations:
    // a) multiple sessions have started at the same time
    // b) C++ AMP Runtime factory adds new accelerator config event to the collection 
    Concurrency::critical_section _M_critical_section;

    // Collection of all configuration events at the time of C++ AMP Runtime initialization
    std::vector<std::tuple<PVOID, LPCWSTR, LPCWSTR>> _M_accelerator_configs;

    // Unique counter for span id and async operation id
    volatile ULONG _M_counter;

    // Type that implements ITrace interface and writes events e.g. ETW events 
    _Trace* _M_trace_ptr;

    // Special value that we return to chain events if provider is disabled
    static const ULONG _M_event_disabled = 0;
};

// Helper function to query the number of mipmap levels from texture object
inline unsigned int _Get_mipmap_levels(const _Texture *_Tex)
{
    _ASSERTE(_Tex);
    return _Tex->_Get_mip_levels();
}

} // namespace Concurrency::details
} // namespace Concurrency

namespace concurrency = Concurrency;

#pragma pack(pop)