/*** * ==++== * * Copyright (c) Microsoft Corporation. All rights reserved. * * ==--== * =+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+ * * amprt.h * * Define the C++ interfaces exported by the C++ AMP runtime * * =-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=- ****/ #pragma once #if !(defined (_M_X64) || defined (_M_IX86) || defined (_M_ARM) || defined (_M_ARM64) ) #error ERROR: C++ AMP runtime is supported only on X64, X86, ARM, and ARM64 architectures. #endif #if defined (_M_CEE) #error ERROR: C++ AMP runtime is not supported when compiling /clr. #endif #ifndef __cplusplus #error ERROR: C++ AMP runtime is supported only for C++. #endif #if !defined(_CXXAMP) #if defined(_DEBUG) #pragma comment(lib, "vcampd") #else // _DEBUG #pragma comment(lib, "vcamp") #endif // _DEBUG #endif // _CXXAMP #if !defined(_CXXAMP) #define __GPU restrict(amp,cpu) #define __GPU_ONLY restrict(amp) #define __CPU_ONLY #else #define __GPU #define __GPU_ONLY #define __CPU_ONLY #endif // _CXXAMP #include #include #include #include #include #if defined(_CXXAMP) #include #endif // _CXXAMP #include #include #include #include #include #include #include #include #if !defined(_AMPIMP) #define _AMPIMP __declspec(dllimport) #endif #pragma pack(push,8) // Part of runtime-compiler interface extern "C" { // Access mode of fields enum _Access_mode { _No_access = 0, _Read_access = (1 << 0), _Write_access = (1 << 1), _Is_array_mode = (1 << 30), _Read_write_access = _Read_access | _Write_access, }; } namespace Concurrency { ///

/// Enumeration type used to denote the various types of access to data. ///

enum access_type { access_type_none = 0, access_type_read = (1 << 0), access_type_write = (1 << 1), access_type_read_write = access_type_read | access_type_write, access_type_auto = (1 << 31), }; // Forward declarations class accelerator_view; class accelerator; namespace details { const size_t ERROR_MSG_BUFFER_SIZE = 1024; // A reference counter to be used as the base class for all reference counted types. class _Reference_counter { public: // Constructor. _Reference_counter() : _M_rc(0) {} // Destructor. virtual ~_Reference_counter() {} // Add a reference. // Thread-safe. size_t _Add_reference() { return InterlockedIncrement(reinterpret_cast(&_M_rc)); } // Remove a reference. // Thread-safe. size_t _Remove_reference() { _ASSERTE(_M_rc > 0); size_t refCount = InterlockedDecrement(reinterpret_cast(&_M_rc)); if (refCount == 0) this->_Release(); return refCount; } // Release the counter _AMPIMP void _Release(); // Return the reference count value size_t _Get_reference_count() { return _M_rc; } private: size_t _M_rc; }; // A smart pointer to a reference counted object // T must be a type derived from _Reference_counter template class _Reference_counted_obj_ptr { public: // Constructor _Reference_counted_obj_ptr(T* _Ptr = NULL) : _M_obj_ptr(_Ptr) { _Init(); } // Copy constructor _Reference_counted_obj_ptr(const _Reference_counted_obj_ptr &_Other) : _M_obj_ptr(_Other._M_obj_ptr) { _Init(); } // Move constructor _Reference_counted_obj_ptr(_Reference_counted_obj_ptr &&_Other) : _M_obj_ptr(_Other._M_obj_ptr) { _Other._M_obj_ptr = nullptr; // No change to ref-count } // Destructor ~_Reference_counted_obj_ptr() { if (_M_obj_ptr != NULL) { _UnInitialize(_M_obj_ptr); } } // Assignment operator _Reference_counted_obj_ptr& operator=(const _Reference_counted_obj_ptr &_Other) { if (_M_obj_ptr != _Other._M_obj_ptr) { T *oldPtr = _M_obj_ptr; _M_obj_ptr = _Other._M_obj_ptr; _Init(); if (oldPtr != NULL) { _UnInitialize(oldPtr); } } return *this; } // Move-assignment operator _Reference_counted_obj_ptr& operator=(_Reference_counted_obj_ptr &&_Other) { if (_M_obj_ptr != _Other._M_obj_ptr) { T *oldPtr = _M_obj_ptr; _M_obj_ptr = _Other._M_obj_ptr; _Other._M_obj_ptr = nullptr; // No change to ref-count of the adopted pointer. if (oldPtr != nullptr) { _UnInitialize(oldPtr); } } return *this; } _Ret_ T* operator->() const { return _M_obj_ptr; } T& operator*() const { return *_M_obj_ptr; } operator T*() const { return _M_obj_ptr; } _Ret_ T* _Get_ptr() const { return _M_obj_ptr; } private: T *_M_obj_ptr; void _Init() { if (_M_obj_ptr == NULL) return; reinterpret_cast<_Reference_counter*>(_M_obj_ptr)->_Add_reference(); } static void _UnInitialize(_In_ T *_Obj_ptr) { reinterpret_cast<_Reference_counter*>(_Obj_ptr)->_Remove_reference(); } }; // Forward declarations class _Trace; class _Amp_runtime_trace; class _Buffer; class _Texture; class _Sampler; class _Ubiquitous_buffer; class _D3D_interop; class _Accelerator_view_impl; class _CPU_accelerator_view_impl; class _D3D_accelerator_view_impl; class _Accelerator_impl; class _Event_impl; class _DPC_runtime_factory; class _View_shape; struct _Buffer_descriptor; class _Accelerator_view_hasher; struct _DPC_shader_blob; struct _View_info; // The enum specifies the base type for short vector type. enum _Short_vector_base_type_id : unsigned int { _Uint_type = 0, _Int_type = 1, _Float_type = 2, _Unorm_type = 3, _Norm_type = 4, _Double_type = 5, _Invalid_type = 0xFFFFFFFF }; typedef enum _Short_vector_base_type_id _Texture_base_type_id; } // namespace Concurrency::details typedef details::_Reference_counted_obj_ptr _Accelerator_view_impl_ptr; typedef details::_Reference_counted_obj_ptr _Accelerator_impl_ptr; typedef details::_Reference_counted_obj_ptr _Buffer_ptr; typedef details::_Reference_counted_obj_ptr _Texture_ptr; typedef details::_Reference_counted_obj_ptr _Sampler_ptr; typedef details::_Reference_counted_obj_ptr _Ubiquitous_buffer_ptr; typedef details::_Reference_counted_obj_ptr _Event_impl_ptr; typedef details::_Reference_counted_obj_ptr _View_shape_ptr; namespace details { // The _Event class. class _Event { friend class _Buffer; friend class _Texture; friend class accelerator_view; friend class _D3D_accelerator_view_impl; public: ///

/// Constructor of the _Event. ///

_AMPIMP _Event(); ///

/// Destructor of the _Event. ///

_AMPIMP ~_Event(); ///

/// Copy constructor ///

_AMPIMP _Event(const _Event & _Other); ///

/// Assignment operator ///

_AMPIMP _Event & operator=(const _Event & _Other); ///

/// Poll whether the _Event has completed or not. Swallows any exceptions ///

/// /// true, if the _Event has completed, false otherwise /// _AMPIMP bool _Is_finished_nothrow(); ///

/// Poll whether the _Event has completed or not and throws any exceptions that occur ///

/// /// true, if the _Event has completed, false otherwise /// _AMPIMP bool _Is_finished(); ///

/// Wait until the _Event completes and throw any exceptions that occur. ///

_AMPIMP void _Get(); ///

/// Tells if this is an empty event ///

/// /// true, if the _Event is empty /// false, otherwise /// _AMPIMP bool _Is_empty() const; ///

/// Creates an event which is an ordered collection of this and _Ev ///

/// /// The composite event /// _AMPIMP _Event _Add_event(_Event _Ev); ///

/// Creates an event which is an ordered collection of this and a continuation task ///

/// /// The composite event /// _AMPIMP _Event _Add_continuation(const std::function<_Event __cdecl ()> &_Continuation_task); ///

/// Return true if the other _Event is same as this _Event; false otherwise ///

_AMPIMP bool operator==(const _Event &_Other) const; ///

/// Return false if the other _Event is same as this _Event; true otherwise ///

_AMPIMP bool operator!=(const _Event &_Other) const; private: // Private constructor _Event(_In_ _Event_impl* _Impl); _Event_impl_ptr _M_ptr_event_impl; }; typedef _Buffer_descriptor *_View_key; _Ret_ _Accelerator_view_impl* _Get_accelerator_view_impl_ptr(const accelerator_view& _Accl_view); _Ret_ _Accelerator_impl* _Get_accelerator_impl_ptr(const accelerator& _Accl); _Event _Get_access_async(const _View_key _Key, accelerator_view _Av, _Access_mode _Mode, _Buffer_ptr &_Buf_ptr); unsigned int _Get_mipmap_levels(const _Texture *_Tex); inline bool _Is_valid_access_mode(_Access_mode _Mode) { if ((_Mode != _Read_access) && (_Mode != _Write_access) && (_Mode != _Read_write_access)) { return false; } return true; } // Caution: Do not change this structure defintion. // This struct is special and is processed by the FE to identify the buffers // used in a parallel_for_each and to setup the _M_data_ptr with the appropriate // buffer ptr value in the device code. typedef struct _Buffer_descriptor { friend _Event _Get_access_async(const _View_key _Key, accelerator_view _Av, _Access_mode _Mode, _Buffer_ptr &_Buf_ptr); // _M_data_ptr points to the raw data underlying the buffer for accessing on host mutable void *_M_data_ptr; private: // _M_buffer_ptr points to a _Ubiquitous_buffer that holds the data in an 1D array. // This is private to ensure that all assignments to this data member // only happen through public functions which properly manage the // ref count of the underlying buffer _Ubiquitous_buffer *_M_buffer_ptr; public: // _M_curr_cpu_access_mode specifies the current access mode of the data on the // cpu accelerator_view specified at the time of registration of this view _Access_mode _M_curr_cpu_access_mode; // _M_type_acess_mode specifies the access mode of the overlay type // array_views set it to the appropriate access mode and for arrays it is // always _Is_array_mode. _Access_mode _M_type_access_mode; public: // Public functions // Default constructor _Buffer_descriptor() __GPU : _M_data_ptr(NULL), _M_buffer_ptr(NULL), _M_curr_cpu_access_mode(_No_access), _M_type_access_mode(_Is_array_mode) { } _Buffer_descriptor(_In_ void *_Data_ptr, _In_ _Ubiquitous_buffer *_Buffer_ptr, _Access_mode _Curr_cpu_access_mode, _Access_mode _Type_mode) __GPU : _M_data_ptr(_Data_ptr), _M_buffer_ptr(NULL), _M_curr_cpu_access_mode(_Curr_cpu_access_mode), _M_type_access_mode(_Type_mode) { _Set_buffer_ptr(_Buffer_ptr); } // Destructor ~_Buffer_descriptor() __GPU { _Set_buffer_ptr(NULL); } // Copy constructor _Buffer_descriptor(const _Buffer_descriptor &_Other) __GPU : _M_data_ptr(_Other._M_data_ptr), _M_buffer_ptr(NULL), _M_curr_cpu_access_mode(_Other._M_curr_cpu_access_mode), _M_type_access_mode(_Other._M_type_access_mode) { _Set_buffer_ptr(_Other._M_buffer_ptr); } // Assignment operator _Buffer_descriptor& operator=(const _Buffer_descriptor &_Other) __GPU { if (this != &_Other) { _M_data_ptr = _Other._M_data_ptr; _M_curr_cpu_access_mode = _Other._M_curr_cpu_access_mode; _M_type_access_mode = _Other._M_type_access_mode; _Set_buffer_ptr(_Other._M_buffer_ptr); } return *this; } _Ret_ _Ubiquitous_buffer* _Get_buffer_ptr() const __CPU_ONLY { return _M_buffer_ptr; } void _Set_buffer_ptr(_In_opt_ _Ubiquitous_buffer *_Buffer_ptr) __CPU_ONLY { if (_M_buffer_ptr != _Buffer_ptr) { if (_M_buffer_ptr != NULL) { reinterpret_cast<_Reference_counter*>(_M_buffer_ptr)->_Remove_reference(); } _M_buffer_ptr = _Buffer_ptr; if (_M_buffer_ptr != NULL) { reinterpret_cast<_Reference_counter*>(_M_buffer_ptr)->_Add_reference(); } } } #if !defined(_CXXAMP) void _Set_buffer_ptr(_In_opt_ _Ubiquitous_buffer *_Buffer_ptr) __GPU_ONLY { // No need to set the buffer ptr on the GPU UNREFERENCED_PARAMETER(_Buffer_ptr); _M_buffer_ptr = NULL; } #endif // _CXXAMP bool _Is_array() const { return (_M_type_access_mode == _Is_array_mode); } _Ret_ _View_key _Get_view_key() { return this; } const _View_key _Get_view_key() const { return ((const _View_key)(this)); } _AMPIMP void _Get_CPU_access(_Access_mode _Requested_mode) const; } _Buffer_descriptor; // Caution: Do not change this structure defintion. // This struct is special and is processed by the FE to identify the textures // used in a parallel_for_each and to setup the _M_data_ptr with the appropriate // texture ptr value in the device code. typedef struct _Texture_descriptor { // _M_data_ptr points to the raw data underlying the texture mutable IUnknown *_M_data_ptr; private: // _M_texture_ptr points to a _Texture that holds the data // This is private to ensure that all assignments to this data member // only happen through public functions which properly manage the // ref count of the underlying texture _Texture *_M_texture_ptr; // The index of the most detailed (largest in size) mipmap level for the texture (or texture view) // This value is always zero for the texture and might be non-zero for the texture views unsigned int _M_most_detailed_mipmap_level; // Number of accessible mipmap levels for the texture (or texture view), // e.g. if the texture has 3 mipmap levels ([0, 1, 2]), // then read-only texture view with most detailed mipmap level equal to 1, can have 1 or 2 mipmap levels ([1] or [1, 2]). // Further texture_views created on top of the texture view defined above can only narrow down the range of accessible mipmap levels. unsigned int _M_view_mipmap_levels; public: // Public functions // Default constructor _Texture_descriptor() __GPU : _M_data_ptr(NULL), _M_texture_ptr(NULL), _M_most_detailed_mipmap_level(0), _M_view_mipmap_levels(0) { // Enables move constructor } // Constructor for the texture _Texture_descriptor(unsigned int _Most_detailed_mipmap_level, unsigned int _View_mipmap_levels) __GPU : _M_data_ptr(NULL), _M_texture_ptr(NULL), _M_most_detailed_mipmap_level(_Most_detailed_mipmap_level), _M_view_mipmap_levels(_View_mipmap_levels) { } // Constructor for the interop texture _Texture_descriptor(_In_ _Texture * _Texture_ptr) : _M_data_ptr(NULL), _M_texture_ptr(NULL), _M_most_detailed_mipmap_level(0) __CPU_ONLY { _Set_texture_ptr(_Texture_ptr); // Adopt number of mipmap levels from underlying texture object _M_view_mipmap_levels = _Get_mipmap_levels(_M_texture_ptr); } // Destructor ~_Texture_descriptor() __GPU { _Set_texture_ptr(NULL); } // Copy constructor _Texture_descriptor(const _Texture_descriptor &_Other) __GPU : _M_data_ptr(_Other._M_data_ptr), _M_texture_ptr(NULL), _M_most_detailed_mipmap_level(_Other._M_most_detailed_mipmap_level), _M_view_mipmap_levels(_Other._M_view_mipmap_levels) { _Set_texture_ptr(_Other._M_texture_ptr); } // Copy constructor with ability to redefine mipmap information _Texture_descriptor(const _Texture_descriptor &_Other, unsigned int _Most_detailed_mipmap_level, unsigned int _View_mipmap_levels) __GPU : _M_data_ptr(_Other._M_data_ptr), _M_texture_ptr(NULL), _M_most_detailed_mipmap_level(_Most_detailed_mipmap_level), _M_view_mipmap_levels(_View_mipmap_levels) { _Set_texture_ptr(_Other._M_texture_ptr); } // Assignment operator _Texture_descriptor& operator=(const _Texture_descriptor &_Other) __GPU { if (this != &_Other) { _M_data_ptr = _Other._M_data_ptr; _Set_texture_ptr(_Other._M_texture_ptr); _M_most_detailed_mipmap_level = _Other._M_most_detailed_mipmap_level; _M_view_mipmap_levels = _Other._M_view_mipmap_levels; } return *this; } // Move constructor _Texture_descriptor(_Texture_descriptor &&_Other) __CPU_ONLY { *this = std::move(_Other); } bool operator==(const _Texture_descriptor &_Other) const __GPU { return _M_texture_ptr == _Other._M_texture_ptr && _M_data_ptr == _Other._M_data_ptr && _M_most_detailed_mipmap_level == _Other._M_most_detailed_mipmap_level && _M_view_mipmap_levels == _Other._M_view_mipmap_levels; } _Ret_ _Texture* _Get_texture_ptr() const __CPU_ONLY { _ASSERTE(_M_texture_ptr); return _M_texture_ptr; } unsigned int _Get_most_detailed_mipmap_level() const __GPU { return _M_most_detailed_mipmap_level; } unsigned int _Get_view_mipmap_levels() const __GPU { return _M_view_mipmap_levels; } void _Set_view_mipmap_levels(unsigned int _View_mipmap_levels) __CPU_ONLY { _M_view_mipmap_levels = _View_mipmap_levels; } void _Set_texture_ptr(_In_opt_ _Texture *_Texture_ptr) __CPU_ONLY { if (_M_texture_ptr != _Texture_ptr) { if (_M_texture_ptr != NULL) { reinterpret_cast<_Reference_counter*>(_M_texture_ptr)->_Remove_reference(); } _M_texture_ptr = _Texture_ptr; if (_M_texture_ptr != NULL) { reinterpret_cast<_Reference_counter*>(_M_texture_ptr)->_Add_reference(); } } } #if !defined(_CXXAMP) void _Set_texture_ptr(_In_opt_ _Texture *_Texture_ptr) __GPU_ONLY { // No need to set the texture ptr on the GPU UNREFERENCED_PARAMETER(_Texture_ptr); _M_texture_ptr = NULL; } #endif // _CXXAMP // This helper function is used to determine aliasing and copy violations bool _Are_mipmap_levels_overlapping(const _Texture_descriptor *_Other) const __CPU_ONLY { _ASSERTE(_Other); if (this->_Get_texture_ptr() != _Other->_Get_texture_ptr()) { return false; } return !((_M_most_detailed_mipmap_level < _Other->_M_most_detailed_mipmap_level) ? ((_M_most_detailed_mipmap_level + _M_view_mipmap_levels - 1) < _Other->_M_most_detailed_mipmap_level) : ((_Other->_M_most_detailed_mipmap_level + _Other->_M_view_mipmap_levels - 1) < _M_most_detailed_mipmap_level)); } } _Texture_descriptor; // Caution: Do not change this structure defintion. // This struct is special and is processed by the FE to identify the samplers // used in a parallel_for_each. typedef struct _Sampler_descriptor { // _M_data_ptr points to the sampler on accelerator mutable void *_M_data_ptr; private: // _M_sampler_ptr points to a _Sampler that holds the underlying sampler // representation. This is private to ensure that all assignments to this data member // only happen through public functions which properly manage the // ref count of the underlying _Sampler object. _Sampler *_M_sampler_ptr; public: // Public functions // Default constructor _Sampler_descriptor() __GPU : _M_data_ptr(NULL), _M_sampler_ptr(NULL) { } _Sampler_descriptor(_In_ _Sampler * _Sampler_ptr) __GPU : _M_data_ptr(NULL), _M_sampler_ptr(NULL) { _Set_sampler_ptr(_Sampler_ptr); } // Destructor ~_Sampler_descriptor() __GPU { _Set_sampler_ptr(NULL); } // Copy constructor _Sampler_descriptor(const _Sampler_descriptor &_Other) __GPU : _M_data_ptr(_Other._M_data_ptr), _M_sampler_ptr(NULL) { _Set_sampler_ptr(_Other._M_sampler_ptr); } // Assignment operator _Sampler_descriptor& operator=(const _Sampler_descriptor &_Other) __GPU { if (this != &_Other) { _M_data_ptr = _Other._M_data_ptr; _Set_sampler_ptr(_Other._M_sampler_ptr); } return *this; } // Move constructor _Sampler_descriptor(_Sampler_descriptor &&_Other) __CPU_ONLY { *this = std::move(_Other); } bool operator==(const _Sampler_descriptor &_Other) const __GPU { return _M_sampler_ptr == _Other._M_sampler_ptr && _M_data_ptr == _Other._M_data_ptr; } _Ret_ _Sampler* _Get_sampler_ptr() const __CPU_ONLY { return _M_sampler_ptr; } void _Set_sampler_ptr(_In_opt_ _Sampler *_Sampler_ptr) __CPU_ONLY { if (_M_sampler_ptr != _Sampler_ptr) { if (_M_sampler_ptr != NULL) { reinterpret_cast<_Reference_counter*>(_M_sampler_ptr)->_Remove_reference(); } _M_sampler_ptr = _Sampler_ptr; if (_M_sampler_ptr != NULL) { reinterpret_cast<_Reference_counter*>(_M_sampler_ptr)->_Add_reference(); } } } #if !defined(_CXXAMP) void _Set_sampler_ptr(_In_opt_ _Sampler *_Sampler_ptr) __GPU_ONLY { // No need to set the sampler ptr on the GPU UNREFERENCED_PARAMETER(_Sampler_ptr); _M_sampler_ptr = NULL; } #endif // _CXXAMP } _Sampler_descriptor; } // namespace Concurrency::details // Forward declaration class accelerator; namespace details { _AMPIMP size_t __cdecl _Get_num_devices(); _AMPIMP _Ret_ _Accelerator_impl_ptr * __cdecl _Get_devices(); _AMPIMP accelerator __cdecl _Select_default_accelerator(); _AMPIMP bool __cdecl _Set_default_accelerator(_Accelerator_impl_ptr _Accl); _AMPIMP bool __cdecl _Is_D3D_accelerator_view(const accelerator_view& _Av); _AMPIMP void __cdecl _Register_async_event(const _Event &_Ev, const std::shared_future &_Shared_future); _AMPIMP _Access_mode __cdecl _Get_recommended_buffer_host_access_mode(const accelerator_view &_Av); } ///

/// Queuing modes supported for accelerator views ///

enum queuing_mode { queuing_mode_immediate, queuing_mode_automatic }; ///

/// Exception thrown due to a C++ AMP runtime_exception. /// This is the base type for all C++ AMP exception types. ///

class runtime_exception : public std::exception { public: ///

/// Construct a runtime_exception exception with a message and an error code ///

/// /// Descriptive message of error /// /// /// HRESULT of error that caused this exception /// _AMPIMP runtime_exception(const char * _Message, HRESULT _Hresult) throw(); ///

/// Construct a runtime_exception exception with an error code ///

/// /// HRESULT of error that caused this exception /// _AMPIMP explicit runtime_exception(HRESULT _Hresult) throw(); ///

/// Copy construct a runtime_exception exception ///

/// /// The runtime_exception object to be copied from /// _AMPIMP runtime_exception(const runtime_exception &_Other) throw(); ///

/// Assignment operator ///

/// /// The runtime_exception object to be assigned from /// _AMPIMP runtime_exception &operator=(const runtime_exception &_Other) throw(); ///

/// Destruct a runtime_exception exception object instance ///

_AMPIMP virtual ~runtime_exception() throw(); ///

/// Get the error code that caused this exception ///

/// /// HRESULT of error that caused the exception /// _AMPIMP HRESULT get_error_code() const throw(); private: HRESULT _M_error_code; }; // class runtime_exception ///

/// Exception thrown when an underlying OS/DirectX call fails /// due to lack of system or device memory ///

class out_of_memory : public runtime_exception { public: ///

/// Construct an out_of_memory exception with a message ///

/// /// Descriptive message of error /// _AMPIMP explicit out_of_memory(const char * _Message) throw(); ///

/// Construct an out_of_memory exception ///

_AMPIMP out_of_memory () throw(); }; // class out_of_memory namespace direct3d { ///

/// Get the D3D device interface underlying a accelerator_view. ///

/// /// The D3D accelerator_view for which the underlying D3D device interface is returned. /// /// /// The IUnknown interface pointer of the D3D device underlying the accelerator_view. /// _AMPIMP _Ret_ IUnknown * __cdecl get_device(const accelerator_view &_Av); ///

/// Create a accelerator_view from a D3D device interface pointer. ///

/// /// The D3D device interface pointer to create the accelerator_view from. /// /// /// The queuing_mode to be used for the newly created accelerator_view. /// This parameter has a default value of queuing_mode_automatic. /// /// /// The accelerator_view created from the passed D3D device interface. /// _AMPIMP accelerator_view __cdecl create_accelerator_view(_In_ IUnknown *_D3D_device, queuing_mode _Qmode = queuing_mode_automatic); ///

/// Create and return a new accelerator view on the specified accelerator. ///

/// /// The accelerator on which the new accelerator_view is to be created. /// /// /// A boolean parameter that specifies whether timeout should be disabled /// for the newly created accelerator_view. This corresponds to the /// D3D11_CREATE_DEVICE_DISABLE_GPU_TIMEOUT flag for Direct3D device creation /// and is used to indicate if the operating system should allow workloads /// that take more than 2 seconds to execute, without resetting the device /// per the Windows timeout detection and recovery mechanism. Use of this flag /// is recommended if you need to perform time consuming tasks on the accelerator_view. /// /// /// The queuing_mode to be used for the newly created accelerator_view. /// This parameter has a default value of queuing_mode_automatic. /// /// /// The newly created accelerator_view. /// _AMPIMP accelerator_view __cdecl create_accelerator_view(accelerator& _Accelerator, bool _Disable_timeout, queuing_mode _Qmode = queuing_mode_automatic); ///

/// Returns a boolean flag indicating if timeout is disabled /// for the specified accelerator_view. This corresponds to the /// D3D11_CREATE_DEVICE_DISABLE_GPU_TIMEOUT flag for Direct3D device creation. ///

/// /// The accelerator_view for which the timeout disabled setting is to be queried. /// /// /// A boolean flag indicating if timeout is disabled for the specified accelerator_view. /// _AMPIMP bool __cdecl is_timeout_disabled(const accelerator_view& _Accelerator_view); ///

/// Acquire a lock on an accelerator_view for the purpose of safely performing D3D operations on resources shared /// with the accelerator_view. The accelerator_view and all C++ AMP resources associated with this accelerator_view /// internally take this lock when performing operations and will block while another thread holds the D3D access lock. /// /// This lock is non-recursive: It is undefined behavior to call this function from a thread that already holds the lock. /// It is undefined behavior to perform operations on the accelerator_view or any data container associated with the /// accelerator_view from the thread that holds the D3D access lock. /// /// See also scoped_d3d_access_lock, a RAII-style class for a scope-based D3D access lock. ///

/// /// The accelerator_view to lock. /// _AMPIMP void __cdecl d3d_access_lock(accelerator_view &_Av); ///

/// Attempt to acquire the D3D access lock on an accelerator_view without blocking. ///

/// /// The accelerator_view to lock. /// /// /// true if the lock was acquired, or false if it is currently held by another thread. /// _AMPIMP bool __cdecl d3d_access_try_lock(accelerator_view &_Av); ///

/// Release the D3D access lock on the given accelerator_view. If the calling thread does /// not hold the lock on the accelerator_view the results are undefined. ///

/// /// The accelerator_view for which the lock is to be released. /// _AMPIMP void __cdecl d3d_access_unlock(accelerator_view &_Av); ///

/// Tag type to indicate the D3D access lock should be adopted rather than /// acquired. ///

struct adopt_d3d_access_lock_t {}; ///

/// RAII wrapper for a D3D access lock on an accelerator_view. ///

class scoped_d3d_access_lock { public: ///

/// Acquire a D3D access lock on the given accelerator_view. The lock is released /// when this object goes out of scope. Construction will block until the lock /// is acquired. ///

/// /// The accelerator_view to lock. /// _AMPIMP explicit scoped_d3d_access_lock(accelerator_view &_Av); ///

/// Construct a scoped_d3d_access_lock on an accelerator_view for which the lock /// is already held (e.g. was acquired by d3d_access_try_lock). The D3D access /// lock must already be held by the calling thread and not controlled by any other /// scoped_d3d_access_lock. ///

/// /// The accelerator_view for the lock to adopt. /// /// /// The adopt_d3d_access_lock object. /// _AMPIMP explicit scoped_d3d_access_lock(accelerator_view &_Av, adopt_d3d_access_lock_t _T); ///

/// Destructor for scoped_d3d_access_lock: unlock the accelerator_view. ///

_AMPIMP ~scoped_d3d_access_lock(); ///

/// Move constructor for scoped_d3d_access_lock: Take ownership of /// a lock from another scoped_d3d_access_lock. ///

/// /// The accelerator_view from which to move. /// _AMPIMP scoped_d3d_access_lock(scoped_d3d_access_lock &&_Other); ///

/// Move assignment operator for scoped_d3d_access_lock: Take ownership /// of a lock from another scoped_d3d_access_lock, releasing the previous /// lock. ///

/// /// The accelerator_view from which to move. /// /// /// A reference to this scoped_accelerator_view_lock. /// _AMPIMP scoped_d3d_access_lock& operator=(scoped_d3d_access_lock &&_Other); private: // No copy constructor scoped_d3d_access_lock(const scoped_d3d_access_lock &_Other); // No assignment operator scoped_d3d_access_lock & operator=(const scoped_d3d_access_lock &_Other); _Accelerator_view_impl_ptr _M_impl; }; } // namespace direct3d ///

/// Class represents a accelerator abstraction for C++ AMP data-parallel devices ///

class accelerator { friend class accelerator_view; friend class details::_Ubiquitous_buffer; friend _AMPIMP accelerator details::_Select_default_accelerator(); _AMPIMP friend accelerator_view __cdecl direct3d::create_accelerator_view(accelerator& _Accelerator, bool _Disable_timeout, queuing_mode _Qmode /* = queuing_mode_automatic */); friend _Ret_ details::_Accelerator_impl* details::_Get_accelerator_impl_ptr(const accelerator& _Accl); public: ///

/// String constant for default accelerator ///

_AMPIMP static const wchar_t default_accelerator[]; ///

/// String constant for cpu accelerator ///

_AMPIMP static const wchar_t cpu_accelerator[]; ///

/// String constant for direct3d WARP accelerator ///

_AMPIMP static const wchar_t direct3d_warp[]; ///

/// String constant for direct3d reference accelerator ///

_AMPIMP static const wchar_t direct3d_ref[]; ///

/// Construct a accelerator representing the default accelerator ///

_AMPIMP accelerator(); ///

/// Construct a accelerator representing the accelerator with the /// specified device instance path ///

explicit accelerator(const std::wstring &_Device_path) : _M_impl(NULL) { _Init(_Device_path.c_str()); } ///

/// Destructor ///

_AMPIMP ~accelerator(); ///

/// Copy constructor ///

_AMPIMP accelerator(const accelerator &_Other); ///

/// Assignment operator ///

_AMPIMP accelerator &operator=(const accelerator &_Other); ///

/// Returns the vector of accelerator objects representing all available accelerators ///

/// /// The vector of available accelerators /// static inline std::vector get_all() { std::vector _AcceleratorVector; size_t _NumDevices = details::_Get_num_devices(); for (size_t _I = 0; (_I < _NumDevices); ++_I) { _AcceleratorVector.push_back(details::_Get_devices()[_I]); } return _AcceleratorVector; } ///

/// Sets the default accelerator to be used for any operation /// that implicitly uses the default accelerator. This method /// only succeeds if the runtime selected default accelerator /// has not already been used in an operation that implicitly /// uses the default accelerator ///

/// /// A boolean value indicating if the call succeeds in setting /// the default accelerator /// static inline bool set_default(const std::wstring& _Path) { accelerator _Accl(_Path); return details::_Set_default_accelerator(_Accl._M_impl); } ///

/// Returns the auto selection accelerator_view which when specified /// as the parallel_for_each target results in the target accelerator_view /// for executing the parallel_for_each kernel to be automatically selected /// by the runtime. For all other purposes, the accelerator_view returned /// by this method is the same as the default accelerator_view of the default /// accelerator ///

_AMPIMP static accelerator_view __cdecl get_auto_selection_view(); ///

/// Returns the system-wide unique device instance path as a std::wstring ///

std::wstring get_device_path() const { return _Get_device_path(); } __declspec(property(get=get_device_path)) std::wstring device_path; ///

/// Get the version for this accelerator ///

_AMPIMP unsigned int get_version() const; __declspec(property(get=get_version)) unsigned int version; // hiword=major, loword=minor ///

/// Returns the device description as a std::wstring ///

std::wstring get_description() const { return _Get_description(); } __declspec(property(get=get_description)) std::wstring description; ///

/// Returns a boolean value indicating whether the accelerator /// was created with DEBUG layer enabled for extensive error reporting ///

_AMPIMP bool get_is_debug() const; __declspec(property(get=get_is_debug)) bool is_debug; ///

/// Returns a boolean value indicating whether the accelerator is emulated. /// This is true, for example, with the direct3d reference and WARP accelerators. ///

_AMPIMP bool get_is_emulated() const; __declspec(property(get=get_is_emulated)) bool is_emulated; ///

/// Returns a boolean value indicating whether the accelerator /// is attached to a display ///

_AMPIMP bool get_has_display() const; __declspec(property(get=get_has_display)) bool has_display; ///

/// Returns a boolean value indicating whether the accelerator /// supports full double precision (including double division, /// precise_math functions, int to double, double to int conversions) /// in a parallel_for_each kernel. ///

_AMPIMP bool get_supports_double_precision() const; __declspec(property(get=get_supports_double_precision)) bool supports_double_precision; ///

/// Returns a boolean value indicating whether the accelerator /// has limited double precision support (excludes double division, /// precise_math functions, int to double, double to int conversions) /// for a parallel_for_each kernel. ///

_AMPIMP bool get_supports_limited_double_precision() const; __declspec(property(get=get_supports_limited_double_precision)) bool supports_limited_double_precision; ///

/// Returns a boolean value indicating whether the accelerator /// supports memory accessible both by the accelerator and the CPU. ///

_AMPIMP bool get_supports_cpu_shared_memory() const; __declspec(property(get=get_supports_cpu_shared_memory)) bool supports_cpu_shared_memory; ///

/// Return the default accelerator view associated with this accelerator ///

_AMPIMP accelerator_view get_default_view() const; __declspec(property(get=get_default_view)) accelerator_view default_view; ///

/// Get the dedicated memory for this accelerator in KB ///

_AMPIMP size_t get_dedicated_memory() const; __declspec(property(get=get_dedicated_memory)) size_t dedicated_memory; ///

/// Get the default cpu access_type for buffers created on this accelerator ///

_AMPIMP access_type get_default_cpu_access_type() const; __declspec(property(get=get_default_cpu_access_type)) access_type default_cpu_access_type; ///

/// Set the default cpu access_type for arrays created on this accelerator /// or for implicit memory allocations as part of array_views accessed /// on this this accelerator. This method only succeeds if the default_cpu_access_type /// for the accelerator has not already been overriden by a previous call to this method /// and the runtime selected default_cpu_access_type for this accelerator has not yet /// been used for allocating an array or for an implicit memory allocation backing an /// array_view accessed on this accelerator. ///

/// /// The default cpu access_type to be used for array/array_view memory allocations /// on this accelerator. /// /// /// A boolean value indicating if the default cpu access_type for the accelerator /// was successfully set. /// _AMPIMP bool set_default_cpu_access_type(access_type _Default_cpu_access_type); ///

/// Create and return a new accelerator view on this accelerator /// with the specified queuing mode. When unspecified the accelerator_view /// is created with queuing_mode_automatic queuing mode. ///

_AMPIMP accelerator_view create_view(queuing_mode qmode = queuing_mode_automatic); ///

/// Return true if the other accelerator is same as this accelerator; false otherwise ///

_AMPIMP bool operator==(const accelerator &_Other) const; ///

/// Return false if the other accelerator is same as this accelerator; true otherwise ///

_AMPIMP bool operator!=(const accelerator &_Other) const; private: // Private constructor _AMPIMP accelerator(_Accelerator_impl_ptr _Impl); // Private helper methods _AMPIMP const wchar_t *_Get_device_path() const; _AMPIMP const wchar_t *_Get_description() const; _AMPIMP void _Init(const wchar_t *_Path); private: _Accelerator_impl_ptr _M_impl; }; ///

/// Class represents a future corresponding to a C++ AMP asynchronous operation ///

class completion_future { friend class details::_Amp_runtime_trace; public: ///

/// Default constructor ///

completion_future() { } ///

/// Copy constructor ///

completion_future(const completion_future& _Other) : _M_shared_future(_Other._M_shared_future), _M_task(_Other._M_task) { } ///

/// Move constructor ///

completion_future(completion_future&& _Other) : _M_shared_future(std::move(_Other._M_shared_future)), _M_task(std::move(_Other._M_task)) { } ///

/// Destructor ///

~completion_future() { } ///

/// Copy assignment operator ///

completion_future& operator=(const completion_future& _Other) { if (this != &_Other) { _M_shared_future = _Other._M_shared_future; _M_task = _Other._M_task; } return (*this); } ///

/// Move assignment operator ///

completion_future& operator=(completion_future&& _Other) { if (this != &_Other) { _M_shared_future = std::move(_Other._M_shared_future); _M_task = std::move(_Other._M_task); } return (*this); } ///

/// Waits until the associated asynchronous operation completes /// Throws the stored exception if one was encountered during the /// asynchronous operation ///

void get() const { _M_shared_future.get(); } ///

/// Returns true if the object is associated with an asynchronous /// operation ///

/// /// true if the object is associated with an asynchronous operation /// and false otherwise /// bool valid() const { return _M_shared_future.valid(); } ///

/// Blocks until the associated asynchronous operation completes ///

void wait() const { _M_shared_future.wait(); } ///

/// Blocks until the associated asynchronous operation completes or /// _Rel_time has elapsed ///

/// /// - future_status::deferred if the associated asynchronous operation is not running /// - future_status::ready if the associated asynchronous operation is finished /// - future_status::timeout if the time period specified has elapsed /// template std::future_status wait_for(const std::chrono::duration<_Rep, _Period>& _Rel_time) const { return _M_shared_future.wait_for(_Rel_time); } ///

/// Blocks until the associated asynchronous operation completes or /// until the current time exceeds _Abs_time ///

/// /// - future_status::deferred if the associated asynchronous operation is not running /// - future_status::ready if the associated asynchronous operation is finished /// - future_status::timeout if the time point specified has been reached /// template std::future_status wait_until(const std::chrono::time_point<_Clock, _Duration>& _Abs_time) const { return _M_shared_future.wait_until(_Abs_time); } ///

/// Returns a std::shared_future<void> object corresponding to the /// associated asynchronous operation ///

/// /// A std::shared_future<void> object corresponding to the associated /// asynchronous operation /// operator std::shared_future() const { return _M_shared_future; } ///

/// Chains a callback Functor to the completion_future to be executed /// when the associated asynchronous operation finishes execution ///

template void then(const _Functor &_Func) const { this->to_task().then(_Func); } ///

/// Returns a concurrency::task<void> object corresponding to the /// associated asynchronous operation ///

/// /// A concurrency::task<void> object corresponding to the associated /// asynchronous operation /// concurrency::task to_task() const { return _M_task; } private: // Private constructor completion_future(const std::shared_future &_Shared_future, const concurrency::task& _Task) : _M_shared_future(_Shared_future), _M_task(_Task) { } std::shared_future _M_shared_future; concurrency::task _M_task; }; ///

/// Class represents a virtual device abstraction on a C++ AMP data-parallel accelerator ///

class accelerator_view { friend class accelerator; friend class details::_Buffer; friend class details::_Texture; friend class details::_Sampler; friend class details::_Ubiquitous_buffer; friend class details::_D3D_interop; friend class details::_D3D_accelerator_view_impl; friend class details::_CPU_accelerator_view_impl; friend class details::_Accelerator_view_hasher; _AMPIMP friend _Ret_ IUnknown * __cdecl direct3d::get_device(const accelerator_view &_Av); _AMPIMP friend accelerator_view __cdecl direct3d::create_accelerator_view(_In_ IUnknown *_D3D_device, queuing_mode qmode /* = queuing_mode_automatic */); _AMPIMP friend accelerator_view __cdecl direct3d::create_accelerator_view(accelerator& _Accelerator, bool _Disable_timeout, queuing_mode _Qmode /* = queuing_mode_automatic */); _AMPIMP friend bool __cdecl direct3d::is_timeout_disabled(const accelerator_view& _Accelerator_view); friend _Ret_ details::_Accelerator_view_impl* details::_Get_accelerator_view_impl_ptr(const accelerator_view& _Accl_view); public: ///

/// Destructor ///

_AMPIMP ~accelerator_view(); ///

/// Copy constructor ///

_AMPIMP accelerator_view(const accelerator_view &_Other); ///

/// Assignment operator ///

_AMPIMP accelerator_view &operator=(const accelerator_view &_Other); ///

/// Get the accelerator for this accelerator view ///

_AMPIMP accelerator get_accelerator() const; __declspec(property(get=get_accelerator)) Concurrency::accelerator accelerator; ///

/// Returns a boolean value indicating whether the accelerator view /// was created with DEBUG layer enabled for extensive error reporting ///

_AMPIMP bool get_is_debug() const; __declspec(property(get=get_is_debug)) bool is_debug; ///

/// Get the version for this accelerator view ///

_AMPIMP unsigned int get_version() const; __declspec(property(get=get_version)) unsigned int version; // hiword=major, loword=minor ///

/// Get the queuing mode for this accelerator view ///

_AMPIMP queuing_mode get_queuing_mode() const; __declspec(property(get=get_queuing_mode)) Concurrency::queuing_mode queuing_mode; ///

/// Returns a boolean value indicating whether the accelerator view /// when passed to a parallel_for_each would result in automatic /// selection of an appropriate execution target by the runtime ///

_AMPIMP bool get_is_auto_selection() const; __declspec(property(get=get_is_auto_selection)) bool is_auto_selection; ///

/// Return true if the other accelerator view is same as this accelerator view; false otherwise ///

_AMPIMP bool operator==(const accelerator_view &_Other) const; ///

/// Return false if the other accelerator view is same as this accelerator view; true otherwise ///

_AMPIMP bool operator!=(const accelerator_view &_Other) const; ///

/// Waits for completion of all commands submitted so far to this accelerator_view ///

_AMPIMP void wait(); ///

/// Submit all pending commands queued to this accelerator_view to the accelerator /// for execution. ///

_AMPIMP void flush(); ///

/// Return a future to track the completion of all commands submitted so far to this accelerator_view ///

_AMPIMP concurrency::completion_future create_marker(); private: // No default constructor accelerator_view(); // Private constructor _AMPIMP accelerator_view(_Accelerator_view_impl_ptr _Impl, bool _Auto_selection = false); private: _Accelerator_view_impl_ptr _M_impl; bool _M_auto_selection; }; namespace details { inline _Ret_ _Accelerator_view_impl* _Get_accelerator_view_impl_ptr(const accelerator_view& _Accl_view) { return _Accl_view._M_impl; } inline _Ret_ _Accelerator_impl* _Get_accelerator_impl_ptr(const accelerator& _Accl) { return _Accl._M_impl; } // Type defining a hasher for accelerator_view objects // for use with std::unordered_set and std::unordered_map class _Accelerator_view_hasher { public: size_t operator()(const accelerator_view &_Accl_view) const { std::hash<_Accelerator_view_impl*> _HashFunctor; return _HashFunctor(_Accl_view._M_impl._Get_ptr()); } }; typedef std::unordered_set _Accelerator_view_unordered_set; // Describes the N dimensional shape of a view in a buffer class _View_shape : public _Reference_counter { public: _AMPIMP static _Ret_ _View_shape* __cdecl _Create_view_shape(unsigned int _Rank, unsigned int _Linear_offset, const unsigned int *_Base_extent, const unsigned int *_View_offset, const unsigned int *_View_extent, const bool *_Projection_info = NULL); _AMPIMP _Ret_ _View_shape* _Get_reduced_shape_for_copy(); inline unsigned int _Get_rank() const { return _M_rank; } inline unsigned int _Get_linear_offset() const { return _M_linear_offset; } inline const unsigned int *_Get_base_extent() const { return _M_base_extent; } inline const unsigned int *_Get_view_offset() const { return _M_view_offset; } inline const unsigned int *_Get_view_extent() const { return _M_view_extent; } inline const bool *_Get_projection_info() const { return _M_projection_info; } inline bool _Is_projection() const { return _M_projection_info[0]; } inline bool _Is_valid(size_t _Buffer_size) const { // The end point of the base shape should not be greater than the size of the buffer size_t endLinearOffset = _M_linear_offset + _Get_extent_size(_M_rank, _M_base_extent); if (endLinearOffset > _Buffer_size) { return false; } return _Is_valid(); } inline unsigned int _Get_view_size() const { return _Get_extent_size(_M_rank, _M_view_extent); } inline unsigned int _Get_view_linear_offset() const { return _Get_linear_offset(_M_view_offset); } static inline bool _Compare_extent_with_elem_size(unsigned int _Rank, const unsigned int *_Extent1, size_t _Elem_size1, const unsigned int *_Extent2, size_t _Elem_size2) { _ASSERTE((_Rank >= 1) && (_Extent1 != NULL)&& (_Extent2 != NULL)); // The extents should match accounting for the element sizes of the respective buffers if ((_Extent1[_Rank - 1] * _Elem_size1) != (_Extent2[_Rank - 1] * _Elem_size2)) { return false; } // Now compare the extent in all but the least significant dimension if ((_Rank > 1) && !_Compare_extent(_Rank - 1, _Extent1, _Extent2)) { return false; } return true; } static inline bool _Compare_extent(unsigned int _Rank, const unsigned int *_Extent1, const unsigned int *_Extent2) { for (size_t _I = 0; _I < _Rank; ++_I) { if (_Extent1[_I] != _Extent2[_I]) { return false; } } return true; } inline bool _Is_view_linear(unsigned int &_Linear_offset, unsigned int &_Linear_size) const { // The effective rank for the purpose of determining linearity // depends on the highest dimension in which the extent is not 1 unsigned int _First_dim_with_non_unit_extent = 0; while ((_First_dim_with_non_unit_extent < _M_rank) && (_M_view_extent[_First_dim_with_non_unit_extent] == 1)) { _First_dim_with_non_unit_extent++; } unsigned int _Effective_rank = (_M_rank - _First_dim_with_non_unit_extent); // It is linear if the effective rank is <= 1 or the base extent // and view extent are same in all but the highest dimension with // non-unit extent if ((_Effective_rank <= 1) || (_Compare_extent(_Effective_rank - 1, &_M_base_extent[_First_dim_with_non_unit_extent + 1], &_M_view_extent[_First_dim_with_non_unit_extent + 1]))) { _Linear_offset = _Get_view_linear_offset(); _Linear_size = _Get_view_size(); return true; } return false; } inline bool _Overlaps(const _View_shape* _Other) const { if (_Compare_base_shape(_Other)) { // If the base shapes are identical we will do the N-dimensional // bounding box overlap test for (size_t _I = 0; _I < _M_rank; ++_I) { if (!_Intervals_overlap(_M_view_offset[_I], _M_view_offset[_I] + _M_view_extent[_I] - 1, _Other->_M_view_offset[_I], _Other->_M_view_offset[_I] + _Other->_M_view_extent[_I] - 1)) { return false; } } return true; } else { // The base shapes are different. Check based on linear intervals size_t firstStart = _Get_view_linear_offset(); size_t firstEnd = firstStart + _Get_view_size() - 1; size_t secondStart = _Other->_Get_view_linear_offset(); size_t secondEnd = secondStart + _Other->_Get_view_size() - 1; return _Intervals_overlap(firstStart, firstEnd, secondStart, secondEnd); } } inline bool _Subsumes(const _View_shape* _Other) const { // Subsumption test can only be done for shapes that have the same base shape or // when both have a rank of 1 if ((_M_rank == 1) && (_Other->_Get_rank() == 1)) { size_t thisStart = _Get_view_linear_offset(); size_t thisEnd = thisStart + _Get_view_size() - 1; size_t otherStart = _Other->_Get_view_linear_offset(); size_t otherEnd = otherStart + _Other->_Get_view_size() - 1; return ((otherStart >= thisStart) && (otherEnd <= thisEnd)); } if (!_Compare_base_shape(_Other)) { return false; } if (!_Contains(_Other->_Get_view_offset())) { return false; } std::vector otherEndPointIndex(_M_rank); for (size_t _I = 0; _I < _M_rank; ++_I) { otherEndPointIndex[_I] = _Other->_Get_view_offset()[_I] + _Other->_Get_view_extent()[_I] - 1; } return _Contains(otherEndPointIndex.data()); } private: // Private constructor to force construction through the _Create_view_shape method _View_shape(unsigned int _Rank, unsigned int _Linear_offset, const unsigned int *_Base_extent, const unsigned int *_View_offset, const unsigned int *_View_extent, const bool *_Projection_info); virtual ~_View_shape(); // No default constructor or copy/assignment _View_shape(); _View_shape(const _View_shape &_Other); _View_shape(_View_shape &&_Other); _View_shape& operator=(const _View_shape &_Other); _View_shape& operator=(_View_shape &&_Other); // Helper methods static bool _Intervals_overlap(size_t _First_start, size_t _First_end, size_t _Second_start, size_t _Second_end) { // Order the intervals by their start points if (_First_start > _Second_start) { size_t temp = _First_start; _First_start = _Second_start; _Second_start = temp; temp = _First_end; _First_end = _Second_end; _Second_end = temp; } // The start of the second one must be within the bounds of the first one return (_Second_start <= _First_end); } static unsigned int _Get_extent_size(unsigned int _Rank, const unsigned int *_Extent) { unsigned int totalExtent = 1; for (size_t _I = 0; _I < _Rank; ++_I) { totalExtent *= _Extent[_I]; } return totalExtent; } inline bool _Is_valid() const { if (_M_rank == 0) { return false; } // Ensure the _M_view_offset + _M_view_extent is within the bounds of _M_base_extent size_t viewSize = 1; for (size_t _I = 0; _I < _M_rank; ++_I) { viewSize *= _M_view_extent[_I]; if ((_M_view_offset[_I] + _M_view_extent[_I]) > _M_base_extent[_I]) { return false; } } if (viewSize == 0) { return false; } return true; } inline bool _Compare_base_shape(const _View_shape* _Other) const { return ((_M_rank == _Other->_M_rank) && (_M_linear_offset == _Other->_M_linear_offset) && _Compare_extent(_M_rank, _M_base_extent, _Other->_M_base_extent)); } // Checks if the element at the specified index // is contained within this view shape // Assumes the rank of the index is same as the // rank of this view's shape inline bool _Contains(const unsigned int* _Element_index) const { for (size_t _I = 0; _I < _M_rank; ++_I) { if ((_Element_index[_I] < _M_view_offset[_I]) || (_Element_index[_I] >= (_M_view_offset[_I] + _M_view_extent[_I]))) { return false; } } return true; } inline unsigned int _Get_linear_offset(const unsigned int* _Element_index) const { unsigned int currMultiplier = 1; unsigned int linearOffset = _M_linear_offset; for (int _I = static_cast(_M_rank - 1); _I >= 0; _I--) { linearOffset += (currMultiplier * _Element_index[_I]); currMultiplier *= _M_base_extent[_I]; } return linearOffset; } private: unsigned int _M_rank; unsigned int _M_linear_offset; unsigned int *_M_base_extent; unsigned int *_M_view_offset; unsigned int *_M_view_extent; bool *_M_projection_info; }; // This function creates a new _View_shape object from an existing _View_shape object when the data underlying the view // needs to be reinterpreted to use a different element size than the one used by the original view. inline _Ret_ _View_shape *_Create_reinterpreted_shape(const _View_shape* _Source_shape, size_t _Curr_elem_size, size_t _New_elem_size) { unsigned int _Rank = _Source_shape->_Get_rank(); size_t _LinearOffsetInBytes = _Source_shape->_Get_linear_offset() * _Curr_elem_size; size_t _BaseLSDExtentInBytes = (_Source_shape->_Get_base_extent())[_Rank - 1] * _Curr_elem_size; size_t _ViewLSDOffsetInBytes = (_Source_shape->_Get_view_offset())[_Rank - 1] * _Curr_elem_size; size_t _ViewLSDExtentInBytes = (_Source_shape->_Get_view_extent())[_Rank - 1] * _Curr_elem_size; _ASSERTE((_LinearOffsetInBytes % _New_elem_size) == 0); _ASSERTE((_BaseLSDExtentInBytes % _New_elem_size) == 0); _ASSERTE((_ViewLSDOffsetInBytes % _New_elem_size) == 0); _ASSERTE((_ViewLSDExtentInBytes % _New_elem_size) == 0); size_t _Temp_val = _LinearOffsetInBytes / _New_elem_size; _ASSERTE(_Temp_val <= UINT_MAX); unsigned int _New_linear_offset = static_cast(_Temp_val); std::vector _New_base_extent(_Rank); std::vector _New_view_offset(_Rank); std::vector _New_view_extent(_Rank); for (unsigned int i = 0; i < _Rank - 1; ++i) { _New_base_extent[i] = (_Source_shape->_Get_base_extent())[i]; _New_view_offset[i] = (_Source_shape->_Get_view_offset())[i]; _New_view_extent[i] = (_Source_shape->_Get_view_extent())[i]; } // The extent in the least significant dimension needs to be adjusted _Temp_val = _BaseLSDExtentInBytes / _New_elem_size; _ASSERTE(_Temp_val <= UINT_MAX); _New_base_extent[_Rank - 1] = static_cast(_Temp_val); _Temp_val = _ViewLSDOffsetInBytes / _New_elem_size; _ASSERTE(_Temp_val <= UINT_MAX); _New_view_offset[_Rank - 1] = static_cast(_Temp_val); _Temp_val = _ViewLSDExtentInBytes / _New_elem_size; _ASSERTE(_Temp_val <= UINT_MAX); _New_view_extent[_Rank - 1] = static_cast(_Temp_val); return _View_shape::_Create_view_shape(_Rank, _New_linear_offset, _New_base_extent.data(), _New_view_offset.data(), _New_view_extent.data()); } inline _Access_mode _Get_synchronize_access_mode(access_type cpu_access_type) { switch(cpu_access_type) { case access_type_auto: case access_type_read: return _Read_access; case access_type_write: return _Write_access; case access_type_read_write: return _Read_write_access; case access_type_none: default: _ASSERTE(false); return _No_access; } } inline access_type _Get_cpu_access_type(_Access_mode _Cpu_access_mode) { access_type _Cpu_access_type = access_type_none; if (_Cpu_access_mode & _Read_access) { _Cpu_access_type = static_cast(_Cpu_access_type | access_type_read); } if (_Cpu_access_mode & _Write_access) { _Cpu_access_type = static_cast(_Cpu_access_type | access_type_write); } return _Cpu_access_type; } // Class manages a raw buffer in a accelerator view class _Buffer : public _Reference_counter { friend class _CPU_accelerator_view_impl; friend class _D3D_accelerator_view_impl; friend class _D3D_temp_staging_cache; public: // Force construction through these static public method to ensure that _Buffer // objects are allocated in the runtime // Allocate a new buffer on the specified accelerator_view _AMPIMP static _Ret_ _Buffer * __cdecl _Create_buffer(accelerator_view _Accelerator_view, accelerator_view _Access_on_accelerator_view, size_t _Num_elems, size_t _Elem_size, bool _Is_temp = false, access_type _Cpu_access_type = access_type_auto); // Create a buffer object from a pre-allocated storage on the specified accelerator_view. This can be thought // of as the accelerator_view "adopting" the passed data buffer. _AMPIMP static _Ret_ _Buffer * __cdecl _Create_buffer(_In_ void *_Data_ptr, accelerator_view _Accelerator_view, size_t _Num_elems, size_t _Elem_size); // Create a staging buffer on the specified accelerator_view which can be accesed on the cpu upon mapping. _AMPIMP static _Ret_ _Buffer * __cdecl _Create_stage_buffer(accelerator_view _Accelerator_view, accelerator_view _Access_on_accelerator_view, size_t _Num_elems, size_t _Elem_size, bool _Is_temp = false); // Creates a temp staging buffer of the requested size. This function may create // a staging buffer smaller than the requested size. _AMPIMP static _Ret_ _Buffer * __cdecl _Get_temp_staging_buffer(accelerator_view _Av, size_t _Requested_num_elems, size_t _Elem_size); // Map a zero-copy or staging buffer for access on the CPU. _AMPIMP void _Map_buffer(_Access_mode _Map_type, bool _Wait); // Asynchronously map a zero-copy or staging buffer for access on the CPU. _AMPIMP _Event _Map_buffer_async(_Access_mode _Map_type); // Unmap a zero-copy or staging buffer denying CPU access _AMPIMP void _Unmap_buffer(); // Copy data to _Dest asynchronously. _AMPIMP _Event _Copy_to_async(_Out_ _Buffer * _Dest, size_t _Num_elems, size_t _Src_offset = 0, size_t _Dest_offset = 0); // Copy data to _Dest asynchronously. _AMPIMP _Event _Copy_to_async(_Out_ _Buffer * _Dest, _View_shape_ptr _Src_shape, _View_shape_ptr _Dest_shape); _AMPIMP accelerator_view _Get_accelerator_view() const; _AMPIMP accelerator_view _Get_access_on_accelerator_view() const; _AMPIMP void _Register_view(_In_ _View_key _Key); _AMPIMP void _Unregister_view(_In_ _View_key _Key); // Return the raw data ptr - only a accelerator view implementation can interpret // this raw pointer. This method should usually not be used in the AMP header files // The _Get_host_ptr is the right way for accessing the host accesible ptr for a buffer _Ret_ void * _Get_data_ptr() const { return _M_data_ptr; } // Returns the host accessible ptr corresponding to the buffer. This would // return NULL when the buffer is inaccesible on the CPU _Ret_ void * _Get_host_ptr() const { return _M_host_ptr; } size_t _Get_elem_size() const { return _M_elem_size; } size_t _Get_num_elems() const { return _M_num_elems; } _Ret_ _Accelerator_view_impl* _Get_accelerator_view_impl() const { return _M_accelerator_view; } _Ret_ _Accelerator_view_impl* _Get_access_on_accelerator_view_impl() const { return _M_access_on_accelerator_view; } bool _Owns_data() const { return _M_owns_data; } _AMPIMP bool _Exclusively_owns_data(); bool _Is_staging() const { return _M_is_staging; } _Access_mode _Get_allowed_host_access_mode() const { return _M_allowed_host_access_mode; } access_type _Get_allowed_host_access_type() const { return _Get_cpu_access_type(_M_allowed_host_access_mode); } bool _Is_host_accessible(_Access_mode _Requested_access_mode) const { return ((_Get_allowed_host_access_mode() & _Requested_access_mode) == _Requested_access_mode); } _Access_mode _Get_current_host_access_mode() const { return _M_current_host_access_mode; } bool _Is_temp() const { return _M_is_temp; } bool _Is_adopted() const { // Is it adopted from interop? return _M_is_adopted; } bool _Is_buffer() const { return _M_is_buffer; } _AMPIMP bool _Is_mappable() const; protected: // The _Buffer constructor is protected to force construction through the static // _Create_buffer method to ensure the object is allocated in the runtime _Buffer(_In_ _Accelerator_view_impl* _Av, _In_ void *_Buffer_data_ptr, _In_ void * _Host_ptr, _Access_mode _Allowed_host_access_mode, _Access_mode _Current_host_access_mode, size_t _Num_elems, size_t _Elem_size, bool _Owns_data, bool _Is_staging, bool _Is_temp, bool _Is_adopted); // protected destructor to force deletion through _Release virtual ~_Buffer(); // No default consturctor, copy constructor and assignment operator _Buffer(); _Buffer(const _Buffer &rhs); _Buffer &operator=(const _Buffer &rhs); void _Set_host_ptr(_In_ void *_Host_ptr, _Access_mode _Host_access_mode = _No_access) { _ASSERTE((_Host_ptr == NULL) || (_Host_access_mode != _No_access)); _M_host_ptr = _Host_ptr; if (_Host_ptr == NULL) { _M_current_host_access_mode = _No_access; } else { _M_current_host_access_mode = _Host_access_mode; } } void _Set_data_ptr(_In_ IUnknown *_Data_ptr) { _M_data_ptr = _Data_ptr; } protected: _Accelerator_view_impl_ptr _M_accelerator_view; _Accelerator_view_impl_ptr _M_access_on_accelerator_view; void * _M_data_ptr; void * _M_host_ptr; _Access_mode _M_allowed_host_access_mode; _Access_mode _M_current_host_access_mode; size_t _M_elem_size; size_t _M_num_elems; bool _M_owns_data; bool _M_is_staging; // Used to determine how to map the staging buffer after its involved in a copy bool _M_is_temp; bool _M_is_adopted; bool _M_is_buffer; private: // A set of view_keys to invalidate whenever the host ptr of a staging buffer is invalidated std::unique_ptr> _M_view_keys; Concurrency::critical_section _M_critical_section; }; // Class manages a texture in a accelerator view class _Texture : public _Buffer { friend class _CPU_accelerator_view_impl; friend class _D3D_accelerator_view_impl; friend class _D3D_temp_staging_cache; public: // Allocate a new texture on the specified accelerator_view _AMPIMP static _Ret_ _Texture * __cdecl _Create_texture(accelerator_view _Accelerator_view, unsigned int _Rank, size_t _Width, size_t _Height, size_t _Depth, unsigned int _Mip_levels, _Short_vector_base_type_id _Type_id, unsigned int _Num_channels, unsigned int _Bits_per_channel, bool _Is_temp = false); // Create a texture object from a pre-allocated storage on the specified accelerator_view. This can be thought // of as the accelerator_view "adopting" the passed data buffer. _AMPIMP static _Ret_ _Texture * __cdecl _Adopt_texture(unsigned int _Rank, _Texture_base_type_id _Id, _In_ IUnknown *_Data_ptr, accelerator_view _Accelerator_view, unsigned int _View_format); // Create a staging texture on the specified accelerator_view which can be accesed on the cpu upon mapping. _AMPIMP static _Ret_ _Texture * __cdecl _Create_stage_texture(accelerator_view _Accelerator_view, accelerator_view _Access_on_accelerator_view, unsigned int _Rank, size_t _Width, size_t _Height, size_t _Depth, unsigned int _Mip_levels, unsigned int _Format, bool _Is_temp = false); // Create a staging texture on the specified accelerator_view which can be accesed on the cpu upon mapping. _AMPIMP static _Ret_ _Texture * __cdecl _Create_stage_texture(accelerator_view _Accelerator_view, accelerator_view _Access_on_accelerator_view, unsigned int _Rank, size_t _Width, size_t _Height, size_t _Depth, unsigned int _Mip_levels, _Short_vector_base_type_id _Type_id, unsigned int _Num_channels, unsigned int _Bits_per_channel); // Creates a temp staging texture. This function may create // a staging texture smaller than the requested size. _AMPIMP static _Ret_ _Texture * __cdecl _Get_temp_staging_texture(accelerator_view _Accelerator_view, unsigned int _Rank, size_t _Width, size_t _Height, size_t _Depth, unsigned int _Mip_levels, unsigned int _Format); // Constructs a new texture with the same properties as the given texture. _AMPIMP static _Ret_ _Texture * __cdecl _Clone_texture(const _Texture *_Src, const accelerator_view &_Accelerator_view, const accelerator_view &_Associated_av); // Copy data to _Dest asynchronously for textures. The two textures must have been created with // compatible physical formats. _AMPIMP _Event _Copy_to_async(_Out_ _Texture * _Dest, const size_t *_Copy_extent, const size_t *_Src_offset, const size_t *_Dst_offset, unsigned int _Src_mipmap_level, unsigned int _Dst_mipmap_level); size_t _Get_width(unsigned int _Mip_offset = 0) const { return (_M_width >> _Mip_offset) ? (_M_width >> _Mip_offset) : 1U; } size_t _Get_height(unsigned int _Mip_offset = 0) const { return (_M_height >> _Mip_offset) ? (_M_height >> _Mip_offset) : 1U; } size_t _Get_depth(unsigned int _Mip_offset = 0) const { return (_M_depth >> _Mip_offset) ? (_M_depth >> _Mip_offset) : 1U; } unsigned int _Get_rank() const { return _M_rank; } unsigned int _Get_texture_format() const { return _M_texture_format; } unsigned int _Get_view_format() const { return _M_view_format; } unsigned int _Get_num_channels() const { return _M_num_channels; } unsigned int _Get_bits_per_channel() const { // For texture adopted from interop, return 0. return _Is_adopted() ? 0 : _M_bits_per_channel; } unsigned int _Get_bits_per_element() const { return _M_bits_per_channel * _M_num_channels; } unsigned int _Get_data_length(unsigned int _Most_detailed_mipmap_level, unsigned int _View_mipmap_levels, const size_t *_Extents = nullptr) const // in bytes { _ASSERTE(_View_mipmap_levels); unsigned long long _Bits_per_byte = 8ULL; unsigned long long _Total_bytes = 0ULL; unsigned int _Mip_level = _Most_detailed_mipmap_level; // Sum up data length (in bytes) of all mipmap levels in the view for (unsigned int _Mip_offset=0; _Mip_offset < _View_mipmap_levels; ++_Mip_offset) { unsigned long long _Width = 1ULL; unsigned long long _Height = 1ULL; unsigned long long _Depth = 1ULL; if (_Extents) { switch (_M_rank) { case 3: _Depth = (_Extents[2] >> _Mip_level) ? (_Extents[2] >> _Mip_level) : 1U; // deliberately fall thru case 2: _Height = (_Extents[1] >> _Mip_level) ? (_Extents[1] >> _Mip_level) : 1U; // deliberately fall thru case 1: _Width = (_Extents[0] >> _Mip_level) ? (_Extents[0] >> _Mip_level) : 1U; break; default: _ASSERTE(false); // textures are only rank 1-3 } } else { _Width = _Get_width(_Mip_level); _Height = _Get_height(_Mip_level); _Depth = _Get_depth(_Mip_level); } // Note _Get_bits_per_element() can be smaller than 8 // Use unsigned long long to avoid integer overflow _Total_bytes += ((_Width * _Height * _Depth * static_cast(_Get_bits_per_element())) + _Bits_per_byte - 1) / _Bits_per_byte; _Mip_level++; } return static_cast(_Total_bytes); } unsigned int _Get_mip_levels() const { return _M_mip_levels; } size_t _Get_row_pitch() const { return _M_row_pitch; } void _Set_row_pitch(size_t _Val) { _M_row_pitch = _Val; } size_t _Get_depth_pitch() const { return _M_depth_pitch; } void _Set_depth_pitch(size_t _Val) { _M_depth_pitch = _Val; } private: // The _Texture constructor is private to force construction through the static // _Create_texture method to ensure the object is allocated in the runtime _Texture(_In_ _Accelerator_view_impl* _Av, _In_ void *_Texture_data_ptr, _In_ void * _Host_ptr, _Access_mode _Allowed_host_access_mode, _Access_mode _Current_host_access_mode, unsigned int _Rank, size_t _Width, size_t _Height, size_t _Depth, unsigned int _Mip_levels, unsigned int _Texture_format, unsigned int _View_format, unsigned int _Num_channels, unsigned int _Bits_per_channel, bool _Owns_data, bool _Is_staging, bool _Is_temp, bool _Is_adopted); // Private destructor to force deletion through _Release ~_Texture(); // No default consturctor, copy constructor and assignment operator _Texture(); _Texture(const _Texture &rhs); _Texture &operator=(const _Texture &rhs); // Texture only unsigned int _M_rank; size_t _M_width; size_t _M_height; size_t _M_depth; unsigned int _M_texture_format; unsigned int _M_view_format; unsigned int _M_bits_per_channel; unsigned int _M_num_channels; unsigned int _M_mip_levels; size_t _M_row_pitch; size_t _M_depth_pitch; }; class _Sampler : public _Reference_counter { public: // Create a new sampler with configurations exposed by C++ AMP. _AMPIMP static _Ret_ _Sampler * __cdecl _Create( unsigned int _Filter_mode, unsigned int _Address_mode, float _Border_r, float _Border_g, float _Border_b, float _Border_a); // Create a sampler object given an adopted opaque data pointer _AMPIMP static _Ret_ _Sampler * __cdecl _Create(_In_ void *_Data_ptr); // Return the raw data ptr - only an accelerator view implementation can interpret // this raw pointer. This method should usually not be used in the AMP header files _Ret_ void * _Get_data_ptr() const { return _M_data_ptr; } bool _Is_adopted() const { // Is it adopted from interop? return _M_is_adopted; } unsigned int _Get_filter_mode() const { return _M_filter_mode; } unsigned int _Get_address_mode() const { return _M_address_mode; } const float* _Get_border_color() const { return &_M_border_color[0]; } private: // The _Sampler constructor is private to force construction through the static // _Create method to ensure the object is allocated in the runtime _Sampler(unsigned int _Filter_mode, unsigned int _Address_mode, float _Border_r, float _Border_g, float _Border_b, float _Border_a); _Sampler(_In_ void *_Data_ptr); // Private destructor to force deletion through _Release ~_Sampler(); // No default consturctor, copy constructor and assignment operator _Sampler(); _Sampler(const _Sampler &rhs); _Sampler &operator=(const _Sampler &rhs); void * _M_data_ptr; bool _M_is_adopted; unsigned int _M_filter_mode; unsigned int _M_address_mode; float _M_border_color[4]; }; // Forward declaration for copy helper functions _AMPIMP _Event __cdecl _Copy_impl(_In_ _Buffer *_Src, size_t _Src_offset, _Out_ _Buffer * _Dst, size_t _Dest_offset, size_t _Num_elems, size_t _Preferred_copy_chunk_num_elems = 0); _AMPIMP _Event __cdecl _Copy_async_impl(_In_ _Texture *_Src_tex, const size_t *_Src_offset, unsigned int _Src_mipmap_level, _Out_ _Texture *_Dst_tex, const size_t *_Dst_offset, unsigned int _Dst_mipmap_level, const size_t *_Copy_extent, const size_t *_Preferred_copy_chunk_extent = NULL); inline bool _Get_chunked_staging_texture(_In_ _Texture* _Tex, const size_t *_Copy_chunk_extent, _Inout_ size_t *_Remaining_copy_extent, _Out_ size_t *_Curr_copy_extent, _Out_ _Texture_ptr *_Staging_texture) { bool _Truncated_copy = false; size_t _Allocation_extent[3] = { _Copy_chunk_extent[0], _Copy_chunk_extent[1], _Copy_chunk_extent[2] }; unsigned int _Most_sig_idx = _Tex->_Get_rank() - 1; if (_Allocation_extent[_Most_sig_idx] > _Remaining_copy_extent[_Most_sig_idx]) { _Allocation_extent[_Most_sig_idx] = _Remaining_copy_extent[_Most_sig_idx]; } _Texture_ptr _Stage = _Texture::_Get_temp_staging_texture(_Tex->_Get_accelerator_view(), _Tex->_Get_rank(), _Allocation_extent[0], _Allocation_extent[1], _Allocation_extent[2], /*_Mip_levels=*/1, _Tex->_Get_texture_format()); std::copy(&_Allocation_extent[0], &_Allocation_extent[3], stdext::make_unchecked_array_iterator(&_Curr_copy_extent[0])); size_t _Staging_tex_extent[3] = {_Stage->_Get_width(), _Stage->_Get_height(), _Stage->_Get_depth()}; if (_Curr_copy_extent[_Most_sig_idx] > _Staging_tex_extent[_Most_sig_idx]) { _Curr_copy_extent[_Most_sig_idx] = _Staging_tex_extent[_Most_sig_idx]; } // The truncation can however happen only in the most significant dimension and lower // dimensions should not get truncated if (_Curr_copy_extent[_Most_sig_idx] < _Remaining_copy_extent[_Most_sig_idx]) { _Remaining_copy_extent[_Most_sig_idx] -= _Curr_copy_extent[_Most_sig_idx]; _Truncated_copy = true; } for (unsigned int _I = 0; _I < _Most_sig_idx; _I++) { _ASSERTE(_Curr_copy_extent[_I] == _Remaining_copy_extent[_I]); } *_Staging_texture = _Stage; return _Truncated_copy; } #pragma warning ( push ) #pragma warning ( disable : 6101 ) // Supress "warning C6101: Returning uninitialized memory '*_Dst'.: A successful" // "path through the function does not set the named _Out_ parameter." // The callers to _Copy_data_on_host all have static_assert that _Rank has to be 1, 2, or 3 dimensions for texture // template inline void _Copy_data_on_host(int _Rank, _Input_iterator _Src, _Out_ _Value_type *_Dst, size_t _Width, size_t _Height, size_t _Depth, size_t _Dst_row_pitch_in_bytes, size_t _Dst_depth_pitch_in_bytes, size_t _Src_row_pitch, size_t _Src_depth_pitch) { switch(_Rank) { case 1: { _Input_iterator _End = _Src; std::advance(_End, _Width); std::copy(_Src, _End, stdext::make_unchecked_array_iterator(_Dst)); } break; case 2: { unsigned char *_Dst_ptr = reinterpret_cast(_Dst); _Input_iterator _Src_start = _Src; for (size_t _I = 0; _I < _Height; _I++) { _Input_iterator _Src_end = _Src_start; std::advance(_Src_end, _Width); std::copy(_Src_start, _Src_end, stdext::make_unchecked_array_iterator(reinterpret_cast<_Value_type*>(_Dst_ptr))); _Dst_ptr += _Dst_row_pitch_in_bytes; std::advance(_Src_start, _Src_row_pitch); } } break; case 3: { unsigned char *_Dst_ptr_slice_start = reinterpret_cast(_Dst); _Input_iterator _Src_depth_slice_start = _Src; for (size_t _I = 0; _I < _Depth; _I++) { _Input_iterator _Src_start = _Src_depth_slice_start; unsigned char *_Dst_ptr = _Dst_ptr_slice_start; for (size_t _J = 0; _J < _Height; _J++) { _Input_iterator _Src_end = _Src_start; std::advance(_Src_end, _Width); std::copy(_Src_start, _Src_end, stdext::make_unchecked_array_iterator(reinterpret_cast<_Value_type*>(_Dst_ptr))); _Dst_ptr += _Dst_row_pitch_in_bytes; std::advance(_Src_start, _Src_row_pitch); } _Dst_ptr_slice_start += _Dst_depth_pitch_in_bytes; std::advance(_Src_depth_slice_start, _Src_depth_pitch); } } break; default: _ASSERTE(FALSE); break; } } #pragma warning ( pop ) // disable : 6101 template inline void _Copy_data_on_host(int _Rank, const _Value_type * _Src, _Output_iterator _Dst, size_t _Width, size_t _Height, size_t _Depth, size_t _Src_row_pitch_in_bytes, size_t _Src_depth_pitch_in_bytes, size_t _Dst_row_pitch, size_t _Dst_depth_pitch) { switch(_Rank) { case 1: { const _Value_type * _End = _Src + _Width; std::copy(stdext::make_unchecked_array_iterator(_Src), stdext::make_unchecked_array_iterator(_End), _Dst); } break; case 2: { const unsigned char *_Src_ptr = reinterpret_cast(_Src); _Output_iterator _Dst_iter = _Dst; for (size_t _I = 0; _I < _Height; _I++) { const _Value_type * _Src_end = reinterpret_cast(_Src_ptr) + _Width; std::copy(stdext::make_unchecked_array_iterator(reinterpret_cast(_Src_ptr)), stdext::make_unchecked_array_iterator(_Src_end), _Dst_iter); std::advance(_Dst_iter, _Dst_row_pitch); _Src_ptr += _Src_row_pitch_in_bytes; } } break; case 3: { const unsigned char *_Src_ptr_slice_start = reinterpret_cast(_Src); _Output_iterator _Dst_depth_slice_start = _Dst; for (size_t _I = 0; _I < _Depth; _I++) { _Output_iterator _Dst_iter = _Dst_depth_slice_start; const unsigned char *_Src_ptr = _Src_ptr_slice_start; for (size_t _J = 0; _J < _Height; _J++) { const _Value_type * _Src_end = reinterpret_cast(_Src_ptr) + _Width; std::copy(stdext::make_unchecked_array_iterator(reinterpret_cast(_Src_ptr)), stdext::make_unchecked_array_iterator(_Src_end), _Dst_iter); std::advance(_Dst_iter, _Dst_row_pitch); _Src_ptr += _Src_row_pitch_in_bytes; } _Src_ptr_slice_start += _Src_depth_pitch_in_bytes; std::advance(_Dst_depth_slice_start, _Dst_depth_pitch); } } break; default: _ASSERTE(FALSE); break; } } _AMPIMP size_t __cdecl _Get_preferred_copy_chunk_size(size_t _Total_copy_size_in_bytes); inline size_t _Get_preferred_copy_chunk_num_elems(size_t _Total_num_elems, size_t _Elem_size) { size_t preferredChunkSize = _Get_preferred_copy_chunk_size(_Total_num_elems * _Elem_size); return (preferredChunkSize / _Elem_size); } inline void _Get_preferred_copy_chunk_extent(unsigned int _Rank, size_t _Width, size_t _Height, size_t _Depth, size_t _Bits_per_element, _Out_writes_(3) size_t *_Preferred_copy_chunk_extent) { _ASSERTE(_Preferred_copy_chunk_extent != nullptr); size_t requestedByteSize = static_cast((static_cast(_Width) * static_cast(_Height) * static_cast(_Depth) * static_cast(_Bits_per_element)) >> 3); size_t preferredChunkSize = _Get_preferred_copy_chunk_size(requestedByteSize); // Lets align the allocation size to the element size of the texture size_t preferredCopyChunkNumElems = static_cast((static_cast(preferredChunkSize) * 8U) / _Bits_per_element); // Lets truncate the dimensions of the requested staging texture. // We only truncate in the most significant dimension switch (_Rank) { case 1: _Width = preferredCopyChunkNumElems; break; case 2: _Height = (preferredCopyChunkNumElems + _Width - 1) / _Width; break; case 3: _Depth = (preferredCopyChunkNumElems + (_Height * _Width) - 1) / (_Height * _Width); break; default: _ASSERTE(false); } _Preferred_copy_chunk_extent[0] = _Width; _Preferred_copy_chunk_extent[1] = _Height; _Preferred_copy_chunk_extent[2] = _Depth; } // Finds the greatest common divisor of 2 unsigned integral numbers using Euclid's algorithm template inline _T _Greatest_common_divisor(_T _M, _T _N) { static_assert(std::is_unsigned<_T>::value, "This GCD function only supports unsigned integral types"); _ASSERTE((_M > 0) && (_N > 0)); if (_N > _M) { std::swap(_N , _M); } _T _Temp; while (_N > 0) { _Temp = _N; _N = _M % _N; _M = _Temp; } return _M; } // Finds the least common multiple of 2 unsigned integral numbers using their greatest_common_divisor template inline _T _Least_common_multiple(_T _M, _T _N) { static_assert(std::is_unsigned<_T>::value, "This LCM function only supports unsigned integral types"); _ASSERTE((_M > 0) && (_N > 0)); _T _Gcd = _Greatest_common_divisor(_M, _N); return ((_M / _Gcd) * _N); } template inline _Event _Copy_impl(InputIterator _SrcFirst, InputIterator _SrcLast, size_t _NumElemsToCopy, _Out_ _Buffer * _Dst, size_t _Dest_offset, size_t _Preferred_copy_chunk_num_elems = 0) { if (_NumElemsToCopy == 0) { return _Event(); } if (_Dst == NULL) { throw runtime_exception("Failed to copy to buffer.", E_INVALIDARG); } #pragma warning ( push ) #pragma warning ( disable : 6001 ) // Using uninitialized memory '*_Dst' if (((_NumElemsToCopy * sizeof(_Value_type)) + (_Dest_offset * _Dst->_Get_elem_size())) > (_Dst->_Get_num_elems() * _Dst->_Get_elem_size())) { throw runtime_exception("Invalid _Src argument(s). _Src size exceeds total size of the _Dest.", E_INVALIDARG); } #pragma warning ( pop ) _ASSERTE(_NumElemsToCopy == (size_t)(std::distance(_SrcFirst, _SrcLast))); // If the dest is host accessible for write then we do the copy on // accelerator(accelerator::cpu_accelerator).default_view if (_Dst->_Is_host_accessible(_Write_access)) { // Lets first map the _Dst buffer _Event _Ev = _Dst->_Map_buffer_async(_Write_access); // The _Dest is accessible on host. We just need to do a std::copy using a raw pointer as OutputIterator _Buffer_ptr _PDestBuf = _Dst; _Ev = _Ev._Add_continuation(std::function<_Event()>([_PDestBuf,_Dest_offset, _SrcFirst, _SrcLast]() mutable -> _Event { _Value_type *_DestPtr = reinterpret_cast<_Value_type*>(reinterpret_cast(_PDestBuf->_Get_host_ptr()) + (_Dest_offset * _PDestBuf->_Get_elem_size())); std::copy(_SrcFirst, _SrcLast, stdext::make_unchecked_array_iterator(_DestPtr)); return _Event(); })); return _Ev; } else { // _Dest is on a device. Lets create a temp staging buffer on the _Dest accelerator_view and copy the input over // We may create a staging buffer of size smaller than the copy size and in that case we will perform the copy // as a series of smaller copies _Buffer_ptr _PDestBuf = _Dst; size_t _NumElemsToCopyRemaining = _NumElemsToCopy; size_t _PreferredNumElemsToCopyPerChunk = _Preferred_copy_chunk_num_elems; if (_PreferredNumElemsToCopyPerChunk == 0) { // If a preferred copy chunk size was not specified, lets pick one based on the // size of the copy _PreferredNumElemsToCopyPerChunk = _Get_preferred_copy_chunk_num_elems(_NumElemsToCopy, sizeof(_Value_type)); } size_t _CurrDstOffset = _Dest_offset; InputIterator _CurrStartIter = _SrcFirst; _Event _Ev; size_t _Lcm = _Least_common_multiple(_Dst->_Get_elem_size(), sizeof(_Value_type)); size_t _AdjustmentRatio = _Lcm / sizeof(_Value_type); do { size_t _AllocationNumElems = _PreferredNumElemsToCopyPerChunk; if (_NumElemsToCopyRemaining < _AllocationNumElems) { _AllocationNumElems = _NumElemsToCopyRemaining; } _Buffer_ptr _PDestStagingBuf = _Buffer::_Get_temp_staging_buffer(_Dst->_Get_accelerator_view(), _AllocationNumElems, sizeof(_Value_type)); _ASSERTE(_PDestStagingBuf != NULL); _ASSERTE(_PDestStagingBuf->_Get_elem_size() == sizeof(_Value_type)); InputIterator _CurrEndIter = _CurrStartIter; size_t _CurrNumElemsToCopy = _AllocationNumElems; if (_CurrNumElemsToCopy > _PDestStagingBuf->_Get_num_elems()) { _CurrNumElemsToCopy = _PDestStagingBuf->_Get_num_elems(); } if (_NumElemsToCopyRemaining <= _CurrNumElemsToCopy) { _CurrNumElemsToCopy = _NumElemsToCopyRemaining; _CurrEndIter = _SrcLast; } else { // We need to adjust the _CurrNumElemsToCopy to be a multiple of the // least common multiple of the destination buffer's element size and sizeof(_Value_type). _CurrNumElemsToCopy = (_CurrNumElemsToCopy / _AdjustmentRatio) * _AdjustmentRatio; std::advance(_CurrEndIter, _CurrNumElemsToCopy); } _ASSERTE((_CurrNumElemsToCopy % _AdjustmentRatio) == 0); // This would not actually never block since we just created this staging buffer or are using // a cached one that is not in use _PDestStagingBuf->_Map_buffer(_Write_access, true /* _Wait */); // Copy from input to the staging using a raw pointer as OutputIterator std::copy(_CurrStartIter, _CurrEndIter, stdext::make_unchecked_array_iterator(reinterpret_cast<_Value_type*>(_PDestStagingBuf->_Get_host_ptr()))); _Ev = _Ev._Add_event(_PDestStagingBuf->_Copy_to_async(_PDestBuf, _CurrNumElemsToCopy, 0, _CurrDstOffset)); // Adjust the iterators and offsets _NumElemsToCopyRemaining -= _CurrNumElemsToCopy; _CurrDstOffset += (_CurrNumElemsToCopy * sizeof(_Value_type)) / _Dst->_Get_elem_size(); _CurrStartIter = _CurrEndIter; } while (_NumElemsToCopyRemaining != 0); return _Ev; } } // The std::advance method is only supported for InputIterators and hence we have a custom implementation // which forwards to the std::advance if the iterator is an input iterator and uses a loop based advance // implementation otherwise template typename std::enable_if::iterator_category>::value>::type _Advance_output_iterator(_InputIterator &_Iter, _Distance _N) { std::advance(_Iter, _N); } template typename std::enable_if::iterator_category>::value>::type _Advance_output_iterator(_OutputIterator &_Iter, size_t _N) { for (size_t i = 0; i < _N; ++i) { _Iter++; } } template inline _Event _Copy_impl(_In_ _Buffer *_Src, size_t _Src_offset, size_t _Num_elems, OutputIterator _DestIter, size_t _Preferred_copy_chunk_num_elems = 0) { if ((_Src == NULL) || ((_Src_offset + _Num_elems) > _Src->_Get_num_elems())) { throw runtime_exception("Failed to copy to buffer.", E_INVALIDARG); } if (_Num_elems == 0) { return _Event(); } size_t _NumElemsToCopy = (_Num_elems * _Src->_Get_elem_size()) / sizeof(_Value_type); // If the src is host accessible for readthen we do the copy on // accelerator(accelerator::cpu_accelerator).default_view if (_Src->_Is_host_accessible(_Read_access)) { // Map the _Src buffer _Event _Ev = _Src->_Map_buffer_async(_Read_access); // The _Src is accessible on host. We just need to do a std::copy using a raw pointer as OutputIterator _Buffer_ptr _PSrcBuf = _Src; _Ev = _Ev._Add_continuation(std::function<_Event()>([_PSrcBuf, _Src_offset, _DestIter, _NumElemsToCopy]() mutable -> _Event { // The _Src is accessible on host. We just need to do a std::copy const _Value_type *_PFirst = reinterpret_cast(reinterpret_cast(_PSrcBuf->_Get_host_ptr()) + (_Src_offset * _PSrcBuf->_Get_elem_size())); std::copy(_PFirst, _PFirst + _NumElemsToCopy, _DestIter); return _Event(); })); return _Ev; } else { // The _Src is on the device. We need to copy it out to a temporary staging array // We may create a staging buffer of size smaller than the copy size and in that case we will // perform the copy as a series of smaller copies _Event _Ev; _Buffer_ptr _PSrcBuf = _Src; size_t _PreferredNumElemsToCopyPerChunk = _Preferred_copy_chunk_num_elems; if (_PreferredNumElemsToCopyPerChunk == 0) { // If a preferred copy chunk size was not specified, lets pick one based on the // size of the copy _PreferredNumElemsToCopyPerChunk = _Get_preferred_copy_chunk_num_elems(_NumElemsToCopy, sizeof(_Value_type)); } size_t _AllocationNumElems = _PreferredNumElemsToCopyPerChunk; if (_NumElemsToCopy < _AllocationNumElems) { _AllocationNumElems = _NumElemsToCopy; } _Buffer_ptr _PSrcStagingBuf = _Buffer::_Get_temp_staging_buffer(_Src->_Get_accelerator_view(), _AllocationNumElems, sizeof(_Value_type)); _ASSERTE(_PSrcStagingBuf != NULL); _ASSERTE(_PSrcStagingBuf->_Get_elem_size() == sizeof(_Value_type)); // The total byte size of a copy chunk must be an integral multiple of both the // source buffer's element size and sizeof(_Value_type). size_t _Lcm = _Least_common_multiple(_Src->_Get_elem_size(), sizeof(_Value_type)); size_t _AdjustmentRatio = _Lcm / sizeof(_Value_type); size_t _CurrNumElemsToCopy = _AllocationNumElems; if (_CurrNumElemsToCopy > _PSrcStagingBuf->_Get_num_elems()) { _CurrNumElemsToCopy = _PSrcStagingBuf->_Get_num_elems(); } if (_NumElemsToCopy <= _CurrNumElemsToCopy) { _CurrNumElemsToCopy = _NumElemsToCopy; } else { // We need to adjust the _StagingBufNumElems to be a multiple of the // least common multiple of the source buffer's element size and sizeof(_Value_type). _CurrNumElemsToCopy = (_CurrNumElemsToCopy / _AdjustmentRatio) * _AdjustmentRatio; } _ASSERTE((_CurrNumElemsToCopy % _AdjustmentRatio) == 0); size_t _NumElemsToCopyRemaining = _NumElemsToCopy - _CurrNumElemsToCopy; _Ev = _PSrcBuf->_Copy_to_async(_PSrcStagingBuf, (_CurrNumElemsToCopy * sizeof(_Value_type)) / _PSrcBuf->_Get_elem_size(), _Src_offset, 0); if (_NumElemsToCopyRemaining != 0) { _Ev = _Ev._Add_continuation(std::function<_Event()>([_DestIter, _PSrcBuf, _PSrcStagingBuf, _CurrNumElemsToCopy, _NumElemsToCopyRemaining, _Src_offset, _PreferredNumElemsToCopyPerChunk]() mutable -> _Event { // Initiate an asynchronous copy of the remaining part so that this part of the copy // makes progress while we consummate the copying of the first part size_t _CurrSrcOffset = _Src_offset + ((_CurrNumElemsToCopy * sizeof(_Value_type)) / _PSrcBuf->_Get_elem_size()); OutputIterator _CurrDestIter = _DestIter; _Advance_output_iterator(_CurrDestIter, _CurrNumElemsToCopy); _Event _Ret_ev = _Copy_impl(_PSrcBuf._Get_ptr(), _CurrSrcOffset, (_NumElemsToCopyRemaining * sizeof(_Value_type)) / _PSrcBuf->_Get_elem_size(), _CurrDestIter, _PreferredNumElemsToCopyPerChunk); // Now copy the data from staging buffer to the destination _Value_type *_PFirst = reinterpret_cast<_Value_type*>(_PSrcStagingBuf->_Get_host_ptr()); std::copy(_PFirst, _PFirst + _CurrNumElemsToCopy, _DestIter); return _Ret_ev; })); } else { _Ev = _Ev._Add_continuation(std::function<_Event()>([_DestIter, _PSrcStagingBuf, _CurrNumElemsToCopy]() mutable -> _Event { _Value_type *_PFirst = reinterpret_cast<_Value_type*>(_PSrcStagingBuf->_Get_host_ptr()); std::copy(_PFirst, _PFirst + _CurrNumElemsToCopy, _DestIter); return _Event(); })); } return _Ev; } } // Structured copy between buffers across AVs _AMPIMP _Event __cdecl _Copy_impl(_In_ _Buffer *_Src, _View_shape_ptr _Src_shape, _Out_ _Buffer * _Dst, _View_shape_ptr _Dst_shape); struct _Array_copy_desc { _Array_copy_desc( const unsigned int _Rank, const unsigned int _Src_linear_offset, const unsigned int * _Src_extents, const unsigned int * _Src_copy_offset, const unsigned int _Dst_linear_offset, const unsigned int * _Dst_extents, const unsigned int * _Dst_copy_offset, const unsigned int * _Copy_extents) { this->_Rank = _Rank; this->_Src_linear_offset = _Src_linear_offset; this->_Src_extents.assign( _Src_extents, _Src_extents + _Rank); this->_Src_copy_offset.assign( _Src_copy_offset, _Src_copy_offset + _Rank); this->_Dst_linear_offset = _Dst_linear_offset; this->_Dst_extents.assign( _Dst_extents, _Dst_extents + _Rank); this->_Dst_copy_offset.assign( _Dst_copy_offset, _Dst_copy_offset + _Rank); this->_Copy_extents.assign( _Copy_extents, _Copy_extents + _Rank); } _Array_copy_desc() {} unsigned int _Rank; // Shape of source unsigned int _Src_linear_offset; std::vector _Src_extents; std::vector _Src_copy_offset; // Shape of destination unsigned int _Dst_linear_offset; std::vector _Dst_extents; std::vector _Dst_copy_offset; // Shape of copy region std::vector _Copy_extents; }; // Declaration _AMPIMP HRESULT __cdecl _Recursive_array_copy(const _Array_copy_desc& _Desc, unsigned int _Native_copy_rank, std::function _Native_copy_func); _AMPIMP std::pair __cdecl _Get_src_dest_accelerator_view(_In_opt_ const _Buffer_descriptor *_SrcBuffDescPtr, _In_opt_ const _Buffer_descriptor *_DestBuffDescPtr); // Iterator based copy function template inline _Event _Copy_impl_iter(_InputInterator _SrcFirst, _InputInterator _SrcLast, _OutputIterator _DstFirst) { std::copy(_SrcFirst, _SrcLast, _DstFirst); return _Event(); } // Iterator based copy function template inline _Event _Copy_impl(InputIterator _SrcFirst, _View_shape_ptr _Src_shape, _Inout_ _Buffer * _Dst, _View_shape_ptr _Dst_shape) { _ASSERTE(_Dst != NULL); _ASSERTE(_Src_shape != NULL); _ASSERTE(_Dst_shape != NULL); if (_Src_shape->_Is_projection()) { _Src_shape = _Src_shape->_Get_reduced_shape_for_copy(); } if (_Dst_shape->_Is_projection()) { _Dst_shape = _Dst_shape->_Get_reduced_shape_for_copy(); } _ASSERTE(_Src_shape->_Get_rank() == _Dst_shape->_Get_rank()); _ASSERTE(_View_shape::_Compare_extent_with_elem_size(_Src_shape->_Get_rank(), _Src_shape->_Get_view_extent(), sizeof(_Value_type), _Dst_shape->_Get_view_extent(), _Dst->_Get_elem_size())); if (_Dst->_Is_host_accessible(_Write_access)) { // The destination buffer is accesible on the host. Map the _Dst buffer _Event _Ev = _Dst->_Map_buffer_async(_Write_access); _Buffer_ptr _PDestBuf = _Dst; return _Ev._Add_continuation(std::function<_Event()>([_SrcFirst, _Src_shape, _PDestBuf, _Dst_shape]() mutable -> _Event { return _Copy_impl_iter(_SrcFirst, _Src_shape, stdext::make_unchecked_array_iterator(reinterpret_cast<_Value_type*>(_PDestBuf->_Get_host_ptr())), _Create_reinterpreted_shape(_Dst_shape, _PDestBuf->_Get_elem_size(), sizeof(_Value_type))); })); } else { // The dest buffer is not accesible on host. Lets create a temporary // staging buffer on the destination buffer's accelerator_view _Buffer_ptr _PTempStagingBuf = _Buffer::_Create_stage_buffer(_Dst->_Get_accelerator_view(), accelerator(accelerator::cpu_accelerator).default_view, _Src_shape->_Get_view_size(), sizeof(_Value_type), true /* _Is_temp */); _PTempStagingBuf->_Map_buffer(_Write_access, true /* _Wait */); _Value_type *_Dst_ptr = reinterpret_cast<_Value_type*>(_PTempStagingBuf->_Get_host_ptr()); _Event _Ev = _Copy_impl_iter(_SrcFirst, _Src_shape, stdext::make_unchecked_array_iterator(_Dst_ptr), _Src_shape); // Now copy from the staging buffer to the destination buffer _Buffer_ptr _PDestBuf = _Dst; return _Ev._Add_continuation(std::function<_Event()>([_PTempStagingBuf, _Src_shape, _PDestBuf, _Dst_shape]() mutable -> _Event { return _Copy_impl(_PTempStagingBuf, _Src_shape, _PDestBuf, _Dst_shape); })); } } template inline _Event _Copy_impl(_In_ _Buffer *_Src, _View_shape_ptr _Src_shape, OutputIterator _DestIter, _View_shape_ptr _Dst_shape) { _ASSERTE(_Src != NULL); _ASSERTE(_Src_shape != NULL); _ASSERTE(_Dst_shape != NULL); if (_Src_shape->_Is_projection()) { _Src_shape = _Src_shape->_Get_reduced_shape_for_copy(); } if (_Dst_shape->_Is_projection()) { _Dst_shape = _Dst_shape->_Get_reduced_shape_for_copy(); } _ASSERTE(_Src_shape->_Get_rank() == _Dst_shape->_Get_rank()); _ASSERTE(_View_shape::_Compare_extent_with_elem_size(_Src_shape->_Get_rank(), _Src_shape->_Get_view_extent(), _Src->_Get_elem_size(), _Dst_shape->_Get_view_extent(), sizeof(_Value_type))); if (_Src->_Is_host_accessible(_Read_access)) { // The source buffer is accessible on the host. Map the _Src buffer _Event _Ev = _Src->_Map_buffer_async(_Read_access); _Buffer_ptr _PSrcBuf = _Src; return _Ev._Add_continuation(std::function<_Event()>([_PSrcBuf, _Src_shape, _DestIter, _Dst_shape]() mutable -> _Event { return _Copy_impl_iter(reinterpret_cast<_Value_type*>(_PSrcBuf->_Get_host_ptr()), _Create_reinterpreted_shape(_Src_shape, _PSrcBuf->_Get_elem_size(), sizeof(_Value_type)), _DestIter, _Dst_shape); })); } else { // The source buffer is not accessible on host. Lets create a temporary // staging buffer on the source buffer's accelerator_view and initiate a copy // from the source buffer to the temporary staging buffer _Buffer_ptr _PTempStagingBuf = _Buffer::_Create_stage_buffer(_Src->_Get_accelerator_view(), accelerator(accelerator::cpu_accelerator).default_view, _Dst_shape->_Get_view_size(), sizeof(_Value_type), true); _Event _Ev = _Src->_Copy_to_async(_PTempStagingBuf, _Src_shape, _Dst_shape); return _Ev._Add_continuation(std::function<_Event()>([_PTempStagingBuf, _Dst_shape, _DestIter]() mutable -> _Event { return _Copy_impl_iter(reinterpret_cast<_Value_type*>(_PTempStagingBuf->_Get_host_ptr()), _Dst_shape, _DestIter, _Dst_shape); })); } } // Iterator based structured copy function template inline _Event _Copy_impl_iter(_InputInterator _SrcIter, _View_shape_ptr _Src_shape, _OutputIterator _DstIter, _View_shape_ptr _Dst_shape) { if (_Src_shape->_Is_projection()) { _Src_shape = _Src_shape->_Get_reduced_shape_for_copy(); } if (_Dst_shape->_Is_projection()) { _Dst_shape = _Dst_shape->_Get_reduced_shape_for_copy(); } _ASSERTE(_Src_shape->_Get_rank() == _Dst_shape->_Get_rank()); _ASSERTE(_View_shape::_Compare_extent(_Src_shape->_Get_rank(), _Src_shape->_Get_view_extent(), _Dst_shape->_Get_view_extent())); // If both the _Src_shape and _Dst_shape are linear we can be more efficient unsigned int _Src_linear_offset, _Src_linear_size, _Dst_linear_offset, _Dst_linear_size; if (_Src_shape->_Is_view_linear(_Src_linear_offset, _Src_linear_size) && _Dst_shape->_Is_view_linear(_Dst_linear_offset, _Dst_linear_size)) { _ASSERTE(_Src_linear_size == _Dst_linear_size); // These iterators might be not contiguous, therefore we use std::advance std::advance(_SrcIter, _Src_linear_offset); auto _SrcLast = _SrcIter; std::advance(_SrcLast, _Src_linear_size); std::advance(_DstIter, _Dst_linear_offset); return _Copy_impl_iter(_SrcIter, _SrcLast, _DstIter); } std::vector _Src_extent(_Src_shape->_Get_rank()); std::vector _Src_offset(_Src_shape->_Get_rank()); std::vector _Dst_extent(_Dst_shape->_Get_rank()); std::vector _Dst_offset(_Dst_shape->_Get_rank()); std::vector _Copy_extent(_Src_shape->_Get_rank()); for (size_t i = 0; i < _Src_shape->_Get_rank(); ++i) { _Src_extent[i] = _Src_shape->_Get_base_extent()[i]; _Src_offset[i] = _Src_shape->_Get_view_offset()[i]; _Dst_extent[i] = _Dst_shape->_Get_base_extent()[i]; _Dst_offset[i] = _Dst_shape->_Get_view_offset()[i]; _Copy_extent[i] = _Src_shape->_Get_view_extent()[i]; } _Array_copy_desc _Desc( _Src_shape->_Get_rank(), _Src_shape->_Get_linear_offset(), _Src_extent.data(), _Src_offset.data(), _Dst_shape->_Get_linear_offset(), _Dst_extent.data(), _Dst_offset.data(), _Copy_extent.data()); // Note: Capturing shape pointers would be incorrect, they are valid for setting up the call. // They might be deleted right after this call completes. HRESULT hr = _Recursive_array_copy(_Desc, 1, [_SrcIter, _DstIter](const _Array_copy_desc &_Reduced) -> HRESULT { auto _SrcFirst = _SrcIter; auto _DstFirst = _DstIter; std::advance(_DstFirst, _Reduced._Dst_linear_offset + _Reduced._Dst_copy_offset[0]); std::advance(_SrcFirst, _Reduced._Src_linear_offset + _Reduced._Src_copy_offset[0]); auto _SrcLast = _SrcFirst; std::advance(_SrcLast, _Reduced._Copy_extents[0]); std::copy(_SrcFirst, _SrcLast, _DstFirst); return S_OK; }); if (FAILED(hr)) { throw Concurrency::runtime_exception("Failed to copy between buffers", E_FAIL); } return _Event(); } // A ubiquitous buffer that provides access to the underlying data // on any accelerator_view class _Ubiquitous_buffer : public _Reference_counter { friend _Event _Get_access_async(const _View_key _Key, accelerator_view _Av, _Access_mode _Mode, _Buffer_ptr &_Buf_ptr); friend _AMPIMP accelerator_view __cdecl _Select_copy_src_accelerator_view(_In_ _View_key _Src_view_key, const accelerator_view &_Dest_accelerator_view); friend struct _DPC_call_handle; public: _AMPIMP static _Ret_ _Ubiquitous_buffer * __cdecl _Create_ubiquitous_buffer(size_t _Num_elems, size_t _Elem_size); _AMPIMP static _Ret_ _Ubiquitous_buffer * __cdecl _Create_ubiquitous_buffer(_Buffer_ptr _Master_buffer); // Register a new view on top of this _Ubiquitous_buffer _AMPIMP void _Register_view(_In_ _View_key _Key, accelerator_view _Cpu_av, _View_shape_ptr _Shape); // Register a copy of an existing view registered with this _Ubiquitous_buffer _AMPIMP void _Register_view_copy(_In_ _View_key _New_view_key, _In_ _View_key _Existing_view_key); // Unregister a view currently registered with this _Ubiquitous_buffer _AMPIMP void _Unregister_view(_In_ _View_key _Key); // Obtain a specified mode of access to the specified view on the specified target // accelerator_view. This method also serves the purpose of determining the // amount of data copy expected to happen as part of this _Get_access request // without actually performing the copies or state updates in the _Ubiquitous_buffer. This // is used for reporting the implicit data copies that happen when accessing array_views // in C++ AMP ETW events _AMPIMP _Event _Get_access_async(_In_ _View_key _Key, _Accelerator_view_impl_ptr _Av_view_impl_ptr, _Access_mode _Mode, _Buffer_ptr &_Buf_ptr, _Inout_opt_ ULONGLONG *_Sync_size = nullptr); // Discard the content underlying this view _AMPIMP void _Discard(_In_ _View_key _Key); // This method does not synchonize the copies. Should not be used for getting // data access but only to get the underlying buffer's properties _AMPIMP _Buffer_ptr _Get_master_buffer() const; _AMPIMP accelerator_view _Get_master_accelerator_view() const; _AMPIMP _View_shape_ptr _Get_view_shape(_In_ _View_key _Key); _Ret_ _Accelerator_view_impl* _Get_master_accelerator_view_impl() const { return _M_master_av; } size_t _Get_master_buffer_elem_size() const { return _M_master_buffer_elem_size; } size_t _Get_master_buffer_num_elems() const { return _M_master_buffer_num_elems; } bool _Has_data_source() const { return _M_has_data_source; } private: // The _Ubiquitous_buffer constructors are private to force construction through the static // _Create_ubiquitous_buffer method to ensure the object is allocated in the runtime _Ubiquitous_buffer(size_t _Num_elems, size_t _Elem_size); _Ubiquitous_buffer(_In_ _Buffer* _Master_buffer); // Private destructor to force deletion through _Release ~_Ubiquitous_buffer(); // No default consturctor, copy constructor and assignment operator _Ubiquitous_buffer(); _Ubiquitous_buffer(const _Ubiquitous_buffer &rhs); _Ubiquitous_buffer &operator=(const _Ubiquitous_buffer &rhs); // Helper methods // Get access to a buffer on a specified accelerator for a specified pre-registered view. // If _Sync_size parameter is not null, then function calculates number of bytes that we // need to synchronize to get desired access. _AMPIMP _Event _Get_access_async(_In_ _View_key _Key, accelerator_view _Av, _Access_mode _Mode, _Buffer_ptr &_Buf_ptr, _Inout_opt_ ULONGLONG *_Sync_size = NULL); // Commit a view to the master buffer if needed. When the _Sync_size parameter is non-null // this method just returns the amount of data to be copied as part of the commit, without // actually performing the commit _Event _Commit_view_async(_In_ _View_info *_Info, _Inout_ ULONGLONG *_Sync_size = nullptr); // Get the _Buffer_ptr corresponding to a specified accelerator_view. When the // _Create parameter is true, it creates a new _Buffer if one does not already exist // for that accelerator_view _Ret_ _Buffer* _Get_buffer(_In_ _Accelerator_view_impl* _Av, bool _Create = true); // Sets a new access mode for the specified view void _Set_new_access_mode(_Inout_ _View_info *_Info, _Access_mode _New_mode); // Unsets the discard flag from the specified view and all other // overlapping views void _Unset_discard_flag(_Inout_ _View_info *_Info); // Determines whether the data underlying the specified view has been discarded // based on whether a subsuming view has the discard flag set. bool _Should_discard(const _View_info *_Info) const; // Does this view have exclusive data which is not discarded, // not on the master accelerator_view and also there is not other view // that subsumes this view and is marked dirty bool _Has_exclusive_data(const _View_info *_Info) const; // Based on the current state of overlapping views in the _Ubiquitous_buffer // does the specified view require a data update on the target accelerator_view // to fulfil an access request bool _Requires_update_on_target_accelerator_view(const _View_info *_Info, _Access_mode _Requested_mode, _In_ _Accelerator_view_impl* _Target_acclerator_view) const; // This method iterates over all views in the specified commit list // and flags them as "commit not needed" if that view is subsumed by another view present in the // commit list static void _Flag_redundant_commits(std::vector> &_Commit_list); // This method returns the list of accelerator_views where the specified view already has // a valid cached copy of the data and getting read access would not incur any data movement. // The _Can_access_anywhere parameter is an output parameter used to indicate to the // caller that the specified view can be accessed on any accelerator_view without incurring // any data movement. This is true when there are no modified overlapping views that require // synchronization and the specified view has the discard_data flag set. // This method is used for determining the source accelerator_view for copy and p_f_e operations // involving array_views _Accelerator_view_unordered_set _Get_caching_info(_In_ _View_key _Key, _Out_opt_ bool *_Can_access_anywhere = NULL); _Accelerator_view_unordered_set _Get_caching_info_impl(_In_ _View_key _Key, _Out_opt_ bool *_Can_access_anywhere); _Ret_ _Accelerator_view_impl* _Determine_alternate_target_accelerator_view(_In_ _View_key _Key, _In_ _Accelerator_view_impl* _Original_av, _Access_mode _Mode); private: // Private data // The master accelerator_view for this _Ubiquitous_buffer // which is specified at construction time _Accelerator_view_impl_ptr _M_master_av; // The master _Buffer corresponding to this _Ubiquitous_buffer // which is specified at construction time _Buffer* _M_master_buffer; // The size of each element of the master buffer size_t _M_master_buffer_elem_size; // The number of elements in the master buffer size_t _M_master_buffer_num_elems; // Indicates if this ubiquitous buffer has an underlying data source bool _M_has_data_source; // A map of pre-created _Buffers corresponding to different // accelerator_views where the _Ubiquitous_buffer has already been // accessed std::map<_Accelerator_view_impl_ptr, _Buffer_ptr> _M_buffer_map; // A mapping between all registered view keys in this _Ubiquitous_buffer // to their corresponding _View_info std::unordered_map<_View_key, _View_info*> _M_view_map; // Set of distinct views of this buffer. As multiple copies of the same // view may have been registered for this _Ubiquitous_buffer, this set // maintains the set of distinct views which really matter for the // caching protocol. Also, note that some view_info may not have any live registered // and hence does not exist in the _M_view_map but may exist here since // it has uncomiitted data which needs to be considered as part of the cache // coherence protocol to prevent modifications underlying this view from being lost std::unordered_set<_View_info*> _M_view_info_set; // Critical section object to protect the cache directory Concurrency::critical_section _M_critical_section; }; // Class defines functions for interoperability with D3D class _D3D_interop { public: _AMPIMP static _Ret_ IUnknown * __cdecl _Get_D3D_buffer(_In_ _Buffer *_Buffer_ptr); _AMPIMP static _Ret_ IUnknown * __cdecl _Get_D3D_texture(_In_ _Texture *_Texture_ptr); _AMPIMP static _Ret_ void * __cdecl _Get_D3D_sampler_data_ptr(_In_ IUnknown *_D3D_sampler); _AMPIMP static void __cdecl _Release_D3D_sampler_data_ptr(_In_ void *_Sampler_data_ptr); _AMPIMP static _Ret_ IUnknown * __cdecl _Get_D3D_sampler(const Concurrency::accelerator_view &_Av, _In_ _Sampler *_Sampler_ptr); }; inline _Event _Get_access_async(const _View_key _Key, accelerator_view _Av, _Access_mode _Mode, _Buffer_ptr &_Buf_ptr) { return _Key->_Get_buffer_ptr()->_Get_access_async(_Key->_Get_view_key(), _Av, _Mode, _Buf_ptr); } inline _Ret_ _View_shape* _Get_buffer_view_shape(const _Buffer_descriptor& _Descriptor) { return _Descriptor._Get_buffer_ptr()->_Get_view_shape(_Descriptor._Get_view_key()); } inline bool _Is_cpu_accelerator(const accelerator& _Accl) { return (_Accl.device_path == accelerator::cpu_accelerator); } } // namespace Concurrency::details ///

/// Exception thrown when an underlying DirectX call fails /// due to the Windows timeout detection and recovery mechanism ///

class accelerator_view_removed : public runtime_exception { public: ///

/// Construct an accelerator_view_removed exception with a message and /// a view removed reason code ///

/// /// Descriptive message of error /// /// /// HRESULT error code indicating the cause of removal of the accelerator_view /// _AMPIMP explicit accelerator_view_removed(const char * _Message, HRESULT _View_removed_reason) throw(); ///

/// Construct an accelerator_view_removed exception ///

/// /// HRESULT error code indicating the cause of removal of the accelerator_view /// _AMPIMP explicit accelerator_view_removed(HRESULT _View_removed_reason) throw(); ///

/// Returns an HRESULT error code indicating the cause of the accelerator_view's removal ///

/// /// The HRESULT error code that indicates the cause of accelerator_view's removal /// _AMPIMP HRESULT get_view_removed_reason() const throw(); private: HRESULT _M_view_removed_reason_code; }; // class accelerator_view_removed ///

/// Exception thrown when the runtime fails to launch a kernel /// using the compute domain specified at the parallel_for_each call site. ///

class invalid_compute_domain : public runtime_exception { public: ///

/// Construct an invalid_compute_domain exception with a message ///

/// /// Descriptive message of error /// _AMPIMP explicit invalid_compute_domain(const char * _Message) throw(); ///

/// Construct an invalid_compute_domain exception ///

_AMPIMP invalid_compute_domain() throw(); }; // class invalid_compute_domain ///

/// Exception thrown when an unsupported feature is used ///

class unsupported_feature : public runtime_exception { public: ///

/// Construct an unsupported_feature exception with a message ///

/// /// Descriptive message of error /// _AMPIMP explicit unsupported_feature(const char * _Message) throw(); ///

/// Construct an unsupported_feature exception ///

_AMPIMP unsupported_feature() throw(); }; // class unsupported_feature } // namespace Concurrency // =+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+ // // Compiler/Runtime Interface // // =-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=- #define HELPERAPI __cdecl using namespace Concurrency::details; extern "C" { // This structure is used for storing information about resources required by the kernel. enum _Resource_kind { RESOURCE_BUFFER = 0, RESOURCE_TEXTURE = 1, RESOURCE_SAMPLER = 2, }; struct _Device_resource_info { _Resource_kind _M_resource_kind; // buffer, texture, or sampler void * _M_desc; // Pointer to the _Buffer_descriptor/_Texture_descriptor/_Sampler_descriptor instance // which underlies all device resource _Access_mode _M_formal_access_mode; // scalar: read-only // const scalar ref: read-only // scalar ref: ReadWrite // array: ReadWrite // const array: ReadOnly size_t _M_actual_arg_num; BOOL _Is_buffer() const { return (_M_resource_kind == RESOURCE_BUFFER); } BOOL _Is_texture() const { return (_M_resource_kind == RESOURCE_TEXTURE); } BOOL _Is_sampler() const { return (_M_resource_kind == RESOURCE_SAMPLER); } _Ret_ _Buffer_descriptor * _Get_buffer_desc() const { _ASSERTE(_Is_buffer()); return reinterpret_cast<_Buffer_descriptor *>(_M_desc); } _Ret_ _Texture_descriptor * _Get_texture_desc() const { _ASSERTE(_Is_texture()); return reinterpret_cast<_Texture_descriptor *>(_M_desc); } _Ret_ _Sampler_descriptor * _Get_sampler_desc() const { _ASSERTE(_Is_sampler()); return reinterpret_cast<_Sampler_descriptor *>(_M_desc); } _Ret_ void * _Get_resource_ptr() const { if (_Is_buffer()) { _Ubiquitous_buffer * _Tmp = _Get_buffer_desc()->_Get_buffer_ptr(); return reinterpret_cast(_Tmp); } else if (_Is_texture()) { _Texture * _Tmp = _Get_texture_desc()->_Get_texture_ptr(); return reinterpret_cast(_Tmp); } else { _ASSERTE(_Is_sampler()); _Sampler * _Tmp = _Get_sampler_desc()->_Get_sampler_ptr(); return reinterpret_cast(_Tmp); } } }; // This structure is used for storing information about the const buffers struct _Device_const_buffer_info { void * _M_data; // Pointer to the host data to intialize the // constant buffer with size_t _M_const_buf_size; // Size of the const buffer in bytes unsigned int _M_is_debug_data; // Is this debug data which will be // intialized by the runtime. 0 (false), 1 (true) }; } namespace Concurrency { namespace details { enum _DPC_kernel_func_kind { NON_ALIASED_SHADER = 0, // slot 0 ALIASED_SHADER = 1, // slot 1 NUM_SHADER_VERSIONS = 2 }; struct _DPC_call_handle { _Accelerator_view_impl *_M_rv; bool _M_is_explicit_target_acclview; // Info about the kernel function arguments _Device_resource_info * _M_device_resource_info; size_t _M_num_resources; size_t _M_num_writable_buffers; size_t _M_num_samplers; // Info about the host buffer created corresponding to the const buffer _Device_const_buffer_info * _M_const_buffer_info; size_t _M_num_const_buffers; bool _M_RW_aliasing; // Kernel funcs _DPC_shader_blob * _M_shader_blobs[NUM_SHADER_VERSIONS]; // Compute domain info int _M_is_flat_model; unsigned int _M_compute_rank; unsigned int * _M_grid_extents; // Kernel dispatch info unsigned int _M_groupCountX; unsigned int _M_groupCountY; unsigned int _M_groupCountZ; // The shape of the group unsigned int _M_groupExtentX; unsigned int _M_groupExtentY; unsigned int _M_groupExtentZ; _DPC_call_handle(const accelerator_view &_Accelerator_view) { if (!_Accelerator_view.is_auto_selection) { _M_rv = _Get_accelerator_view_impl_ptr(_Accelerator_view); } else { _M_rv = NULL; } _M_is_explicit_target_acclview = false; if (_M_rv != NULL) { _M_is_explicit_target_acclview = true; } _M_device_resource_info = NULL; _M_num_resources = 0; _M_num_writable_buffers = 0; _M_num_samplers = 0; _M_const_buffer_info = NULL; _M_num_const_buffers = 0; _M_RW_aliasing = false; for (size_t _I = 0; _I < NUM_SHADER_VERSIONS; _I++) { _M_shader_blobs[_I] = NULL; } _M_is_flat_model = 0; _M_compute_rank = 0; _M_grid_extents = NULL; _M_groupCountX = 0; _M_groupCountY = 0; _M_groupCountZ = 0; _M_groupExtentX = 0; _M_groupExtentY = 0; _M_groupExtentZ = 0; } ~_DPC_call_handle() { if (_M_grid_extents) { delete [] _M_grid_extents; } } bool _Is_buffer_aliased(_In_ void *_Buffer_ptr) { return ((_M_aliased_buffer_set != nullptr) && (_M_aliased_buffer_set->find(_Buffer_ptr) != _M_aliased_buffer_set->end())); } bool _Is_buffer_unaccessed(size_t _Buffer_idx) { return ((_M_is_device_buffer_unaccessed != nullptr) && _M_is_device_buffer_unaccessed->operator[](_Buffer_idx)); } void _Set_buffer_unaccessed(size_t _Buffer_idx) { if (_M_is_device_buffer_unaccessed == nullptr) { _M_is_device_buffer_unaccessed = std::unique_ptr>(new std::vector(_M_num_resources, false)); } _M_is_device_buffer_unaccessed->operator[](_Buffer_idx) = true; } const int* _Get_redirect_indices() const { if (!_M_RW_aliasing) { return nullptr; } _ASSERTE(_M_Redirect_indices != nullptr); return _M_Redirect_indices->data(); } void _Check_buffer_aliasing(); void _Update_buffer_rw_property(); void _Setup_aliasing_redirection_indices(); void _Select_accelerator_view(); void _Verify_buffers_against_accelerator_view(); private: std::unique_ptr> _M_aliased_buffer_set; std::unique_ptr> _M_is_device_buffer_unaccessed; // Info about read-write aliasing std::unique_ptr> _M_Redirect_indices; }; // This structure is used for passing the scheduling // info to the parallel_for_each which is handed back // to the compiler-runtime interface methods by the front-end struct _Host_Scheduling_info { // The accelerator view to invoke a parallel_for_each on accelerator_view _M_accelerator_view; }; } // namespace Concurrency::details ///

/// Uninitializes the C++ AMP runtime. It is legal to /// call this function multiple times during an applications /// lifetime. Calling any C++ AMP API afer calling this function /// will reinitialize the C++ AMP runtime. Note that it is illegal /// to use C++ AMP objects across calls to this function and doing /// so will result in undefined behavior. Also, concurrently calling /// this function and any other AMP APIs is illegal and would result /// in undefined behavior. ///

_AMPIMP void __cdecl amp_uninitialize(); } // namespace Concurrency extern "C" { // Return a compiler helper handle. _AMPIMP _Ret_ _DPC_call_handle * HELPERAPI __dpc_create_call_handle(_In_ _Host_Scheduling_info *_Sch_info) throw(...); // Destroy the call handle _AMPIMP void HELPERAPI __dpc_release_call_handle(_In_ _DPC_call_handle * _Handle) throw(...); _AMPIMP void HELPERAPI __dpc_set_device_resource_info(_In_ _DPC_call_handle * _Handle, _In_ _Device_resource_info * _DeviceResourceInfo, size_t _NumResources) throw(...); // Set const buffer info. _AMPIMP void HELPERAPI __dpc_set_const_buffer_info(_In_ _DPC_call_handle * _Handle, _In_ _Device_const_buffer_info * _DeviceConstBufferInfo, size_t _NumConstBuffers) throw(...); // Set the kernel shader info _AMPIMP void HELPERAPI __dpc_set_kernel_shader_info(_In_ _DPC_call_handle * _Handle, _Inout_ void ** _ShaderBlobs) throw(...); // Set kernel dispatch info _AMPIMP void HELPERAPI __dpc_set_kernel_dispatch_info(_In_ _DPC_call_handle * _Handle, unsigned int _ComputeRank, _In_ int * _Extents, unsigned int _GroupRank, const unsigned int * _GroupExtents, unsigned int & _GroupCountX, unsigned int & _GroupCountY, unsigned int & _GroupCountZ) throw(...); // Dispatch the kernel _AMPIMP void HELPERAPI __dpc_dispatch_kernel(_In_ _DPC_call_handle * _Handle) throw(...); #ifdef _DEBUG // Dispatch the kernel passed as a HLSL source level shader // This function is to be used only for testing and debugging purposes _AMPIMP void HELPERAPI __dpc_dispatch_kernel_test(_In_ _DPC_call_handle * _Handle, _In_ WCHAR* szFileName, LPCSTR szEntryPoint) throw(...); #endif } // =+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+ // // C++ AMP ETW Provider // // =-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=- namespace Concurrency { namespace details { // Thread-safe factory method for _Amp_runtime_trace object _AMPIMP _Ret_ _Amp_runtime_trace* __cdecl _Get_amp_trace(); // Class that gathers C++ AMP diagnostic information and triggers events class _Amp_runtime_trace { // Called by factory to create single instance of _Amp_runtime_trace type friend BOOL CALLBACK _Init_amp_runtime_trace(PINIT_ONCE _Init_once, PVOID _Param, _Inout_ PVOID *_Context); public: // Destructor for _Amp_runtime_trace, called at program termination _AMPIMP ~_Amp_runtime_trace(); // End event is triggered by multiple other events such us StartComputeEvent to show exactly when given activity completed _AMPIMP void _Write_end_event(ULONG _Span_id); // Add accelerator configuration information // Note: This member function does not have to be exported, it is used by C++ AMP runtime factory void _Add_accelerator_config_event(PVOID _Accelerator_id, LPCWSTR _Device_path, LPCWSTR _Device_description); // Used by callback function, to write all configuration data when new session is detected // Note: This member function does not have to be exported, it is used by C++ AMP runtime factory void _Write_all_accelerator_config_events(); // Started accelerator_view::wait operation // Note: This member function does not have to be exported, it is used by C++ AMP runtime factory ULONG _Start_accelerator_view_wait_event(PVOID _Accelerator_id, PVOID _Accelerator_view_id); // Launched accelerator_view::flush operation // Note: This member function does not have to be exported, it is used by C++ AMP runtime factory void _Launch_flush_event(PVOID _Accelerator_id, PVOID _Accelerator_view_id); // Launched accelerator_view::create_marker operation // Note: This member function does not have to be exported, it is used by C++ AMP runtime factory ULONG _Launch_marker(PVOID _Accelerator_id, PVOID _Accelerator_view_id); // Below are set of helpers that take various types that were available at event injection point and extract all necessary data _AMPIMP ULONG _Start_parallel_for_each_event_helper(_In_ _DPC_call_handle *_Handle); // This helper wraps functor with wait start and wait end events inline concurrency::completion_future _Start_async_op_wait_event_helper(ULONG _Async_op_id, _Event _Ev) { std::shared_future retFuture; concurrency::task_completion_event retTaskCompletionEvent; // Create a std::shared_future by creating a deferred task through std::async that waits for the // event _Ev to finish. Wrap functor with start and end events retFuture = std::async(std::launch::sync, [=]() mutable { try { if (_Async_op_id == _Amp_runtime_trace::_M_event_disabled) { _Ev._Get(); } else { auto _Span_id = details::_Get_amp_trace()->_Start_async_op_wait_event(_Async_op_id); _Ev._Get(); details::_Get_amp_trace()->_Write_end_event(_Span_id); } } catch(...) { // If an exception is encountered when executing the asynchronous operation // we should set the exception on the retTaskCompletionEvent so that it is // appropriately cancelled and the exception is propagated to continuations retTaskCompletionEvent.set_exception(std::current_exception()); throw; } retTaskCompletionEvent.set(); }); // Register the async event with the runtime asynchronous events manager _Register_async_event(_Ev, retFuture); // Lets issue a continuation just to swallow any exceptions that are encountered during the // async operation and are never observed by the user or are just observed through the // shared_future and not through the task concurrency::task retTask(retTaskCompletionEvent); retTask.then([](concurrency::task _Task) { try { _Task.get(); } catch(...) { } }); return Concurrency::completion_future(retFuture, retTask); } _AMPIMP ULONG _Start_array_view_synchronize_event_helper(const _Buffer_descriptor &_Buff_desc); _AMPIMP ULONG _Launch_array_view_synchronize_event_helper(const _Buffer_descriptor &_Buff_desc); // Helpers for buffers (array, array_view) _AMPIMP ULONG _Start_copy_event_helper(const _Buffer_descriptor &_Src, const _Buffer_descriptor &_Dest, ULONGLONG _Num_bytes_for_copy); _AMPIMP ULONG _Start_copy_event_helper(nullptr_t, const _Buffer_descriptor &_Dest, ULONGLONG _Num_bytes_for_copy); _AMPIMP ULONG _Start_copy_event_helper(const _Buffer_descriptor &_Src, nullptr_t, ULONGLONG _Num_bytes_for_copy); _AMPIMP ULONG _Launch_async_copy_event_helper(const _Buffer_descriptor &_Src, const _Buffer_descriptor &_Dest, ULONGLONG _Num_bytes_for_copy); _AMPIMP ULONG _Launch_async_copy_event_helper(nullptr_t, const _Buffer_descriptor &_Dest, ULONGLONG _Num_bytes_for_copy); _AMPIMP ULONG _Launch_async_copy_event_helper(const _Buffer_descriptor &_Src, nullptr_t, ULONGLONG _Num_bytes_for_copy); // Helper for textures _AMPIMP ULONG _Start_copy_event_helper(const _Texture_descriptor &_Src, nullptr_t, ULONGLONG _Num_bytes_for_copy); _AMPIMP ULONG _Start_copy_event_helper(nullptr_t, const _Texture_descriptor &_Dest, ULONGLONG _Num_bytes_for_copy); _AMPIMP ULONG _Start_copy_event_helper(const _Texture_descriptor &_Src, const _Texture_descriptor &_Dest, ULONGLONG _Num_bytes_for_copy); _AMPIMP ULONG _Launch_async_copy_event_helper(const _Texture_descriptor &_Src, nullptr_t, ULONGLONG _Num_bytes_for_copy); _AMPIMP ULONG _Launch_async_copy_event_helper(nullptr_t, const _Texture_descriptor &_Dest, ULONGLONG _Num_bytes_for_copy); _AMPIMP ULONG _Launch_async_copy_event_helper(const _Texture_descriptor &_Src, const _Texture_descriptor &_Dest, ULONGLONG _Num_bytes_for_copy); void _Enable_provider(bool _Enable = true); private: // Private constructor. This type is created by factory method _Amp_runtime_trace(PVOID _Callback_function, _In_ _Trace *_Trace); // Disallow copy construction _Amp_runtime_trace(const _Amp_runtime_trace&); // Disallow assignment operator _Amp_runtime_trace& operator=(const _Amp_runtime_trace&); // Used internally to write configuation events void _Write_accelerator_config_event(const std::tuple &_ConfigTuple); // Event triggered when computation is scheduled ULONG _Start_parallel_for_each_event( PVOID _Accelerator_id, PVOID _Accelerator_view_id, BOOL _Is_tiled_explicitly, ULONGLONG _Num_of_tiles, ULONG _Num_of_threads_per_tile, BOOL _Is_aliased, ULONG _Num_read_only_resources, ULONG _Num_read_write_resources, ULONGLONG _Size_of_all_resouces, ULONG _Size_of_const_data, ULONGLONG _Size_of_data_for_copy); // Synchronous copy operation has started ULONG _Start_copy_event( PVOID _Src_accelerator_id, PVOID _Src_accelerator_view_id, PVOID _Dst_accelerator_id, PVOID _Dst_accelerator_view_id, ULONGLONG _Num_bytes_for_copy, BOOL _Is_src_staging, BOOL _Is_dst_staging); // Asynchronous copy operation has been launched ULONG _Launch_async_copy_event( PVOID _Src_accelerator_id, PVOID _Src_accelerator_view_id, PVOID _Dst_accelerator_id, PVOID _Dst_accelerator_view_id, ULONGLONG _Num_bytes_for_copy, BOOL _Is_src_staging, BOOL _Is_dst_staging); // Started waiting for asynchronous operation to complete _AMPIMP ULONG _Start_async_op_wait_event(ULONG _Async_op_id); // Started array_view::synchronize operation ULONG _Start_array_view_synchronize_event(ULONGLONG _Num_bytes_to_synchronize); // Async array_view::synchronize operation has been launched ULONG _Launch_array_view_synchronize_event(ULONGLONG _Num_bytes_to_synchronize); // Helper function that extracts information from buffer descriptor std::tuple _Get_resource_diagnostic_info(const _Buffer_descriptor &_Buff_desc, accelerator_view _Accl_view) const; // Helper function that extracts information from texture descriptor std::tuple _Get_resource_diagnostic_info(const _Texture_descriptor &_Tex_desc) const; // Generates unique identifiers for span_id and async_op_id ULONG _Get_unique_identifier(); // Critical section object used by callback function to synchronize following situations: // a) multiple sessions have started at the same time // b) C++ AMP Runtime factory adds new accelerator config event to the collection Concurrency::critical_section _M_critical_section; // Collection of all configuration events at the time of C++ AMP Runtime initialization std::vector> _M_accelerator_configs; // Unique counter for span id and async operation id volatile ULONG _M_counter; // Type that implements ITrace interface and writes events e.g. ETW events _Trace* _M_trace_ptr; // Special value that we return to chain events if provider is disabled static const ULONG _M_event_disabled = 0; }; // Helper function to query the number of mipmap levels from texture object inline unsigned int _Get_mipmap_levels(const _Texture *_Tex) { _ASSERTE(_Tex); return _Tex->_Get_mip_levels(); } } // namespace Concurrency::details } // namespace Concurrency namespace concurrency = Concurrency; #pragma pack(pop)