4080 lines
160 KiB
C++
4080 lines
160 KiB
C++
/***
|
|
* ==++==
|
|
*
|
|
* Copyright (c) Microsoft Corporation. All rights reserved.
|
|
*
|
|
* ==--==
|
|
* =+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+
|
|
*
|
|
* amprt.h
|
|
*
|
|
* Define the C++ interfaces exported by the C++ AMP runtime
|
|
*
|
|
* =-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-
|
|
****/
|
|
#pragma once
|
|
|
|
#if !(defined (_M_X64) || defined (_M_IX86) || defined (_M_ARM) || defined (_M_ARM64) )
|
|
#error ERROR: C++ AMP runtime is supported only on X64, X86, ARM, and ARM64 architectures.
|
|
#endif
|
|
|
|
#if defined (_M_CEE)
|
|
#error ERROR: C++ AMP runtime is not supported when compiling /clr.
|
|
#endif
|
|
|
|
#ifndef __cplusplus
|
|
#error ERROR: C++ AMP runtime is supported only for C++.
|
|
#endif
|
|
|
|
#if !defined(_CXXAMP)
|
|
|
|
#if defined(_DEBUG)
|
|
#pragma comment(lib, "vcampd")
|
|
#else // _DEBUG
|
|
#pragma comment(lib, "vcamp")
|
|
#endif // _DEBUG
|
|
|
|
#endif // _CXXAMP
|
|
|
|
#if !defined(_CXXAMP)
|
|
|
|
#define __GPU restrict(amp,cpu)
|
|
#define __GPU_ONLY restrict(amp)
|
|
#define __CPU_ONLY
|
|
|
|
#else
|
|
|
|
#define __GPU
|
|
#define __GPU_ONLY
|
|
#define __CPU_ONLY
|
|
|
|
#endif // _CXXAMP
|
|
|
|
#include <exception>
|
|
#include <unknwn.h>
|
|
#include <crtdbg.h>
|
|
#include <string>
|
|
#include <vector>
|
|
|
|
#if defined(_CXXAMP)
|
|
#include <strsafe.h>
|
|
#endif // _CXXAMP
|
|
|
|
#include <future>
|
|
#include <functional>
|
|
#include <map>
|
|
#include <unordered_map>
|
|
#include <set>
|
|
#include <unordered_set>
|
|
#include <concrt.h>
|
|
#include <type_traits>
|
|
|
|
#if !defined(_AMPIMP)
|
|
#define _AMPIMP __declspec(dllimport)
|
|
#endif
|
|
|
|
#pragma pack(push,8)
|
|
|
|
// Part of runtime-compiler interface
|
|
extern "C"
|
|
{
|
|
// Access mode of fields
|
|
enum _Access_mode
|
|
{
|
|
_No_access = 0,
|
|
_Read_access = (1 << 0),
|
|
_Write_access = (1 << 1),
|
|
_Is_array_mode = (1 << 30),
|
|
_Read_write_access = _Read_access | _Write_access,
|
|
};
|
|
}
|
|
|
|
namespace Concurrency
|
|
{
|
|
/// <summary>
|
|
/// Enumeration type used to denote the various types of access to data.
|
|
/// </summary>
|
|
enum access_type
|
|
{
|
|
access_type_none = 0,
|
|
access_type_read = (1 << 0),
|
|
access_type_write = (1 << 1),
|
|
access_type_read_write = access_type_read | access_type_write,
|
|
access_type_auto = (1 << 31),
|
|
};
|
|
|
|
// Forward declarations
|
|
class accelerator_view;
|
|
class accelerator;
|
|
|
|
namespace details
|
|
{
|
|
const size_t ERROR_MSG_BUFFER_SIZE = 1024;
|
|
|
|
// A reference counter to be used as the base class for all reference counted types.
|
|
class _Reference_counter
|
|
{
|
|
public:
|
|
|
|
// Constructor.
|
|
_Reference_counter() : _M_rc(0) {}
|
|
|
|
// Destructor.
|
|
virtual ~_Reference_counter() {}
|
|
|
|
// Add a reference.
|
|
// Thread-safe.
|
|
size_t _Add_reference()
|
|
{
|
|
return InterlockedIncrement(reinterpret_cast<LONG volatile*>(&_M_rc));
|
|
}
|
|
|
|
// Remove a reference.
|
|
// Thread-safe.
|
|
size_t _Remove_reference()
|
|
{
|
|
_ASSERTE(_M_rc > 0);
|
|
|
|
size_t refCount = InterlockedDecrement(reinterpret_cast<LONG volatile*>(&_M_rc));
|
|
|
|
if (refCount == 0)
|
|
this->_Release();
|
|
|
|
return refCount;
|
|
}
|
|
|
|
// Release the counter
|
|
_AMPIMP void _Release();
|
|
|
|
// Return the reference count value
|
|
size_t _Get_reference_count()
|
|
{
|
|
return _M_rc;
|
|
}
|
|
|
|
private:
|
|
size_t _M_rc;
|
|
};
|
|
|
|
// A smart pointer to a reference counted object
|
|
// T must be a type derived from _Reference_counter
|
|
template <class T>
|
|
class _Reference_counted_obj_ptr
|
|
{
|
|
public:
|
|
|
|
// Constructor
|
|
_Reference_counted_obj_ptr(T* _Ptr = NULL) : _M_obj_ptr(_Ptr)
|
|
{
|
|
_Init();
|
|
}
|
|
|
|
// Copy constructor
|
|
_Reference_counted_obj_ptr(const _Reference_counted_obj_ptr &_Other) : _M_obj_ptr(_Other._M_obj_ptr)
|
|
{
|
|
_Init();
|
|
}
|
|
|
|
// Move constructor
|
|
_Reference_counted_obj_ptr(_Reference_counted_obj_ptr &&_Other) : _M_obj_ptr(_Other._M_obj_ptr)
|
|
{
|
|
_Other._M_obj_ptr = nullptr;
|
|
// No change to ref-count
|
|
}
|
|
|
|
// Destructor
|
|
~_Reference_counted_obj_ptr()
|
|
{
|
|
if (_M_obj_ptr != NULL) {
|
|
_UnInitialize(_M_obj_ptr);
|
|
}
|
|
}
|
|
|
|
// Assignment operator
|
|
_Reference_counted_obj_ptr& operator=(const _Reference_counted_obj_ptr &_Other)
|
|
{
|
|
if (_M_obj_ptr != _Other._M_obj_ptr)
|
|
{
|
|
T *oldPtr = _M_obj_ptr;
|
|
_M_obj_ptr = _Other._M_obj_ptr;
|
|
_Init();
|
|
|
|
if (oldPtr != NULL) {
|
|
_UnInitialize(oldPtr);
|
|
}
|
|
}
|
|
return *this;
|
|
}
|
|
|
|
// Move-assignment operator
|
|
_Reference_counted_obj_ptr& operator=(_Reference_counted_obj_ptr &&_Other)
|
|
{
|
|
if (_M_obj_ptr != _Other._M_obj_ptr)
|
|
{
|
|
T *oldPtr = _M_obj_ptr;
|
|
_M_obj_ptr = _Other._M_obj_ptr;
|
|
_Other._M_obj_ptr = nullptr;
|
|
// No change to ref-count of the adopted pointer.
|
|
|
|
if (oldPtr != nullptr)
|
|
{
|
|
_UnInitialize(oldPtr);
|
|
}
|
|
}
|
|
return *this;
|
|
}
|
|
|
|
_Ret_ T* operator->() const
|
|
{
|
|
return _M_obj_ptr;
|
|
}
|
|
|
|
T& operator*() const
|
|
{
|
|
return *_M_obj_ptr;
|
|
}
|
|
|
|
operator T*() const
|
|
{
|
|
return _M_obj_ptr;
|
|
}
|
|
|
|
_Ret_ T* _Get_ptr() const
|
|
{
|
|
return _M_obj_ptr;
|
|
}
|
|
|
|
private:
|
|
T *_M_obj_ptr;
|
|
|
|
void _Init()
|
|
{
|
|
if (_M_obj_ptr == NULL)
|
|
return;
|
|
|
|
reinterpret_cast<_Reference_counter*>(_M_obj_ptr)->_Add_reference();
|
|
}
|
|
|
|
static void _UnInitialize(_In_ T *_Obj_ptr)
|
|
{
|
|
reinterpret_cast<_Reference_counter*>(_Obj_ptr)->_Remove_reference();
|
|
}
|
|
};
|
|
|
|
// Forward declarations
|
|
class _Trace;
|
|
class _Amp_runtime_trace;
|
|
class _Buffer;
|
|
class _Texture;
|
|
class _Sampler;
|
|
class _Ubiquitous_buffer;
|
|
class _D3D_interop;
|
|
class _Accelerator_view_impl;
|
|
class _CPU_accelerator_view_impl;
|
|
class _D3D_accelerator_view_impl;
|
|
class _Accelerator_impl;
|
|
class _Event_impl;
|
|
class _DPC_runtime_factory;
|
|
class _View_shape;
|
|
struct _Buffer_descriptor;
|
|
class _Accelerator_view_hasher;
|
|
struct _DPC_shader_blob;
|
|
struct _View_info;
|
|
|
|
// The enum specifies the base type for short vector type.
|
|
enum _Short_vector_base_type_id : unsigned int
|
|
{
|
|
_Uint_type = 0,
|
|
_Int_type = 1,
|
|
_Float_type = 2,
|
|
_Unorm_type = 3,
|
|
_Norm_type = 4,
|
|
_Double_type = 5,
|
|
_Invalid_type = 0xFFFFFFFF
|
|
};
|
|
|
|
typedef enum _Short_vector_base_type_id _Texture_base_type_id;
|
|
|
|
} // namespace Concurrency::details
|
|
|
|
typedef details::_Reference_counted_obj_ptr<details::_Accelerator_view_impl> _Accelerator_view_impl_ptr;
|
|
typedef details::_Reference_counted_obj_ptr<details::_Accelerator_impl> _Accelerator_impl_ptr;
|
|
typedef details::_Reference_counted_obj_ptr<details::_Buffer> _Buffer_ptr;
|
|
typedef details::_Reference_counted_obj_ptr<details::_Texture> _Texture_ptr;
|
|
typedef details::_Reference_counted_obj_ptr<details::_Sampler> _Sampler_ptr;
|
|
typedef details::_Reference_counted_obj_ptr<details::_Ubiquitous_buffer> _Ubiquitous_buffer_ptr;
|
|
typedef details::_Reference_counted_obj_ptr<details::_Event_impl> _Event_impl_ptr;
|
|
typedef details::_Reference_counted_obj_ptr<details::_View_shape> _View_shape_ptr;
|
|
|
|
namespace details
|
|
{
|
|
// The _Event class.
|
|
class _Event
|
|
{
|
|
friend class _Buffer;
|
|
friend class _Texture;
|
|
friend class accelerator_view;
|
|
friend class _D3D_accelerator_view_impl;
|
|
|
|
public:
|
|
/// <summary>
|
|
/// Constructor of the _Event.
|
|
/// </summary>
|
|
_AMPIMP _Event();
|
|
|
|
/// <summary>
|
|
/// Destructor of the _Event.
|
|
/// </summary>
|
|
_AMPIMP ~_Event();
|
|
|
|
/// <summary>
|
|
/// Copy constructor
|
|
/// </summary>
|
|
_AMPIMP _Event(const _Event & _Other);
|
|
|
|
/// <summary>
|
|
/// Assignment operator
|
|
/// </summary>
|
|
_AMPIMP _Event & operator=(const _Event & _Other);
|
|
|
|
/// <summary>
|
|
/// Poll whether the _Event has completed or not. Swallows any exceptions
|
|
/// </summary>
|
|
/// <returns>
|
|
/// true, if the _Event has completed, false otherwise
|
|
/// </returns>
|
|
_AMPIMP bool _Is_finished_nothrow();
|
|
|
|
/// <summary>
|
|
/// Poll whether the _Event has completed or not and throws any exceptions that occur
|
|
/// </summary>
|
|
/// <returns>
|
|
/// true, if the _Event has completed, false otherwise
|
|
/// </returns>
|
|
_AMPIMP bool _Is_finished();
|
|
|
|
/// <summary>
|
|
/// Wait until the _Event completes and throw any exceptions that occur.
|
|
/// </summary>
|
|
_AMPIMP void _Get();
|
|
|
|
/// <summary>
|
|
/// Tells if this is an empty event
|
|
/// </summary>
|
|
/// <returns>
|
|
/// true, if the _Event is empty
|
|
/// false, otherwise
|
|
/// </returns>
|
|
_AMPIMP bool _Is_empty() const;
|
|
|
|
/// <summary>
|
|
/// Creates an event which is an ordered collection of this and _Ev
|
|
/// </summary>
|
|
/// <returns>
|
|
/// The composite event
|
|
/// </returns>
|
|
_AMPIMP _Event _Add_event(_Event _Ev);
|
|
|
|
/// <summary>
|
|
/// Creates an event which is an ordered collection of this and a continuation task
|
|
/// </summary>
|
|
/// <returns>
|
|
/// The composite event
|
|
/// </returns>
|
|
_AMPIMP _Event _Add_continuation(const std::function<_Event __cdecl ()> &_Continuation_task);
|
|
|
|
/// <summary>
|
|
/// Return true if the other _Event is same as this _Event; false otherwise
|
|
/// </summary>
|
|
_AMPIMP bool operator==(const _Event &_Other) const;
|
|
|
|
/// <summary>
|
|
/// Return false if the other _Event is same as this _Event; true otherwise
|
|
/// </summary>
|
|
_AMPIMP bool operator!=(const _Event &_Other) const;
|
|
|
|
private:
|
|
|
|
// Private constructor
|
|
_Event(_In_ _Event_impl* _Impl);
|
|
|
|
_Event_impl_ptr _M_ptr_event_impl;
|
|
};
|
|
|
|
typedef _Buffer_descriptor *_View_key;
|
|
|
|
_Ret_ _Accelerator_view_impl* _Get_accelerator_view_impl_ptr(const accelerator_view& _Accl_view);
|
|
_Ret_ _Accelerator_impl* _Get_accelerator_impl_ptr(const accelerator& _Accl);
|
|
_Event _Get_access_async(const _View_key _Key, accelerator_view _Av, _Access_mode _Mode, _Buffer_ptr &_Buf_ptr);
|
|
unsigned int _Get_mipmap_levels(const _Texture *_Tex);
|
|
|
|
inline bool _Is_valid_access_mode(_Access_mode _Mode)
|
|
{
|
|
if ((_Mode != _Read_access) &&
|
|
(_Mode != _Write_access) &&
|
|
(_Mode != _Read_write_access))
|
|
{
|
|
return false;
|
|
}
|
|
|
|
return true;
|
|
}
|
|
|
|
// Caution: Do not change this structure defintion.
|
|
// This struct is special and is processed by the FE to identify the buffers
|
|
// used in a parallel_for_each and to setup the _M_data_ptr with the appropriate
|
|
// buffer ptr value in the device code.
|
|
typedef struct _Buffer_descriptor
|
|
{
|
|
friend _Event _Get_access_async(const _View_key _Key, accelerator_view _Av, _Access_mode _Mode, _Buffer_ptr &_Buf_ptr);
|
|
|
|
// _M_data_ptr points to the raw data underlying the buffer for accessing on host
|
|
mutable void *_M_data_ptr;
|
|
|
|
private:
|
|
// _M_buffer_ptr points to a _Ubiquitous_buffer that holds the data in an 1D array.
|
|
// This is private to ensure that all assignments to this data member
|
|
// only happen through public functions which properly manage the
|
|
// ref count of the underlying buffer
|
|
_Ubiquitous_buffer *_M_buffer_ptr;
|
|
|
|
public:
|
|
// _M_curr_cpu_access_mode specifies the current access mode of the data on the
|
|
// cpu accelerator_view specified at the time of registration of this view
|
|
_Access_mode _M_curr_cpu_access_mode;
|
|
|
|
// _M_type_acess_mode specifies the access mode of the overlay type
|
|
// array_views set it to the appropriate access mode and for arrays it is
|
|
// always _Is_array_mode.
|
|
_Access_mode _M_type_access_mode;
|
|
|
|
public:
|
|
// Public functions
|
|
|
|
// Default constructor
|
|
_Buffer_descriptor() __GPU
|
|
: _M_data_ptr(NULL), _M_buffer_ptr(NULL),
|
|
_M_curr_cpu_access_mode(_No_access), _M_type_access_mode(_Is_array_mode)
|
|
{
|
|
}
|
|
|
|
_Buffer_descriptor(_In_ void *_Data_ptr, _In_ _Ubiquitous_buffer *_Buffer_ptr,
|
|
_Access_mode _Curr_cpu_access_mode, _Access_mode _Type_mode) __GPU
|
|
: _M_data_ptr(_Data_ptr), _M_buffer_ptr(NULL),
|
|
_M_curr_cpu_access_mode(_Curr_cpu_access_mode), _M_type_access_mode(_Type_mode)
|
|
{
|
|
_Set_buffer_ptr(_Buffer_ptr);
|
|
}
|
|
|
|
// Destructor
|
|
~_Buffer_descriptor() __GPU
|
|
{
|
|
_Set_buffer_ptr(NULL);
|
|
}
|
|
|
|
// Copy constructor
|
|
_Buffer_descriptor(const _Buffer_descriptor &_Other) __GPU
|
|
: _M_data_ptr(_Other._M_data_ptr), _M_buffer_ptr(NULL),
|
|
_M_curr_cpu_access_mode(_Other._M_curr_cpu_access_mode), _M_type_access_mode(_Other._M_type_access_mode)
|
|
{
|
|
_Set_buffer_ptr(_Other._M_buffer_ptr);
|
|
}
|
|
|
|
// Assignment operator
|
|
_Buffer_descriptor& operator=(const _Buffer_descriptor &_Other) __GPU
|
|
{
|
|
if (this != &_Other)
|
|
{
|
|
_M_data_ptr = _Other._M_data_ptr;
|
|
_M_curr_cpu_access_mode = _Other._M_curr_cpu_access_mode;
|
|
_M_type_access_mode = _Other._M_type_access_mode;
|
|
_Set_buffer_ptr(_Other._M_buffer_ptr);
|
|
}
|
|
|
|
return *this;
|
|
}
|
|
|
|
_Ret_ _Ubiquitous_buffer* _Get_buffer_ptr() const __CPU_ONLY
|
|
{
|
|
return _M_buffer_ptr;
|
|
}
|
|
|
|
void _Set_buffer_ptr(_In_opt_ _Ubiquitous_buffer *_Buffer_ptr) __CPU_ONLY
|
|
{
|
|
if (_M_buffer_ptr != _Buffer_ptr)
|
|
{
|
|
if (_M_buffer_ptr != NULL) {
|
|
reinterpret_cast<_Reference_counter*>(_M_buffer_ptr)->_Remove_reference();
|
|
}
|
|
|
|
_M_buffer_ptr = _Buffer_ptr;
|
|
|
|
if (_M_buffer_ptr != NULL) {
|
|
reinterpret_cast<_Reference_counter*>(_M_buffer_ptr)->_Add_reference();
|
|
}
|
|
}
|
|
}
|
|
|
|
#if !defined(_CXXAMP)
|
|
void _Set_buffer_ptr(_In_opt_ _Ubiquitous_buffer *_Buffer_ptr) __GPU_ONLY
|
|
{
|
|
// No need to set the buffer ptr on the GPU
|
|
UNREFERENCED_PARAMETER(_Buffer_ptr);
|
|
_M_buffer_ptr = NULL;
|
|
}
|
|
#endif // _CXXAMP
|
|
|
|
bool _Is_array() const
|
|
{
|
|
return (_M_type_access_mode == _Is_array_mode);
|
|
}
|
|
|
|
_Ret_ _View_key _Get_view_key()
|
|
{
|
|
return this;
|
|
}
|
|
|
|
const _View_key _Get_view_key() const
|
|
{
|
|
return ((const _View_key)(this));
|
|
}
|
|
|
|
_AMPIMP void _Get_CPU_access(_Access_mode _Requested_mode) const;
|
|
|
|
} _Buffer_descriptor;
|
|
|
|
// Caution: Do not change this structure defintion.
|
|
// This struct is special and is processed by the FE to identify the textures
|
|
// used in a parallel_for_each and to setup the _M_data_ptr with the appropriate
|
|
// texture ptr value in the device code.
|
|
typedef struct _Texture_descriptor
|
|
{
|
|
// _M_data_ptr points to the raw data underlying the texture
|
|
mutable IUnknown *_M_data_ptr;
|
|
|
|
private:
|
|
// _M_texture_ptr points to a _Texture that holds the data
|
|
// This is private to ensure that all assignments to this data member
|
|
// only happen through public functions which properly manage the
|
|
// ref count of the underlying texture
|
|
_Texture *_M_texture_ptr;
|
|
|
|
// The index of the most detailed (largest in size) mipmap level for the texture (or texture view)
|
|
// This value is always zero for the texture and might be non-zero for the texture views
|
|
unsigned int _M_most_detailed_mipmap_level;
|
|
|
|
// Number of accessible mipmap levels for the texture (or texture view),
|
|
// e.g. if the texture has 3 mipmap levels ([0, 1, 2]),
|
|
// then read-only texture view with most detailed mipmap level equal to 1, can have 1 or 2 mipmap levels ([1] or [1, 2]).
|
|
// Further texture_views created on top of the texture view defined above can only narrow down the range of accessible mipmap levels.
|
|
unsigned int _M_view_mipmap_levels;
|
|
|
|
public:
|
|
// Public functions
|
|
|
|
// Default constructor
|
|
_Texture_descriptor() __GPU
|
|
: _M_data_ptr(NULL), _M_texture_ptr(NULL), _M_most_detailed_mipmap_level(0), _M_view_mipmap_levels(0)
|
|
{
|
|
// Enables move constructor
|
|
}
|
|
|
|
// Constructor for the texture
|
|
_Texture_descriptor(unsigned int _Most_detailed_mipmap_level, unsigned int _View_mipmap_levels) __GPU
|
|
: _M_data_ptr(NULL), _M_texture_ptr(NULL), _M_most_detailed_mipmap_level(_Most_detailed_mipmap_level), _M_view_mipmap_levels(_View_mipmap_levels)
|
|
{
|
|
}
|
|
|
|
// Constructor for the interop texture
|
|
_Texture_descriptor(_In_ _Texture * _Texture_ptr) : _M_data_ptr(NULL), _M_texture_ptr(NULL), _M_most_detailed_mipmap_level(0) __CPU_ONLY
|
|
{
|
|
_Set_texture_ptr(_Texture_ptr);
|
|
|
|
// Adopt number of mipmap levels from underlying texture object
|
|
_M_view_mipmap_levels = _Get_mipmap_levels(_M_texture_ptr);
|
|
}
|
|
|
|
// Destructor
|
|
~_Texture_descriptor() __GPU
|
|
{
|
|
_Set_texture_ptr(NULL);
|
|
}
|
|
|
|
// Copy constructor
|
|
_Texture_descriptor(const _Texture_descriptor &_Other) __GPU
|
|
: _M_data_ptr(_Other._M_data_ptr), _M_texture_ptr(NULL),
|
|
_M_most_detailed_mipmap_level(_Other._M_most_detailed_mipmap_level), _M_view_mipmap_levels(_Other._M_view_mipmap_levels)
|
|
{
|
|
_Set_texture_ptr(_Other._M_texture_ptr);
|
|
}
|
|
|
|
// Copy constructor with ability to redefine mipmap information
|
|
_Texture_descriptor(const _Texture_descriptor &_Other, unsigned int _Most_detailed_mipmap_level, unsigned int _View_mipmap_levels) __GPU
|
|
: _M_data_ptr(_Other._M_data_ptr), _M_texture_ptr(NULL),
|
|
_M_most_detailed_mipmap_level(_Most_detailed_mipmap_level), _M_view_mipmap_levels(_View_mipmap_levels)
|
|
{
|
|
_Set_texture_ptr(_Other._M_texture_ptr);
|
|
}
|
|
|
|
// Assignment operator
|
|
_Texture_descriptor& operator=(const _Texture_descriptor &_Other) __GPU
|
|
{
|
|
if (this != &_Other)
|
|
{
|
|
_M_data_ptr = _Other._M_data_ptr;
|
|
_Set_texture_ptr(_Other._M_texture_ptr);
|
|
_M_most_detailed_mipmap_level = _Other._M_most_detailed_mipmap_level;
|
|
_M_view_mipmap_levels = _Other._M_view_mipmap_levels;
|
|
}
|
|
|
|
return *this;
|
|
}
|
|
|
|
// Move constructor
|
|
_Texture_descriptor(_Texture_descriptor &&_Other) __CPU_ONLY
|
|
{
|
|
*this = std::move(_Other);
|
|
}
|
|
|
|
bool operator==(const _Texture_descriptor &_Other) const __GPU
|
|
{
|
|
return _M_texture_ptr == _Other._M_texture_ptr
|
|
&& _M_data_ptr == _Other._M_data_ptr
|
|
&& _M_most_detailed_mipmap_level == _Other._M_most_detailed_mipmap_level
|
|
&& _M_view_mipmap_levels == _Other._M_view_mipmap_levels;
|
|
}
|
|
|
|
_Ret_ _Texture* _Get_texture_ptr() const __CPU_ONLY
|
|
{
|
|
_ASSERTE(_M_texture_ptr);
|
|
return _M_texture_ptr;
|
|
}
|
|
|
|
unsigned int _Get_most_detailed_mipmap_level() const __GPU
|
|
{
|
|
return _M_most_detailed_mipmap_level;
|
|
}
|
|
|
|
unsigned int _Get_view_mipmap_levels() const __GPU
|
|
{
|
|
return _M_view_mipmap_levels;
|
|
}
|
|
|
|
void _Set_view_mipmap_levels(unsigned int _View_mipmap_levels) __CPU_ONLY
|
|
{
|
|
_M_view_mipmap_levels = _View_mipmap_levels;
|
|
}
|
|
|
|
void _Set_texture_ptr(_In_opt_ _Texture *_Texture_ptr) __CPU_ONLY
|
|
{
|
|
if (_M_texture_ptr != _Texture_ptr)
|
|
{
|
|
if (_M_texture_ptr != NULL) {
|
|
reinterpret_cast<_Reference_counter*>(_M_texture_ptr)->_Remove_reference();
|
|
}
|
|
|
|
_M_texture_ptr = _Texture_ptr;
|
|
|
|
if (_M_texture_ptr != NULL) {
|
|
reinterpret_cast<_Reference_counter*>(_M_texture_ptr)->_Add_reference();
|
|
}
|
|
}
|
|
}
|
|
|
|
#if !defined(_CXXAMP)
|
|
void _Set_texture_ptr(_In_opt_ _Texture *_Texture_ptr) __GPU_ONLY
|
|
{
|
|
// No need to set the texture ptr on the GPU
|
|
UNREFERENCED_PARAMETER(_Texture_ptr);
|
|
_M_texture_ptr = NULL;
|
|
}
|
|
#endif // _CXXAMP
|
|
|
|
// This helper function is used to determine aliasing and copy violations
|
|
bool _Are_mipmap_levels_overlapping(const _Texture_descriptor *_Other) const __CPU_ONLY
|
|
{
|
|
_ASSERTE(_Other);
|
|
|
|
if (this->_Get_texture_ptr() != _Other->_Get_texture_ptr())
|
|
{
|
|
return false;
|
|
}
|
|
|
|
return !((_M_most_detailed_mipmap_level < _Other->_M_most_detailed_mipmap_level) ? ((_M_most_detailed_mipmap_level + _M_view_mipmap_levels - 1) < _Other->_M_most_detailed_mipmap_level)
|
|
: ((_Other->_M_most_detailed_mipmap_level + _Other->_M_view_mipmap_levels - 1) < _M_most_detailed_mipmap_level));
|
|
}
|
|
|
|
} _Texture_descriptor;
|
|
|
|
// Caution: Do not change this structure defintion.
|
|
// This struct is special and is processed by the FE to identify the samplers
|
|
// used in a parallel_for_each.
|
|
typedef struct _Sampler_descriptor
|
|
{
|
|
// _M_data_ptr points to the sampler on accelerator
|
|
mutable void *_M_data_ptr;
|
|
|
|
private:
|
|
// _M_sampler_ptr points to a _Sampler that holds the underlying sampler
|
|
// representation. This is private to ensure that all assignments to this data member
|
|
// only happen through public functions which properly manage the
|
|
// ref count of the underlying _Sampler object.
|
|
_Sampler *_M_sampler_ptr;
|
|
|
|
public:
|
|
// Public functions
|
|
|
|
// Default constructor
|
|
_Sampler_descriptor() __GPU
|
|
: _M_data_ptr(NULL), _M_sampler_ptr(NULL)
|
|
{
|
|
}
|
|
|
|
_Sampler_descriptor(_In_ _Sampler * _Sampler_ptr) __GPU
|
|
: _M_data_ptr(NULL), _M_sampler_ptr(NULL)
|
|
{
|
|
_Set_sampler_ptr(_Sampler_ptr);
|
|
}
|
|
|
|
// Destructor
|
|
~_Sampler_descriptor() __GPU
|
|
{
|
|
_Set_sampler_ptr(NULL);
|
|
}
|
|
|
|
// Copy constructor
|
|
_Sampler_descriptor(const _Sampler_descriptor &_Other) __GPU
|
|
: _M_data_ptr(_Other._M_data_ptr), _M_sampler_ptr(NULL)
|
|
{
|
|
_Set_sampler_ptr(_Other._M_sampler_ptr);
|
|
}
|
|
|
|
// Assignment operator
|
|
_Sampler_descriptor& operator=(const _Sampler_descriptor &_Other) __GPU
|
|
{
|
|
if (this != &_Other)
|
|
{
|
|
_M_data_ptr = _Other._M_data_ptr;
|
|
_Set_sampler_ptr(_Other._M_sampler_ptr);
|
|
}
|
|
|
|
return *this;
|
|
}
|
|
|
|
// Move constructor
|
|
_Sampler_descriptor(_Sampler_descriptor &&_Other) __CPU_ONLY
|
|
{
|
|
*this = std::move(_Other);
|
|
}
|
|
|
|
bool operator==(const _Sampler_descriptor &_Other) const __GPU
|
|
{
|
|
return _M_sampler_ptr == _Other._M_sampler_ptr && _M_data_ptr == _Other._M_data_ptr;
|
|
}
|
|
|
|
_Ret_ _Sampler* _Get_sampler_ptr() const __CPU_ONLY
|
|
{
|
|
return _M_sampler_ptr;
|
|
}
|
|
|
|
void _Set_sampler_ptr(_In_opt_ _Sampler *_Sampler_ptr) __CPU_ONLY
|
|
{
|
|
if (_M_sampler_ptr != _Sampler_ptr)
|
|
{
|
|
if (_M_sampler_ptr != NULL) {
|
|
reinterpret_cast<_Reference_counter*>(_M_sampler_ptr)->_Remove_reference();
|
|
}
|
|
|
|
_M_sampler_ptr = _Sampler_ptr;
|
|
|
|
if (_M_sampler_ptr != NULL) {
|
|
reinterpret_cast<_Reference_counter*>(_M_sampler_ptr)->_Add_reference();
|
|
}
|
|
}
|
|
}
|
|
|
|
#if !defined(_CXXAMP)
|
|
void _Set_sampler_ptr(_In_opt_ _Sampler *_Sampler_ptr) __GPU_ONLY
|
|
{
|
|
// No need to set the sampler ptr on the GPU
|
|
UNREFERENCED_PARAMETER(_Sampler_ptr);
|
|
_M_sampler_ptr = NULL;
|
|
}
|
|
#endif // _CXXAMP
|
|
|
|
} _Sampler_descriptor;
|
|
|
|
} // namespace Concurrency::details
|
|
|
|
// Forward declaration
|
|
class accelerator;
|
|
|
|
namespace details
|
|
{
|
|
_AMPIMP size_t __cdecl _Get_num_devices();
|
|
_AMPIMP _Ret_ _Accelerator_impl_ptr * __cdecl _Get_devices();
|
|
_AMPIMP accelerator __cdecl _Select_default_accelerator();
|
|
_AMPIMP bool __cdecl _Set_default_accelerator(_Accelerator_impl_ptr _Accl);
|
|
_AMPIMP bool __cdecl _Is_D3D_accelerator_view(const accelerator_view& _Av);
|
|
_AMPIMP void __cdecl _Register_async_event(const _Event &_Ev, const std::shared_future<void> &_Shared_future);
|
|
_AMPIMP _Access_mode __cdecl _Get_recommended_buffer_host_access_mode(const accelerator_view &_Av);
|
|
}
|
|
|
|
/// <summary>
|
|
/// Queuing modes supported for accelerator views
|
|
/// </summary>
|
|
enum queuing_mode {
|
|
queuing_mode_immediate,
|
|
queuing_mode_automatic
|
|
};
|
|
|
|
/// <summary>
|
|
/// Exception thrown due to a C++ AMP runtime_exception.
|
|
/// This is the base type for all C++ AMP exception types.
|
|
/// </summary>
|
|
class runtime_exception : public std::exception
|
|
{
|
|
public:
|
|
/// <summary>
|
|
/// Construct a runtime_exception exception with a message and an error code
|
|
/// </summary>
|
|
/// <param name="_Message">
|
|
/// Descriptive message of error
|
|
/// </param>
|
|
/// <param name="_Hresult">
|
|
/// HRESULT of error that caused this exception
|
|
/// </param>
|
|
_AMPIMP runtime_exception(const char * _Message, HRESULT _Hresult) throw();
|
|
|
|
/// <summary>
|
|
/// Construct a runtime_exception exception with an error code
|
|
/// </summary>
|
|
/// <param name="_Hresult">
|
|
/// HRESULT of error that caused this exception
|
|
/// </param>
|
|
_AMPIMP explicit runtime_exception(HRESULT _Hresult) throw();
|
|
|
|
/// <summary>
|
|
/// Copy construct a runtime_exception exception
|
|
/// </summary>
|
|
/// <param name="_Other">
|
|
/// The runtime_exception object to be copied from
|
|
/// </param>
|
|
_AMPIMP runtime_exception(const runtime_exception &_Other) throw();
|
|
|
|
/// <summary>
|
|
/// Assignment operator
|
|
/// </summary>
|
|
/// <param name="_Other">
|
|
/// The runtime_exception object to be assigned from
|
|
/// </param>
|
|
_AMPIMP runtime_exception &operator=(const runtime_exception &_Other) throw();
|
|
|
|
/// <summary>
|
|
/// Destruct a runtime_exception exception object instance
|
|
/// </summary>
|
|
_AMPIMP virtual ~runtime_exception() throw();
|
|
|
|
/// <summary>
|
|
/// Get the error code that caused this exception
|
|
/// </summary>
|
|
/// <returns>
|
|
/// HRESULT of error that caused the exception
|
|
/// </returns>
|
|
_AMPIMP HRESULT get_error_code() const throw();
|
|
|
|
private:
|
|
HRESULT _M_error_code;
|
|
}; // class runtime_exception
|
|
|
|
/// <summary>
|
|
/// Exception thrown when an underlying OS/DirectX call fails
|
|
/// due to lack of system or device memory
|
|
/// </summary>
|
|
class out_of_memory : public runtime_exception
|
|
{
|
|
public:
|
|
/// <summary>
|
|
/// Construct an out_of_memory exception with a message
|
|
/// </summary>
|
|
/// <param name="_Message">
|
|
/// Descriptive message of error
|
|
/// </param>
|
|
_AMPIMP explicit out_of_memory(const char * _Message) throw();
|
|
|
|
/// <summary>
|
|
/// Construct an out_of_memory exception
|
|
/// </summary>
|
|
_AMPIMP out_of_memory () throw();
|
|
}; // class out_of_memory
|
|
|
|
namespace direct3d
|
|
{
|
|
/// <summary>
|
|
/// Get the D3D device interface underlying a accelerator_view.
|
|
/// </summary>
|
|
/// <param name="_Av">
|
|
/// The D3D accelerator_view for which the underlying D3D device interface is returned.
|
|
/// </param>
|
|
/// <returns>
|
|
/// The IUnknown interface pointer of the D3D device underlying the accelerator_view.
|
|
/// </returns>
|
|
_AMPIMP _Ret_ IUnknown * __cdecl get_device(const accelerator_view &_Av);
|
|
|
|
/// <summary>
|
|
/// Create a accelerator_view from a D3D device interface pointer.
|
|
/// </summary>
|
|
/// <param name="_D3D_device">
|
|
/// The D3D device interface pointer to create the accelerator_view from.
|
|
/// </param>
|
|
/// <param name="_Qmode">
|
|
/// The queuing_mode to be used for the newly created accelerator_view.
|
|
/// This parameter has a default value of queuing_mode_automatic.
|
|
/// </param>
|
|
/// <returns>
|
|
/// The accelerator_view created from the passed D3D device interface.
|
|
/// </returns>
|
|
_AMPIMP accelerator_view __cdecl create_accelerator_view(_In_ IUnknown *_D3D_device, queuing_mode _Qmode = queuing_mode_automatic);
|
|
|
|
/// <summary>
|
|
/// Create and return a new accelerator view on the specified accelerator.
|
|
/// </summary>
|
|
/// <param name="_Accelerator">
|
|
/// The accelerator on which the new accelerator_view is to be created.
|
|
/// </param>
|
|
/// <param name="_Disable_timeout">
|
|
/// A boolean parameter that specifies whether timeout should be disabled
|
|
/// for the newly created accelerator_view. This corresponds to the
|
|
/// D3D11_CREATE_DEVICE_DISABLE_GPU_TIMEOUT flag for Direct3D device creation
|
|
/// and is used to indicate if the operating system should allow workloads
|
|
/// that take more than 2 seconds to execute, without resetting the device
|
|
/// per the Windows timeout detection and recovery mechanism. Use of this flag
|
|
/// is recommended if you need to perform time consuming tasks on the accelerator_view.
|
|
/// </param>
|
|
/// <param name="_Qmode">
|
|
/// The queuing_mode to be used for the newly created accelerator_view.
|
|
/// This parameter has a default value of queuing_mode_automatic.
|
|
/// </param>
|
|
/// <returns>
|
|
/// The newly created accelerator_view.
|
|
/// </returns>
|
|
_AMPIMP accelerator_view __cdecl create_accelerator_view(accelerator& _Accelerator, bool _Disable_timeout, queuing_mode _Qmode = queuing_mode_automatic);
|
|
|
|
/// <summary>
|
|
/// Returns a boolean flag indicating if timeout is disabled
|
|
/// for the specified accelerator_view. This corresponds to the
|
|
/// D3D11_CREATE_DEVICE_DISABLE_GPU_TIMEOUT flag for Direct3D device creation.
|
|
/// </summary>
|
|
/// <param name="_Accelerator_view">
|
|
/// The accelerator_view for which the timeout disabled setting is to be queried.
|
|
/// </param>
|
|
/// <returns>
|
|
/// A boolean flag indicating if timeout is disabled for the specified accelerator_view.
|
|
/// </returns>
|
|
_AMPIMP bool __cdecl is_timeout_disabled(const accelerator_view& _Accelerator_view);
|
|
|
|
/// <summary>
|
|
/// Acquire a lock on an accelerator_view for the purpose of safely performing D3D operations on resources shared
|
|
/// with the accelerator_view. The accelerator_view and all C++ AMP resources associated with this accelerator_view
|
|
/// internally take this lock when performing operations and will block while another thread holds the D3D access lock.
|
|
///
|
|
/// This lock is non-recursive: It is undefined behavior to call this function from a thread that already holds the lock.
|
|
/// It is undefined behavior to perform operations on the accelerator_view or any data container associated with the
|
|
/// accelerator_view from the thread that holds the D3D access lock.
|
|
///
|
|
/// See also scoped_d3d_access_lock, a RAII-style class for a scope-based D3D access lock.
|
|
/// </summary>
|
|
/// <param name="_Av">
|
|
/// The accelerator_view to lock.
|
|
/// </param>
|
|
_AMPIMP void __cdecl d3d_access_lock(accelerator_view &_Av);
|
|
|
|
/// <summary>
|
|
/// Attempt to acquire the D3D access lock on an accelerator_view without blocking.
|
|
/// </summary>
|
|
/// <param name="_Av">
|
|
/// The accelerator_view to lock.
|
|
/// </param>
|
|
/// <returns>
|
|
/// true if the lock was acquired, or false if it is currently held by another thread.
|
|
/// </returns>
|
|
_AMPIMP bool __cdecl d3d_access_try_lock(accelerator_view &_Av);
|
|
|
|
/// <summary>
|
|
/// Release the D3D access lock on the given accelerator_view. If the calling thread does
|
|
/// not hold the lock on the accelerator_view the results are undefined.
|
|
/// </summary>
|
|
/// <param name="_Av">
|
|
/// The accelerator_view for which the lock is to be released.
|
|
/// </param>
|
|
_AMPIMP void __cdecl d3d_access_unlock(accelerator_view &_Av);
|
|
|
|
/// <summary>
|
|
/// Tag type to indicate the D3D access lock should be adopted rather than
|
|
/// acquired.
|
|
/// </summary>
|
|
struct adopt_d3d_access_lock_t {};
|
|
|
|
/// <summary>
|
|
/// RAII wrapper for a D3D access lock on an accelerator_view.
|
|
/// </summary>
|
|
class scoped_d3d_access_lock
|
|
{
|
|
public:
|
|
/// <summary>
|
|
/// Acquire a D3D access lock on the given accelerator_view. The lock is released
|
|
/// when this object goes out of scope. Construction will block until the lock
|
|
/// is acquired.
|
|
/// </summary>
|
|
/// <param name="_Av">
|
|
/// The accelerator_view to lock.
|
|
/// </param>
|
|
_AMPIMP explicit scoped_d3d_access_lock(accelerator_view &_Av);
|
|
|
|
/// <summary>
|
|
/// Construct a scoped_d3d_access_lock on an accelerator_view for which the lock
|
|
/// is already held (e.g. was acquired by d3d_access_try_lock). The D3D access
|
|
/// lock must already be held by the calling thread and not controlled by any other
|
|
/// scoped_d3d_access_lock.
|
|
/// </summary>
|
|
/// <param name="_Av">
|
|
/// The accelerator_view for the lock to adopt.
|
|
/// </param>
|
|
/// <param name="_T">
|
|
/// The adopt_d3d_access_lock object.
|
|
/// </param>
|
|
_AMPIMP explicit scoped_d3d_access_lock(accelerator_view &_Av, adopt_d3d_access_lock_t _T);
|
|
|
|
/// <summary>
|
|
/// Destructor for scoped_d3d_access_lock: unlock the accelerator_view.
|
|
/// </summary>
|
|
_AMPIMP ~scoped_d3d_access_lock();
|
|
|
|
/// <summary>
|
|
/// Move constructor for scoped_d3d_access_lock: Take ownership of
|
|
/// a lock from another scoped_d3d_access_lock.
|
|
/// </summary>
|
|
/// <param name="_Other">
|
|
/// The accelerator_view from which to move.
|
|
/// </param>
|
|
_AMPIMP scoped_d3d_access_lock(scoped_d3d_access_lock &&_Other);
|
|
|
|
/// <summary>
|
|
/// Move assignment operator for scoped_d3d_access_lock: Take ownership
|
|
/// of a lock from another scoped_d3d_access_lock, releasing the previous
|
|
/// lock.
|
|
/// </summary>
|
|
/// <param name="_Other">
|
|
/// The accelerator_view from which to move.
|
|
/// </param>
|
|
/// <returns>
|
|
/// A reference to this scoped_accelerator_view_lock.
|
|
/// </returns>
|
|
_AMPIMP scoped_d3d_access_lock& operator=(scoped_d3d_access_lock &&_Other);
|
|
|
|
private:
|
|
// No copy constructor
|
|
scoped_d3d_access_lock(const scoped_d3d_access_lock &_Other);
|
|
|
|
// No assignment operator
|
|
scoped_d3d_access_lock & operator=(const scoped_d3d_access_lock &_Other);
|
|
|
|
_Accelerator_view_impl_ptr _M_impl;
|
|
};
|
|
} // namespace direct3d
|
|
|
|
/// <summary>
|
|
/// Class represents a accelerator abstraction for C++ AMP data-parallel devices
|
|
/// </summary>
|
|
class accelerator
|
|
{
|
|
friend class accelerator_view;
|
|
|
|
friend class details::_Ubiquitous_buffer;
|
|
|
|
friend _AMPIMP accelerator details::_Select_default_accelerator();
|
|
|
|
_AMPIMP friend accelerator_view __cdecl direct3d::create_accelerator_view(accelerator& _Accelerator, bool _Disable_timeout, queuing_mode _Qmode /* = queuing_mode_automatic */);
|
|
|
|
friend _Ret_ details::_Accelerator_impl* details::_Get_accelerator_impl_ptr(const accelerator& _Accl);
|
|
|
|
public:
|
|
|
|
/// <summary>
|
|
/// String constant for default accelerator
|
|
/// </summary>
|
|
_AMPIMP static const wchar_t default_accelerator[];
|
|
|
|
/// <summary>
|
|
/// String constant for cpu accelerator
|
|
/// </summary>
|
|
_AMPIMP static const wchar_t cpu_accelerator[];
|
|
|
|
/// <summary>
|
|
/// String constant for direct3d WARP accelerator
|
|
/// </summary>
|
|
_AMPIMP static const wchar_t direct3d_warp[];
|
|
|
|
/// <summary>
|
|
/// String constant for direct3d reference accelerator
|
|
/// </summary>
|
|
_AMPIMP static const wchar_t direct3d_ref[];
|
|
|
|
/// <summary>
|
|
/// Construct a accelerator representing the default accelerator
|
|
/// </summary>
|
|
_AMPIMP accelerator();
|
|
|
|
/// <summary>
|
|
/// Construct a accelerator representing the accelerator with the
|
|
/// specified device instance path
|
|
/// </summary>
|
|
explicit accelerator(const std::wstring &_Device_path) : _M_impl(NULL)
|
|
{
|
|
_Init(_Device_path.c_str());
|
|
}
|
|
|
|
/// <summary>
|
|
/// Destructor
|
|
/// </summary>
|
|
_AMPIMP ~accelerator();
|
|
|
|
/// <summary>
|
|
/// Copy constructor
|
|
/// </summary>
|
|
_AMPIMP accelerator(const accelerator &_Other);
|
|
|
|
/// <summary>
|
|
/// Assignment operator
|
|
/// </summary>
|
|
_AMPIMP accelerator &operator=(const accelerator &_Other);
|
|
|
|
/// <summary>
|
|
/// Returns the vector of accelerator objects representing all available accelerators
|
|
/// </summary>
|
|
/// <returns>
|
|
/// The vector of available accelerators
|
|
/// </returns>
|
|
static inline std::vector<accelerator> get_all()
|
|
{
|
|
std::vector<accelerator> _AcceleratorVector;
|
|
size_t _NumDevices = details::_Get_num_devices();
|
|
for (size_t _I = 0; (_I < _NumDevices); ++_I)
|
|
{
|
|
_AcceleratorVector.push_back(details::_Get_devices()[_I]);
|
|
}
|
|
|
|
return _AcceleratorVector;
|
|
}
|
|
|
|
/// <summary>
|
|
/// Sets the default accelerator to be used for any operation
|
|
/// that implicitly uses the default accelerator. This method
|
|
/// only succeeds if the runtime selected default accelerator
|
|
/// has not already been used in an operation that implicitly
|
|
/// uses the default accelerator
|
|
/// </summary>
|
|
/// <returns>
|
|
/// A boolean value indicating if the call succeeds in setting
|
|
/// the default accelerator
|
|
/// </returns>
|
|
static inline bool set_default(const std::wstring& _Path)
|
|
{
|
|
accelerator _Accl(_Path);
|
|
return details::_Set_default_accelerator(_Accl._M_impl);
|
|
}
|
|
|
|
/// <summary>
|
|
/// Returns the auto selection accelerator_view which when specified
|
|
/// as the parallel_for_each target results in the target accelerator_view
|
|
/// for executing the parallel_for_each kernel to be automatically selected
|
|
/// by the runtime. For all other purposes, the accelerator_view returned
|
|
/// by this method is the same as the default accelerator_view of the default
|
|
/// accelerator
|
|
/// </summary>
|
|
_AMPIMP static accelerator_view __cdecl get_auto_selection_view();
|
|
|
|
/// <summary>
|
|
/// Returns the system-wide unique device instance path as a std::wstring
|
|
/// </summary>
|
|
std::wstring get_device_path() const
|
|
{
|
|
return _Get_device_path();
|
|
}
|
|
|
|
__declspec(property(get=get_device_path)) std::wstring device_path;
|
|
|
|
/// <summary>
|
|
/// Get the version for this accelerator
|
|
/// </summary>
|
|
_AMPIMP unsigned int get_version() const;
|
|
__declspec(property(get=get_version)) unsigned int version; // hiword=major, loword=minor
|
|
|
|
/// <summary>
|
|
/// Returns the device description as a std::wstring
|
|
/// </summary>
|
|
std::wstring get_description() const
|
|
{
|
|
return _Get_description();
|
|
}
|
|
|
|
__declspec(property(get=get_description)) std::wstring description;
|
|
|
|
/// <summary>
|
|
/// Returns a boolean value indicating whether the accelerator
|
|
/// was created with DEBUG layer enabled for extensive error reporting
|
|
/// </summary>
|
|
_AMPIMP bool get_is_debug() const;
|
|
__declspec(property(get=get_is_debug)) bool is_debug;
|
|
|
|
/// <summary>
|
|
/// Returns a boolean value indicating whether the accelerator is emulated.
|
|
/// This is true, for example, with the direct3d reference and WARP accelerators.
|
|
/// </summary>
|
|
_AMPIMP bool get_is_emulated() const;
|
|
__declspec(property(get=get_is_emulated)) bool is_emulated;
|
|
|
|
/// <summary>
|
|
/// Returns a boolean value indicating whether the accelerator
|
|
/// is attached to a display
|
|
/// </summary>
|
|
_AMPIMP bool get_has_display() const;
|
|
__declspec(property(get=get_has_display)) bool has_display;
|
|
|
|
/// <summary>
|
|
/// Returns a boolean value indicating whether the accelerator
|
|
/// supports full double precision (including double division,
|
|
/// precise_math functions, int to double, double to int conversions)
|
|
/// in a parallel_for_each kernel.
|
|
/// </summary>
|
|
_AMPIMP bool get_supports_double_precision() const;
|
|
__declspec(property(get=get_supports_double_precision)) bool supports_double_precision;
|
|
|
|
/// <summary>
|
|
/// Returns a boolean value indicating whether the accelerator
|
|
/// has limited double precision support (excludes double division,
|
|
/// precise_math functions, int to double, double to int conversions)
|
|
/// for a parallel_for_each kernel.
|
|
/// </summary>
|
|
_AMPIMP bool get_supports_limited_double_precision() const;
|
|
__declspec(property(get=get_supports_limited_double_precision)) bool supports_limited_double_precision;
|
|
|
|
/// <summary>
|
|
/// Returns a boolean value indicating whether the accelerator
|
|
/// supports memory accessible both by the accelerator and the CPU.
|
|
/// </summary>
|
|
_AMPIMP bool get_supports_cpu_shared_memory() const;
|
|
__declspec(property(get=get_supports_cpu_shared_memory)) bool supports_cpu_shared_memory;
|
|
|
|
/// <summary>
|
|
/// Return the default accelerator view associated with this accelerator
|
|
/// </summary>
|
|
_AMPIMP accelerator_view get_default_view() const;
|
|
__declspec(property(get=get_default_view)) accelerator_view default_view;
|
|
|
|
/// <summary>
|
|
/// Get the dedicated memory for this accelerator in KB
|
|
/// </summary>
|
|
_AMPIMP size_t get_dedicated_memory() const;
|
|
__declspec(property(get=get_dedicated_memory)) size_t dedicated_memory;
|
|
|
|
/// <summary>
|
|
/// Get the default cpu access_type for buffers created on this accelerator
|
|
/// </summary>
|
|
_AMPIMP access_type get_default_cpu_access_type() const;
|
|
__declspec(property(get=get_default_cpu_access_type)) access_type default_cpu_access_type;
|
|
|
|
/// <summary>
|
|
/// Set the default cpu access_type for arrays created on this accelerator
|
|
/// or for implicit memory allocations as part of array_views accessed
|
|
/// on this this accelerator. This method only succeeds if the default_cpu_access_type
|
|
/// for the accelerator has not already been overriden by a previous call to this method
|
|
/// and the runtime selected default_cpu_access_type for this accelerator has not yet
|
|
/// been used for allocating an array or for an implicit memory allocation backing an
|
|
/// array_view accessed on this accelerator.
|
|
/// </summary>
|
|
/// <param name="_Default_cpu_access_type">
|
|
/// The default cpu access_type to be used for array/array_view memory allocations
|
|
/// on this accelerator.
|
|
/// </param>
|
|
/// <returns>
|
|
/// A boolean value indicating if the default cpu access_type for the accelerator
|
|
/// was successfully set.
|
|
/// </returns>
|
|
_AMPIMP bool set_default_cpu_access_type(access_type _Default_cpu_access_type);
|
|
|
|
/// <summary>
|
|
/// Create and return a new accelerator view on this accelerator
|
|
/// with the specified queuing mode. When unspecified the accelerator_view
|
|
/// is created with queuing_mode_automatic queuing mode.
|
|
/// </summary>
|
|
_AMPIMP accelerator_view create_view(queuing_mode qmode = queuing_mode_automatic);
|
|
|
|
/// <summary>
|
|
/// Return true if the other accelerator is same as this accelerator; false otherwise
|
|
/// </summary>
|
|
_AMPIMP bool operator==(const accelerator &_Other) const;
|
|
|
|
/// <summary>
|
|
/// Return false if the other accelerator is same as this accelerator; true otherwise
|
|
/// </summary>
|
|
_AMPIMP bool operator!=(const accelerator &_Other) const;
|
|
|
|
private:
|
|
|
|
// Private constructor
|
|
_AMPIMP accelerator(_Accelerator_impl_ptr _Impl);
|
|
|
|
// Private helper methods
|
|
_AMPIMP const wchar_t *_Get_device_path() const;
|
|
_AMPIMP const wchar_t *_Get_description() const;
|
|
|
|
_AMPIMP void _Init(const wchar_t *_Path);
|
|
|
|
private:
|
|
|
|
_Accelerator_impl_ptr _M_impl;
|
|
};
|
|
|
|
/// <summary>
|
|
/// Class represents a future corresponding to a C++ AMP asynchronous operation
|
|
/// </summary>
|
|
class completion_future
|
|
{
|
|
friend class details::_Amp_runtime_trace;
|
|
public:
|
|
|
|
/// <summary>
|
|
/// Default constructor
|
|
/// </summary>
|
|
completion_future()
|
|
{
|
|
}
|
|
|
|
/// <summary>
|
|
/// Copy constructor
|
|
/// </summary>
|
|
completion_future(const completion_future& _Other)
|
|
: _M_shared_future(_Other._M_shared_future),
|
|
_M_task(_Other._M_task)
|
|
{
|
|
}
|
|
|
|
/// <summary>
|
|
/// Move constructor
|
|
/// </summary>
|
|
completion_future(completion_future&& _Other)
|
|
: _M_shared_future(std::move(_Other._M_shared_future)),
|
|
_M_task(std::move(_Other._M_task))
|
|
{
|
|
}
|
|
|
|
/// <summary>
|
|
/// Destructor
|
|
/// </summary>
|
|
~completion_future()
|
|
{
|
|
}
|
|
|
|
/// <summary>
|
|
/// Copy assignment operator
|
|
/// </summary>
|
|
completion_future& operator=(const completion_future& _Other)
|
|
{
|
|
if (this != &_Other) {
|
|
_M_shared_future = _Other._M_shared_future;
|
|
_M_task = _Other._M_task;
|
|
}
|
|
|
|
return (*this);
|
|
}
|
|
|
|
/// <summary>
|
|
/// Move assignment operator
|
|
/// </summary>
|
|
completion_future& operator=(completion_future&& _Other)
|
|
{
|
|
if (this != &_Other) {
|
|
_M_shared_future = std::move(_Other._M_shared_future);
|
|
_M_task = std::move(_Other._M_task);
|
|
}
|
|
|
|
return (*this);
|
|
}
|
|
|
|
/// <summary>
|
|
/// Waits until the associated asynchronous operation completes
|
|
/// Throws the stored exception if one was encountered during the
|
|
/// asynchronous operation
|
|
/// </summary>
|
|
void get() const
|
|
{
|
|
_M_shared_future.get();
|
|
}
|
|
|
|
/// <summary>
|
|
/// Returns true if the object is associated with an asynchronous
|
|
/// operation
|
|
/// </summary>
|
|
/// <returns>
|
|
/// true if the object is associated with an asynchronous operation
|
|
/// and false otherwise
|
|
/// </returns>
|
|
bool valid() const
|
|
{
|
|
return _M_shared_future.valid();
|
|
}
|
|
|
|
/// <summary>
|
|
/// Blocks until the associated asynchronous operation completes
|
|
/// </summary>
|
|
void wait() const
|
|
{
|
|
_M_shared_future.wait();
|
|
}
|
|
|
|
/// <summary>
|
|
/// Blocks until the associated asynchronous operation completes or
|
|
/// _Rel_time has elapsed
|
|
/// </summary>
|
|
/// <returns>
|
|
/// - future_status::deferred if the associated asynchronous operation is not running
|
|
/// - future_status::ready if the associated asynchronous operation is finished
|
|
/// - future_status::timeout if the time period specified has elapsed
|
|
/// </returns>
|
|
template <class _Rep, class _Period>
|
|
std::future_status wait_for(const std::chrono::duration<_Rep, _Period>& _Rel_time) const
|
|
{
|
|
return _M_shared_future.wait_for(_Rel_time);
|
|
}
|
|
|
|
/// <summary>
|
|
/// Blocks until the associated asynchronous operation completes or
|
|
/// until the current time exceeds _Abs_time
|
|
/// </summary>
|
|
/// <returns>
|
|
/// - future_status::deferred if the associated asynchronous operation is not running
|
|
/// - future_status::ready if the associated asynchronous operation is finished
|
|
/// - future_status::timeout if the time point specified has been reached
|
|
/// </returns>
|
|
template <class _Clock, class _Duration>
|
|
std::future_status wait_until(const std::chrono::time_point<_Clock, _Duration>& _Abs_time) const
|
|
{
|
|
return _M_shared_future.wait_until(_Abs_time);
|
|
}
|
|
|
|
/// <summary>
|
|
/// Returns a std::shared_future<void> object corresponding to the
|
|
/// associated asynchronous operation
|
|
/// </summary>
|
|
/// <returns>
|
|
/// A std::shared_future<void> object corresponding to the associated
|
|
/// asynchronous operation
|
|
/// </returns>
|
|
operator std::shared_future<void>() const
|
|
{
|
|
return _M_shared_future;
|
|
}
|
|
|
|
/// <summary>
|
|
/// Chains a callback Functor to the completion_future to be executed
|
|
/// when the associated asynchronous operation finishes execution
|
|
/// </summary>
|
|
template <typename _Functor>
|
|
void then(const _Functor &_Func) const
|
|
{
|
|
this->to_task().then(_Func);
|
|
}
|
|
|
|
/// <summary>
|
|
/// Returns a concurrency::task<void> object corresponding to the
|
|
/// associated asynchronous operation
|
|
/// </summary>
|
|
/// <returns>
|
|
/// A concurrency::task<void> object corresponding to the associated
|
|
/// asynchronous operation
|
|
/// </returns>
|
|
concurrency::task<void> to_task() const
|
|
{
|
|
return _M_task;
|
|
}
|
|
|
|
private:
|
|
|
|
// Private constructor
|
|
completion_future(const std::shared_future<void> &_Shared_future,
|
|
const concurrency::task<void>& _Task)
|
|
: _M_shared_future(_Shared_future), _M_task(_Task)
|
|
{
|
|
}
|
|
|
|
std::shared_future<void> _M_shared_future;
|
|
concurrency::task<void> _M_task;
|
|
};
|
|
|
|
/// <summary>
|
|
/// Class represents a virtual device abstraction on a C++ AMP data-parallel accelerator
|
|
/// </summary>
|
|
class accelerator_view
|
|
{
|
|
friend class accelerator;
|
|
friend class details::_Buffer;
|
|
friend class details::_Texture;
|
|
friend class details::_Sampler;
|
|
friend class details::_Ubiquitous_buffer;
|
|
friend class details::_D3D_interop;
|
|
friend class details::_D3D_accelerator_view_impl;
|
|
friend class details::_CPU_accelerator_view_impl;
|
|
friend class details::_Accelerator_view_hasher;
|
|
|
|
_AMPIMP friend _Ret_ IUnknown * __cdecl direct3d::get_device(const accelerator_view &_Av);
|
|
|
|
_AMPIMP friend accelerator_view __cdecl direct3d::create_accelerator_view(_In_ IUnknown *_D3D_device, queuing_mode qmode /* = queuing_mode_automatic */);
|
|
|
|
_AMPIMP friend accelerator_view __cdecl direct3d::create_accelerator_view(accelerator& _Accelerator, bool _Disable_timeout, queuing_mode _Qmode /* = queuing_mode_automatic */);
|
|
|
|
_AMPIMP friend bool __cdecl direct3d::is_timeout_disabled(const accelerator_view& _Accelerator_view);
|
|
|
|
friend _Ret_ details::_Accelerator_view_impl* details::_Get_accelerator_view_impl_ptr(const accelerator_view& _Accl_view);
|
|
|
|
public:
|
|
|
|
/// <summary>
|
|
/// Destructor
|
|
/// </summary>
|
|
_AMPIMP ~accelerator_view();
|
|
|
|
/// <summary>
|
|
/// Copy constructor
|
|
/// </summary>
|
|
_AMPIMP accelerator_view(const accelerator_view &_Other);
|
|
|
|
/// <summary>
|
|
/// Assignment operator
|
|
/// </summary>
|
|
_AMPIMP accelerator_view &operator=(const accelerator_view &_Other);
|
|
|
|
/// <summary>
|
|
/// Get the accelerator for this accelerator view
|
|
/// </summary>
|
|
_AMPIMP accelerator get_accelerator() const;
|
|
__declspec(property(get=get_accelerator)) Concurrency::accelerator accelerator;
|
|
|
|
/// <summary>
|
|
/// Returns a boolean value indicating whether the accelerator view
|
|
/// was created with DEBUG layer enabled for extensive error reporting
|
|
/// </summary>
|
|
_AMPIMP bool get_is_debug() const;
|
|
__declspec(property(get=get_is_debug)) bool is_debug;
|
|
|
|
/// <summary>
|
|
/// Get the version for this accelerator view
|
|
/// </summary>
|
|
_AMPIMP unsigned int get_version() const;
|
|
__declspec(property(get=get_version)) unsigned int version; // hiword=major, loword=minor
|
|
|
|
/// <summary>
|
|
/// Get the queuing mode for this accelerator view
|
|
/// </summary>
|
|
_AMPIMP queuing_mode get_queuing_mode() const;
|
|
__declspec(property(get=get_queuing_mode)) Concurrency::queuing_mode queuing_mode;
|
|
|
|
/// <summary>
|
|
/// Returns a boolean value indicating whether the accelerator view
|
|
/// when passed to a parallel_for_each would result in automatic
|
|
/// selection of an appropriate execution target by the runtime
|
|
/// </summary>
|
|
_AMPIMP bool get_is_auto_selection() const;
|
|
__declspec(property(get=get_is_auto_selection)) bool is_auto_selection;
|
|
|
|
/// <summary>
|
|
/// Return true if the other accelerator view is same as this accelerator view; false otherwise
|
|
/// </summary>
|
|
_AMPIMP bool operator==(const accelerator_view &_Other) const;
|
|
|
|
/// <summary>
|
|
/// Return false if the other accelerator view is same as this accelerator view; true otherwise
|
|
/// </summary>
|
|
_AMPIMP bool operator!=(const accelerator_view &_Other) const;
|
|
|
|
/// <summary>
|
|
/// Waits for completion of all commands submitted so far to this accelerator_view
|
|
/// </summary>
|
|
_AMPIMP void wait();
|
|
|
|
/// <summary>
|
|
/// Submit all pending commands queued to this accelerator_view to the accelerator
|
|
/// for execution.
|
|
/// </summary>
|
|
_AMPIMP void flush();
|
|
|
|
/// <summary>
|
|
/// Return a future to track the completion of all commands submitted so far to this accelerator_view
|
|
/// </summary>
|
|
_AMPIMP concurrency::completion_future create_marker();
|
|
|
|
private:
|
|
|
|
// No default constructor
|
|
accelerator_view();
|
|
|
|
// Private constructor
|
|
_AMPIMP accelerator_view(_Accelerator_view_impl_ptr _Impl, bool _Auto_selection = false);
|
|
|
|
private:
|
|
|
|
_Accelerator_view_impl_ptr _M_impl;
|
|
bool _M_auto_selection;
|
|
};
|
|
|
|
namespace details
|
|
{
|
|
inline _Ret_ _Accelerator_view_impl* _Get_accelerator_view_impl_ptr(const accelerator_view& _Accl_view)
|
|
{
|
|
return _Accl_view._M_impl;
|
|
}
|
|
|
|
inline _Ret_ _Accelerator_impl* _Get_accelerator_impl_ptr(const accelerator& _Accl)
|
|
{
|
|
return _Accl._M_impl;
|
|
}
|
|
|
|
// Type defining a hasher for accelerator_view objects
|
|
// for use with std::unordered_set and std::unordered_map
|
|
class _Accelerator_view_hasher
|
|
{
|
|
public:
|
|
size_t operator()(const accelerator_view &_Accl_view) const
|
|
{
|
|
std::hash<_Accelerator_view_impl*> _HashFunctor;
|
|
return _HashFunctor(_Accl_view._M_impl._Get_ptr());
|
|
}
|
|
};
|
|
|
|
typedef std::unordered_set<accelerator_view, _Accelerator_view_hasher> _Accelerator_view_unordered_set;
|
|
|
|
// Describes the N dimensional shape of a view in a buffer
|
|
class _View_shape : public _Reference_counter
|
|
{
|
|
public:
|
|
|
|
_AMPIMP static _Ret_ _View_shape* __cdecl _Create_view_shape(unsigned int _Rank, unsigned int _Linear_offset,
|
|
const unsigned int *_Base_extent, const unsigned int *_View_offset,
|
|
const unsigned int *_View_extent, const bool *_Projection_info = NULL);
|
|
|
|
_AMPIMP _Ret_ _View_shape* _Get_reduced_shape_for_copy();
|
|
|
|
inline unsigned int _Get_rank() const
|
|
{
|
|
return _M_rank;
|
|
}
|
|
|
|
inline unsigned int _Get_linear_offset() const
|
|
{
|
|
return _M_linear_offset;
|
|
}
|
|
|
|
inline const unsigned int *_Get_base_extent() const
|
|
{
|
|
return _M_base_extent;
|
|
}
|
|
|
|
inline const unsigned int *_Get_view_offset() const
|
|
{
|
|
return _M_view_offset;
|
|
}
|
|
inline const unsigned int *_Get_view_extent() const
|
|
{
|
|
return _M_view_extent;
|
|
}
|
|
|
|
inline const bool *_Get_projection_info() const
|
|
{
|
|
return _M_projection_info;
|
|
}
|
|
|
|
inline bool _Is_projection() const
|
|
{
|
|
return _M_projection_info[0];
|
|
}
|
|
|
|
inline bool _Is_valid(size_t _Buffer_size) const
|
|
{
|
|
// The end point of the base shape should not be greater than the size of the buffer
|
|
size_t endLinearOffset = _M_linear_offset + _Get_extent_size(_M_rank, _M_base_extent);
|
|
if (endLinearOffset > _Buffer_size) {
|
|
return false;
|
|
}
|
|
|
|
return _Is_valid();
|
|
}
|
|
|
|
inline unsigned int _Get_view_size() const
|
|
{
|
|
return _Get_extent_size(_M_rank, _M_view_extent);
|
|
}
|
|
|
|
inline unsigned int _Get_view_linear_offset() const
|
|
{
|
|
return _Get_linear_offset(_M_view_offset);
|
|
}
|
|
|
|
static inline bool
|
|
_Compare_extent_with_elem_size(unsigned int _Rank, const unsigned int *_Extent1, size_t _Elem_size1, const unsigned int *_Extent2, size_t _Elem_size2)
|
|
{
|
|
_ASSERTE((_Rank >= 1) && (_Extent1 != NULL)&& (_Extent2 != NULL));
|
|
|
|
// The extents should match accounting for the element sizes of the respective buffers
|
|
if ((_Extent1[_Rank - 1] * _Elem_size1) != (_Extent2[_Rank - 1] * _Elem_size2))
|
|
{
|
|
return false;
|
|
}
|
|
|
|
// Now compare the extent in all but the least significant dimension
|
|
if ((_Rank > 1) && !_Compare_extent(_Rank - 1, _Extent1, _Extent2))
|
|
{
|
|
return false;
|
|
}
|
|
|
|
return true;
|
|
}
|
|
|
|
|
|
static inline bool
|
|
_Compare_extent(unsigned int _Rank, const unsigned int *_Extent1, const unsigned int *_Extent2)
|
|
{
|
|
for (size_t _I = 0; _I < _Rank; ++_I) {
|
|
if (_Extent1[_I] != _Extent2[_I]) {
|
|
return false;
|
|
}
|
|
}
|
|
|
|
return true;
|
|
}
|
|
|
|
inline bool _Is_view_linear(unsigned int &_Linear_offset, unsigned int &_Linear_size) const
|
|
{
|
|
// The effective rank for the purpose of determining linearity
|
|
// depends on the highest dimension in which the extent is not 1
|
|
unsigned int _First_dim_with_non_unit_extent = 0;
|
|
while ((_First_dim_with_non_unit_extent < _M_rank) && (_M_view_extent[_First_dim_with_non_unit_extent] == 1)) {
|
|
_First_dim_with_non_unit_extent++;
|
|
}
|
|
|
|
unsigned int _Effective_rank = (_M_rank - _First_dim_with_non_unit_extent);
|
|
|
|
// It is linear if the effective rank is <= 1 or the base extent
|
|
// and view extent are same in all but the highest dimension with
|
|
// non-unit extent
|
|
if ((_Effective_rank <= 1) ||
|
|
(_Compare_extent(_Effective_rank - 1, &_M_base_extent[_First_dim_with_non_unit_extent + 1], &_M_view_extent[_First_dim_with_non_unit_extent + 1])))
|
|
{
|
|
_Linear_offset = _Get_view_linear_offset();
|
|
_Linear_size = _Get_view_size();
|
|
return true;
|
|
}
|
|
|
|
return false;
|
|
}
|
|
|
|
inline bool _Overlaps(const _View_shape* _Other) const
|
|
{
|
|
if (_Compare_base_shape(_Other))
|
|
{
|
|
// If the base shapes are identical we will do the N-dimensional
|
|
// bounding box overlap test
|
|
|
|
for (size_t _I = 0; _I < _M_rank; ++_I)
|
|
{
|
|
if (!_Intervals_overlap(_M_view_offset[_I], _M_view_offset[_I] + _M_view_extent[_I] - 1,
|
|
_Other->_M_view_offset[_I], _Other->_M_view_offset[_I] + _Other->_M_view_extent[_I] - 1))
|
|
{
|
|
return false;
|
|
}
|
|
}
|
|
|
|
return true;
|
|
}
|
|
else
|
|
{
|
|
// The base shapes are different. Check based on linear intervals
|
|
size_t firstStart = _Get_view_linear_offset();
|
|
size_t firstEnd = firstStart + _Get_view_size() - 1;
|
|
|
|
size_t secondStart = _Other->_Get_view_linear_offset();
|
|
size_t secondEnd = secondStart + _Other->_Get_view_size() - 1;
|
|
|
|
return _Intervals_overlap(firstStart, firstEnd, secondStart, secondEnd);
|
|
}
|
|
}
|
|
|
|
inline bool _Subsumes(const _View_shape* _Other) const
|
|
{
|
|
// Subsumption test can only be done for shapes that have the same base shape or
|
|
// when both have a rank of 1
|
|
if ((_M_rank == 1) && (_Other->_Get_rank() == 1))
|
|
{
|
|
size_t thisStart = _Get_view_linear_offset();
|
|
size_t thisEnd = thisStart + _Get_view_size() - 1;
|
|
|
|
size_t otherStart = _Other->_Get_view_linear_offset();
|
|
size_t otherEnd = otherStart + _Other->_Get_view_size() - 1;
|
|
|
|
return ((otherStart >= thisStart) && (otherEnd <= thisEnd));
|
|
}
|
|
|
|
if (!_Compare_base_shape(_Other)) {
|
|
return false;
|
|
}
|
|
|
|
if (!_Contains(_Other->_Get_view_offset())) {
|
|
return false;
|
|
}
|
|
|
|
std::vector<unsigned int> otherEndPointIndex(_M_rank);
|
|
for (size_t _I = 0; _I < _M_rank; ++_I) {
|
|
otherEndPointIndex[_I] = _Other->_Get_view_offset()[_I] + _Other->_Get_view_extent()[_I] - 1;
|
|
}
|
|
|
|
return _Contains(otherEndPointIndex.data());
|
|
}
|
|
|
|
private:
|
|
// Private constructor to force construction through the _Create_view_shape method
|
|
_View_shape(unsigned int _Rank, unsigned int _Linear_offset,
|
|
const unsigned int *_Base_extent, const unsigned int *_View_offset,
|
|
const unsigned int *_View_extent, const bool *_Projection_info);
|
|
|
|
virtual ~_View_shape();
|
|
|
|
// No default constructor or copy/assignment
|
|
_View_shape();
|
|
_View_shape(const _View_shape &_Other);
|
|
_View_shape(_View_shape &&_Other);
|
|
_View_shape& operator=(const _View_shape &_Other);
|
|
_View_shape& operator=(_View_shape &&_Other);
|
|
|
|
// Helper methods
|
|
static bool _Intervals_overlap(size_t _First_start, size_t _First_end,
|
|
size_t _Second_start, size_t _Second_end)
|
|
{
|
|
// Order the intervals by their start points
|
|
if (_First_start > _Second_start) {
|
|
size_t temp = _First_start;
|
|
_First_start = _Second_start;
|
|
_Second_start = temp;
|
|
|
|
temp = _First_end;
|
|
_First_end = _Second_end;
|
|
_Second_end = temp;
|
|
}
|
|
|
|
// The start of the second one must be within the bounds of the first one
|
|
return (_Second_start <= _First_end);
|
|
}
|
|
|
|
static unsigned int _Get_extent_size(unsigned int _Rank, const unsigned int *_Extent)
|
|
{
|
|
unsigned int totalExtent = 1;
|
|
for (size_t _I = 0; _I < _Rank; ++_I) {
|
|
totalExtent *= _Extent[_I];
|
|
}
|
|
|
|
return totalExtent;
|
|
}
|
|
|
|
inline bool _Is_valid() const
|
|
{
|
|
if (_M_rank == 0) {
|
|
return false;
|
|
}
|
|
|
|
// Ensure the _M_view_offset + _M_view_extent is within the bounds of _M_base_extent
|
|
size_t viewSize = 1;
|
|
|
|
for (size_t _I = 0; _I < _M_rank; ++_I)
|
|
{
|
|
viewSize *= _M_view_extent[_I];
|
|
if ((_M_view_offset[_I] + _M_view_extent[_I]) > _M_base_extent[_I]) {
|
|
return false;
|
|
}
|
|
}
|
|
|
|
if (viewSize == 0) {
|
|
return false;
|
|
}
|
|
|
|
return true;
|
|
}
|
|
|
|
inline bool _Compare_base_shape(const _View_shape* _Other) const
|
|
{
|
|
return ((_M_rank == _Other->_M_rank) &&
|
|
(_M_linear_offset == _Other->_M_linear_offset) &&
|
|
_Compare_extent(_M_rank, _M_base_extent, _Other->_M_base_extent));
|
|
}
|
|
|
|
// Checks if the element at the specified index
|
|
// is contained within this view shape
|
|
// Assumes the rank of the index is same as the
|
|
// rank of this view's shape
|
|
inline bool _Contains(const unsigned int* _Element_index) const
|
|
{
|
|
for (size_t _I = 0; _I < _M_rank; ++_I)
|
|
{
|
|
if ((_Element_index[_I] < _M_view_offset[_I]) ||
|
|
(_Element_index[_I] >= (_M_view_offset[_I] + _M_view_extent[_I])))
|
|
{
|
|
return false;
|
|
}
|
|
}
|
|
|
|
return true;
|
|
}
|
|
|
|
inline unsigned int _Get_linear_offset(const unsigned int* _Element_index) const
|
|
{
|
|
unsigned int currMultiplier = 1;
|
|
unsigned int linearOffset = _M_linear_offset;
|
|
for (int _I = static_cast<int>(_M_rank - 1); _I >= 0; _I--)
|
|
{
|
|
linearOffset += (currMultiplier * _Element_index[_I]);
|
|
currMultiplier *= _M_base_extent[_I];
|
|
}
|
|
|
|
return linearOffset;
|
|
}
|
|
|
|
private:
|
|
|
|
unsigned int _M_rank;
|
|
unsigned int _M_linear_offset;
|
|
unsigned int *_M_base_extent;
|
|
unsigned int *_M_view_offset;
|
|
unsigned int *_M_view_extent;
|
|
bool *_M_projection_info;
|
|
};
|
|
|
|
// This function creates a new _View_shape object from an existing _View_shape object when the data underlying the view
|
|
// needs to be reinterpreted to use a different element size than the one used by the original view.
|
|
inline
|
|
_Ret_ _View_shape *_Create_reinterpreted_shape(const _View_shape* _Source_shape, size_t _Curr_elem_size, size_t _New_elem_size)
|
|
{
|
|
unsigned int _Rank = _Source_shape->_Get_rank();
|
|
size_t _LinearOffsetInBytes = _Source_shape->_Get_linear_offset() * _Curr_elem_size;
|
|
size_t _BaseLSDExtentInBytes = (_Source_shape->_Get_base_extent())[_Rank - 1] * _Curr_elem_size;
|
|
size_t _ViewLSDOffsetInBytes = (_Source_shape->_Get_view_offset())[_Rank - 1] * _Curr_elem_size;
|
|
size_t _ViewLSDExtentInBytes = (_Source_shape->_Get_view_extent())[_Rank - 1] * _Curr_elem_size;
|
|
|
|
_ASSERTE((_LinearOffsetInBytes % _New_elem_size) == 0);
|
|
_ASSERTE((_BaseLSDExtentInBytes % _New_elem_size) == 0);
|
|
_ASSERTE((_ViewLSDOffsetInBytes % _New_elem_size) == 0);
|
|
_ASSERTE((_ViewLSDExtentInBytes % _New_elem_size) == 0);
|
|
|
|
size_t _Temp_val = _LinearOffsetInBytes / _New_elem_size;
|
|
_ASSERTE(_Temp_val <= UINT_MAX);
|
|
unsigned int _New_linear_offset = static_cast<unsigned int>(_Temp_val);
|
|
|
|
std::vector<unsigned int> _New_base_extent(_Rank);
|
|
std::vector<unsigned int> _New_view_offset(_Rank);
|
|
std::vector<unsigned int> _New_view_extent(_Rank);
|
|
for (unsigned int i = 0; i < _Rank - 1; ++i) {
|
|
_New_base_extent[i] = (_Source_shape->_Get_base_extent())[i];
|
|
_New_view_offset[i] = (_Source_shape->_Get_view_offset())[i];
|
|
_New_view_extent[i] = (_Source_shape->_Get_view_extent())[i];
|
|
}
|
|
|
|
// The extent in the least significant dimension needs to be adjusted
|
|
_Temp_val = _BaseLSDExtentInBytes / _New_elem_size;
|
|
_ASSERTE(_Temp_val <= UINT_MAX);
|
|
_New_base_extent[_Rank - 1] = static_cast<unsigned int>(_Temp_val);
|
|
|
|
_Temp_val = _ViewLSDOffsetInBytes / _New_elem_size;
|
|
_ASSERTE(_Temp_val <= UINT_MAX);
|
|
_New_view_offset[_Rank - 1] = static_cast<unsigned int>(_Temp_val);
|
|
|
|
_Temp_val = _ViewLSDExtentInBytes / _New_elem_size;
|
|
_ASSERTE(_Temp_val <= UINT_MAX);
|
|
_New_view_extent[_Rank - 1] = static_cast<unsigned int>(_Temp_val);
|
|
|
|
return _View_shape::_Create_view_shape(_Rank, _New_linear_offset, _New_base_extent.data(), _New_view_offset.data(), _New_view_extent.data());
|
|
}
|
|
|
|
inline _Access_mode _Get_synchronize_access_mode(access_type cpu_access_type)
|
|
{
|
|
switch(cpu_access_type)
|
|
{
|
|
case access_type_auto:
|
|
case access_type_read:
|
|
return _Read_access;
|
|
case access_type_write:
|
|
return _Write_access;
|
|
case access_type_read_write:
|
|
return _Read_write_access;
|
|
case access_type_none:
|
|
default:
|
|
_ASSERTE(false);
|
|
return _No_access;
|
|
}
|
|
}
|
|
|
|
inline access_type _Get_cpu_access_type(_Access_mode _Cpu_access_mode)
|
|
{
|
|
access_type _Cpu_access_type = access_type_none;
|
|
if (_Cpu_access_mode & _Read_access) {
|
|
_Cpu_access_type = static_cast<access_type>(_Cpu_access_type | access_type_read);
|
|
}
|
|
|
|
if (_Cpu_access_mode & _Write_access) {
|
|
_Cpu_access_type = static_cast<access_type>(_Cpu_access_type | access_type_write);
|
|
}
|
|
|
|
return _Cpu_access_type;
|
|
}
|
|
|
|
// Class manages a raw buffer in a accelerator view
|
|
class _Buffer : public _Reference_counter
|
|
{
|
|
friend class _CPU_accelerator_view_impl;
|
|
friend class _D3D_accelerator_view_impl;
|
|
friend class _D3D_temp_staging_cache;
|
|
|
|
public:
|
|
|
|
// Force construction through these static public method to ensure that _Buffer
|
|
// objects are allocated in the runtime
|
|
|
|
// Allocate a new buffer on the specified accelerator_view
|
|
_AMPIMP static _Ret_ _Buffer * __cdecl _Create_buffer(accelerator_view _Accelerator_view, accelerator_view _Access_on_accelerator_view, size_t _Num_elems,
|
|
size_t _Elem_size, bool _Is_temp = false, access_type _Cpu_access_type = access_type_auto);
|
|
|
|
// Create a buffer object from a pre-allocated storage on the specified accelerator_view. This can be thought
|
|
// of as the accelerator_view "adopting" the passed data buffer.
|
|
_AMPIMP static _Ret_ _Buffer * __cdecl _Create_buffer(_In_ void *_Data_ptr, accelerator_view _Accelerator_view, size_t _Num_elems,
|
|
size_t _Elem_size);
|
|
|
|
// Create a staging buffer on the specified accelerator_view which can be accesed on the cpu upon mapping.
|
|
_AMPIMP static _Ret_ _Buffer * __cdecl _Create_stage_buffer(accelerator_view _Accelerator_view, accelerator_view _Access_on_accelerator_view,
|
|
size_t _Num_elems, size_t _Elem_size, bool _Is_temp = false);
|
|
|
|
// Creates a temp staging buffer of the requested size. This function may create
|
|
// a staging buffer smaller than the requested size.
|
|
_AMPIMP static _Ret_ _Buffer * __cdecl _Get_temp_staging_buffer(accelerator_view _Av, size_t _Requested_num_elems, size_t _Elem_size);
|
|
|
|
// Map a zero-copy or staging buffer for access on the CPU.
|
|
_AMPIMP void _Map_buffer(_Access_mode _Map_type, bool _Wait);
|
|
|
|
// Asynchronously map a zero-copy or staging buffer for access on the CPU.
|
|
_AMPIMP _Event _Map_buffer_async(_Access_mode _Map_type);
|
|
|
|
// Unmap a zero-copy or staging buffer denying CPU access
|
|
_AMPIMP void _Unmap_buffer();
|
|
|
|
// Copy data to _Dest asynchronously.
|
|
_AMPIMP _Event _Copy_to_async(_Out_ _Buffer * _Dest, size_t _Num_elems, size_t _Src_offset = 0, size_t _Dest_offset = 0);
|
|
|
|
// Copy data to _Dest asynchronously.
|
|
_AMPIMP _Event _Copy_to_async(_Out_ _Buffer * _Dest, _View_shape_ptr _Src_shape, _View_shape_ptr _Dest_shape);
|
|
|
|
_AMPIMP accelerator_view _Get_accelerator_view() const;
|
|
_AMPIMP accelerator_view _Get_access_on_accelerator_view() const;
|
|
|
|
_AMPIMP void _Register_view(_In_ _View_key _Key);
|
|
_AMPIMP void _Unregister_view(_In_ _View_key _Key);
|
|
|
|
// Return the raw data ptr - only a accelerator view implementation can interpret
|
|
// this raw pointer. This method should usually not be used in the AMP header files
|
|
// The _Get_host_ptr is the right way for accessing the host accesible ptr for a buffer
|
|
_Ret_ void * _Get_data_ptr() const
|
|
{
|
|
return _M_data_ptr;
|
|
}
|
|
|
|
// Returns the host accessible ptr corresponding to the buffer. This would
|
|
// return NULL when the buffer is inaccesible on the CPU
|
|
_Ret_ void * _Get_host_ptr() const
|
|
{
|
|
return _M_host_ptr;
|
|
}
|
|
|
|
size_t _Get_elem_size() const
|
|
{
|
|
return _M_elem_size;
|
|
}
|
|
|
|
size_t _Get_num_elems() const
|
|
{
|
|
return _M_num_elems;
|
|
}
|
|
|
|
_Ret_ _Accelerator_view_impl* _Get_accelerator_view_impl() const
|
|
{
|
|
return _M_accelerator_view;
|
|
}
|
|
|
|
_Ret_ _Accelerator_view_impl* _Get_access_on_accelerator_view_impl() const
|
|
{
|
|
return _M_access_on_accelerator_view;
|
|
}
|
|
|
|
bool _Owns_data() const
|
|
{
|
|
return _M_owns_data;
|
|
}
|
|
|
|
_AMPIMP bool _Exclusively_owns_data();
|
|
|
|
bool _Is_staging() const
|
|
{
|
|
return _M_is_staging;
|
|
}
|
|
|
|
_Access_mode _Get_allowed_host_access_mode() const
|
|
{
|
|
return _M_allowed_host_access_mode;
|
|
}
|
|
|
|
access_type _Get_allowed_host_access_type() const
|
|
{
|
|
return _Get_cpu_access_type(_M_allowed_host_access_mode);
|
|
}
|
|
|
|
bool _Is_host_accessible(_Access_mode _Requested_access_mode) const
|
|
{
|
|
return ((_Get_allowed_host_access_mode() & _Requested_access_mode) == _Requested_access_mode);
|
|
}
|
|
|
|
_Access_mode _Get_current_host_access_mode() const
|
|
{
|
|
return _M_current_host_access_mode;
|
|
}
|
|
|
|
bool _Is_temp() const
|
|
{
|
|
return _M_is_temp;
|
|
}
|
|
|
|
bool _Is_adopted() const
|
|
{
|
|
// Is it adopted from interop?
|
|
return _M_is_adopted;
|
|
}
|
|
|
|
bool _Is_buffer() const
|
|
{
|
|
return _M_is_buffer;
|
|
}
|
|
|
|
_AMPIMP bool _Is_mappable() const;
|
|
|
|
protected:
|
|
|
|
// The _Buffer constructor is protected to force construction through the static
|
|
// _Create_buffer method to ensure the object is allocated in the runtime
|
|
_Buffer(_In_ _Accelerator_view_impl* _Av, _In_ void *_Buffer_data_ptr, _In_ void * _Host_ptr,
|
|
_Access_mode _Allowed_host_access_mode, _Access_mode _Current_host_access_mode, size_t _Num_elems,
|
|
size_t _Elem_size, bool _Owns_data, bool _Is_staging, bool _Is_temp, bool _Is_adopted);
|
|
|
|
// protected destructor to force deletion through _Release
|
|
virtual ~_Buffer();
|
|
|
|
// No default consturctor, copy constructor and assignment operator
|
|
_Buffer();
|
|
_Buffer(const _Buffer &rhs);
|
|
_Buffer &operator=(const _Buffer &rhs);
|
|
|
|
void _Set_host_ptr(_In_ void *_Host_ptr, _Access_mode _Host_access_mode = _No_access)
|
|
{
|
|
_ASSERTE((_Host_ptr == NULL) || (_Host_access_mode != _No_access));
|
|
|
|
_M_host_ptr = _Host_ptr;
|
|
if (_Host_ptr == NULL) {
|
|
_M_current_host_access_mode = _No_access;
|
|
}
|
|
else {
|
|
_M_current_host_access_mode = _Host_access_mode;
|
|
}
|
|
}
|
|
|
|
void _Set_data_ptr(_In_ IUnknown *_Data_ptr)
|
|
{
|
|
_M_data_ptr = _Data_ptr;
|
|
}
|
|
|
|
protected:
|
|
_Accelerator_view_impl_ptr _M_accelerator_view;
|
|
_Accelerator_view_impl_ptr _M_access_on_accelerator_view;
|
|
void * _M_data_ptr;
|
|
void * _M_host_ptr;
|
|
_Access_mode _M_allowed_host_access_mode;
|
|
_Access_mode _M_current_host_access_mode;
|
|
size_t _M_elem_size;
|
|
size_t _M_num_elems;
|
|
bool _M_owns_data;
|
|
bool _M_is_staging;
|
|
|
|
// Used to determine how to map the staging buffer after its involved in a copy
|
|
bool _M_is_temp;
|
|
|
|
bool _M_is_adopted;
|
|
bool _M_is_buffer;
|
|
private:
|
|
// A set of view_keys to invalidate whenever the host ptr of a staging buffer is invalidated
|
|
std::unique_ptr<std::unordered_set<_View_key>> _M_view_keys;
|
|
Concurrency::critical_section _M_critical_section;
|
|
};
|
|
|
|
// Class manages a texture in a accelerator view
|
|
class _Texture : public _Buffer
|
|
{
|
|
friend class _CPU_accelerator_view_impl;
|
|
friend class _D3D_accelerator_view_impl;
|
|
friend class _D3D_temp_staging_cache;
|
|
|
|
public:
|
|
|
|
// Allocate a new texture on the specified accelerator_view
|
|
_AMPIMP static _Ret_ _Texture * __cdecl _Create_texture(accelerator_view _Accelerator_view,
|
|
unsigned int _Rank,
|
|
size_t _Width, size_t _Height, size_t _Depth,
|
|
unsigned int _Mip_levels,
|
|
_Short_vector_base_type_id _Type_id,
|
|
unsigned int _Num_channels,
|
|
unsigned int _Bits_per_channel,
|
|
bool _Is_temp = false);
|
|
|
|
// Create a texture object from a pre-allocated storage on the specified accelerator_view. This can be thought
|
|
// of as the accelerator_view "adopting" the passed data buffer.
|
|
_AMPIMP static _Ret_ _Texture * __cdecl _Adopt_texture(unsigned int _Rank, _Texture_base_type_id _Id,
|
|
_In_ IUnknown *_Data_ptr, accelerator_view _Accelerator_view,
|
|
unsigned int _View_format);
|
|
|
|
// Create a staging texture on the specified accelerator_view which can be accesed on the cpu upon mapping.
|
|
_AMPIMP static _Ret_ _Texture * __cdecl _Create_stage_texture(accelerator_view _Accelerator_view, accelerator_view _Access_on_accelerator_view,
|
|
unsigned int _Rank,
|
|
size_t _Width, size_t _Height, size_t _Depth,
|
|
unsigned int _Mip_levels,
|
|
unsigned int _Format,
|
|
bool _Is_temp = false);
|
|
|
|
// Create a staging texture on the specified accelerator_view which can be accesed on the cpu upon mapping.
|
|
_AMPIMP static _Ret_ _Texture * __cdecl _Create_stage_texture(accelerator_view _Accelerator_view, accelerator_view _Access_on_accelerator_view,
|
|
unsigned int _Rank,
|
|
size_t _Width, size_t _Height, size_t _Depth,
|
|
unsigned int _Mip_levels,
|
|
_Short_vector_base_type_id _Type_id,
|
|
unsigned int _Num_channels,
|
|
unsigned int _Bits_per_channel);
|
|
|
|
// Creates a temp staging texture. This function may create
|
|
// a staging texture smaller than the requested size.
|
|
_AMPIMP static _Ret_ _Texture * __cdecl _Get_temp_staging_texture(accelerator_view _Accelerator_view,
|
|
unsigned int _Rank,
|
|
size_t _Width, size_t _Height, size_t _Depth,
|
|
unsigned int _Mip_levels,
|
|
unsigned int _Format);
|
|
|
|
// Constructs a new texture with the same properties as the given texture.
|
|
_AMPIMP static _Ret_ _Texture * __cdecl _Clone_texture(const _Texture *_Src, const accelerator_view &_Accelerator_view, const accelerator_view &_Associated_av);
|
|
|
|
// Copy data to _Dest asynchronously for textures. The two textures must have been created with
|
|
// compatible physical formats.
|
|
_AMPIMP _Event _Copy_to_async(_Out_ _Texture * _Dest, const size_t *_Copy_extent,
|
|
const size_t *_Src_offset, const size_t *_Dst_offset,
|
|
unsigned int _Src_mipmap_level, unsigned int _Dst_mipmap_level);
|
|
|
|
size_t _Get_width(unsigned int _Mip_offset = 0) const
|
|
{
|
|
return (_M_width >> _Mip_offset) ? (_M_width >> _Mip_offset) : 1U;
|
|
}
|
|
|
|
size_t _Get_height(unsigned int _Mip_offset = 0) const
|
|
{
|
|
return (_M_height >> _Mip_offset) ? (_M_height >> _Mip_offset) : 1U;
|
|
}
|
|
|
|
size_t _Get_depth(unsigned int _Mip_offset = 0) const
|
|
{
|
|
return (_M_depth >> _Mip_offset) ? (_M_depth >> _Mip_offset) : 1U;
|
|
}
|
|
|
|
unsigned int _Get_rank() const
|
|
{
|
|
return _M_rank;
|
|
}
|
|
|
|
unsigned int _Get_texture_format() const
|
|
{
|
|
return _M_texture_format;
|
|
}
|
|
|
|
unsigned int _Get_view_format() const
|
|
{
|
|
return _M_view_format;
|
|
}
|
|
|
|
unsigned int _Get_num_channels() const
|
|
{
|
|
return _M_num_channels;
|
|
}
|
|
|
|
unsigned int _Get_bits_per_channel() const
|
|
{
|
|
// For texture adopted from interop, return 0.
|
|
return _Is_adopted() ? 0 : _M_bits_per_channel;
|
|
}
|
|
|
|
unsigned int _Get_bits_per_element() const
|
|
{
|
|
return _M_bits_per_channel * _M_num_channels;
|
|
}
|
|
|
|
unsigned int _Get_data_length(unsigned int _Most_detailed_mipmap_level, unsigned int _View_mipmap_levels, const size_t *_Extents = nullptr) const // in bytes
|
|
{
|
|
_ASSERTE(_View_mipmap_levels);
|
|
|
|
unsigned long long _Bits_per_byte = 8ULL;
|
|
unsigned long long _Total_bytes = 0ULL;
|
|
|
|
unsigned int _Mip_level = _Most_detailed_mipmap_level;
|
|
|
|
// Sum up data length (in bytes) of all mipmap levels in the view
|
|
for (unsigned int _Mip_offset=0; _Mip_offset < _View_mipmap_levels; ++_Mip_offset)
|
|
{
|
|
unsigned long long _Width = 1ULL;
|
|
unsigned long long _Height = 1ULL;
|
|
unsigned long long _Depth = 1ULL;
|
|
|
|
if (_Extents)
|
|
{
|
|
switch (_M_rank)
|
|
{
|
|
case 3:
|
|
_Depth = (_Extents[2] >> _Mip_level) ? (_Extents[2] >> _Mip_level) : 1U;
|
|
// deliberately fall thru
|
|
case 2:
|
|
_Height = (_Extents[1] >> _Mip_level) ? (_Extents[1] >> _Mip_level) : 1U;
|
|
// deliberately fall thru
|
|
case 1:
|
|
_Width = (_Extents[0] >> _Mip_level) ? (_Extents[0] >> _Mip_level) : 1U;
|
|
break;
|
|
default:
|
|
_ASSERTE(false); // textures are only rank 1-3
|
|
}
|
|
}
|
|
else
|
|
{
|
|
_Width = _Get_width(_Mip_level);
|
|
_Height = _Get_height(_Mip_level);
|
|
_Depth = _Get_depth(_Mip_level);
|
|
}
|
|
|
|
// Note _Get_bits_per_element() can be smaller than 8
|
|
// Use unsigned long long to avoid integer overflow
|
|
_Total_bytes += ((_Width * _Height * _Depth * static_cast<unsigned long long>(_Get_bits_per_element())) + _Bits_per_byte - 1) / _Bits_per_byte;
|
|
|
|
_Mip_level++;
|
|
}
|
|
|
|
return static_cast<unsigned int>(_Total_bytes);
|
|
}
|
|
|
|
unsigned int _Get_mip_levels() const
|
|
{
|
|
return _M_mip_levels;
|
|
}
|
|
|
|
size_t _Get_row_pitch() const
|
|
{
|
|
return _M_row_pitch;
|
|
}
|
|
|
|
void _Set_row_pitch(size_t _Val)
|
|
{
|
|
_M_row_pitch = _Val;
|
|
}
|
|
|
|
size_t _Get_depth_pitch() const
|
|
{
|
|
return _M_depth_pitch;
|
|
}
|
|
|
|
void _Set_depth_pitch(size_t _Val)
|
|
{
|
|
_M_depth_pitch = _Val;
|
|
}
|
|
|
|
private:
|
|
|
|
// The _Texture constructor is private to force construction through the static
|
|
// _Create_texture method to ensure the object is allocated in the runtime
|
|
_Texture(_In_ _Accelerator_view_impl* _Av, _In_ void *_Texture_data_ptr, _In_ void * _Host_ptr,
|
|
_Access_mode _Allowed_host_access_mode, _Access_mode _Current_host_access_mode,
|
|
unsigned int _Rank,
|
|
size_t _Width, size_t _Height, size_t _Depth,
|
|
unsigned int _Mip_levels,
|
|
unsigned int _Texture_format,
|
|
unsigned int _View_format,
|
|
unsigned int _Num_channels,
|
|
unsigned int _Bits_per_channel,
|
|
bool _Owns_data, bool _Is_staging, bool _Is_temp, bool _Is_adopted);
|
|
|
|
// Private destructor to force deletion through _Release
|
|
~_Texture();
|
|
|
|
// No default consturctor, copy constructor and assignment operator
|
|
_Texture();
|
|
_Texture(const _Texture &rhs);
|
|
_Texture &operator=(const _Texture &rhs);
|
|
|
|
// Texture only
|
|
unsigned int _M_rank;
|
|
size_t _M_width;
|
|
size_t _M_height;
|
|
size_t _M_depth;
|
|
unsigned int _M_texture_format;
|
|
unsigned int _M_view_format;
|
|
unsigned int _M_bits_per_channel;
|
|
unsigned int _M_num_channels;
|
|
unsigned int _M_mip_levels;
|
|
|
|
size_t _M_row_pitch;
|
|
size_t _M_depth_pitch;
|
|
};
|
|
|
|
class _Sampler : public _Reference_counter
|
|
{
|
|
public:
|
|
// Create a new sampler with configurations exposed by C++ AMP.
|
|
_AMPIMP static _Ret_ _Sampler * __cdecl _Create(
|
|
unsigned int _Filter_mode,
|
|
unsigned int _Address_mode,
|
|
float _Border_r,
|
|
float _Border_g,
|
|
float _Border_b,
|
|
float _Border_a);
|
|
|
|
// Create a sampler object given an adopted opaque data pointer
|
|
_AMPIMP static _Ret_ _Sampler * __cdecl _Create(_In_ void *_Data_ptr);
|
|
|
|
// Return the raw data ptr - only an accelerator view implementation can interpret
|
|
// this raw pointer. This method should usually not be used in the AMP header files
|
|
_Ret_ void * _Get_data_ptr() const
|
|
{
|
|
return _M_data_ptr;
|
|
}
|
|
|
|
bool _Is_adopted() const
|
|
{
|
|
// Is it adopted from interop?
|
|
return _M_is_adopted;
|
|
}
|
|
|
|
unsigned int _Get_filter_mode() const
|
|
{
|
|
return _M_filter_mode;
|
|
}
|
|
|
|
unsigned int _Get_address_mode() const
|
|
{
|
|
return _M_address_mode;
|
|
}
|
|
|
|
const float* _Get_border_color() const
|
|
{
|
|
return &_M_border_color[0];
|
|
}
|
|
|
|
private:
|
|
// The _Sampler constructor is private to force construction through the static
|
|
// _Create method to ensure the object is allocated in the runtime
|
|
_Sampler(unsigned int _Filter_mode, unsigned int _Address_mode, float _Border_r, float _Border_g, float _Border_b, float _Border_a);
|
|
|
|
_Sampler(_In_ void *_Data_ptr);
|
|
|
|
// Private destructor to force deletion through _Release
|
|
~_Sampler();
|
|
|
|
// No default consturctor, copy constructor and assignment operator
|
|
_Sampler();
|
|
_Sampler(const _Sampler &rhs);
|
|
_Sampler &operator=(const _Sampler &rhs);
|
|
|
|
void * _M_data_ptr;
|
|
bool _M_is_adopted;
|
|
unsigned int _M_filter_mode;
|
|
unsigned int _M_address_mode;
|
|
float _M_border_color[4];
|
|
};
|
|
|
|
// Forward declaration for copy helper functions
|
|
_AMPIMP _Event __cdecl _Copy_impl(_In_ _Buffer *_Src, size_t _Src_offset,
|
|
_Out_ _Buffer * _Dst, size_t _Dest_offset,
|
|
size_t _Num_elems, size_t _Preferred_copy_chunk_num_elems = 0);
|
|
|
|
_AMPIMP _Event __cdecl _Copy_async_impl(_In_ _Texture *_Src_tex, const size_t *_Src_offset, unsigned int _Src_mipmap_level,
|
|
_Out_ _Texture *_Dst_tex, const size_t *_Dst_offset, unsigned int _Dst_mipmap_level,
|
|
const size_t *_Copy_extent, const size_t *_Preferred_copy_chunk_extent = NULL);
|
|
|
|
inline bool _Get_chunked_staging_texture(_In_ _Texture* _Tex, const size_t *_Copy_chunk_extent, _Inout_ size_t *_Remaining_copy_extent, _Out_ size_t *_Curr_copy_extent, _Out_ _Texture_ptr *_Staging_texture)
|
|
{
|
|
bool _Truncated_copy = false;
|
|
size_t _Allocation_extent[3] = { _Copy_chunk_extent[0], _Copy_chunk_extent[1], _Copy_chunk_extent[2] };
|
|
|
|
unsigned int _Most_sig_idx = _Tex->_Get_rank() - 1;
|
|
|
|
if (_Allocation_extent[_Most_sig_idx] > _Remaining_copy_extent[_Most_sig_idx]) {
|
|
_Allocation_extent[_Most_sig_idx] = _Remaining_copy_extent[_Most_sig_idx];
|
|
}
|
|
|
|
_Texture_ptr _Stage = _Texture::_Get_temp_staging_texture(_Tex->_Get_accelerator_view(), _Tex->_Get_rank(),
|
|
_Allocation_extent[0], _Allocation_extent[1], _Allocation_extent[2],
|
|
/*_Mip_levels=*/1, _Tex->_Get_texture_format());
|
|
|
|
std::copy(&_Allocation_extent[0], &_Allocation_extent[3], stdext::make_unchecked_array_iterator(&_Curr_copy_extent[0]));
|
|
size_t _Staging_tex_extent[3] = {_Stage->_Get_width(), _Stage->_Get_height(), _Stage->_Get_depth()};
|
|
if (_Curr_copy_extent[_Most_sig_idx] > _Staging_tex_extent[_Most_sig_idx]) {
|
|
_Curr_copy_extent[_Most_sig_idx] = _Staging_tex_extent[_Most_sig_idx];
|
|
}
|
|
|
|
// The truncation can however happen only in the most significant dimension and lower
|
|
// dimensions should not get truncated
|
|
if (_Curr_copy_extent[_Most_sig_idx] < _Remaining_copy_extent[_Most_sig_idx])
|
|
{
|
|
_Remaining_copy_extent[_Most_sig_idx] -= _Curr_copy_extent[_Most_sig_idx];
|
|
_Truncated_copy = true;
|
|
}
|
|
|
|
for (unsigned int _I = 0; _I < _Most_sig_idx; _I++)
|
|
{
|
|
_ASSERTE(_Curr_copy_extent[_I] == _Remaining_copy_extent[_I]);
|
|
}
|
|
|
|
*_Staging_texture = _Stage;
|
|
return _Truncated_copy;
|
|
}
|
|
|
|
#pragma warning ( push )
|
|
#pragma warning ( disable : 6101 )
|
|
// Supress "warning C6101: Returning uninitialized memory '*_Dst'.: A successful"
|
|
// "path through the function does not set the named _Out_ parameter."
|
|
// The callers to _Copy_data_on_host all have static_assert that _Rank has to be 1, 2, or 3 dimensions for texture
|
|
//
|
|
template <typename _Input_iterator, typename _Value_type>
|
|
inline void _Copy_data_on_host(int _Rank, _Input_iterator _Src, _Out_ _Value_type *_Dst,
|
|
size_t _Width, size_t _Height, size_t _Depth,
|
|
size_t _Dst_row_pitch_in_bytes, size_t _Dst_depth_pitch_in_bytes,
|
|
size_t _Src_row_pitch, size_t _Src_depth_pitch)
|
|
{
|
|
switch(_Rank)
|
|
{
|
|
case 1:
|
|
{
|
|
_Input_iterator _End = _Src;
|
|
std::advance(_End, _Width);
|
|
std::copy(_Src, _End, stdext::make_unchecked_array_iterator(_Dst));
|
|
}
|
|
break;
|
|
case 2:
|
|
{
|
|
unsigned char *_Dst_ptr = reinterpret_cast<unsigned char *>(_Dst);
|
|
_Input_iterator _Src_start = _Src;
|
|
for (size_t _I = 0; _I < _Height; _I++)
|
|
{
|
|
_Input_iterator _Src_end = _Src_start;
|
|
std::advance(_Src_end, _Width);
|
|
|
|
std::copy(_Src_start, _Src_end, stdext::make_unchecked_array_iterator(reinterpret_cast<_Value_type*>(_Dst_ptr)));
|
|
|
|
_Dst_ptr += _Dst_row_pitch_in_bytes;
|
|
std::advance(_Src_start, _Src_row_pitch);
|
|
}
|
|
}
|
|
break;
|
|
case 3:
|
|
{
|
|
unsigned char *_Dst_ptr_slice_start = reinterpret_cast<unsigned char *>(_Dst);
|
|
_Input_iterator _Src_depth_slice_start = _Src;
|
|
for (size_t _I = 0; _I < _Depth; _I++)
|
|
{
|
|
_Input_iterator _Src_start = _Src_depth_slice_start;
|
|
unsigned char *_Dst_ptr = _Dst_ptr_slice_start;
|
|
|
|
for (size_t _J = 0; _J < _Height; _J++)
|
|
{
|
|
_Input_iterator _Src_end = _Src_start;
|
|
std::advance(_Src_end, _Width);
|
|
|
|
std::copy(_Src_start, _Src_end, stdext::make_unchecked_array_iterator(reinterpret_cast<_Value_type*>(_Dst_ptr)));
|
|
|
|
_Dst_ptr += _Dst_row_pitch_in_bytes;
|
|
std::advance(_Src_start, _Src_row_pitch);
|
|
}
|
|
|
|
_Dst_ptr_slice_start += _Dst_depth_pitch_in_bytes;
|
|
std::advance(_Src_depth_slice_start, _Src_depth_pitch);
|
|
}
|
|
}
|
|
break;
|
|
default:
|
|
_ASSERTE(FALSE);
|
|
break;
|
|
}
|
|
}
|
|
#pragma warning ( pop ) // disable : 6101
|
|
|
|
template <typename _Output_iterator, typename _Value_type>
|
|
inline void _Copy_data_on_host(int _Rank, const _Value_type * _Src, _Output_iterator _Dst,
|
|
size_t _Width, size_t _Height, size_t _Depth,
|
|
size_t _Src_row_pitch_in_bytes, size_t _Src_depth_pitch_in_bytes,
|
|
size_t _Dst_row_pitch, size_t _Dst_depth_pitch)
|
|
{
|
|
switch(_Rank)
|
|
{
|
|
case 1:
|
|
{
|
|
const _Value_type * _End = _Src + _Width;
|
|
std::copy(stdext::make_unchecked_array_iterator(_Src), stdext::make_unchecked_array_iterator(_End), _Dst);
|
|
}
|
|
break;
|
|
case 2:
|
|
{
|
|
const unsigned char *_Src_ptr = reinterpret_cast<const unsigned char *>(_Src);
|
|
_Output_iterator _Dst_iter = _Dst;
|
|
for (size_t _I = 0; _I < _Height; _I++)
|
|
{
|
|
const _Value_type * _Src_end = reinterpret_cast<const _Value_type*>(_Src_ptr) + _Width;
|
|
|
|
std::copy(stdext::make_unchecked_array_iterator(reinterpret_cast<const _Value_type*>(_Src_ptr)), stdext::make_unchecked_array_iterator(_Src_end), _Dst_iter);
|
|
std::advance(_Dst_iter, _Dst_row_pitch);
|
|
_Src_ptr += _Src_row_pitch_in_bytes;
|
|
}
|
|
}
|
|
break;
|
|
case 3:
|
|
{
|
|
const unsigned char *_Src_ptr_slice_start = reinterpret_cast<const unsigned char *>(_Src);
|
|
_Output_iterator _Dst_depth_slice_start = _Dst;
|
|
for (size_t _I = 0; _I < _Depth; _I++)
|
|
{
|
|
_Output_iterator _Dst_iter = _Dst_depth_slice_start;
|
|
const unsigned char *_Src_ptr = _Src_ptr_slice_start;
|
|
|
|
for (size_t _J = 0; _J < _Height; _J++)
|
|
{
|
|
const _Value_type * _Src_end = reinterpret_cast<const _Value_type *>(_Src_ptr) + _Width;
|
|
|
|
std::copy(stdext::make_unchecked_array_iterator(reinterpret_cast<const _Value_type*>(_Src_ptr)), stdext::make_unchecked_array_iterator(_Src_end), _Dst_iter);
|
|
|
|
std::advance(_Dst_iter, _Dst_row_pitch);
|
|
_Src_ptr += _Src_row_pitch_in_bytes;
|
|
}
|
|
|
|
_Src_ptr_slice_start += _Src_depth_pitch_in_bytes;
|
|
std::advance(_Dst_depth_slice_start, _Dst_depth_pitch);
|
|
}
|
|
}
|
|
break;
|
|
default:
|
|
_ASSERTE(FALSE);
|
|
break;
|
|
}
|
|
}
|
|
|
|
_AMPIMP size_t __cdecl _Get_preferred_copy_chunk_size(size_t _Total_copy_size_in_bytes);
|
|
|
|
inline size_t _Get_preferred_copy_chunk_num_elems(size_t _Total_num_elems, size_t _Elem_size)
|
|
{
|
|
size_t preferredChunkSize = _Get_preferred_copy_chunk_size(_Total_num_elems * _Elem_size);
|
|
|
|
return (preferredChunkSize / _Elem_size);
|
|
}
|
|
|
|
inline void _Get_preferred_copy_chunk_extent(unsigned int _Rank, size_t _Width, size_t _Height,
|
|
size_t _Depth, size_t _Bits_per_element, _Out_writes_(3) size_t *_Preferred_copy_chunk_extent)
|
|
{
|
|
_ASSERTE(_Preferred_copy_chunk_extent != nullptr);
|
|
|
|
size_t requestedByteSize = static_cast<size_t>((static_cast<unsigned long long>(_Width) *
|
|
static_cast<unsigned long long>(_Height) *
|
|
static_cast<unsigned long long>(_Depth) *
|
|
static_cast<unsigned long long>(_Bits_per_element)) >> 3);
|
|
|
|
size_t preferredChunkSize = _Get_preferred_copy_chunk_size(requestedByteSize);
|
|
|
|
// Lets align the allocation size to the element size of the texture
|
|
size_t preferredCopyChunkNumElems = static_cast<size_t>((static_cast<unsigned long long>(preferredChunkSize) * 8U) / _Bits_per_element);
|
|
|
|
// Lets truncate the dimensions of the requested staging texture.
|
|
// We only truncate in the most significant dimension
|
|
switch (_Rank)
|
|
{
|
|
case 1:
|
|
_Width = preferredCopyChunkNumElems;
|
|
break;
|
|
case 2:
|
|
_Height = (preferredCopyChunkNumElems + _Width - 1) / _Width;
|
|
break;
|
|
case 3:
|
|
_Depth = (preferredCopyChunkNumElems + (_Height * _Width) - 1) / (_Height * _Width);
|
|
break;
|
|
default:
|
|
_ASSERTE(false);
|
|
}
|
|
|
|
_Preferred_copy_chunk_extent[0] = _Width;
|
|
_Preferred_copy_chunk_extent[1] = _Height;
|
|
_Preferred_copy_chunk_extent[2] = _Depth;
|
|
}
|
|
|
|
// Finds the greatest common divisor of 2 unsigned integral numbers using Euclid's algorithm
|
|
template <typename _T>
|
|
inline _T _Greatest_common_divisor(_T _M, _T _N)
|
|
{
|
|
static_assert(std::is_unsigned<_T>::value, "This GCD function only supports unsigned integral types");
|
|
|
|
_ASSERTE((_M > 0) && (_N > 0));
|
|
|
|
if (_N > _M) {
|
|
std::swap(_N , _M);
|
|
}
|
|
|
|
_T _Temp;
|
|
while (_N > 0)
|
|
{
|
|
_Temp = _N;
|
|
_N = _M % _N;
|
|
_M = _Temp;
|
|
}
|
|
|
|
return _M;
|
|
}
|
|
|
|
// Finds the least common multiple of 2 unsigned integral numbers using their greatest_common_divisor
|
|
template <typename _T>
|
|
inline _T _Least_common_multiple(_T _M, _T _N)
|
|
{
|
|
static_assert(std::is_unsigned<_T>::value, "This LCM function only supports unsigned integral types");
|
|
|
|
_ASSERTE((_M > 0) && (_N > 0));
|
|
|
|
_T _Gcd = _Greatest_common_divisor(_M, _N);
|
|
return ((_M / _Gcd) * _N);
|
|
}
|
|
|
|
template <typename InputIterator, typename _Value_type>
|
|
inline _Event _Copy_impl(InputIterator _SrcFirst, InputIterator _SrcLast, size_t _NumElemsToCopy,
|
|
_Out_ _Buffer * _Dst, size_t _Dest_offset, size_t _Preferred_copy_chunk_num_elems = 0)
|
|
{
|
|
if (_NumElemsToCopy == 0) {
|
|
return _Event();
|
|
}
|
|
|
|
if (_Dst == NULL) {
|
|
throw runtime_exception("Failed to copy to buffer.", E_INVALIDARG);
|
|
}
|
|
|
|
#pragma warning ( push )
|
|
#pragma warning ( disable : 6001 ) // Using uninitialized memory '*_Dst'
|
|
if (((_NumElemsToCopy * sizeof(_Value_type)) + (_Dest_offset * _Dst->_Get_elem_size())) > (_Dst->_Get_num_elems() * _Dst->_Get_elem_size()))
|
|
{
|
|
throw runtime_exception("Invalid _Src argument(s). _Src size exceeds total size of the _Dest.", E_INVALIDARG);
|
|
}
|
|
#pragma warning ( pop )
|
|
|
|
_ASSERTE(_NumElemsToCopy == (size_t)(std::distance(_SrcFirst, _SrcLast)));
|
|
|
|
// If the dest is host accessible for write then we do the copy on
|
|
// accelerator(accelerator::cpu_accelerator).default_view
|
|
if (_Dst->_Is_host_accessible(_Write_access))
|
|
{
|
|
// Lets first map the _Dst buffer
|
|
_Event _Ev = _Dst->_Map_buffer_async(_Write_access);
|
|
|
|
// The _Dest is accessible on host. We just need to do a std::copy using a raw pointer as OutputIterator
|
|
_Buffer_ptr _PDestBuf = _Dst;
|
|
_Ev = _Ev._Add_continuation(std::function<_Event()>([_PDestBuf,_Dest_offset, _SrcFirst, _SrcLast]() mutable -> _Event
|
|
{
|
|
_Value_type *_DestPtr = reinterpret_cast<_Value_type*>(reinterpret_cast<char*>(_PDestBuf->_Get_host_ptr()) + (_Dest_offset * _PDestBuf->_Get_elem_size()));
|
|
std::copy(_SrcFirst, _SrcLast, stdext::make_unchecked_array_iterator(_DestPtr));
|
|
|
|
return _Event();
|
|
}));
|
|
|
|
return _Ev;
|
|
}
|
|
else
|
|
{
|
|
// _Dest is on a device. Lets create a temp staging buffer on the _Dest accelerator_view and copy the input over
|
|
// We may create a staging buffer of size smaller than the copy size and in that case we will perform the copy
|
|
// as a series of smaller copies
|
|
_Buffer_ptr _PDestBuf = _Dst;
|
|
size_t _NumElemsToCopyRemaining = _NumElemsToCopy;
|
|
size_t _PreferredNumElemsToCopyPerChunk = _Preferred_copy_chunk_num_elems;
|
|
if (_PreferredNumElemsToCopyPerChunk == 0) {
|
|
// If a preferred copy chunk size was not specified, lets pick one based on the
|
|
// size of the copy
|
|
_PreferredNumElemsToCopyPerChunk = _Get_preferred_copy_chunk_num_elems(_NumElemsToCopy, sizeof(_Value_type));
|
|
}
|
|
size_t _CurrDstOffset = _Dest_offset;
|
|
InputIterator _CurrStartIter = _SrcFirst;
|
|
_Event _Ev;
|
|
|
|
size_t _Lcm = _Least_common_multiple(_Dst->_Get_elem_size(), sizeof(_Value_type));
|
|
size_t _AdjustmentRatio = _Lcm / sizeof(_Value_type);
|
|
|
|
do
|
|
{
|
|
size_t _AllocationNumElems = _PreferredNumElemsToCopyPerChunk;
|
|
if (_NumElemsToCopyRemaining < _AllocationNumElems) {
|
|
_AllocationNumElems = _NumElemsToCopyRemaining;
|
|
}
|
|
|
|
_Buffer_ptr _PDestStagingBuf = _Buffer::_Get_temp_staging_buffer(_Dst->_Get_accelerator_view(),
|
|
_AllocationNumElems, sizeof(_Value_type));
|
|
|
|
_ASSERTE(_PDestStagingBuf != NULL);
|
|
_ASSERTE(_PDestStagingBuf->_Get_elem_size() == sizeof(_Value_type));
|
|
|
|
InputIterator _CurrEndIter = _CurrStartIter;
|
|
size_t _CurrNumElemsToCopy = _AllocationNumElems;
|
|
if (_CurrNumElemsToCopy > _PDestStagingBuf->_Get_num_elems()) {
|
|
_CurrNumElemsToCopy = _PDestStagingBuf->_Get_num_elems();
|
|
}
|
|
|
|
if (_NumElemsToCopyRemaining <= _CurrNumElemsToCopy) {
|
|
_CurrNumElemsToCopy = _NumElemsToCopyRemaining;
|
|
_CurrEndIter = _SrcLast;
|
|
}
|
|
else
|
|
{
|
|
// We need to adjust the _CurrNumElemsToCopy to be a multiple of the
|
|
// least common multiple of the destination buffer's element size and sizeof(_Value_type).
|
|
_CurrNumElemsToCopy = (_CurrNumElemsToCopy / _AdjustmentRatio) * _AdjustmentRatio;
|
|
std::advance(_CurrEndIter, _CurrNumElemsToCopy);
|
|
}
|
|
|
|
_ASSERTE((_CurrNumElemsToCopy % _AdjustmentRatio) == 0);
|
|
|
|
// This would not actually never block since we just created this staging buffer or are using
|
|
// a cached one that is not in use
|
|
_PDestStagingBuf->_Map_buffer(_Write_access, true /* _Wait */);
|
|
|
|
// Copy from input to the staging using a raw pointer as OutputIterator
|
|
std::copy(_CurrStartIter, _CurrEndIter, stdext::make_unchecked_array_iterator(reinterpret_cast<_Value_type*>(_PDestStagingBuf->_Get_host_ptr())));
|
|
|
|
_Ev = _Ev._Add_event(_PDestStagingBuf->_Copy_to_async(_PDestBuf, _CurrNumElemsToCopy, 0, _CurrDstOffset));
|
|
|
|
// Adjust the iterators and offsets
|
|
_NumElemsToCopyRemaining -= _CurrNumElemsToCopy;
|
|
_CurrDstOffset += (_CurrNumElemsToCopy * sizeof(_Value_type)) / _Dst->_Get_elem_size();
|
|
_CurrStartIter = _CurrEndIter;
|
|
|
|
} while (_NumElemsToCopyRemaining != 0);
|
|
|
|
return _Ev;
|
|
}
|
|
}
|
|
|
|
// The std::advance method is only supported for InputIterators and hence we have a custom implementation
|
|
// which forwards to the std::advance if the iterator is an input iterator and uses a loop based advance
|
|
// implementation otherwise
|
|
template<typename _InputIterator, typename _Distance>
|
|
typename std::enable_if<std::is_base_of<std::input_iterator_tag, typename std::iterator_traits<_InputIterator>::iterator_category>::value>::type
|
|
_Advance_output_iterator(_InputIterator &_Iter, _Distance _N)
|
|
{
|
|
std::advance(_Iter, _N);
|
|
}
|
|
|
|
template<typename _OutputIterator, typename _Distance>
|
|
typename std::enable_if<!std::is_base_of<std::input_iterator_tag, typename std::iterator_traits<_OutputIterator>::iterator_category>::value>::type
|
|
_Advance_output_iterator(_OutputIterator &_Iter, size_t _N)
|
|
{
|
|
for (size_t i = 0; i < _N; ++i)
|
|
{
|
|
_Iter++;
|
|
}
|
|
}
|
|
|
|
template <typename OutputIterator, typename _Value_type>
|
|
inline _Event _Copy_impl(_In_ _Buffer *_Src, size_t _Src_offset, size_t _Num_elems,
|
|
OutputIterator _DestIter, size_t _Preferred_copy_chunk_num_elems = 0)
|
|
{
|
|
if ((_Src == NULL) || ((_Src_offset + _Num_elems) > _Src->_Get_num_elems())) {
|
|
throw runtime_exception("Failed to copy to buffer.", E_INVALIDARG);
|
|
}
|
|
|
|
if (_Num_elems == 0) {
|
|
return _Event();
|
|
}
|
|
|
|
size_t _NumElemsToCopy = (_Num_elems * _Src->_Get_elem_size()) / sizeof(_Value_type);
|
|
|
|
// If the src is host accessible for readthen we do the copy on
|
|
// accelerator(accelerator::cpu_accelerator).default_view
|
|
if (_Src->_Is_host_accessible(_Read_access))
|
|
{
|
|
// Map the _Src buffer
|
|
_Event _Ev = _Src->_Map_buffer_async(_Read_access);
|
|
|
|
// The _Src is accessible on host. We just need to do a std::copy using a raw pointer as OutputIterator
|
|
_Buffer_ptr _PSrcBuf = _Src;
|
|
_Ev = _Ev._Add_continuation(std::function<_Event()>([_PSrcBuf, _Src_offset, _DestIter, _NumElemsToCopy]() mutable -> _Event
|
|
{
|
|
// The _Src is accessible on host. We just need to do a std::copy
|
|
const _Value_type *_PFirst = reinterpret_cast<const _Value_type*>(reinterpret_cast<char*>(_PSrcBuf->_Get_host_ptr()) + (_Src_offset * _PSrcBuf->_Get_elem_size()));
|
|
std::copy(_PFirst, _PFirst + _NumElemsToCopy, _DestIter);
|
|
|
|
return _Event();
|
|
}));
|
|
|
|
return _Ev;
|
|
}
|
|
else
|
|
{
|
|
// The _Src is on the device. We need to copy it out to a temporary staging array
|
|
// We may create a staging buffer of size smaller than the copy size and in that case we will
|
|
// perform the copy as a series of smaller copies
|
|
|
|
_Event _Ev;
|
|
|
|
_Buffer_ptr _PSrcBuf = _Src;
|
|
size_t _PreferredNumElemsToCopyPerChunk = _Preferred_copy_chunk_num_elems;
|
|
if (_PreferredNumElemsToCopyPerChunk == 0) {
|
|
// If a preferred copy chunk size was not specified, lets pick one based on the
|
|
// size of the copy
|
|
_PreferredNumElemsToCopyPerChunk = _Get_preferred_copy_chunk_num_elems(_NumElemsToCopy, sizeof(_Value_type));
|
|
}
|
|
|
|
size_t _AllocationNumElems = _PreferredNumElemsToCopyPerChunk;
|
|
if (_NumElemsToCopy < _AllocationNumElems) {
|
|
_AllocationNumElems = _NumElemsToCopy;
|
|
}
|
|
|
|
_Buffer_ptr _PSrcStagingBuf = _Buffer::_Get_temp_staging_buffer(_Src->_Get_accelerator_view(),
|
|
_AllocationNumElems, sizeof(_Value_type));
|
|
|
|
_ASSERTE(_PSrcStagingBuf != NULL);
|
|
_ASSERTE(_PSrcStagingBuf->_Get_elem_size() == sizeof(_Value_type));
|
|
|
|
// The total byte size of a copy chunk must be an integral multiple of both the
|
|
// source buffer's element size and sizeof(_Value_type).
|
|
size_t _Lcm = _Least_common_multiple(_Src->_Get_elem_size(), sizeof(_Value_type));
|
|
size_t _AdjustmentRatio = _Lcm / sizeof(_Value_type);
|
|
|
|
size_t _CurrNumElemsToCopy = _AllocationNumElems;
|
|
if (_CurrNumElemsToCopy > _PSrcStagingBuf->_Get_num_elems()) {
|
|
_CurrNumElemsToCopy = _PSrcStagingBuf->_Get_num_elems();
|
|
}
|
|
if (_NumElemsToCopy <= _CurrNumElemsToCopy)
|
|
{
|
|
_CurrNumElemsToCopy = _NumElemsToCopy;
|
|
}
|
|
else
|
|
{
|
|
// We need to adjust the _StagingBufNumElems to be a multiple of the
|
|
// least common multiple of the source buffer's element size and sizeof(_Value_type).
|
|
_CurrNumElemsToCopy = (_CurrNumElemsToCopy / _AdjustmentRatio) * _AdjustmentRatio;
|
|
}
|
|
|
|
_ASSERTE((_CurrNumElemsToCopy % _AdjustmentRatio) == 0);
|
|
|
|
size_t _NumElemsToCopyRemaining = _NumElemsToCopy - _CurrNumElemsToCopy;
|
|
|
|
_Ev = _PSrcBuf->_Copy_to_async(_PSrcStagingBuf, (_CurrNumElemsToCopy * sizeof(_Value_type)) / _PSrcBuf->_Get_elem_size(), _Src_offset, 0);
|
|
|
|
if (_NumElemsToCopyRemaining != 0)
|
|
{
|
|
_Ev = _Ev._Add_continuation(std::function<_Event()>([_DestIter, _PSrcBuf, _PSrcStagingBuf,
|
|
_CurrNumElemsToCopy, _NumElemsToCopyRemaining,
|
|
_Src_offset, _PreferredNumElemsToCopyPerChunk]() mutable -> _Event
|
|
{
|
|
// Initiate an asynchronous copy of the remaining part so that this part of the copy
|
|
// makes progress while we consummate the copying of the first part
|
|
size_t _CurrSrcOffset = _Src_offset + ((_CurrNumElemsToCopy * sizeof(_Value_type)) / _PSrcBuf->_Get_elem_size());
|
|
OutputIterator _CurrDestIter = _DestIter;
|
|
_Advance_output_iterator<decltype(_CurrDestIter), size_t>(_CurrDestIter, _CurrNumElemsToCopy);
|
|
_Event _Ret_ev = _Copy_impl<OutputIterator, _Value_type>(_PSrcBuf._Get_ptr(), _CurrSrcOffset,
|
|
(_NumElemsToCopyRemaining * sizeof(_Value_type)) / _PSrcBuf->_Get_elem_size(),
|
|
_CurrDestIter, _PreferredNumElemsToCopyPerChunk);
|
|
|
|
// Now copy the data from staging buffer to the destination
|
|
_Value_type *_PFirst = reinterpret_cast<_Value_type*>(_PSrcStagingBuf->_Get_host_ptr());
|
|
std::copy(_PFirst, _PFirst + _CurrNumElemsToCopy, _DestIter);
|
|
return _Ret_ev;
|
|
}));
|
|
}
|
|
else
|
|
{
|
|
_Ev = _Ev._Add_continuation(std::function<_Event()>([_DestIter, _PSrcStagingBuf, _CurrNumElemsToCopy]() mutable -> _Event
|
|
{
|
|
_Value_type *_PFirst = reinterpret_cast<_Value_type*>(_PSrcStagingBuf->_Get_host_ptr());
|
|
std::copy(_PFirst, _PFirst + _CurrNumElemsToCopy, _DestIter);
|
|
return _Event();
|
|
}));
|
|
}
|
|
|
|
return _Ev;
|
|
}
|
|
}
|
|
|
|
// Structured copy between buffers across AVs
|
|
_AMPIMP _Event __cdecl _Copy_impl(_In_ _Buffer *_Src, _View_shape_ptr _Src_shape, _Out_ _Buffer * _Dst, _View_shape_ptr _Dst_shape);
|
|
|
|
struct _Array_copy_desc
|
|
{
|
|
_Array_copy_desc(
|
|
const unsigned int _Rank,
|
|
const unsigned int _Src_linear_offset,
|
|
const unsigned int * _Src_extents,
|
|
const unsigned int * _Src_copy_offset,
|
|
const unsigned int _Dst_linear_offset,
|
|
const unsigned int * _Dst_extents,
|
|
const unsigned int * _Dst_copy_offset,
|
|
const unsigned int * _Copy_extents)
|
|
{
|
|
this->_Rank = _Rank;
|
|
|
|
this->_Src_linear_offset = _Src_linear_offset;
|
|
this->_Src_extents.assign( _Src_extents, _Src_extents + _Rank);
|
|
this->_Src_copy_offset.assign( _Src_copy_offset, _Src_copy_offset + _Rank);
|
|
|
|
this->_Dst_linear_offset = _Dst_linear_offset;
|
|
this->_Dst_extents.assign( _Dst_extents, _Dst_extents + _Rank);
|
|
this->_Dst_copy_offset.assign( _Dst_copy_offset, _Dst_copy_offset + _Rank);
|
|
|
|
this->_Copy_extents.assign( _Copy_extents, _Copy_extents + _Rank);
|
|
}
|
|
|
|
_Array_copy_desc() {}
|
|
|
|
unsigned int _Rank;
|
|
|
|
// Shape of source
|
|
unsigned int _Src_linear_offset;
|
|
std::vector<unsigned int> _Src_extents;
|
|
std::vector<unsigned int> _Src_copy_offset;
|
|
|
|
// Shape of destination
|
|
unsigned int _Dst_linear_offset;
|
|
std::vector<unsigned int> _Dst_extents;
|
|
std::vector<unsigned int> _Dst_copy_offset;
|
|
|
|
// Shape of copy region
|
|
std::vector<unsigned int> _Copy_extents;
|
|
};
|
|
|
|
// Declaration
|
|
_AMPIMP HRESULT __cdecl _Recursive_array_copy(const _Array_copy_desc& _Desc,
|
|
unsigned int _Native_copy_rank,
|
|
std::function<HRESULT(const _Array_copy_desc &_Reduced)> _Native_copy_func);
|
|
|
|
_AMPIMP std::pair<accelerator_view, accelerator_view> __cdecl _Get_src_dest_accelerator_view(_In_opt_ const _Buffer_descriptor *_SrcBuffDescPtr,
|
|
_In_opt_ const _Buffer_descriptor *_DestBuffDescPtr);
|
|
|
|
// Iterator based copy function
|
|
template<typename _InputInterator, typename _OutputIterator>
|
|
inline _Event _Copy_impl_iter(_InputInterator _SrcFirst, _InputInterator _SrcLast, _OutputIterator _DstFirst)
|
|
{
|
|
std::copy(_SrcFirst, _SrcLast, _DstFirst);
|
|
return _Event();
|
|
}
|
|
|
|
// Iterator based copy function
|
|
template <typename InputIterator, typename _Value_type>
|
|
inline _Event _Copy_impl(InputIterator _SrcFirst, _View_shape_ptr _Src_shape, _Inout_ _Buffer * _Dst, _View_shape_ptr _Dst_shape)
|
|
{
|
|
_ASSERTE(_Dst != NULL);
|
|
_ASSERTE(_Src_shape != NULL);
|
|
_ASSERTE(_Dst_shape != NULL);
|
|
|
|
if (_Src_shape->_Is_projection()) {
|
|
_Src_shape = _Src_shape->_Get_reduced_shape_for_copy();
|
|
}
|
|
|
|
if (_Dst_shape->_Is_projection()) {
|
|
_Dst_shape = _Dst_shape->_Get_reduced_shape_for_copy();
|
|
}
|
|
|
|
_ASSERTE(_Src_shape->_Get_rank() == _Dst_shape->_Get_rank());
|
|
|
|
_ASSERTE(_View_shape::_Compare_extent_with_elem_size(_Src_shape->_Get_rank(), _Src_shape->_Get_view_extent(),
|
|
sizeof(_Value_type), _Dst_shape->_Get_view_extent(), _Dst->_Get_elem_size()));
|
|
|
|
if (_Dst->_Is_host_accessible(_Write_access))
|
|
{
|
|
// The destination buffer is accesible on the host. Map the _Dst buffer
|
|
_Event _Ev = _Dst->_Map_buffer_async(_Write_access);
|
|
_Buffer_ptr _PDestBuf = _Dst;
|
|
return _Ev._Add_continuation(std::function<_Event()>([_SrcFirst, _Src_shape, _PDestBuf, _Dst_shape]() mutable -> _Event {
|
|
return _Copy_impl_iter(_SrcFirst, _Src_shape, stdext::make_unchecked_array_iterator(reinterpret_cast<_Value_type*>(_PDestBuf->_Get_host_ptr())),
|
|
_Create_reinterpreted_shape(_Dst_shape, _PDestBuf->_Get_elem_size(), sizeof(_Value_type)));
|
|
}));
|
|
}
|
|
else
|
|
{
|
|
// The dest buffer is not accesible on host. Lets create a temporary
|
|
// staging buffer on the destination buffer's accelerator_view
|
|
_Buffer_ptr _PTempStagingBuf = _Buffer::_Create_stage_buffer(_Dst->_Get_accelerator_view(), accelerator(accelerator::cpu_accelerator).default_view,
|
|
_Src_shape->_Get_view_size(), sizeof(_Value_type), true /* _Is_temp */);
|
|
|
|
_PTempStagingBuf->_Map_buffer(_Write_access, true /* _Wait */);
|
|
_Value_type *_Dst_ptr = reinterpret_cast<_Value_type*>(_PTempStagingBuf->_Get_host_ptr());
|
|
_Event _Ev = _Copy_impl_iter(_SrcFirst, _Src_shape, stdext::make_unchecked_array_iterator(_Dst_ptr), _Src_shape);
|
|
|
|
// Now copy from the staging buffer to the destination buffer
|
|
_Buffer_ptr _PDestBuf = _Dst;
|
|
return _Ev._Add_continuation(std::function<_Event()>([_PTempStagingBuf, _Src_shape, _PDestBuf, _Dst_shape]() mutable -> _Event {
|
|
return _Copy_impl(_PTempStagingBuf, _Src_shape, _PDestBuf, _Dst_shape);
|
|
}));
|
|
}
|
|
}
|
|
|
|
template <typename OutputIterator, typename _Value_type>
|
|
inline _Event _Copy_impl(_In_ _Buffer *_Src, _View_shape_ptr _Src_shape, OutputIterator _DestIter, _View_shape_ptr _Dst_shape)
|
|
{
|
|
_ASSERTE(_Src != NULL);
|
|
_ASSERTE(_Src_shape != NULL);
|
|
_ASSERTE(_Dst_shape != NULL);
|
|
|
|
if (_Src_shape->_Is_projection()) {
|
|
_Src_shape = _Src_shape->_Get_reduced_shape_for_copy();
|
|
}
|
|
|
|
if (_Dst_shape->_Is_projection()) {
|
|
_Dst_shape = _Dst_shape->_Get_reduced_shape_for_copy();
|
|
}
|
|
|
|
_ASSERTE(_Src_shape->_Get_rank() == _Dst_shape->_Get_rank());
|
|
|
|
_ASSERTE(_View_shape::_Compare_extent_with_elem_size(_Src_shape->_Get_rank(), _Src_shape->_Get_view_extent(),
|
|
_Src->_Get_elem_size(), _Dst_shape->_Get_view_extent(), sizeof(_Value_type)));
|
|
|
|
if (_Src->_Is_host_accessible(_Read_access))
|
|
{
|
|
// The source buffer is accessible on the host. Map the _Src buffer
|
|
_Event _Ev = _Src->_Map_buffer_async(_Read_access);
|
|
|
|
_Buffer_ptr _PSrcBuf = _Src;
|
|
return _Ev._Add_continuation(std::function<_Event()>([_PSrcBuf, _Src_shape, _DestIter, _Dst_shape]() mutable -> _Event {
|
|
return _Copy_impl_iter(reinterpret_cast<_Value_type*>(_PSrcBuf->_Get_host_ptr()),
|
|
_Create_reinterpreted_shape(_Src_shape, _PSrcBuf->_Get_elem_size(), sizeof(_Value_type)),
|
|
_DestIter, _Dst_shape);
|
|
}));
|
|
}
|
|
else
|
|
{
|
|
// The source buffer is not accessible on host. Lets create a temporary
|
|
// staging buffer on the source buffer's accelerator_view and initiate a copy
|
|
// from the source buffer to the temporary staging buffer
|
|
_Buffer_ptr _PTempStagingBuf = _Buffer::_Create_stage_buffer(_Src->_Get_accelerator_view(), accelerator(accelerator::cpu_accelerator).default_view,
|
|
_Dst_shape->_Get_view_size(), sizeof(_Value_type), true);
|
|
|
|
_Event _Ev = _Src->_Copy_to_async(_PTempStagingBuf, _Src_shape, _Dst_shape);
|
|
return _Ev._Add_continuation(std::function<_Event()>([_PTempStagingBuf, _Dst_shape, _DestIter]() mutable -> _Event {
|
|
return _Copy_impl_iter(reinterpret_cast<_Value_type*>(_PTempStagingBuf->_Get_host_ptr()),
|
|
_Dst_shape, _DestIter, _Dst_shape);
|
|
}));
|
|
}
|
|
}
|
|
|
|
// Iterator based structured copy function
|
|
template<typename _InputInterator, typename _OutputIterator>
|
|
inline _Event _Copy_impl_iter(_InputInterator _SrcIter, _View_shape_ptr _Src_shape,
|
|
_OutputIterator _DstIter, _View_shape_ptr _Dst_shape)
|
|
{
|
|
if (_Src_shape->_Is_projection()) {
|
|
_Src_shape = _Src_shape->_Get_reduced_shape_for_copy();
|
|
}
|
|
|
|
if (_Dst_shape->_Is_projection()) {
|
|
_Dst_shape = _Dst_shape->_Get_reduced_shape_for_copy();
|
|
}
|
|
|
|
_ASSERTE(_Src_shape->_Get_rank() == _Dst_shape->_Get_rank());
|
|
_ASSERTE(_View_shape::_Compare_extent(_Src_shape->_Get_rank(), _Src_shape->_Get_view_extent(), _Dst_shape->_Get_view_extent()));
|
|
|
|
// If both the _Src_shape and _Dst_shape are linear we can be more efficient
|
|
unsigned int _Src_linear_offset, _Src_linear_size, _Dst_linear_offset, _Dst_linear_size;
|
|
if (_Src_shape->_Is_view_linear(_Src_linear_offset, _Src_linear_size) &&
|
|
_Dst_shape->_Is_view_linear(_Dst_linear_offset, _Dst_linear_size))
|
|
{
|
|
_ASSERTE(_Src_linear_size == _Dst_linear_size);
|
|
|
|
// These iterators might be not contiguous, therefore we use std::advance
|
|
std::advance(_SrcIter, _Src_linear_offset);
|
|
auto _SrcLast = _SrcIter;
|
|
std::advance(_SrcLast, _Src_linear_size);
|
|
std::advance(_DstIter, _Dst_linear_offset);
|
|
|
|
return _Copy_impl_iter(_SrcIter, _SrcLast, _DstIter);
|
|
}
|
|
|
|
std::vector<unsigned int> _Src_extent(_Src_shape->_Get_rank());
|
|
std::vector<unsigned int> _Src_offset(_Src_shape->_Get_rank());
|
|
std::vector<unsigned int> _Dst_extent(_Dst_shape->_Get_rank());
|
|
std::vector<unsigned int> _Dst_offset(_Dst_shape->_Get_rank());
|
|
std::vector<unsigned int> _Copy_extent(_Src_shape->_Get_rank());
|
|
|
|
for (size_t i = 0; i < _Src_shape->_Get_rank(); ++i) {
|
|
_Src_extent[i] = _Src_shape->_Get_base_extent()[i];
|
|
_Src_offset[i] = _Src_shape->_Get_view_offset()[i];
|
|
_Dst_extent[i] = _Dst_shape->_Get_base_extent()[i];
|
|
_Dst_offset[i] = _Dst_shape->_Get_view_offset()[i];
|
|
_Copy_extent[i] = _Src_shape->_Get_view_extent()[i];
|
|
}
|
|
|
|
_Array_copy_desc _Desc(
|
|
_Src_shape->_Get_rank(),
|
|
_Src_shape->_Get_linear_offset(),
|
|
_Src_extent.data(),
|
|
_Src_offset.data(),
|
|
_Dst_shape->_Get_linear_offset(),
|
|
_Dst_extent.data(),
|
|
_Dst_offset.data(),
|
|
_Copy_extent.data());
|
|
|
|
// Note: Capturing shape pointers would be incorrect, they are valid for setting up the call.
|
|
// They might be deleted right after this call completes.
|
|
HRESULT hr = _Recursive_array_copy(_Desc, 1, [_SrcIter, _DstIter](const _Array_copy_desc &_Reduced) -> HRESULT {
|
|
|
|
auto _SrcFirst = _SrcIter;
|
|
auto _DstFirst = _DstIter;
|
|
|
|
std::advance(_DstFirst, _Reduced._Dst_linear_offset + _Reduced._Dst_copy_offset[0]);
|
|
std::advance(_SrcFirst, _Reduced._Src_linear_offset + _Reduced._Src_copy_offset[0]);
|
|
auto _SrcLast = _SrcFirst;
|
|
std::advance(_SrcLast, _Reduced._Copy_extents[0]);
|
|
|
|
std::copy(_SrcFirst, _SrcLast, _DstFirst);
|
|
|
|
return S_OK;
|
|
});
|
|
|
|
if (FAILED(hr)) {
|
|
throw Concurrency::runtime_exception("Failed to copy between buffers", E_FAIL);
|
|
}
|
|
|
|
return _Event();
|
|
}
|
|
|
|
// A ubiquitous buffer that provides access to the underlying data
|
|
// on any accelerator_view
|
|
class _Ubiquitous_buffer : public _Reference_counter
|
|
{
|
|
friend _Event _Get_access_async(const _View_key _Key, accelerator_view _Av, _Access_mode _Mode, _Buffer_ptr &_Buf_ptr);
|
|
friend _AMPIMP accelerator_view __cdecl _Select_copy_src_accelerator_view(_In_ _View_key _Src_view_key, const accelerator_view &_Dest_accelerator_view);
|
|
friend struct _DPC_call_handle;
|
|
|
|
public:
|
|
|
|
_AMPIMP static _Ret_ _Ubiquitous_buffer * __cdecl _Create_ubiquitous_buffer(size_t _Num_elems, size_t _Elem_size);
|
|
|
|
_AMPIMP static _Ret_ _Ubiquitous_buffer * __cdecl _Create_ubiquitous_buffer(_Buffer_ptr _Master_buffer);
|
|
|
|
// Register a new view on top of this _Ubiquitous_buffer
|
|
_AMPIMP void _Register_view(_In_ _View_key _Key, accelerator_view _Cpu_av, _View_shape_ptr _Shape);
|
|
|
|
// Register a copy of an existing view registered with this _Ubiquitous_buffer
|
|
_AMPIMP void _Register_view_copy(_In_ _View_key _New_view_key, _In_ _View_key _Existing_view_key);
|
|
|
|
// Unregister a view currently registered with this _Ubiquitous_buffer
|
|
_AMPIMP void _Unregister_view(_In_ _View_key _Key);
|
|
|
|
// Obtain a specified mode of access to the specified view on the specified target
|
|
// accelerator_view. This method also serves the purpose of determining the
|
|
// amount of data copy expected to happen as part of this _Get_access request
|
|
// without actually performing the copies or state updates in the _Ubiquitous_buffer. This
|
|
// is used for reporting the implicit data copies that happen when accessing array_views
|
|
// in C++ AMP ETW events
|
|
_AMPIMP _Event _Get_access_async(_In_ _View_key _Key, _Accelerator_view_impl_ptr _Av_view_impl_ptr,
|
|
_Access_mode _Mode, _Buffer_ptr &_Buf_ptr,
|
|
_Inout_opt_ ULONGLONG *_Sync_size = nullptr);
|
|
|
|
// Discard the content underlying this view
|
|
_AMPIMP void _Discard(_In_ _View_key _Key);
|
|
|
|
// This method does not synchonize the copies. Should not be used for getting
|
|
// data access but only to get the underlying buffer's properties
|
|
_AMPIMP _Buffer_ptr _Get_master_buffer() const;
|
|
|
|
_AMPIMP accelerator_view _Get_master_accelerator_view() const;
|
|
|
|
_AMPIMP _View_shape_ptr _Get_view_shape(_In_ _View_key _Key);
|
|
|
|
_Ret_ _Accelerator_view_impl* _Get_master_accelerator_view_impl() const
|
|
{
|
|
return _M_master_av;
|
|
}
|
|
|
|
size_t _Get_master_buffer_elem_size() const
|
|
{
|
|
return _M_master_buffer_elem_size;
|
|
}
|
|
|
|
size_t _Get_master_buffer_num_elems() const
|
|
{
|
|
return _M_master_buffer_num_elems;
|
|
}
|
|
|
|
bool _Has_data_source() const
|
|
{
|
|
return _M_has_data_source;
|
|
}
|
|
|
|
private:
|
|
|
|
// The _Ubiquitous_buffer constructors are private to force construction through the static
|
|
// _Create_ubiquitous_buffer method to ensure the object is allocated in the runtime
|
|
_Ubiquitous_buffer(size_t _Num_elems, size_t _Elem_size);
|
|
_Ubiquitous_buffer(_In_ _Buffer* _Master_buffer);
|
|
|
|
// Private destructor to force deletion through _Release
|
|
~_Ubiquitous_buffer();
|
|
|
|
// No default consturctor, copy constructor and assignment operator
|
|
_Ubiquitous_buffer();
|
|
_Ubiquitous_buffer(const _Ubiquitous_buffer &rhs);
|
|
_Ubiquitous_buffer &operator=(const _Ubiquitous_buffer &rhs);
|
|
|
|
// Helper methods
|
|
|
|
// Get access to a buffer on a specified accelerator for a specified pre-registered view.
|
|
// If _Sync_size parameter is not null, then function calculates number of bytes that we
|
|
// need to synchronize to get desired access.
|
|
_AMPIMP _Event _Get_access_async(_In_ _View_key _Key, accelerator_view _Av, _Access_mode _Mode,
|
|
_Buffer_ptr &_Buf_ptr, _Inout_opt_ ULONGLONG *_Sync_size = NULL);
|
|
|
|
// Commit a view to the master buffer if needed. When the _Sync_size parameter is non-null
|
|
// this method just returns the amount of data to be copied as part of the commit, without
|
|
// actually performing the commit
|
|
_Event _Commit_view_async(_In_ _View_info *_Info, _Inout_ ULONGLONG *_Sync_size = nullptr);
|
|
|
|
// Get the _Buffer_ptr corresponding to a specified accelerator_view. When the
|
|
// _Create parameter is true, it creates a new _Buffer if one does not already exist
|
|
// for that accelerator_view
|
|
_Ret_ _Buffer* _Get_buffer(_In_ _Accelerator_view_impl* _Av, bool _Create = true);
|
|
|
|
// Sets a new access mode for the specified view
|
|
void _Set_new_access_mode(_Inout_ _View_info *_Info, _Access_mode _New_mode);
|
|
|
|
// Unsets the discard flag from the specified view and all other
|
|
// overlapping views
|
|
void _Unset_discard_flag(_Inout_ _View_info *_Info);
|
|
|
|
// Determines whether the data underlying the specified view has been discarded
|
|
// based on whether a subsuming view has the discard flag set.
|
|
bool _Should_discard(const _View_info *_Info) const;
|
|
|
|
// Does this view have exclusive data which is not discarded,
|
|
// not on the master accelerator_view and also there is not other view
|
|
// that subsumes this view and is marked dirty
|
|
bool _Has_exclusive_data(const _View_info *_Info) const;
|
|
|
|
// Based on the current state of overlapping views in the _Ubiquitous_buffer
|
|
// does the specified view require a data update on the target accelerator_view
|
|
// to fulfil an access request
|
|
bool _Requires_update_on_target_accelerator_view(const _View_info *_Info,
|
|
_Access_mode _Requested_mode,
|
|
_In_ _Accelerator_view_impl* _Target_acclerator_view) const;
|
|
|
|
// This method iterates over all views in the specified commit list
|
|
// and flags them as "commit not needed" if that view is subsumed by another view present in the
|
|
// commit list
|
|
static void _Flag_redundant_commits(std::vector<std::pair<_View_info*, bool>> &_Commit_list);
|
|
|
|
// This method returns the list of accelerator_views where the specified view already has
|
|
// a valid cached copy of the data and getting read access would not incur any data movement.
|
|
// The _Can_access_anywhere parameter is an output parameter used to indicate to the
|
|
// caller that the specified view can be accessed on any accelerator_view without incurring
|
|
// any data movement. This is true when there are no modified overlapping views that require
|
|
// synchronization and the specified view has the discard_data flag set.
|
|
// This method is used for determining the source accelerator_view for copy and p_f_e operations
|
|
// involving array_views
|
|
_Accelerator_view_unordered_set _Get_caching_info(_In_ _View_key _Key, _Out_opt_ bool *_Can_access_anywhere = NULL);
|
|
|
|
_Accelerator_view_unordered_set _Get_caching_info_impl(_In_ _View_key _Key, _Out_opt_ bool *_Can_access_anywhere);
|
|
|
|
_Ret_ _Accelerator_view_impl* _Determine_alternate_target_accelerator_view(_In_ _View_key _Key,
|
|
_In_ _Accelerator_view_impl* _Original_av,
|
|
_Access_mode _Mode);
|
|
|
|
private:
|
|
|
|
// Private data
|
|
|
|
// The master accelerator_view for this _Ubiquitous_buffer
|
|
// which is specified at construction time
|
|
_Accelerator_view_impl_ptr _M_master_av;
|
|
|
|
// The master _Buffer corresponding to this _Ubiquitous_buffer
|
|
// which is specified at construction time
|
|
_Buffer* _M_master_buffer;
|
|
|
|
// The size of each element of the master buffer
|
|
size_t _M_master_buffer_elem_size;
|
|
|
|
// The number of elements in the master buffer
|
|
size_t _M_master_buffer_num_elems;
|
|
|
|
// Indicates if this ubiquitous buffer has an underlying data source
|
|
bool _M_has_data_source;
|
|
|
|
// A map of pre-created _Buffers corresponding to different
|
|
// accelerator_views where the _Ubiquitous_buffer has already been
|
|
// accessed
|
|
std::map<_Accelerator_view_impl_ptr, _Buffer_ptr> _M_buffer_map;
|
|
|
|
// A mapping between all registered view keys in this _Ubiquitous_buffer
|
|
// to their corresponding _View_info
|
|
std::unordered_map<_View_key, _View_info*> _M_view_map;
|
|
|
|
// Set of distinct views of this buffer. As multiple copies of the same
|
|
// view may have been registered for this _Ubiquitous_buffer, this set
|
|
// maintains the set of distinct views which really matter for the
|
|
// caching protocol. Also, note that some view_info may not have any live registered
|
|
// and hence does not exist in the _M_view_map but may exist here since
|
|
// it has uncomiitted data which needs to be considered as part of the cache
|
|
// coherence protocol to prevent modifications underlying this view from being lost
|
|
std::unordered_set<_View_info*> _M_view_info_set;
|
|
|
|
// Critical section object to protect the cache directory
|
|
Concurrency::critical_section _M_critical_section;
|
|
};
|
|
|
|
// Class defines functions for interoperability with D3D
|
|
class _D3D_interop
|
|
{
|
|
public:
|
|
_AMPIMP static _Ret_ IUnknown * __cdecl _Get_D3D_buffer(_In_ _Buffer *_Buffer_ptr);
|
|
_AMPIMP static _Ret_ IUnknown * __cdecl _Get_D3D_texture(_In_ _Texture *_Texture_ptr);
|
|
_AMPIMP static _Ret_ void * __cdecl _Get_D3D_sampler_data_ptr(_In_ IUnknown *_D3D_sampler);
|
|
_AMPIMP static void __cdecl _Release_D3D_sampler_data_ptr(_In_ void *_Sampler_data_ptr);
|
|
_AMPIMP static _Ret_ IUnknown * __cdecl _Get_D3D_sampler(const Concurrency::accelerator_view &_Av, _In_ _Sampler *_Sampler_ptr);
|
|
};
|
|
|
|
inline
|
|
_Event _Get_access_async(const _View_key _Key, accelerator_view _Av, _Access_mode _Mode, _Buffer_ptr &_Buf_ptr)
|
|
{
|
|
return _Key->_Get_buffer_ptr()->_Get_access_async(_Key->_Get_view_key(), _Av, _Mode, _Buf_ptr);
|
|
}
|
|
|
|
inline
|
|
_Ret_ _View_shape* _Get_buffer_view_shape(const _Buffer_descriptor& _Descriptor)
|
|
{
|
|
return _Descriptor._Get_buffer_ptr()->_Get_view_shape(_Descriptor._Get_view_key());
|
|
}
|
|
|
|
inline
|
|
bool _Is_cpu_accelerator(const accelerator& _Accl)
|
|
{
|
|
return (_Accl.device_path == accelerator::cpu_accelerator);
|
|
}
|
|
|
|
} // namespace Concurrency::details
|
|
|
|
/// <summary>
|
|
/// Exception thrown when an underlying DirectX call fails
|
|
/// due to the Windows timeout detection and recovery mechanism
|
|
/// </summary>
|
|
class accelerator_view_removed : public runtime_exception
|
|
{
|
|
public:
|
|
/// <summary>
|
|
/// Construct an accelerator_view_removed exception with a message and
|
|
/// a view removed reason code
|
|
/// </summary>
|
|
/// <param name="_Message">
|
|
/// Descriptive message of error
|
|
/// </param>
|
|
/// <param name="_View_removed_reason">
|
|
/// HRESULT error code indicating the cause of removal of the accelerator_view
|
|
/// </param>
|
|
_AMPIMP explicit accelerator_view_removed(const char * _Message, HRESULT _View_removed_reason) throw();
|
|
|
|
/// <summary>
|
|
/// Construct an accelerator_view_removed exception
|
|
/// </summary>
|
|
/// <param name="_View_removed_reason">
|
|
/// HRESULT error code indicating the cause of removal of the accelerator_view
|
|
/// </param>
|
|
_AMPIMP explicit accelerator_view_removed(HRESULT _View_removed_reason) throw();
|
|
|
|
/// <summary>
|
|
/// Returns an HRESULT error code indicating the cause of the accelerator_view's removal
|
|
/// </summary>
|
|
/// <returns>
|
|
/// The HRESULT error code that indicates the cause of accelerator_view's removal
|
|
/// </returns>
|
|
_AMPIMP HRESULT get_view_removed_reason() const throw();
|
|
|
|
private:
|
|
|
|
HRESULT _M_view_removed_reason_code;
|
|
}; // class accelerator_view_removed
|
|
|
|
/// <summary>
|
|
/// Exception thrown when the runtime fails to launch a kernel
|
|
/// using the compute domain specified at the parallel_for_each call site.
|
|
/// </summary>
|
|
class invalid_compute_domain : public runtime_exception
|
|
{
|
|
public:
|
|
/// <summary>
|
|
/// Construct an invalid_compute_domain exception with a message
|
|
/// </summary>
|
|
/// <param name="_Message">
|
|
/// Descriptive message of error
|
|
/// </param>
|
|
_AMPIMP explicit invalid_compute_domain(const char * _Message) throw();
|
|
|
|
/// <summary>
|
|
/// Construct an invalid_compute_domain exception
|
|
/// </summary>
|
|
_AMPIMP invalid_compute_domain() throw();
|
|
}; // class invalid_compute_domain
|
|
|
|
/// <summary>
|
|
/// Exception thrown when an unsupported feature is used
|
|
/// </summary>
|
|
class unsupported_feature : public runtime_exception
|
|
{
|
|
public:
|
|
/// <summary>
|
|
/// Construct an unsupported_feature exception with a message
|
|
/// </summary>
|
|
/// <param name="_Message">
|
|
/// Descriptive message of error
|
|
/// </param>
|
|
_AMPIMP explicit unsupported_feature(const char * _Message) throw();
|
|
|
|
/// <summary>
|
|
/// Construct an unsupported_feature exception
|
|
/// </summary>
|
|
_AMPIMP unsupported_feature() throw();
|
|
}; // class unsupported_feature
|
|
|
|
} // namespace Concurrency
|
|
|
|
// =+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+
|
|
//
|
|
// Compiler/Runtime Interface
|
|
//
|
|
// =-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-
|
|
|
|
#define HELPERAPI __cdecl
|
|
|
|
using namespace Concurrency::details;
|
|
|
|
extern "C" {
|
|
|
|
// This structure is used for storing information about resources required by the kernel.
|
|
enum _Resource_kind
|
|
{
|
|
RESOURCE_BUFFER = 0,
|
|
RESOURCE_TEXTURE = 1,
|
|
RESOURCE_SAMPLER = 2,
|
|
};
|
|
|
|
struct _Device_resource_info
|
|
{
|
|
_Resource_kind _M_resource_kind; // buffer, texture, or sampler
|
|
|
|
void * _M_desc; // Pointer to the _Buffer_descriptor/_Texture_descriptor/_Sampler_descriptor instance
|
|
// which underlies all device resource
|
|
|
|
_Access_mode _M_formal_access_mode; // scalar: read-only
|
|
// const scalar ref: read-only
|
|
// scalar ref: ReadWrite
|
|
// array: ReadWrite
|
|
// const array: ReadOnly
|
|
size_t _M_actual_arg_num;
|
|
|
|
BOOL _Is_buffer() const
|
|
{
|
|
return (_M_resource_kind == RESOURCE_BUFFER);
|
|
}
|
|
|
|
BOOL _Is_texture() const
|
|
{
|
|
return (_M_resource_kind == RESOURCE_TEXTURE);
|
|
}
|
|
|
|
BOOL _Is_sampler() const
|
|
{
|
|
return (_M_resource_kind == RESOURCE_SAMPLER);
|
|
}
|
|
|
|
_Ret_ _Buffer_descriptor * _Get_buffer_desc() const
|
|
{
|
|
_ASSERTE(_Is_buffer());
|
|
return reinterpret_cast<_Buffer_descriptor *>(_M_desc);
|
|
}
|
|
|
|
_Ret_ _Texture_descriptor * _Get_texture_desc() const
|
|
{
|
|
_ASSERTE(_Is_texture());
|
|
return reinterpret_cast<_Texture_descriptor *>(_M_desc);
|
|
}
|
|
|
|
_Ret_ _Sampler_descriptor * _Get_sampler_desc() const
|
|
{
|
|
_ASSERTE(_Is_sampler());
|
|
return reinterpret_cast<_Sampler_descriptor *>(_M_desc);
|
|
}
|
|
|
|
_Ret_ void * _Get_resource_ptr() const
|
|
{
|
|
if (_Is_buffer())
|
|
{
|
|
_Ubiquitous_buffer * _Tmp = _Get_buffer_desc()->_Get_buffer_ptr();
|
|
return reinterpret_cast<void *>(_Tmp);
|
|
}
|
|
else if (_Is_texture())
|
|
{
|
|
_Texture * _Tmp = _Get_texture_desc()->_Get_texture_ptr();
|
|
return reinterpret_cast<void *>(_Tmp);
|
|
}
|
|
else
|
|
{
|
|
_ASSERTE(_Is_sampler());
|
|
_Sampler * _Tmp = _Get_sampler_desc()->_Get_sampler_ptr();
|
|
return reinterpret_cast<void *>(_Tmp);
|
|
}
|
|
}
|
|
};
|
|
|
|
// This structure is used for storing information about the const buffers
|
|
struct _Device_const_buffer_info
|
|
{
|
|
void * _M_data; // Pointer to the host data to intialize the
|
|
// constant buffer with
|
|
|
|
size_t _M_const_buf_size; // Size of the const buffer in bytes
|
|
|
|
unsigned int _M_is_debug_data; // Is this debug data which will be
|
|
// intialized by the runtime. 0 (false), 1 (true)
|
|
};
|
|
}
|
|
|
|
namespace Concurrency
|
|
{
|
|
namespace details
|
|
{
|
|
enum _DPC_kernel_func_kind
|
|
{
|
|
NON_ALIASED_SHADER = 0, // slot 0
|
|
ALIASED_SHADER = 1, // slot 1
|
|
NUM_SHADER_VERSIONS = 2
|
|
};
|
|
|
|
struct _DPC_call_handle
|
|
{
|
|
_Accelerator_view_impl *_M_rv;
|
|
bool _M_is_explicit_target_acclview;
|
|
|
|
// Info about the kernel function arguments
|
|
_Device_resource_info * _M_device_resource_info;
|
|
size_t _M_num_resources;
|
|
size_t _M_num_writable_buffers;
|
|
size_t _M_num_samplers;
|
|
|
|
// Info about the host buffer created corresponding to the const buffer
|
|
_Device_const_buffer_info * _M_const_buffer_info;
|
|
size_t _M_num_const_buffers;
|
|
|
|
bool _M_RW_aliasing;
|
|
|
|
// Kernel funcs
|
|
_DPC_shader_blob * _M_shader_blobs[NUM_SHADER_VERSIONS];
|
|
|
|
// Compute domain info
|
|
int _M_is_flat_model;
|
|
unsigned int _M_compute_rank;
|
|
unsigned int * _M_grid_extents;
|
|
|
|
// Kernel dispatch info
|
|
unsigned int _M_groupCountX;
|
|
unsigned int _M_groupCountY;
|
|
unsigned int _M_groupCountZ;
|
|
|
|
// The shape of the group
|
|
unsigned int _M_groupExtentX;
|
|
unsigned int _M_groupExtentY;
|
|
unsigned int _M_groupExtentZ;
|
|
|
|
_DPC_call_handle(const accelerator_view &_Accelerator_view)
|
|
{
|
|
if (!_Accelerator_view.is_auto_selection) {
|
|
_M_rv = _Get_accelerator_view_impl_ptr(_Accelerator_view);
|
|
}
|
|
else {
|
|
_M_rv = NULL;
|
|
}
|
|
|
|
_M_is_explicit_target_acclview = false;
|
|
if (_M_rv != NULL) {
|
|
_M_is_explicit_target_acclview = true;
|
|
}
|
|
|
|
_M_device_resource_info = NULL;
|
|
_M_num_resources = 0;
|
|
_M_num_writable_buffers = 0;
|
|
_M_num_samplers = 0;
|
|
|
|
_M_const_buffer_info = NULL;
|
|
_M_num_const_buffers = 0;
|
|
|
|
_M_RW_aliasing = false;
|
|
|
|
for (size_t _I = 0; _I < NUM_SHADER_VERSIONS; _I++)
|
|
{
|
|
_M_shader_blobs[_I] = NULL;
|
|
}
|
|
|
|
_M_is_flat_model = 0;
|
|
_M_compute_rank = 0;
|
|
_M_grid_extents = NULL;
|
|
|
|
_M_groupCountX = 0;
|
|
_M_groupCountY = 0;
|
|
_M_groupCountZ = 0;
|
|
|
|
_M_groupExtentX = 0;
|
|
_M_groupExtentY = 0;
|
|
_M_groupExtentZ = 0;
|
|
}
|
|
|
|
~_DPC_call_handle()
|
|
{
|
|
if (_M_grid_extents) {
|
|
delete [] _M_grid_extents;
|
|
}
|
|
}
|
|
|
|
bool _Is_buffer_aliased(_In_ void *_Buffer_ptr)
|
|
{
|
|
return ((_M_aliased_buffer_set != nullptr) && (_M_aliased_buffer_set->find(_Buffer_ptr) != _M_aliased_buffer_set->end()));
|
|
}
|
|
|
|
bool _Is_buffer_unaccessed(size_t _Buffer_idx)
|
|
{
|
|
return ((_M_is_device_buffer_unaccessed != nullptr) && _M_is_device_buffer_unaccessed->operator[](_Buffer_idx));
|
|
}
|
|
|
|
void _Set_buffer_unaccessed(size_t _Buffer_idx)
|
|
{
|
|
if (_M_is_device_buffer_unaccessed == nullptr) {
|
|
_M_is_device_buffer_unaccessed = std::unique_ptr<std::vector<bool>>(new std::vector<bool>(_M_num_resources, false));
|
|
}
|
|
|
|
_M_is_device_buffer_unaccessed->operator[](_Buffer_idx) = true;
|
|
}
|
|
|
|
const int* _Get_redirect_indices() const
|
|
{
|
|
if (!_M_RW_aliasing) {
|
|
return nullptr;
|
|
}
|
|
|
|
_ASSERTE(_M_Redirect_indices != nullptr);
|
|
|
|
return _M_Redirect_indices->data();
|
|
}
|
|
|
|
void _Check_buffer_aliasing();
|
|
void _Update_buffer_rw_property();
|
|
void _Setup_aliasing_redirection_indices();
|
|
void _Select_accelerator_view();
|
|
void _Verify_buffers_against_accelerator_view();
|
|
|
|
private:
|
|
std::unique_ptr<std::unordered_set<void*>> _M_aliased_buffer_set;
|
|
std::unique_ptr<std::vector<bool>> _M_is_device_buffer_unaccessed;
|
|
// Info about read-write aliasing
|
|
std::unique_ptr<std::vector<int>> _M_Redirect_indices;
|
|
};
|
|
|
|
// This structure is used for passing the scheduling
|
|
// info to the parallel_for_each which is handed back
|
|
// to the compiler-runtime interface methods by the front-end
|
|
struct _Host_Scheduling_info
|
|
{
|
|
// The accelerator view to invoke a parallel_for_each on
|
|
accelerator_view _M_accelerator_view;
|
|
};
|
|
|
|
} // namespace Concurrency::details
|
|
|
|
|
|
/// <summary>
|
|
/// Uninitializes the C++ AMP runtime. It is legal to
|
|
/// call this function multiple times during an applications
|
|
/// lifetime. Calling any C++ AMP API afer calling this function
|
|
/// will reinitialize the C++ AMP runtime. Note that it is illegal
|
|
/// to use C++ AMP objects across calls to this function and doing
|
|
/// so will result in undefined behavior. Also, concurrently calling
|
|
/// this function and any other AMP APIs is illegal and would result
|
|
/// in undefined behavior.
|
|
/// </summary>
|
|
_AMPIMP void __cdecl amp_uninitialize();
|
|
|
|
} // namespace Concurrency
|
|
|
|
extern "C" {
|
|
|
|
// Return a compiler helper handle.
|
|
_AMPIMP _Ret_ _DPC_call_handle * HELPERAPI __dpc_create_call_handle(_In_ _Host_Scheduling_info *_Sch_info) throw(...);
|
|
|
|
// Destroy the call handle
|
|
_AMPIMP void HELPERAPI __dpc_release_call_handle(_In_ _DPC_call_handle * _Handle) throw(...);
|
|
|
|
_AMPIMP void HELPERAPI __dpc_set_device_resource_info(_In_ _DPC_call_handle * _Handle, _In_ _Device_resource_info * _DeviceResourceInfo, size_t _NumResources) throw(...);
|
|
|
|
// Set const buffer info.
|
|
_AMPIMP void HELPERAPI __dpc_set_const_buffer_info(_In_ _DPC_call_handle * _Handle, _In_ _Device_const_buffer_info * _DeviceConstBufferInfo, size_t _NumConstBuffers) throw(...);
|
|
|
|
// Set the kernel shader info
|
|
_AMPIMP void HELPERAPI __dpc_set_kernel_shader_info(_In_ _DPC_call_handle * _Handle,
|
|
_Inout_ void ** _ShaderBlobs) throw(...);
|
|
// Set kernel dispatch info
|
|
_AMPIMP void HELPERAPI __dpc_set_kernel_dispatch_info(_In_ _DPC_call_handle * _Handle,
|
|
unsigned int _ComputeRank,
|
|
_In_ int * _Extents,
|
|
unsigned int _GroupRank,
|
|
const unsigned int * _GroupExtents,
|
|
unsigned int & _GroupCountX,
|
|
unsigned int & _GroupCountY,
|
|
unsigned int & _GroupCountZ) throw(...);
|
|
|
|
// Dispatch the kernel
|
|
_AMPIMP void HELPERAPI __dpc_dispatch_kernel(_In_ _DPC_call_handle * _Handle) throw(...);
|
|
|
|
#ifdef _DEBUG
|
|
// Dispatch the kernel passed as a HLSL source level shader
|
|
// This function is to be used only for testing and debugging purposes
|
|
_AMPIMP void HELPERAPI __dpc_dispatch_kernel_test(_In_ _DPC_call_handle * _Handle, _In_ WCHAR* szFileName, LPCSTR szEntryPoint) throw(...);
|
|
#endif
|
|
}
|
|
|
|
// =+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+
|
|
//
|
|
// C++ AMP ETW Provider
|
|
//
|
|
// =-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-
|
|
|
|
namespace Concurrency
|
|
{
|
|
namespace details
|
|
{
|
|
|
|
// Thread-safe factory method for _Amp_runtime_trace object
|
|
_AMPIMP _Ret_ _Amp_runtime_trace* __cdecl _Get_amp_trace();
|
|
|
|
// Class that gathers C++ AMP diagnostic information and triggers events
|
|
class _Amp_runtime_trace
|
|
{
|
|
|
|
// Called by factory to create single instance of _Amp_runtime_trace type
|
|
friend BOOL CALLBACK _Init_amp_runtime_trace(PINIT_ONCE _Init_once, PVOID _Param, _Inout_ PVOID *_Context);
|
|
|
|
public:
|
|
// Destructor for _Amp_runtime_trace, called at program termination
|
|
_AMPIMP ~_Amp_runtime_trace();
|
|
|
|
// End event is triggered by multiple other events such us StartComputeEvent to show exactly when given activity completed
|
|
_AMPIMP void _Write_end_event(ULONG _Span_id);
|
|
|
|
// Add accelerator configuration information
|
|
// Note: This member function does not have to be exported, it is used by C++ AMP runtime factory
|
|
void _Add_accelerator_config_event(PVOID _Accelerator_id, LPCWSTR _Device_path, LPCWSTR _Device_description);
|
|
|
|
// Used by callback function, to write all configuration data when new session is detected
|
|
// Note: This member function does not have to be exported, it is used by C++ AMP runtime factory
|
|
void _Write_all_accelerator_config_events();
|
|
|
|
// Started accelerator_view::wait operation
|
|
// Note: This member function does not have to be exported, it is used by C++ AMP runtime factory
|
|
ULONG _Start_accelerator_view_wait_event(PVOID _Accelerator_id, PVOID _Accelerator_view_id);
|
|
|
|
// Launched accelerator_view::flush operation
|
|
// Note: This member function does not have to be exported, it is used by C++ AMP runtime factory
|
|
void _Launch_flush_event(PVOID _Accelerator_id, PVOID _Accelerator_view_id);
|
|
|
|
// Launched accelerator_view::create_marker operation
|
|
// Note: This member function does not have to be exported, it is used by C++ AMP runtime factory
|
|
ULONG _Launch_marker(PVOID _Accelerator_id, PVOID _Accelerator_view_id);
|
|
|
|
// Below are set of helpers that take various types that were available at event injection point and extract all necessary data
|
|
_AMPIMP ULONG _Start_parallel_for_each_event_helper(_In_ _DPC_call_handle *_Handle);
|
|
|
|
// This helper wraps functor with wait start and wait end events
|
|
inline concurrency::completion_future _Start_async_op_wait_event_helper(ULONG _Async_op_id, _Event _Ev)
|
|
{
|
|
std::shared_future<void> retFuture;
|
|
concurrency::task_completion_event<void> retTaskCompletionEvent;
|
|
|
|
// Create a std::shared_future by creating a deferred task through std::async that waits for the
|
|
// event _Ev to finish. Wrap functor with start and end events
|
|
retFuture = std::async(std::launch::sync, [=]() mutable {
|
|
try
|
|
{
|
|
if (_Async_op_id == _Amp_runtime_trace::_M_event_disabled)
|
|
{
|
|
_Ev._Get();
|
|
}
|
|
else
|
|
{
|
|
auto _Span_id = details::_Get_amp_trace()->_Start_async_op_wait_event(_Async_op_id);
|
|
_Ev._Get();
|
|
details::_Get_amp_trace()->_Write_end_event(_Span_id);
|
|
}
|
|
}
|
|
catch(...)
|
|
{
|
|
// If an exception is encountered when executing the asynchronous operation
|
|
// we should set the exception on the retTaskCompletionEvent so that it is
|
|
// appropriately cancelled and the exception is propagated to continuations
|
|
retTaskCompletionEvent.set_exception(std::current_exception());
|
|
throw;
|
|
}
|
|
|
|
retTaskCompletionEvent.set();
|
|
});
|
|
|
|
// Register the async event with the runtime asynchronous events manager
|
|
_Register_async_event(_Ev, retFuture);
|
|
|
|
// Lets issue a continuation just to swallow any exceptions that are encountered during the
|
|
// async operation and are never observed by the user or are just observed through the
|
|
// shared_future and not through the task
|
|
concurrency::task<void> retTask(retTaskCompletionEvent);
|
|
retTask.then([](concurrency::task<void> _Task) {
|
|
try {
|
|
_Task.get();
|
|
}
|
|
catch(...) {
|
|
}
|
|
});
|
|
|
|
return Concurrency::completion_future(retFuture, retTask);
|
|
}
|
|
|
|
_AMPIMP ULONG _Start_array_view_synchronize_event_helper(const _Buffer_descriptor &_Buff_desc);
|
|
_AMPIMP ULONG _Launch_array_view_synchronize_event_helper(const _Buffer_descriptor &_Buff_desc);
|
|
|
|
// Helpers for buffers (array, array_view)
|
|
_AMPIMP ULONG _Start_copy_event_helper(const _Buffer_descriptor &_Src, const _Buffer_descriptor &_Dest, ULONGLONG _Num_bytes_for_copy);
|
|
_AMPIMP ULONG _Start_copy_event_helper(nullptr_t, const _Buffer_descriptor &_Dest, ULONGLONG _Num_bytes_for_copy);
|
|
_AMPIMP ULONG _Start_copy_event_helper(const _Buffer_descriptor &_Src, nullptr_t, ULONGLONG _Num_bytes_for_copy);
|
|
_AMPIMP ULONG _Launch_async_copy_event_helper(const _Buffer_descriptor &_Src, const _Buffer_descriptor &_Dest, ULONGLONG _Num_bytes_for_copy);
|
|
_AMPIMP ULONG _Launch_async_copy_event_helper(nullptr_t, const _Buffer_descriptor &_Dest, ULONGLONG _Num_bytes_for_copy);
|
|
_AMPIMP ULONG _Launch_async_copy_event_helper(const _Buffer_descriptor &_Src, nullptr_t, ULONGLONG _Num_bytes_for_copy);
|
|
|
|
// Helper for textures
|
|
_AMPIMP ULONG _Start_copy_event_helper(const _Texture_descriptor &_Src, nullptr_t, ULONGLONG _Num_bytes_for_copy);
|
|
_AMPIMP ULONG _Start_copy_event_helper(nullptr_t, const _Texture_descriptor &_Dest, ULONGLONG _Num_bytes_for_copy);
|
|
_AMPIMP ULONG _Start_copy_event_helper(const _Texture_descriptor &_Src, const _Texture_descriptor &_Dest, ULONGLONG _Num_bytes_for_copy);
|
|
_AMPIMP ULONG _Launch_async_copy_event_helper(const _Texture_descriptor &_Src, nullptr_t, ULONGLONG _Num_bytes_for_copy);
|
|
_AMPIMP ULONG _Launch_async_copy_event_helper(nullptr_t, const _Texture_descriptor &_Dest, ULONGLONG _Num_bytes_for_copy);
|
|
_AMPIMP ULONG _Launch_async_copy_event_helper(const _Texture_descriptor &_Src, const _Texture_descriptor &_Dest, ULONGLONG _Num_bytes_for_copy);
|
|
|
|
void _Enable_provider(bool _Enable = true);
|
|
|
|
private:
|
|
// Private constructor. This type is created by factory method
|
|
_Amp_runtime_trace(PVOID _Callback_function, _In_ _Trace *_Trace);
|
|
|
|
// Disallow copy construction
|
|
_Amp_runtime_trace(const _Amp_runtime_trace&);
|
|
|
|
// Disallow assignment operator
|
|
_Amp_runtime_trace& operator=(const _Amp_runtime_trace&);
|
|
|
|
// Used internally to write configuation events
|
|
void _Write_accelerator_config_event(const std::tuple<PVOID, LPCWSTR, LPCWSTR> &_ConfigTuple);
|
|
|
|
// Event triggered when computation is scheduled
|
|
ULONG _Start_parallel_for_each_event(
|
|
PVOID _Accelerator_id,
|
|
PVOID _Accelerator_view_id,
|
|
BOOL _Is_tiled_explicitly,
|
|
ULONGLONG _Num_of_tiles,
|
|
ULONG _Num_of_threads_per_tile,
|
|
BOOL _Is_aliased,
|
|
ULONG _Num_read_only_resources,
|
|
ULONG _Num_read_write_resources,
|
|
ULONGLONG _Size_of_all_resouces,
|
|
ULONG _Size_of_const_data,
|
|
ULONGLONG _Size_of_data_for_copy);
|
|
|
|
// Synchronous copy operation has started
|
|
ULONG _Start_copy_event(
|
|
PVOID _Src_accelerator_id,
|
|
PVOID _Src_accelerator_view_id,
|
|
PVOID _Dst_accelerator_id,
|
|
PVOID _Dst_accelerator_view_id,
|
|
ULONGLONG _Num_bytes_for_copy,
|
|
BOOL _Is_src_staging,
|
|
BOOL _Is_dst_staging);
|
|
|
|
// Asynchronous copy operation has been launched
|
|
ULONG _Launch_async_copy_event(
|
|
PVOID _Src_accelerator_id,
|
|
PVOID _Src_accelerator_view_id,
|
|
PVOID _Dst_accelerator_id,
|
|
PVOID _Dst_accelerator_view_id,
|
|
ULONGLONG _Num_bytes_for_copy,
|
|
BOOL _Is_src_staging,
|
|
BOOL _Is_dst_staging);
|
|
|
|
// Started waiting for asynchronous operation to complete
|
|
_AMPIMP ULONG _Start_async_op_wait_event(ULONG _Async_op_id);
|
|
|
|
// Started array_view::synchronize operation
|
|
ULONG _Start_array_view_synchronize_event(ULONGLONG _Num_bytes_to_synchronize);
|
|
|
|
// Async array_view::synchronize operation has been launched
|
|
ULONG _Launch_array_view_synchronize_event(ULONGLONG _Num_bytes_to_synchronize);
|
|
|
|
// Helper function that extracts information from buffer descriptor
|
|
std::tuple<PVOID, PVOID, BOOL> _Get_resource_diagnostic_info(const _Buffer_descriptor &_Buff_desc, accelerator_view _Accl_view) const;
|
|
|
|
// Helper function that extracts information from texture descriptor
|
|
std::tuple<PVOID, PVOID, BOOL> _Get_resource_diagnostic_info(const _Texture_descriptor &_Tex_desc) const;
|
|
|
|
// Generates unique identifiers for span_id and async_op_id
|
|
ULONG _Get_unique_identifier();
|
|
|
|
// Critical section object used by callback function to synchronize following situations:
|
|
// a) multiple sessions have started at the same time
|
|
// b) C++ AMP Runtime factory adds new accelerator config event to the collection
|
|
Concurrency::critical_section _M_critical_section;
|
|
|
|
// Collection of all configuration events at the time of C++ AMP Runtime initialization
|
|
std::vector<std::tuple<PVOID, LPCWSTR, LPCWSTR>> _M_accelerator_configs;
|
|
|
|
// Unique counter for span id and async operation id
|
|
volatile ULONG _M_counter;
|
|
|
|
// Type that implements ITrace interface and writes events e.g. ETW events
|
|
_Trace* _M_trace_ptr;
|
|
|
|
// Special value that we return to chain events if provider is disabled
|
|
static const ULONG _M_event_disabled = 0;
|
|
};
|
|
|
|
// Helper function to query the number of mipmap levels from texture object
|
|
inline unsigned int _Get_mipmap_levels(const _Texture *_Tex)
|
|
{
|
|
_ASSERTE(_Tex);
|
|
return _Tex->_Get_mip_levels();
|
|
}
|
|
|
|
} // namespace Concurrency::details
|
|
} // namespace Concurrency
|
|
|
|
namespace concurrency = Concurrency;
|
|
|
|
#pragma pack(pop)
|