Memory profiling & codebase cleanup

This commit is contained in:
PS 2021-03-18 02:19:35 -07:00
parent 4f199ee1c6
commit c054a0e6b6
25 changed files with 272 additions and 4399 deletions

2
compile.bat Normal file
View File

@ -0,0 +1,2 @@
@echo off
build\build_app_msvc_win32_debug.bat

View File

@ -1,3 +0,0 @@
@echo off
remedybg build\win32_foldhaus.rdbg

View File

@ -144,8 +144,8 @@ Editor_Render(app_state* State, context* Context, render_command_buffer* RenderB
// Draw the Interface
if (State->Interface.DrawOrderRoot != 0)
{
ui_widget Widget = *State->Interface.DrawOrderRoot;
Editor_DrawWidget(State, Context, RenderBuffer, Widget, Context->WindowBounds);
ui_widget* Widget = State->Interface.DrawOrderRoot;
Editor_DrawWidgetList(State, Context, RenderBuffer, Widget, Context->WindowBounds);
}
Context->GeneralWorkQueue->CompleteQueueWork(Context->GeneralWorkQueue, Context->ThreadContext);

View File

@ -77,12 +77,11 @@ Editor_GetWidgetFillBounds(ui_widget Widget)
return Result;
}
internal void
Editor_DrawWidget(app_state* State, context* Context, render_command_buffer* RenderBuffer, ui_widget Widget, rect2 ParentClipBounds)
{
rect2 WidgetParentUnion = Widget.Bounds;
WidgetParentUnion = Rect2Union(Widget.Bounds, ParentClipBounds);
internal void Editor_DrawWidgetList(app_state* State, context* Context, render_command_buffer* RenderBuffer, ui_widget Widget, rect2 ParentClipBounds);
internal void
Editor_DrawWidget(app_state* State, context* Context, render_command_buffer* RenderBuffer, ui_widget Widget, rect2 WidgetParentUnion)
{
bool IsActiveWidget = ui_WidgetIdsEqual(Widget.Id, State->Interface.ActiveWidget);
;
if (!Widget.Parent || (Rect2Area(WidgetParentUnion) > 0))
@ -146,18 +145,27 @@ Editor_DrawWidget(app_state* State, context* Context, render_command_buffer* Ren
PushRenderBoundingBox2D(RenderBuffer, WidgetParentUnion.Min, WidgetParentUnion.Max, Thickness, Color);
}
}
if (Widget.ChildrenRoot)
{
Editor_DrawWidget(State, Context, RenderBuffer, *Widget.ChildrenRoot, WidgetParentUnion);
}
if (Widget.Next)
{
Editor_DrawWidget(State, Context, RenderBuffer, *Widget.Next, ParentClipBounds);
}
}
internal void Editor_DrawWidgetList(app_state* State, context* Context, render_command_buffer* RenderBuffer, ui_widget* Widget, rect2 ParentClipBounds)
{
ui_widget* WidgetAt = Widget;
while (WidgetAt)
{
rect2 WidgetParentUnion = WidgetAt->Bounds;
WidgetParentUnion = Rect2Union(WidgetAt->Bounds, ParentClipBounds);
Editor_DrawWidget(State, Context, RenderBuffer, *WidgetAt, WidgetParentUnion);
if (WidgetAt->ChildrenRoot)
{
Editor_DrawWidgetList(State, Context, RenderBuffer, WidgetAt->ChildrenRoot, WidgetParentUnion);
}
WidgetAt = WidgetAt->Next;
}
}
#define FOLDHAUS_EDITOR_DRAW_H
#endif // FOLDHAUS_EDITOR_DRAW_H

View File

@ -1582,7 +1582,7 @@ ui_InterfaceCreate(context Context, interface_config Style, gs_memory_arena* Per
Result.WidgetsCountMax = 4096;
Result.Widgets = PushArray(Permanent, ui_widget, Result.WidgetsCountMax);
Result.PerFrameMemory = PushStruct(Permanent, gs_memory_arena);
*Result.PerFrameMemory = CreateMemoryArena(Context.ThreadContext.Allocator);
*Result.PerFrameMemory = CreateMemoryArena(Context.ThreadContext.Allocator, "Interface Per Frame Memory Arena", KB(32));
InterfaceAssert(Result.PerFrameMemory);
Result.Permanent = Permanent;

View File

@ -90,7 +90,7 @@ FileView_Init(panel* Panel, app_state* State, context Context)
// TODO: :FreePanelMemory
file_view_state* FileViewState = PushStruct(&State->Permanent, file_view_state);
Panel->StateMemory = StructToData(FileViewState, file_view_state);
FileViewState->FileNamesArena = CreateMemoryArena(Context.ThreadContext.Allocator);
FileViewState->FileNamesArena = CreateMemoryArena(Context.ThreadContext.Allocator, "File View - File Names Arena");
// TODO(pjs): this shouldn't be stored in permanent
FileViewState->DisplayDirectory = PushString(&State->Permanent, 1024);

View File

@ -163,6 +163,44 @@ RenderProfiler_ListVisualization(ui_interface* Interface, ui_widget* Layout, deb
ui_EndList(Interface);
}
internal void
RenderProfiler_MemoryView(ui_interface* Interface, ui_widget* Layout, app_state* State, context Context, gs_memory_arena* Memory)
{
gs_allocator_debug Debug = *Context.ThreadContext.Allocator.Debug;
gs_string TempString = PushString(State->Transient, 256);
u64 MemFootprint = Debug.TotalAllocSize;
u64 AllocCount = Debug.AllocationsCount;
PrintF(&TempString, "Total Memory Size: %lld | Allocations: %lld", MemFootprint, AllocCount);
ui_Label(Interface, TempString);
ui_column_spec ColumnWidths[] = {
{ UIColumnSize_Fill, 0 },
{ UIColumnSize_Fixed,256 },
};
ui_BeginRow(Interface, 2, &ColumnWidths[0]);
{
ui_Label(Interface, MakeString("Location"));
ui_Label(Interface, MakeString("Alloc Size"));
}
ui_EndRow(Interface);
ui_BeginList(Interface, MakeString("Alloc List"), 10, Debug.AllocationsCount);
ui_BeginRow(Interface, 2, &ColumnWidths[0]);
for (s32 n = 0; n < Debug.AllocationsCount; n++)
{
gs_debug_allocation A = Debug.Allocations[n];
PrintF(&TempString, "%S", A.Location);
ui_Label(Interface, TempString);
PrintF(&TempString, "%lld bytes", A.Size);
ui_Label(Interface, TempString);
}
ui_EndRow(Interface);
ui_EndList(Interface);
}
GSMetaTag(panel_render);
GSMetaTag(panel_type_profiler);
internal void
@ -234,24 +272,39 @@ ProfilerView_Render(panel* Panel, rect2 PanelBounds, render_command_buffer* Rend
ui_BeginRow(&State->Interface, 8);
{
if (ui_Button(&State->Interface, MakeString("Scope View")))
if (ui_Button(&State->Interface, MakeString("Profiler")))
{
GlobalDebugServices->Interface.FrameView = FRAME_VIEW_PROFILER;
GlobalDebugServices->Interface.FrameView = DebugUI_Profiler;
}
if (ui_Button(&State->Interface, MakeString("List View")))
{
GlobalDebugServices->Interface.FrameView = FRAME_VIEW_SCOPE_LIST;
GlobalDebugServices->Interface.FrameView = DebugUI_ScopeList;
}
if (ui_Button(&State->Interface, MakeString("Memory")))
{
GlobalDebugServices->Interface.FrameView = DebugUI_MemoryView;
}
}
ui_EndRow(&State->Interface);
if (GlobalDebugServices->Interface.FrameView == FRAME_VIEW_PROFILER)
switch (GlobalDebugServices->Interface.FrameView)
{
RenderProfiler_ScopeVisualization(&State->Interface, Layout, VisibleFrame, Memory);
}
else
{
RenderProfiler_ListVisualization(&State->Interface, Layout, VisibleFrame, Memory);
case DebugUI_Profiler:
{
RenderProfiler_ScopeVisualization(&State->Interface, Layout, VisibleFrame, Memory);
}break;
case DebugUI_ScopeList:
{
RenderProfiler_ListVisualization(&State->Interface, Layout, VisibleFrame, Memory);
}break;
case DebugUI_MemoryView:
{
RenderProfiler_MemoryView(&State->Interface, Layout, State, Context, Memory);
}break;
InvalidDefaultCase;
}
ui_PopLayout(&State->Interface, MakeString("Profiler Layout"));

View File

@ -206,7 +206,7 @@ LoadAssembly (assembly_array* Assemblies, led_system* LedSystem, gs_memory_arena
gs_const_string FileName = Substring(Path, IndexOfLastSlash + 1, Path.Length);
assembly* NewAssembly = AssemblyArray_Take(Assemblies);
NewAssembly->Arena = CreateMemoryArena(Context.ThreadContext.Allocator);
NewAssembly->Arena = CreateMemoryArena(Context.ThreadContext.Allocator, "Assembly Arena");
parser AssemblyParser = ParseAssemblyFile(NewAssembly, FileName, AssemblyFileText, Scratch);
if (AssemblyParser.Success)

View File

@ -100,7 +100,7 @@ AddressedDataBufferList_Create(gs_thread_context TC)
{
addressed_data_buffer_list Result = {};
Result.Arena = AllocatorAllocStruct(TC.Allocator, gs_memory_arena);
*Result.Arena = CreateMemoryArena(TC.Allocator);
*Result.Arena = CreateMemoryArena(TC.Allocator, "Addressed Data Buffer List Arena");
return Result;
}

View File

@ -24,7 +24,7 @@ INITIALIZE_APPLICATION(InitializeApplication)
app_state* State = (app_state*)Context.MemoryBase;
*State = {};
State->Permanent = CreateMemoryArena(Context.ThreadContext.Allocator);
State->Permanent = CreateMemoryArena(Context.ThreadContext.Allocator, "Permanent");
State->Transient = Context.ThreadContext.Transient;
State->Assemblies = AssemblyArray_Create(8, &State->Permanent);
@ -91,6 +91,8 @@ INITIALIZE_APPLICATION(InitializeApplication)
Panel_SetType(Hierarchy, &State->PanelSystem, PanelType_AssemblyDebug, State, Context);
}
State->RunEditor = true;
}
UPDATE_AND_RENDER(UpdateAndRender)
@ -104,7 +106,10 @@ UPDATE_AND_RENDER(UpdateAndRender)
// incorrect to clear the arena, and then access the memory later.
ClearArena(State->Transient);
Editor_Update(State, Context, InputQueue);
if (State->RunEditor)
{
Editor_Update(State, Context, InputQueue);
}
AnimationSystem_Update(&State->AnimationSystem, Context->DeltaTime);
if (AnimationSystem_NeedsRender(State->AnimationSystem))
@ -123,7 +128,10 @@ UPDATE_AND_RENDER(UpdateAndRender)
State->Assemblies,
State->LedSystem);
Editor_Render(State, Context, RenderBuffer);
if (State->RunEditor)
{
Editor_Render(State, Context, RenderBuffer);
}
// NOTE(pjs): Building data buffers to be sent out to the sculpture
// This array is used on the platform side to actually send the information

View File

@ -13,7 +13,7 @@
#include "../gs_libs/gs_font.h"
#include "foldhaus_log.h"
#include "interface.h"
#include "editor/interface.h"
#include "engine/foldhaus_network_ordering.h"
@ -42,7 +42,7 @@ typedef struct panel panel;
#include "engine/animation/foldhaus_animation_renderer.cpp"
#include "engine/user_space.h"
#include "blumen_lumen.h"
#include "ss_blumen_lumen/blumen_lumen.h"
struct app_state
{
@ -72,6 +72,8 @@ struct app_state
panel* HotPanel;
user_space_desc UserSpaceDesc;
bool RunEditor;
};
internal void OpenColorPicker(app_state* State, v4* Address);
@ -81,7 +83,7 @@ internal void OpenColorPicker(app_state* State, v4* Address);
#include "engine/user_space.cpp"
#include "patterns/blumen_patterns.h"
#include "blumen_lumen.cpp"
#include "ss_blumen_lumen/blumen_lumen.cpp"
internal void
EndCurrentOperationMode(app_state* State)

View File

@ -64,8 +64,14 @@ struct debug_frame
collated_scope_record* CollatedScopes;
};
#define FRAME_VIEW_PROFILER 0
#define FRAME_VIEW_SCOPE_LIST 1
enum debug_ui_view
{
DebugUI_Profiler,
DebugUI_ScopeList,
DebugUI_MemoryView,
DebugUI_Count,
};
struct debug_interface
{

File diff suppressed because it is too large Load Diff

View File

@ -535,6 +535,12 @@ WinMain (
{
gs_thread_context ThreadContext = Win32CreateThreadContext();
gs_allocator_debug AllocDebug = {};
AllocDebug.AllocationsCountMax = 4096;
AllocDebug.Allocations = (gs_debug_allocation*)Win32Alloc(sizeof(gs_debug_allocation) * AllocDebug.AllocationsCountMax, 0);
ThreadContext.Allocator.Debug = &AllocDebug;
gs_file_info A = GetFileInfo(ThreadContext.FileHandler, ConstString("C:\\projects\\Lumenarium"));
gs_file_info B = GetFileInfo(ThreadContext.FileHandler, ConstString("C:\\projects\\Lumenarium\\"));
@ -556,7 +562,7 @@ WinMain (
Context.MemorySize = MB(64);
Context.MemoryBase = (u8*)Win32Alloc(Context.MemorySize, 0);
gs_memory_arena PlatformPermanent = CreateMemoryArena(Context.ThreadContext.Allocator);
gs_memory_arena PlatformPermanent = CreateMemoryArena(Context.ThreadContext.Allocator, "Platform Memory");
s64 PerformanceCountFrequency = GetPerformanceFrequency();
s64 LastFrameEnd = GetWallClock();

View File

@ -204,9 +204,13 @@ Win32SerialArray_Create(gs_thread_context Context)
Win32SerialPortNames = AllocatorAllocArray(Context.Allocator, gs_string, Win32SerialHandlesCountMax);
Win32SerialPortFilled = AllocatorAllocArray(Context.Allocator, s32, Win32SerialHandlesCountMax);
u64 PortNameSize = 256;
u64 PortNameBufferSize = PortNameSize * Win32SerialHandlesCountMax;
char* PortNameBuffer = AllocatorAllocArray(Context.Allocator, char, PortNameBufferSize);
for (u32 i = 0; i < Win32SerialHandlesCountMax; i++)
{
Win32SerialPortNames[i] = AllocatorAllocString(Context.Allocator, 256);
char* NameBase = PortNameBuffer + (PortNameSize * i);
Win32SerialPortNames[i] = MakeString(NameBase, 0, PortNameSize);
Win32SerialPortFilled[i] = 0;
}
}

View File

@ -48,7 +48,7 @@ Win32CreateThreadContext(gs_memory_arena* Transient = 0)
else
{
Result.Transient = (gs_memory_arena*)AllocatorAlloc(Result.Allocator, sizeof(gs_memory_arena)).Memory;
*Result.Transient = CreateMemoryArena(Result.Allocator);
*Result.Transient = CreateMemoryArena(Result.Allocator, "Tctx Transient");
}
Result.FileHandler = CreateFileHandler(Win32GetFileInfo,
Win32ReadEntireFile,

View File

@ -24,10 +24,8 @@ BlumenLumen_MicListenJob(gs_thread_context* Ctx, u8* UserData)
while (*Data->Running)
{
#if 1
if (SocketQueryStatus(Data->SocketManager, Data->ListenSocket))
{
// TODO(pjs): Removing this block for now - nothing is wrong with it except that SocketPeek is still blocking for some reason
if (SocketPeek(Data->SocketManager, Data->ListenSocket))
{
// TODO(pjs): Make this a peek operation
@ -41,7 +39,6 @@ BlumenLumen_MicListenJob(gs_thread_context* Ctx, u8* UserData)
}
}
}
#endif
while (Data->OutgoingMsgQueue->ReadHead != Data->OutgoingMsgQueue->WriteHead)
{

View File

@ -1,711 +0,0 @@
/* SIMD (SSE1+MMX or SSE2) implementation of sin, cos, exp and log
Inspired by Intel Approximate Math library, and based on the
corresponding algorithms of the cephes math library
The default is to use the SSE1 version. If you define USE_SSE2 the
the SSE2 intrinsics will be used in place of the MMX intrinsics. Do
not expect any significant performance improvement with SSE2.
*/
/* Copyright (C) 2007 Julien Pommier
This software is provided 'as-is', without any express or implied
warranty. In no event will the authors be held liable for any damages
arising from the use of this software.
Permission is granted to anyone to use this software for any purpose,
including commercial applications, and to alter it and redistribute it
freely, subject to the following restrictions:
1. The origin of this software must not be misrepresented; you must not
claim that you wrote the original software. If you use this software
in a product, an acknowledgment in the product documentation would be
appreciated but is not required.
2. Altered source versions must be plainly marked as such, and must not be
misrepresented as being the original software.
3. This notice may not be removed or altered from any source distribution.
(this is the zlib license)
*/
#include <xmmintrin.h>
/* yes I know, the top of this file is quite ugly */
#ifdef _MSC_VER /* visual c++ */
# define ALIGN16_BEG __declspec(align(16))
# define ALIGN16_END
#else /* gcc or icc */
# define ALIGN16_BEG
# define ALIGN16_END __attribute__((aligned(16)))
#endif
/* __m128 is ugly to write */
typedef __m128 v4sf; // vector of 4 float (sse1)
#ifdef USE_SSE2
# include <emmintrin.h>
typedef __m128i v4si; // vector of 4 int (sse2)
#else
typedef __m64 v2si; // vector of 2 int (mmx)
#endif
/* declare some SSE constants -- why can't I figure a better way to do that? */
#define _PS_CONST(Name, Val) \
static const ALIGN16_BEG float _ps_##Name[4] ALIGN16_END = { Val, Val, Val, Val }
#define _PI32_CONST(Name, Val) \
static const ALIGN16_BEG int _pi32_##Name[4] ALIGN16_END = { Val, Val, Val, Val }
#define _PS_CONST_TYPE(Name, Type, Val) \
static const ALIGN16_BEG Type _ps_##Name[4] ALIGN16_END = { Val, Val, Val, Val }
_PS_CONST(1 , 1.0f);
_PS_CONST(0p5, 0.5f);
/* the smallest non denormalized float number */
_PS_CONST_TYPE(min_norm_pos, int, 0x00800000);
_PS_CONST_TYPE(mant_mask, int, 0x7f800000);
_PS_CONST_TYPE(inv_mant_mask, int, ~0x7f800000);
_PS_CONST_TYPE(sign_mask, int, (int)0x80000000);
_PS_CONST_TYPE(inv_sign_mask, int, ~0x80000000);
_PI32_CONST(1, 1);
_PI32_CONST(inv1, ~1);
_PI32_CONST(2, 2);
_PI32_CONST(4, 4);
_PI32_CONST(0x7f, 0x7f);
_PS_CONST(cephes_SQRTHF, 0.707106781186547524);
_PS_CONST(cephes_log_p0, 7.0376836292E-2);
_PS_CONST(cephes_log_p1, - 1.1514610310E-1);
_PS_CONST(cephes_log_p2, 1.1676998740E-1);
_PS_CONST(cephes_log_p3, - 1.2420140846E-1);
_PS_CONST(cephes_log_p4, + 1.4249322787E-1);
_PS_CONST(cephes_log_p5, - 1.6668057665E-1);
_PS_CONST(cephes_log_p6, + 2.0000714765E-1);
_PS_CONST(cephes_log_p7, - 2.4999993993E-1);
_PS_CONST(cephes_log_p8, + 3.3333331174E-1);
_PS_CONST(cephes_log_q1, -2.12194440e-4);
_PS_CONST(cephes_log_q2, 0.693359375);
#ifndef USE_SSE2
typedef union xmm_mm_union {
__m128 xmm;
__m64 mm[2];
} xmm_mm_union;
#define COPY_XMM_TO_MM(xmm_, mm0_, mm1_) { \
xmm_mm_union u; u.xmm = xmm_; \
mm0_ = u.mm[0]; \
mm1_ = u.mm[1]; \
}
#define COPY_MM_TO_XMM(mm0_, mm1_, xmm_) { \
xmm_mm_union u; u.mm[0]=mm0_; u.mm[1]=mm1_; xmm_ = u.xmm; \
}
#endif // USE_SSE2
/* natural logarithm computed for 4 simultaneous float
return NaN for x <= 0
*/
v4sf log_ps(v4sf x) {
#ifdef USE_SSE2
v4si emm0;
#else
v2si mm0, mm1;
#endif
v4sf one = *(v4sf*)_ps_1;
v4sf invalid_mask = _mm_cmple_ps(x, _mm_setzero_ps());
x = _mm_max_ps(x, *(v4sf*)_ps_min_norm_pos); /* cut off denormalized stuff */
#ifndef USE_SSE2
/* part 1: x = frexpf(x, &e); */
COPY_XMM_TO_MM(x, mm0, mm1);
mm0 = _mm_srli_pi32(mm0, 23);
mm1 = _mm_srli_pi32(mm1, 23);
#else
emm0 = _mm_srli_epi32(_mm_castps_si128(x), 23);
#endif
/* keep only the fractional part */
x = _mm_and_ps(x, *(v4sf*)_ps_inv_mant_mask);
x = _mm_or_ps(x, *(v4sf*)_ps_0p5);
#ifndef USE_SSE2
/* now e=mm0:mm1 contain the really base-2 exponent */
mm0 = _mm_sub_pi32(mm0, *(v2si*)_pi32_0x7f);
mm1 = _mm_sub_pi32(mm1, *(v2si*)_pi32_0x7f);
v4sf e = _mm_cvtpi32x2_ps(mm0, mm1);
_mm_empty(); /* bye bye mmx */
#else
emm0 = _mm_sub_epi32(emm0, *(v4si*)_pi32_0x7f);
v4sf e = _mm_cvtepi32_ps(emm0);
#endif
e = _mm_add_ps(e, one);
/* part2:
if( x < SQRTHF ) {
e -= 1;
x = x + x - 1.0;
} else { x = x - 1.0; }
*/
v4sf mask = _mm_cmplt_ps(x, *(v4sf*)_ps_cephes_SQRTHF);
v4sf tmp = _mm_and_ps(x, mask);
x = _mm_sub_ps(x, one);
e = _mm_sub_ps(e, _mm_and_ps(one, mask));
x = _mm_add_ps(x, tmp);
v4sf z = _mm_mul_ps(x,x);
v4sf y = *(v4sf*)_ps_cephes_log_p0;
y = _mm_mul_ps(y, x);
y = _mm_add_ps(y, *(v4sf*)_ps_cephes_log_p1);
y = _mm_mul_ps(y, x);
y = _mm_add_ps(y, *(v4sf*)_ps_cephes_log_p2);
y = _mm_mul_ps(y, x);
y = _mm_add_ps(y, *(v4sf*)_ps_cephes_log_p3);
y = _mm_mul_ps(y, x);
y = _mm_add_ps(y, *(v4sf*)_ps_cephes_log_p4);
y = _mm_mul_ps(y, x);
y = _mm_add_ps(y, *(v4sf*)_ps_cephes_log_p5);
y = _mm_mul_ps(y, x);
y = _mm_add_ps(y, *(v4sf*)_ps_cephes_log_p6);
y = _mm_mul_ps(y, x);
y = _mm_add_ps(y, *(v4sf*)_ps_cephes_log_p7);
y = _mm_mul_ps(y, x);
y = _mm_add_ps(y, *(v4sf*)_ps_cephes_log_p8);
y = _mm_mul_ps(y, x);
y = _mm_mul_ps(y, z);
tmp = _mm_mul_ps(e, *(v4sf*)_ps_cephes_log_q1);
y = _mm_add_ps(y, tmp);
tmp = _mm_mul_ps(z, *(v4sf*)_ps_0p5);
y = _mm_sub_ps(y, tmp);
tmp = _mm_mul_ps(e, *(v4sf*)_ps_cephes_log_q2);
x = _mm_add_ps(x, y);
x = _mm_add_ps(x, tmp);
x = _mm_or_ps(x, invalid_mask); // negative arg will be NAN
return x;
}
_PS_CONST(exp_hi, 88.3762626647949f);
_PS_CONST(exp_lo, -88.3762626647949f);
_PS_CONST(cephes_LOG2EF, 1.44269504088896341);
_PS_CONST(cephes_exp_C1, 0.693359375);
_PS_CONST(cephes_exp_C2, -2.12194440e-4);
_PS_CONST(cephes_exp_p0, 1.9875691500E-4);
_PS_CONST(cephes_exp_p1, 1.3981999507E-3);
_PS_CONST(cephes_exp_p2, 8.3334519073E-3);
_PS_CONST(cephes_exp_p3, 4.1665795894E-2);
_PS_CONST(cephes_exp_p4, 1.6666665459E-1);
_PS_CONST(cephes_exp_p5, 5.0000001201E-1);
v4sf exp_ps(v4sf x) {
v4sf tmp = _mm_setzero_ps(), fx;
#ifdef USE_SSE2
v4si emm0;
#else
v2si mm0, mm1;
#endif
v4sf one = *(v4sf*)_ps_1;
x = _mm_min_ps(x, *(v4sf*)_ps_exp_hi);
x = _mm_max_ps(x, *(v4sf*)_ps_exp_lo);
/* express exp(x) as exp(g + n*log(2)) */
fx = _mm_mul_ps(x, *(v4sf*)_ps_cephes_LOG2EF);
fx = _mm_add_ps(fx, *(v4sf*)_ps_0p5);
/* how to perform a floorf with SSE: just below */
#ifndef USE_SSE2
/* step 1 : cast to int */
tmp = _mm_movehl_ps(tmp, fx);
mm0 = _mm_cvttps_pi32(fx);
mm1 = _mm_cvttps_pi32(tmp);
/* step 2 : cast back to float */
tmp = _mm_cvtpi32x2_ps(mm0, mm1);
#else
emm0 = _mm_cvttps_epi32(fx);
tmp = _mm_cvtepi32_ps(emm0);
#endif
/* if greater, substract 1 */
v4sf mask = _mm_cmpgt_ps(tmp, fx);
mask = _mm_and_ps(mask, one);
fx = _mm_sub_ps(tmp, mask);
tmp = _mm_mul_ps(fx, *(v4sf*)_ps_cephes_exp_C1);
v4sf z = _mm_mul_ps(fx, *(v4sf*)_ps_cephes_exp_C2);
x = _mm_sub_ps(x, tmp);
x = _mm_sub_ps(x, z);
z = _mm_mul_ps(x,x);
v4sf y = *(v4sf*)_ps_cephes_exp_p0;
y = _mm_mul_ps(y, x);
y = _mm_add_ps(y, *(v4sf*)_ps_cephes_exp_p1);
y = _mm_mul_ps(y, x);
y = _mm_add_ps(y, *(v4sf*)_ps_cephes_exp_p2);
y = _mm_mul_ps(y, x);
y = _mm_add_ps(y, *(v4sf*)_ps_cephes_exp_p3);
y = _mm_mul_ps(y, x);
y = _mm_add_ps(y, *(v4sf*)_ps_cephes_exp_p4);
y = _mm_mul_ps(y, x);
y = _mm_add_ps(y, *(v4sf*)_ps_cephes_exp_p5);
y = _mm_mul_ps(y, z);
y = _mm_add_ps(y, x);
y = _mm_add_ps(y, one);
/* build 2^n */
#ifndef USE_SSE2
z = _mm_movehl_ps(z, fx);
mm0 = _mm_cvttps_pi32(fx);
mm1 = _mm_cvttps_pi32(z);
mm0 = _mm_add_pi32(mm0, *(v2si*)_pi32_0x7f);
mm1 = _mm_add_pi32(mm1, *(v2si*)_pi32_0x7f);
mm0 = _mm_slli_pi32(mm0, 23);
mm1 = _mm_slli_pi32(mm1, 23);
v4sf pow2n;
COPY_MM_TO_XMM(mm0, mm1, pow2n);
_mm_empty();
#else
emm0 = _mm_cvttps_epi32(fx);
emm0 = _mm_add_epi32(emm0, *(v4si*)_pi32_0x7f);
emm0 = _mm_slli_epi32(emm0, 23);
v4sf pow2n = _mm_castsi128_ps(emm0);
#endif
y = _mm_mul_ps(y, pow2n);
return y;
}
_PS_CONST(minus_cephes_DP1, -0.78515625);
_PS_CONST(minus_cephes_DP2, -2.4187564849853515625e-4);
_PS_CONST(minus_cephes_DP3, -3.77489497744594108e-8);
_PS_CONST(sincof_p0, -1.9515295891E-4);
_PS_CONST(sincof_p1, 8.3321608736E-3);
_PS_CONST(sincof_p2, -1.6666654611E-1);
_PS_CONST(coscof_p0, 2.443315711809948E-005);
_PS_CONST(coscof_p1, -1.388731625493765E-003);
_PS_CONST(coscof_p2, 4.166664568298827E-002);
_PS_CONST(cephes_FOPI, 1.27323954473516); // 4 / M_PI
/* evaluation of 4 sines at onces, using only SSE1+MMX intrinsics so
it runs also on old athlons XPs and the pentium III of your grand
mother.
The code is the exact rewriting of the cephes sinf function.
Precision is excellent as long as x < 8192 (I did not bother to
take into account the special handling they have for greater values
-- it does not return garbage for arguments over 8192, though, but
the extra precision is missing).
Note that it is such that sinf((float)M_PI) = 8.74e-8, which is the
surprising but correct result.
Performance is also surprisingly good, 1.33 times faster than the
macos vsinf SSE2 function, and 1.5 times faster than the
__vrs4_sinf of amd's ACML (which is only available in 64 bits). Not
too bad for an SSE1 function (with no special tuning) !
However the latter libraries probably have a much better handling of NaN,
Inf, denormalized and other special arguments..
On my core 1 duo, the execution of this function takes approximately 95 cycles.
From what I have observed on the experiments with Intel AMath lib, switching to an
SSE2 version would improve the perf by only 10%.
Since it is based on SSE intrinsics, it has to be compiled at -O2 to
deliver full speed.
*/
v4sf sin_ps(v4sf x) { // any x
v4sf xmm1, xmm2 = _mm_setzero_ps(), xmm3, sign_bit, y;
#ifdef USE_SSE2
v4si emm0, emm2;
#else
v2si mm0, mm1, mm2, mm3;
#endif
sign_bit = x;
/* take the absolute value */
x = _mm_and_ps(x, *(v4sf*)_ps_inv_sign_mask);
/* extract the sign bit (upper one) */
sign_bit = _mm_and_ps(sign_bit, *(v4sf*)_ps_sign_mask);
/* scale by 4/Pi */
y = _mm_mul_ps(x, *(v4sf*)_ps_cephes_FOPI);
#ifdef USE_SSE2
/* store the integer part of y in mm0 */
emm2 = _mm_cvttps_epi32(y);
/* j=(j+1) & (~1) (see the cephes sources) */
emm2 = _mm_add_epi32(emm2, *(v4si*)_pi32_1);
emm2 = _mm_and_si128(emm2, *(v4si*)_pi32_inv1);
y = _mm_cvtepi32_ps(emm2);
/* get the swap sign flag */
emm0 = _mm_and_si128(emm2, *(v4si*)_pi32_4);
emm0 = _mm_slli_epi32(emm0, 29);
/* get the polynom selection mask
there is one polynom for 0 <= x <= Pi/4
and another one for Pi/4<x<=Pi/2
Both branches will be computed.
*/
emm2 = _mm_and_si128(emm2, *(v4si*)_pi32_2);
emm2 = _mm_cmpeq_epi32(emm2, _mm_setzero_si128());
v4sf swap_sign_bit = _mm_castsi128_ps(emm0);
v4sf poly_mask = _mm_castsi128_ps(emm2);
sign_bit = _mm_xor_ps(sign_bit, swap_sign_bit);
#else
/* store the integer part of y in mm0:mm1 */
xmm2 = _mm_movehl_ps(xmm2, y);
mm2 = _mm_cvttps_pi32(y);
mm3 = _mm_cvttps_pi32(xmm2);
/* j=(j+1) & (~1) (see the cephes sources) */
mm2 = _mm_add_pi32(mm2, *(v2si*)_pi32_1);
mm3 = _mm_add_pi32(mm3, *(v2si*)_pi32_1);
mm2 = _mm_and_si64(mm2, *(v2si*)_pi32_inv1);
mm3 = _mm_and_si64(mm3, *(v2si*)_pi32_inv1);
y = _mm_cvtpi32x2_ps(mm2, mm3);
/* get the swap sign flag */
mm0 = _mm_and_si64(mm2, *(v2si*)_pi32_4);
mm1 = _mm_and_si64(mm3, *(v2si*)_pi32_4);
mm0 = _mm_slli_pi32(mm0, 29);
mm1 = _mm_slli_pi32(mm1, 29);
/* get the polynom selection mask */
mm2 = _mm_and_si64(mm2, *(v2si*)_pi32_2);
mm3 = _mm_and_si64(mm3, *(v2si*)_pi32_2);
mm2 = _mm_cmpeq_pi32(mm2, _mm_setzero_si64());
mm3 = _mm_cmpeq_pi32(mm3, _mm_setzero_si64());
v4sf swap_sign_bit, poly_mask;
COPY_MM_TO_XMM(mm0, mm1, swap_sign_bit);
COPY_MM_TO_XMM(mm2, mm3, poly_mask);
sign_bit = _mm_xor_ps(sign_bit, swap_sign_bit);
_mm_empty(); /* good-bye mmx */
#endif
/* The magic pass: "Extended precision modular arithmetic"
x = ((x - y * DP1) - y * DP2) - y * DP3; */
xmm1 = *(v4sf*)_ps_minus_cephes_DP1;
xmm2 = *(v4sf*)_ps_minus_cephes_DP2;
xmm3 = *(v4sf*)_ps_minus_cephes_DP3;
xmm1 = _mm_mul_ps(y, xmm1);
xmm2 = _mm_mul_ps(y, xmm2);
xmm3 = _mm_mul_ps(y, xmm3);
x = _mm_add_ps(x, xmm1);
x = _mm_add_ps(x, xmm2);
x = _mm_add_ps(x, xmm3);
/* Evaluate the first polynom (0 <= x <= Pi/4) */
y = *(v4sf*)_ps_coscof_p0;
v4sf z = _mm_mul_ps(x,x);
y = _mm_mul_ps(y, z);
y = _mm_add_ps(y, *(v4sf*)_ps_coscof_p1);
y = _mm_mul_ps(y, z);
y = _mm_add_ps(y, *(v4sf*)_ps_coscof_p2);
y = _mm_mul_ps(y, z);
y = _mm_mul_ps(y, z);
v4sf tmp = _mm_mul_ps(z, *(v4sf*)_ps_0p5);
y = _mm_sub_ps(y, tmp);
y = _mm_add_ps(y, *(v4sf*)_ps_1);
/* Evaluate the second polynom (Pi/4 <= x <= 0) */
v4sf y2 = *(v4sf*)_ps_sincof_p0;
y2 = _mm_mul_ps(y2, z);
y2 = _mm_add_ps(y2, *(v4sf*)_ps_sincof_p1);
y2 = _mm_mul_ps(y2, z);
y2 = _mm_add_ps(y2, *(v4sf*)_ps_sincof_p2);
y2 = _mm_mul_ps(y2, z);
y2 = _mm_mul_ps(y2, x);
y2 = _mm_add_ps(y2, x);
/* select the correct result from the two polynoms */
xmm3 = poly_mask;
y2 = _mm_and_ps(xmm3, y2); //, xmm3);
y = _mm_andnot_ps(xmm3, y);
y = _mm_add_ps(y,y2);
/* update the sign */
y = _mm_xor_ps(y, sign_bit);
return y;
}
/* almost the same as sin_ps */
v4sf cos_ps(v4sf x) { // any x
v4sf xmm1, xmm2 = _mm_setzero_ps(), xmm3, y;
#ifdef USE_SSE2
v4si emm0, emm2;
#else
v2si mm0, mm1, mm2, mm3;
#endif
/* take the absolute value */
x = _mm_and_ps(x, *(v4sf*)_ps_inv_sign_mask);
/* scale by 4/Pi */
y = _mm_mul_ps(x, *(v4sf*)_ps_cephes_FOPI);
#ifdef USE_SSE2
/* store the integer part of y in mm0 */
emm2 = _mm_cvttps_epi32(y);
/* j=(j+1) & (~1) (see the cephes sources) */
emm2 = _mm_add_epi32(emm2, *(v4si*)_pi32_1);
emm2 = _mm_and_si128(emm2, *(v4si*)_pi32_inv1);
y = _mm_cvtepi32_ps(emm2);
emm2 = _mm_sub_epi32(emm2, *(v4si*)_pi32_2);
/* get the swap sign flag */
emm0 = _mm_andnot_si128(emm2, *(v4si*)_pi32_4);
emm0 = _mm_slli_epi32(emm0, 29);
/* get the polynom selection mask */
emm2 = _mm_and_si128(emm2, *(v4si*)_pi32_2);
emm2 = _mm_cmpeq_epi32(emm2, _mm_setzero_si128());
v4sf sign_bit = _mm_castsi128_ps(emm0);
v4sf poly_mask = _mm_castsi128_ps(emm2);
#else
/* store the integer part of y in mm0:mm1 */
xmm2 = _mm_movehl_ps(xmm2, y);
mm2 = _mm_cvttps_pi32(y);
mm3 = _mm_cvttps_pi32(xmm2);
/* j=(j+1) & (~1) (see the cephes sources) */
mm2 = _mm_add_pi32(mm2, *(v2si*)_pi32_1);
mm3 = _mm_add_pi32(mm3, *(v2si*)_pi32_1);
mm2 = _mm_and_si64(mm2, *(v2si*)_pi32_inv1);
mm3 = _mm_and_si64(mm3, *(v2si*)_pi32_inv1);
y = _mm_cvtpi32x2_ps(mm2, mm3);
mm2 = _mm_sub_pi32(mm2, *(v2si*)_pi32_2);
mm3 = _mm_sub_pi32(mm3, *(v2si*)_pi32_2);
/* get the swap sign flag in mm0:mm1 and the
polynom selection mask in mm2:mm3 */
mm0 = _mm_andnot_si64(mm2, *(v2si*)_pi32_4);
mm1 = _mm_andnot_si64(mm3, *(v2si*)_pi32_4);
mm0 = _mm_slli_pi32(mm0, 29);
mm1 = _mm_slli_pi32(mm1, 29);
mm2 = _mm_and_si64(mm2, *(v2si*)_pi32_2);
mm3 = _mm_and_si64(mm3, *(v2si*)_pi32_2);
mm2 = _mm_cmpeq_pi32(mm2, _mm_setzero_si64());
mm3 = _mm_cmpeq_pi32(mm3, _mm_setzero_si64());
v4sf sign_bit, poly_mask;
COPY_MM_TO_XMM(mm0, mm1, sign_bit);
COPY_MM_TO_XMM(mm2, mm3, poly_mask);
_mm_empty(); /* good-bye mmx */
#endif
/* The magic pass: "Extended precision modular arithmetic"
x = ((x - y * DP1) - y * DP2) - y * DP3; */
xmm1 = *(v4sf*)_ps_minus_cephes_DP1;
xmm2 = *(v4sf*)_ps_minus_cephes_DP2;
xmm3 = *(v4sf*)_ps_minus_cephes_DP3;
xmm1 = _mm_mul_ps(y, xmm1);
xmm2 = _mm_mul_ps(y, xmm2);
xmm3 = _mm_mul_ps(y, xmm3);
x = _mm_add_ps(x, xmm1);
x = _mm_add_ps(x, xmm2);
x = _mm_add_ps(x, xmm3);
/* Evaluate the first polynom (0 <= x <= Pi/4) */
y = *(v4sf*)_ps_coscof_p0;
v4sf z = _mm_mul_ps(x,x);
y = _mm_mul_ps(y, z);
y = _mm_add_ps(y, *(v4sf*)_ps_coscof_p1);
y = _mm_mul_ps(y, z);
y = _mm_add_ps(y, *(v4sf*)_ps_coscof_p2);
y = _mm_mul_ps(y, z);
y = _mm_mul_ps(y, z);
v4sf tmp = _mm_mul_ps(z, *(v4sf*)_ps_0p5);
y = _mm_sub_ps(y, tmp);
y = _mm_add_ps(y, *(v4sf*)_ps_1);
/* Evaluate the second polynom (Pi/4 <= x <= 0) */
v4sf y2 = *(v4sf*)_ps_sincof_p0;
y2 = _mm_mul_ps(y2, z);
y2 = _mm_add_ps(y2, *(v4sf*)_ps_sincof_p1);
y2 = _mm_mul_ps(y2, z);
y2 = _mm_add_ps(y2, *(v4sf*)_ps_sincof_p2);
y2 = _mm_mul_ps(y2, z);
y2 = _mm_mul_ps(y2, x);
y2 = _mm_add_ps(y2, x);
/* select the correct result from the two polynoms */
xmm3 = poly_mask;
y2 = _mm_and_ps(xmm3, y2); //, xmm3);
y = _mm_andnot_ps(xmm3, y);
y = _mm_add_ps(y,y2);
/* update the sign */
y = _mm_xor_ps(y, sign_bit);
return y;
}
/* since sin_ps and cos_ps are almost identical, sincos_ps could replace both of them..
it is almost as fast, and gives you a free cosine with your sine */
void sincos_ps(v4sf x, v4sf *s, v4sf *c) {
v4sf xmm1, xmm2, xmm3 = _mm_setzero_ps(), sign_bit_sin, y;
#ifdef USE_SSE2
v4si emm0, emm2, emm4;
#else
v2si mm0, mm1, mm2, mm3, mm4, mm5;
#endif
sign_bit_sin = x;
/* take the absolute value */
x = _mm_and_ps(x, *(v4sf*)_ps_inv_sign_mask);
/* extract the sign bit (upper one) */
sign_bit_sin = _mm_and_ps(sign_bit_sin, *(v4sf*)_ps_sign_mask);
/* scale by 4/Pi */
y = _mm_mul_ps(x, *(v4sf*)_ps_cephes_FOPI);
#ifdef USE_SSE2
/* store the integer part of y in emm2 */
emm2 = _mm_cvttps_epi32(y);
/* j=(j+1) & (~1) (see the cephes sources) */
emm2 = _mm_add_epi32(emm2, *(v4si*)_pi32_1);
emm2 = _mm_and_si128(emm2, *(v4si*)_pi32_inv1);
y = _mm_cvtepi32_ps(emm2);
emm4 = emm2;
/* get the swap sign flag for the sine */
emm0 = _mm_and_si128(emm2, *(v4si*)_pi32_4);
emm0 = _mm_slli_epi32(emm0, 29);
v4sf swap_sign_bit_sin = _mm_castsi128_ps(emm0);
/* get the polynom selection mask for the sine*/
emm2 = _mm_and_si128(emm2, *(v4si*)_pi32_2);
emm2 = _mm_cmpeq_epi32(emm2, _mm_setzero_si128());
v4sf poly_mask = _mm_castsi128_ps(emm2);
#else
/* store the integer part of y in mm2:mm3 */
xmm3 = _mm_movehl_ps(xmm3, y);
mm2 = _mm_cvttps_pi32(y);
mm3 = _mm_cvttps_pi32(xmm3);
/* j=(j+1) & (~1) (see the cephes sources) */
mm2 = _mm_add_pi32(mm2, *(v2si*)_pi32_1);
mm3 = _mm_add_pi32(mm3, *(v2si*)_pi32_1);
mm2 = _mm_and_si64(mm2, *(v2si*)_pi32_inv1);
mm3 = _mm_and_si64(mm3, *(v2si*)_pi32_inv1);
y = _mm_cvtpi32x2_ps(mm2, mm3);
mm4 = mm2;
mm5 = mm3;
/* get the swap sign flag for the sine */
mm0 = _mm_and_si64(mm2, *(v2si*)_pi32_4);
mm1 = _mm_and_si64(mm3, *(v2si*)_pi32_4);
mm0 = _mm_slli_pi32(mm0, 29);
mm1 = _mm_slli_pi32(mm1, 29);
v4sf swap_sign_bit_sin;
COPY_MM_TO_XMM(mm0, mm1, swap_sign_bit_sin);
/* get the polynom selection mask for the sine */
mm2 = _mm_and_si64(mm2, *(v2si*)_pi32_2);
mm3 = _mm_and_si64(mm3, *(v2si*)_pi32_2);
mm2 = _mm_cmpeq_pi32(mm2, _mm_setzero_si64());
mm3 = _mm_cmpeq_pi32(mm3, _mm_setzero_si64());
v4sf poly_mask;
COPY_MM_TO_XMM(mm2, mm3, poly_mask);
#endif
/* The magic pass: "Extended precision modular arithmetic"
x = ((x - y * DP1) - y * DP2) - y * DP3; */
xmm1 = *(v4sf*)_ps_minus_cephes_DP1;
xmm2 = *(v4sf*)_ps_minus_cephes_DP2;
xmm3 = *(v4sf*)_ps_minus_cephes_DP3;
xmm1 = _mm_mul_ps(y, xmm1);
xmm2 = _mm_mul_ps(y, xmm2);
xmm3 = _mm_mul_ps(y, xmm3);
x = _mm_add_ps(x, xmm1);
x = _mm_add_ps(x, xmm2);
x = _mm_add_ps(x, xmm3);
#ifdef USE_SSE2
emm4 = _mm_sub_epi32(emm4, *(v4si*)_pi32_2);
emm4 = _mm_andnot_si128(emm4, *(v4si*)_pi32_4);
emm4 = _mm_slli_epi32(emm4, 29);
v4sf sign_bit_cos = _mm_castsi128_ps(emm4);
#else
/* get the sign flag for the cosine */
mm4 = _mm_sub_pi32(mm4, *(v2si*)_pi32_2);
mm5 = _mm_sub_pi32(mm5, *(v2si*)_pi32_2);
mm4 = _mm_andnot_si64(mm4, *(v2si*)_pi32_4);
mm5 = _mm_andnot_si64(mm5, *(v2si*)_pi32_4);
mm4 = _mm_slli_pi32(mm4, 29);
mm5 = _mm_slli_pi32(mm5, 29);
v4sf sign_bit_cos;
COPY_MM_TO_XMM(mm4, mm5, sign_bit_cos);
_mm_empty(); /* good-bye mmx */
#endif
sign_bit_sin = _mm_xor_ps(sign_bit_sin, swap_sign_bit_sin);
/* Evaluate the first polynom (0 <= x <= Pi/4) */
v4sf z = _mm_mul_ps(x,x);
y = *(v4sf*)_ps_coscof_p0;
y = _mm_mul_ps(y, z);
y = _mm_add_ps(y, *(v4sf*)_ps_coscof_p1);
y = _mm_mul_ps(y, z);
y = _mm_add_ps(y, *(v4sf*)_ps_coscof_p2);
y = _mm_mul_ps(y, z);
y = _mm_mul_ps(y, z);
v4sf tmp = _mm_mul_ps(z, *(v4sf*)_ps_0p5);
y = _mm_sub_ps(y, tmp);
y = _mm_add_ps(y, *(v4sf*)_ps_1);
/* Evaluate the second polynom (Pi/4 <= x <= 0) */
v4sf y2 = *(v4sf*)_ps_sincof_p0;
y2 = _mm_mul_ps(y2, z);
y2 = _mm_add_ps(y2, *(v4sf*)_ps_sincof_p1);
y2 = _mm_mul_ps(y2, z);
y2 = _mm_add_ps(y2, *(v4sf*)_ps_sincof_p2);
y2 = _mm_mul_ps(y2, z);
y2 = _mm_mul_ps(y2, x);
y2 = _mm_add_ps(y2, x);
/* select the correct result from the two polynoms */
xmm3 = poly_mask;
v4sf ysin2 = _mm_and_ps(xmm3, y2);
v4sf ysin1 = _mm_andnot_ps(xmm3, y);
y2 = _mm_sub_ps(y2,ysin2);
y = _mm_sub_ps(y, ysin1);
xmm1 = _mm_add_ps(ysin1,ysin2);
xmm2 = _mm_add_ps(y,y2);
/* update the sign */
*s = _mm_xor_ps(xmm1, sign_bit_sin);
*c = _mm_xor_ps(xmm2, sign_bit_cos);
}

View File

@ -1,360 +0,0 @@
/*
sse_mathfun_extension.h - zlib license
Written by Tolga Mizrak 2016
Extension of sse_mathfun.h, which is written by Julien Pommier
Based on the corresponding algorithms of the cephes math library
This is written as an extension to sse_mathfun.h instead of modifying it, just because I didn't want
to maintain a modified version of the original library. This way switching to a newer version of the
library won't be a hassle.
Note that non SSE2 implementations of tan_ps, atan_ps, cot_ps and atan2_ps are not implemented yet.
As such, currently you need to #define USE_SSE2 to compile.
With tan_ps, cot_ps you get good precision on input ranges that are further away from the domain
borders (-PI/2, PI/2 for tan and 0, 1 for cot). See the results on the deviations for these
functions on my machine:
checking tan on [-0.25*Pi, 0.25*Pi]
max deviation from tanf(x): 1.19209e-07 at 0.250000006957*Pi, max deviation from cephes_tan(x):
5.96046e-08
->> precision OK for the tan_ps <<-
checking tan on [-0.49*Pi, 0.49*Pi]
max deviation from tanf(x): 3.8147e-06 at -0.490000009841*Pi, max deviation from cephes_tan(x):
9.53674e-07
->> precision OK for the tan_ps <<-
checking cot on [0.2*Pi, 0.7*Pi]
max deviation from cotf(x): 1.19209e-07 at 0.204303119606*Pi, max deviation from cephes_cot(x):
1.19209e-07
->> precision OK for the cot_ps <<-
checking cot on [0.01*Pi, 0.99*Pi]
max deviation from cotf(x): 3.8147e-06 at 0.987876517942*Pi, max deviation from cephes_cot(x):
9.53674e-07
->> precision OK for the cot_ps <<-
With atan_ps and atan2_ps you get pretty good precision, atan_ps max deviation is < 2e-7 and
atan2_ps max deviation is < 2.5e-7
*/
/* Copyright (C) 2016 Tolga Mizrak
This software is provided 'as-is', without any express or implied
warranty. In no event will the authors be held liable for any damages
arising from the use of this software.
Permission is granted to anyone to use this software for any purpose,
including commercial applications, and to alter it and redistribute it
freely, subject to the following restrictions:
1. The origin of this software must not be misrepresented; you must not
claim that you wrote the original software. If you use this software
in a product, an acknowledgment in the product documentation would be
appreciated but is not required.
2. Altered source versions must be plainly marked as such, and must not be
misrepresented as being the original software.
3. This notice may not be removed or altered from any source distribution.
(this is the zlib license)
*/
#pragma once
#ifndef _SSE_MATHFUN_EXTENSION_H_INCLUDED_
#define _SSE_MATHFUN_EXTENSION_H_INCLUDED_
#ifndef USE_SSE2
#error sse1 & mmx version not implemented
#endif
#ifdef _MSC_VER
#pragma warning( push )
/* warning C4838: conversion from 'double' to 'const float' requires a narrowing conversion */
#pragma warning( disable : 4838 )
/* warning C4305: 'initializing': truncation from 'double' to 'const float' */
#pragma warning( disable : 4305 )
#endif
#include "sse_mathfun.h"
_PS_CONST( 0, 0 );
_PS_CONST( 2, 2 );
_PI32_CONST( neg1, 1 );
_PS_CONST( tancof_p0, 9.38540185543E-3 );
_PS_CONST( tancof_p1, 3.11992232697E-3 );
_PS_CONST( tancof_p2, 2.44301354525E-2 );
_PS_CONST( tancof_p3, 5.34112807005E-2 );
_PS_CONST( tancof_p4, 1.33387994085E-1 );
_PS_CONST( tancof_p5, 3.33331568548E-1 );
_PS_CONST( tancot_eps, 1.0e-4 );
v4sf tancot_ps( v4sf x, int cotFlag )
{
v4sf xmm1, xmm2 = _mm_setzero_ps(), xmm3, sign_bit, y;
#ifdef USE_SSE2
v4si emm2;
#else
#endif
sign_bit = x;
/* take the absolute value */
x = _mm_and_ps( x, *(v4sf*)_ps_inv_sign_mask );
/* extract the sign bit (upper one) */
sign_bit = _mm_and_ps( sign_bit, *(v4sf*)_ps_sign_mask );
/* scale by 4/Pi */
y = _mm_mul_ps( x, *(v4sf*)_ps_cephes_FOPI );
#ifdef USE_SSE2
/* store the integer part of y in mm0 */
emm2 = _mm_cvttps_epi32( y );
/* j=(j+1) & (~1) (see the cephes sources) */
emm2 = _mm_add_epi32( emm2, *(v4si*)_pi32_1 );
emm2 = _mm_and_si128( emm2, *(v4si*)_pi32_inv1 );
y = _mm_cvtepi32_ps( emm2 );
emm2 = _mm_and_si128( emm2, *(v4si*)_pi32_2 );
emm2 = _mm_cmpeq_epi32( emm2, _mm_setzero_si128() );
v4sf poly_mask = _mm_castsi128_ps( emm2 );
#else
#endif
/* The magic pass: "Extended precision modular arithmetic"
x = ((x - y * DP1) - y * DP2) - y * DP3; */
xmm1 = *(v4sf*)_ps_minus_cephes_DP1;
xmm2 = *(v4sf*)_ps_minus_cephes_DP2;
xmm3 = *(v4sf*)_ps_minus_cephes_DP3;
xmm1 = _mm_mul_ps( y, xmm1 );
xmm2 = _mm_mul_ps( y, xmm2 );
xmm3 = _mm_mul_ps( y, xmm3 );
v4sf z = _mm_add_ps( x, xmm1 );
z = _mm_add_ps( z, xmm2 );
z = _mm_add_ps( z, xmm3 );
v4sf zz = _mm_mul_ps( z, z );
y = *(v4sf*)_ps_tancof_p0;
y = _mm_mul_ps( y, zz );
y = _mm_add_ps( y, *(v4sf*)_ps_tancof_p1 );
y = _mm_mul_ps( y, zz );
y = _mm_add_ps( y, *(v4sf*)_ps_tancof_p2 );
y = _mm_mul_ps( y, zz );
y = _mm_add_ps( y, *(v4sf*)_ps_tancof_p3 );
y = _mm_mul_ps( y, zz );
y = _mm_add_ps( y, *(v4sf*)_ps_tancof_p4 );
y = _mm_mul_ps( y, zz );
y = _mm_add_ps( y, *(v4sf*)_ps_tancof_p5 );
y = _mm_mul_ps( y, zz );
y = _mm_mul_ps( y, z );
y = _mm_add_ps( y, z );
v4sf y2;
if( cotFlag ) {
y2 = _mm_xor_ps( y, *(v4sf*)_ps_sign_mask );
/* y = _mm_rcp_ps( y ); */
/* using _mm_rcp_ps here loses on way too much precision, better to do a div */
y = _mm_div_ps( *(v4sf*)_ps_1, y );
} else {
/* y2 = _mm_rcp_ps( y ); */
/* using _mm_rcp_ps here loses on way too much precision, better to do a div */
y2 = _mm_div_ps( *(v4sf*)_ps_1, y );
y2 = _mm_xor_ps( y2, *(v4sf*)_ps_sign_mask );
}
/* select the correct result from the two polynoms */
xmm3 = poly_mask;
y = _mm_and_ps( xmm3, y );
y2 = _mm_andnot_ps( xmm3, y2 );
y = _mm_or_ps( y, y2 );
/* update the sign */
y = _mm_xor_ps( y, sign_bit );
return y;
}
v4sf tan_ps( v4sf x ) { return tancot_ps( x, 0 ); }
v4sf cot_ps( v4sf x ) { return tancot_ps( x, 1 ); }
_PS_CONST( atanrange_hi, 2.414213562373095 );
_PS_CONST( atanrange_lo, 0.4142135623730950 );
const float PIF = 3.141592653589793238;
const float PIO2F = 1.5707963267948966192;
_PS_CONST( cephes_PIF, 3.141592653589793238 );
_PS_CONST( cephes_PIO2F, 1.5707963267948966192 );
_PS_CONST( cephes_PIO4F, 0.7853981633974483096 );
_PS_CONST( atancof_p0, 8.05374449538e-2 );
_PS_CONST( atancof_p1, 1.38776856032E-1 );
_PS_CONST( atancof_p2, 1.99777106478E-1 );
_PS_CONST( atancof_p3, 3.33329491539E-1 );
v4sf atan_ps( v4sf x )
{
v4sf sign_bit, y;
sign_bit = x;
/* take the absolute value */
x = _mm_and_ps( x, *(v4sf*)_ps_inv_sign_mask );
/* extract the sign bit (upper one) */
sign_bit = _mm_and_ps( sign_bit, *(v4sf*)_ps_sign_mask );
/* range reduction, init x and y depending on range */
#ifdef USE_SSE2
/* x > 2.414213562373095 */
v4sf cmp0 = _mm_cmpgt_ps( x, *(v4sf*)_ps_atanrange_hi );
/* x > 0.4142135623730950 */
v4sf cmp1 = _mm_cmpgt_ps( x, *(v4sf*)_ps_atanrange_lo );
/* x > 0.4142135623730950 && !( x > 2.414213562373095 ) */
v4sf cmp2 = _mm_andnot_ps( cmp0, cmp1 );
/* -( 1.0/x ) */
v4sf y0 = _mm_and_ps( cmp0, *(v4sf*)_ps_cephes_PIO2F );
v4sf x0 = _mm_div_ps( *(v4sf*)_ps_1, x );
x0 = _mm_xor_ps( x0, *(v4sf*)_ps_sign_mask );
v4sf y1 = _mm_and_ps( cmp2, *(v4sf*)_ps_cephes_PIO4F );
/* (x-1.0)/(x+1.0) */
v4sf x1_o = _mm_sub_ps( x, *(v4sf*)_ps_1 );
v4sf x1_u = _mm_add_ps( x, *(v4sf*)_ps_1 );
v4sf x1 = _mm_div_ps( x1_o, x1_u );
v4sf x2 = _mm_and_ps( cmp2, x1 );
x0 = _mm_and_ps( cmp0, x0 );
x2 = _mm_or_ps( x2, x0 );
cmp1 = _mm_or_ps( cmp0, cmp2 );
x2 = _mm_and_ps( cmp1, x2 );
x = _mm_andnot_ps( cmp1, x );
x = _mm_or_ps( x2, x );
y = _mm_or_ps( y0, y1 );
#else
#error sse1 & mmx version not implemented
#endif
v4sf zz = _mm_mul_ps( x, x );
v4sf acc = *(v4sf*)_ps_atancof_p0;
acc = _mm_mul_ps( acc, zz );
acc = _mm_sub_ps( acc, *(v4sf*)_ps_atancof_p1 );
acc = _mm_mul_ps( acc, zz );
acc = _mm_add_ps( acc, *(v4sf*)_ps_atancof_p2 );
acc = _mm_mul_ps( acc, zz );
acc = _mm_sub_ps( acc, *(v4sf*)_ps_atancof_p3 );
acc = _mm_mul_ps( acc, zz );
acc = _mm_mul_ps( acc, x );
acc = _mm_add_ps( acc, x );
y = _mm_add_ps( y, acc );
/* update the sign */
y = _mm_xor_ps( y, sign_bit );
return y;
}
v4sf atan2_ps( v4sf y, v4sf x )
{
v4sf x_eq_0 = _mm_cmpeq_ps( x, *(v4sf*)_ps_0 );
v4sf x_gt_0 = _mm_cmpgt_ps( x, *(v4sf*)_ps_0 );
v4sf x_le_0 = _mm_cmple_ps( x, *(v4sf*)_ps_0 );
v4sf y_eq_0 = _mm_cmpeq_ps( y, *(v4sf*)_ps_0 );
v4sf x_lt_0 = _mm_cmplt_ps( x, *(v4sf*)_ps_0 );
v4sf y_lt_0 = _mm_cmplt_ps( y, *(v4sf*)_ps_0 );
v4sf zero_mask = _mm_and_ps( x_eq_0, y_eq_0 );
v4sf zero_mask_other_case = _mm_and_ps( y_eq_0, x_gt_0 );
zero_mask = _mm_or_ps( zero_mask, zero_mask_other_case );
v4sf pio2_mask = _mm_andnot_ps( y_eq_0, x_eq_0 );
v4sf pio2_mask_sign = _mm_and_ps( y_lt_0, *(v4sf*)_ps_sign_mask );
v4sf pio2_result = *(v4sf*)_ps_cephes_PIO2F;
pio2_result = _mm_xor_ps( pio2_result, pio2_mask_sign );
pio2_result = _mm_and_ps( pio2_mask, pio2_result );
v4sf pi_mask = _mm_and_ps( y_eq_0, x_le_0 );
v4sf pi = *(v4sf*)_ps_cephes_PIF;
v4sf pi_result = _mm_and_ps( pi_mask, pi );
v4sf swap_sign_mask_offset = _mm_and_ps( x_lt_0, y_lt_0 );
swap_sign_mask_offset = _mm_and_ps( swap_sign_mask_offset, *(v4sf*)_ps_sign_mask );
v4sf offset0 = _mm_setzero_ps();
v4sf offset1 = *(v4sf*)_ps_cephes_PIF;
offset1 = _mm_xor_ps( offset1, swap_sign_mask_offset );
v4sf offset = _mm_andnot_ps( x_lt_0, offset0 );
offset = _mm_and_ps( x_lt_0, offset1 );
v4sf arg = _mm_div_ps( y, x );
v4sf atan_result = atan_ps( arg );
atan_result = _mm_add_ps( atan_result, offset );
/* select between zero_result, pio2_result and atan_result */
v4sf result = _mm_andnot_ps( zero_mask, pio2_result );
atan_result = _mm_andnot_ps( pio2_mask, atan_result );
atan_result = _mm_andnot_ps( pio2_mask, atan_result);
result = _mm_or_ps( result, atan_result );
result = _mm_or_ps( result, pi_result );
return result;
}
/* for convenience of calling simd sqrt */
float sqrt_ps( float x )
{
v4sf sse_value = _mm_set_ps1( x );
sse_value = _mm_sqrt_ps( sse_value );
return _mm_cvtss_f32( sse_value );
}
float rsqrt_ps( float x )
{
v4sf sse_value = _mm_set_ps1( x );
sse_value = _mm_rsqrt_ps( sse_value );
return _mm_cvtss_f32( sse_value );
}
/* atan2 implementation using atan, used as a reference to implement atan2_ps */
float atan2_ref( float y, float x )
{
if( x == 0.0f ) {
if( y == 0.0f ) {
return 0.0f;
}
float result = _ps_cephes_PIO2F[0];
if( y < 0.0f ) {
result = -result;
}
return result;
}
if( y == 0.0f ) {
if( x > 0.0f ) {
return 0.0f;
}
return PIF;
}
float offset = 0;
if( x < 0.0f ) {
offset = PIF;
if( y < 0.0f ) {
offset = -offset;
}
}
v4sf val = _mm_set_ps1( y / x );
val = atan_ps( val );
return offset + _mm_cvtss_f32( val );
}
#ifdef _MSC_VER
#pragma warning( pop )
#endif
#endif

View File

@ -2424,12 +2424,37 @@ CreateAllocator_(allocator_allocate* Alloc, allocator_free* Free)
}
#define CreateAllocator(a, f) CreateAllocator_((allocator_allocate*)(a), (allocator_free*)(f))
internal void
AllocatorDebug_PushAlloc(gs_allocator_debug* Debug, u64 Size, char* Location)
{
// NOTE(pjs): I don't want this debug procedure to be the reason the
// application crashes.
if (Debug->AllocationsCount < Debug->AllocationsCountMax)
{
gs_debug_allocation Allocation = {};
gs_const_string L = ConstString(Location);
s64 LastSlash = FindLastFromSet(L, "\\/");
if (LastSlash < 0) LastSlash = 0;
Allocation.Location = GetStringAfter(L, LastSlash);
Allocation.Size = Size;
Debug->Allocations[Debug->AllocationsCount++] = Allocation;
}
Debug->TotalAllocSize += Size;
}
internal gs_data
AllocatorAlloc_(gs_allocator Allocator, u64 Size, char* Location)
{
// TODO(Peter): Memory Profiling with Location
u64 SizeResult = 0;
void* Memory = Allocator.Alloc(Size, &SizeResult);
if (Allocator.Debug)
{
AllocatorDebug_PushAlloc(Allocator.Debug, Size, Location);
}
return CreateData((u8*)Memory, SizeResult);
}
internal void
@ -2439,6 +2464,13 @@ AllocatorFree_(gs_allocator Allocator, void* Base, u64 Size, char* Location)
if (Base != 0 && Size != 0)
{
Allocator.Free(Base, Size);
if (Allocator.Debug)
{
// NOTE(pjs): There's no reason we should be going negative
// ie. Freeing more memory than we allocated
Assert(Allocator.Debug->TotalAllocSize >= Size);
Allocator.Debug->TotalAllocSize -= Size;
}
}
}
@ -2526,30 +2558,37 @@ FreeCursorListEntry(gs_allocator Allocator, gs_memory_cursor_list* CursorEntry)
}
internal gs_memory_arena
CreateMemoryArena_(arena_type ArenaType, gs_allocator Allocator, u64 ChunkSize, u64 Alignment, gs_memory_arena* ParentArena)
CreateMemoryArena_(arena_type ArenaType, gs_allocator Allocator, u64 ChunkSize, u64 Alignment, gs_memory_arena* ParentArena, char* Name)
{
// we only want a parent arena if the type is Arena_SubArena
Assert(((ArenaType == Arena_BaseArena) && (ParentArena == 0)) ||
((ArenaType == Arena_SubArena) && (ParentArena != 0)));
gs_memory_arena Arena = {};
Arena.ArenaName = Name;
Arena.Type = ArenaType;
Arena.Allocator = Allocator;
Arena.Parent = ParentArena;
#if MEMORY_CURSOR_STATIC_ARRAY
Arena.CursorsCountMax = 4096;
Arena.Cursors = AllocatorAllocArray(Allocator, gs_memory_cursor_list, Arena.CursorsCountMax);
#endif
Arena.MemoryChunkSize = ChunkSize;
Arena.MemoryAlignment = Alignment;
return Arena;
}
internal gs_memory_arena
CreateMemoryArena(gs_allocator Allocator, u64 ChunkSize = KB(32), u64 Alignment = Bytes(8))
CreateMemoryArena(gs_allocator Allocator, char* Name, u64 ChunkSize = KB(32), u64 Alignment = Bytes(8))
{
return CreateMemoryArena_(Arena_BaseArena, Allocator, ChunkSize, Alignment, 0);
return CreateMemoryArena_(Arena_BaseArena, Allocator, ChunkSize, Alignment, 0, Name);
}
internal gs_memory_arena
CreateMemorySubArena(gs_memory_arena* Parent, u64 ChunkSize = KB(32), u64 Alignment = Bytes(8))
CreateMemorySubArena(gs_memory_arena* Parent, char* Name, u64 ChunkSize = KB(32), u64 Alignment = Bytes(8))
{
return CreateMemoryArena_(Arena_SubArena, Parent->Allocator, ChunkSize, Alignment, Parent);
return CreateMemoryArena_(Arena_SubArena, Parent->Allocator, ChunkSize, Alignment, Parent, Name);
}
internal gs_data PushSize_(gs_memory_arena* Arena, u64 Size, char* Location);
@ -2557,6 +2596,7 @@ internal gs_data PushSize_(gs_memory_arena* Arena, u64 Size, char* Location);
internal void
FreeCursorList(gs_memory_cursor_list* List, gs_allocator Allocator)
{
#if !MEMORY_CURSOR_STATIC_ARRAY
gs_memory_cursor_list* CursorAt = List;
while (CursorAt != 0)
{
@ -2564,13 +2604,18 @@ FreeCursorList(gs_memory_cursor_list* List, gs_allocator Allocator)
FreeCursorListEntry(Allocator, CursorAt);
CursorAt = Prev;
}
#endif
}
internal gs_memory_cursor_list*
MemoryArenaNewCursor(gs_memory_arena* Arena, u64 MinSize, char* Location)
{
#if MEMORY_CURSOR_STATIC_ARRAY
u64 AllocSize = Max(MinSize, Arena->MemoryChunkSize);
#else
// Allocate enough spcae for the minimum size needed + sizeo for the cursor list
u64 AllocSize = Max(MinSize, Arena->MemoryChunkSize) + sizeof(gs_memory_cursor_list);
#endif
gs_data Data = {0};
switch (Arena->Type)
@ -2588,6 +2633,11 @@ MemoryArenaNewCursor(gs_memory_arena* Arena, u64 MinSize, char* Location)
InvalidDefaultCase;
}
#if MEMORY_CURSOR_STATIC_ARRAY
Assert(Arena->CursorsCount < Arena->CursorsCountMax);
gs_memory_cursor_list* Result = Arena->Cursors + Arena->CursorsCount++;
Result->Cursor = CreateMemoryCursor(Data.Memory, Data.Size);
#else
// Fit the memory cursor into the region allocated
Assert(MinSize + sizeof(gs_memory_cursor_list) <= Data.Size);
gs_memory_cursor_list* Result = (gs_memory_cursor_list*)Data.Memory;
@ -2599,9 +2649,14 @@ MemoryArenaNewCursor(gs_memory_arena* Arena, u64 MinSize, char* Location)
Result->Next = 0;
if (Arena->CursorList != 0)
{
if (Arena->CursorList->Next != 0)
{
Result->Next = Arena->CursorList->Next;
}
Arena->CursorList->Next = Result;
}
Arena->CursorList = Result;
#endif
return Result;
}
@ -2611,6 +2666,27 @@ PushSize_(gs_memory_arena* Arena, u64 Size, char* Location)
gs_data Result = {0};
if (Size > 0)
{
#if MEMORY_CURSOR_STATIC_ARRAY
gs_memory_cursor_list* CursorEntry = 0;
for (u64 i = 0;
i < Arena->CursorsCount;
i++)
{
gs_memory_cursor_list* At = Arena->Cursors + i;
if (CursorHasRoom(At->Cursor, Size))
{
CursorEntry = At;
break;
}
}
if (!CursorEntry)
{
CursorEntry = MemoryArenaNewCursor(Arena, Size, Location);
}
Assert(CursorEntry);
Assert(CursorHasRoom(CursorEntry->Cursor, Size));
#else
gs_memory_cursor_list* CursorEntry = Arena->CursorList;
if (CursorEntry == 0)
{
@ -2627,6 +2703,7 @@ PushSize_(gs_memory_arena* Arena, u64 Size, char* Location)
CursorEntry = MemoryArenaNewCursor(Arena, Size, Location);
}
}
#endif
Assert(CursorEntry != 0);
Result = PushSizeOnCursor_(&CursorEntry->Cursor, Size, Location);
Assert(Result.Memory != 0);
@ -2651,44 +2728,19 @@ PushSize_(gs_memory_arena* Arena, u64 Size, char* Location)
return Result;
}
internal void
PopSize(gs_memory_arena* Arena, u64 Size)
{
gs_allocator Allocator = Arena->Allocator;
gs_memory_cursor_list* CursorEntry = Arena->CursorList;
for (gs_memory_cursor_list* Prev = 0;
CursorEntry != 0 && Size != 0;
CursorEntry = Prev)
{
Prev = CursorEntry->Prev;
if (Size >= CursorEntry->Cursor.Position)
{
Size -= CursorEntry->Cursor.Position;
FreeCursorListEntry(Allocator, CursorEntry);
}
else
{
PopSizeOnCursor(&CursorEntry->Cursor, Size);
break;
}
}
Arena->CursorList = CursorEntry;
}
internal void
FreeMemoryArena(gs_memory_arena* Arena)
{
gs_allocator Allocator = Arena->Allocator;
gs_memory_cursor_list* CursorEntry = Arena->CursorList;
for (gs_memory_cursor_list* Prev = 0;
CursorEntry != 0;
CursorEntry = Prev)
#if MEMORY_CURSOR_STATIC_ARRAY
for (u32 i = 0; i < Arena->CursorsCount; i++)
{
Prev = CursorEntry->Prev;
if (CursorEntry != 0)
{
FreeCursorListEntry(Allocator, CursorEntry);
}
gs_memory_cursor_list E = Arena->Cursors[i];
AllocatorFree(Arena->Allocator, E.Cursor.Data.Memory, E.Cursor.Data.Size);
}
AllocatorFreeArray(Arena->Allocator, Arena->Cursors, gs_memory_cursor_list, Arena->CursorsCountMax);
#else
FreeCursorList(Arena->CursorList, Arena->Allocator);
#endif
}
#define PushSizeToData(arena, size) PushSize_((arena), (size), FileNameAndLineNumberString)
@ -2726,6 +2778,12 @@ PushStringCopy(gs_memory_arena* Arena, gs_const_string String)
internal void
ClearArena(gs_memory_arena* Arena)
{
#if MEMORY_CURSOR_STATIC_ARRAY
for (u32 i = 0; i < Arena->CursorsCount; i++)
{
Arena->Cursors[i].Cursor.Position = 0;
}
#else
gs_memory_cursor_list* First = 0;
for (gs_memory_cursor_list* CursorEntry = Arena->CursorList;
CursorEntry != 0;
@ -2735,12 +2793,13 @@ ClearArena(gs_memory_arena* Arena)
CursorEntry->Cursor.Position = 0;
}
Arena->CursorList = First;
#endif
}
internal void
FreeArena(gs_memory_arena* Arena)
{
FreeCursorList(Arena->CursorList, Arena->Allocator);
FreeMemoryArena(Arena);
}
///////////////////////////
@ -2789,14 +2848,14 @@ CreateDynarrayWithStorage(gs_memory_arena Storage, u32 ElementSize, u32 Elements
internal gs_dynarray
CreateDynarray_(gs_allocator Allocator, u32 ElementSize, u32 ElementsPerBuffer)
{
gs_memory_arena Storage = CreateMemoryArena(Allocator, ElementSize * ElementsPerBuffer);
gs_memory_arena Storage = CreateMemoryArena(Allocator, "Dynarray Arena", ElementSize * ElementsPerBuffer);
return CreateDynarrayWithStorage(Storage, ElementSize, ElementsPerBuffer);
};
internal gs_dynarray
CreateDynarray_(gs_memory_arena* Arena, u32 ElementSize, u32 ElementsPerBuffer)
{
gs_memory_arena Storage = CreateMemorySubArena(Arena, ElementSize * ElementsPerBuffer);
gs_memory_arena Storage = CreateMemorySubArena(Arena, "Dynarray Sub Arena", ElementSize * ElementsPerBuffer);
return CreateDynarrayWithStorage(Storage, ElementSize, ElementsPerBuffer);
};

View File

@ -247,7 +247,7 @@ enum { \
#define DontCompile ImAfraidICantDoThat
#define LineNumberString Stringify(__LINE__)
#define FileNameAndLineNumberString_ __FILE__ ":" LineNumberString ":"
#define FileNameAndLineNumberString_ __FILE__ ":" LineNumberString ":" __FUNCTION__
#define FileNameAndLineNumberString (char*)FileNameAndLineNumberString_
//
@ -633,10 +633,27 @@ typedef ALLOCATOR_ALLOC(allocator_allocate);
#define ALLOCATOR_FREE(name) void name(void* Ptr, u64 Size)
typedef ALLOCATOR_FREE(allocator_free);
struct gs_debug_allocation
{
gs_const_string Location;
u64 Size;
};
struct gs_allocator_debug
{
u64 TotalAllocSize;
u64 AllocationsCount;
u64 AllocationsCountMax;
gs_debug_allocation* Allocations;
};
struct gs_allocator
{
allocator_allocate* Alloc;
allocator_free* Free;
gs_allocator_debug* Debug;
};
struct gs_memory_cursor
@ -645,11 +662,26 @@ struct gs_memory_cursor
u64 Position;
};
/* TODO(pjs): Setting MEMORY_CURSOR_STATIC_ARRAY will still compile,
However, it introduces a bug that I haven't fully diagnosed.
The problem seems to occur when trying to push to a cleared memory arena
Where the FirstCursor doesn't have enough room for the allocation, but
also FirstCursor->Next points to a valid cursor. The new cursor is put
in the middle however we seem to continually keep allocating new
cursors forever and losing old ones.
The problem in Lumenarium is found in the OutputData structure
Leaving this in a simplified state for now
*/
#define MEMORY_CURSOR_STATIC_ARRAY 1
struct gs_memory_cursor_list
{
gs_memory_cursor Cursor;
#if !MEMORY_CURSOR_STATIC_ARRAY
gs_memory_cursor_list* Next;
gs_memory_cursor_list* Prev;
#endif
};
enum arena_type
@ -664,9 +696,18 @@ struct gs_memory_arena
gs_allocator Allocator;
gs_memory_arena* Parent;
#if MEMORY_CURSOR_STATIC_ARRAY
gs_memory_cursor_list* Cursors;
u64 CursorsCount;
u64 CursorsCountMax;
#else
gs_memory_cursor_list* CursorList;
#endif
u64 MemoryChunkSize;
u64 MemoryAlignment;
char* ArenaName;
};
struct gs_memory_arena_array

View File

@ -31,7 +31,7 @@ bool PathTest (char* In, char* Out) {
int main (int ArgCount, char** Args)
{
Scratch = CreateMemoryArena(CreateAllocator(Alloc, Free));
Scratch = CreateMemoryArena(CreateAllocator(Alloc, Free), "Scratch");
Test("gs_string")
{