Memory profiling & codebase cleanup

2021-03-18 02:19:35 -07:00 · 2021-03-18 02:19:35 -07:00 · c054a0e6b6
parent 4f199ee1c6
commit c054a0e6b6
25 changed files with 272 additions and 4399 deletions
--- a/compile.bat
+++ b/compile.bat
@ -0,0 +1,2 @@
+@echo off
+build\build_app_msvc_win32_debug.bat
--- a/msdev.bat
+++ b/msdev.bat
@ -1,3 +0,0 @@
-@echo off
-
-remedybg build\win32_foldhaus.rdbg
--- a/src/app/editor/foldhaus_editor.cpp
+++ b/src/app/editor/foldhaus_editor.cpp
@ -144,8 +144,8 @@ Editor_Render(app_state* State, context* Context, render_command_buffer* RenderB
    // Draw the Interface
    if (State->Interface.DrawOrderRoot != 0)
    {
-        ui_widget Widget = *State->Interface.DrawOrderRoot;
-        Editor_DrawWidget(State, Context, RenderBuffer, Widget, Context->WindowBounds);
+        ui_widget* Widget = State->Interface.DrawOrderRoot;
+        Editor_DrawWidgetList(State, Context, RenderBuffer, Widget, Context->WindowBounds);
    }
    
    Context->GeneralWorkQueue->CompleteQueueWork(Context->GeneralWorkQueue, Context->ThreadContext);
--- a/src/app/editor/foldhaus_editor_draw.h
+++ b/src/app/editor/foldhaus_editor_draw.h
@ -77,12 +77,11 @@ Editor_GetWidgetFillBounds(ui_widget Widget)
    return Result;
 }

-internal void
-Editor_DrawWidget(app_state* State, context* Context, render_command_buffer* RenderBuffer, ui_widget Widget, rect2 ParentClipBounds)
-{
-    rect2 WidgetParentUnion = Widget.Bounds;
-    WidgetParentUnion = Rect2Union(Widget.Bounds, ParentClipBounds);
+internal void Editor_DrawWidgetList(app_state* State, context* Context, render_command_buffer* RenderBuffer, ui_widget Widget, rect2 ParentClipBounds);

+internal void
+Editor_DrawWidget(app_state* State, context* Context, render_command_buffer* RenderBuffer, ui_widget Widget, rect2 WidgetParentUnion)
+{
    bool IsActiveWidget = ui_WidgetIdsEqual(Widget.Id, State->Interface.ActiveWidget);
    ;
    if (!Widget.Parent || (Rect2Area(WidgetParentUnion) > 0))
@ -146,18 +145,27 @@ Editor_DrawWidget(app_state* State, context* Context, render_command_buffer* Ren
            PushRenderBoundingBox2D(RenderBuffer, WidgetParentUnion.Min, WidgetParentUnion.Max, Thickness, Color);
        }
    }
-    
-    if (Widget.ChildrenRoot)
-    {
-        Editor_DrawWidget(State, Context, RenderBuffer, *Widget.ChildrenRoot, WidgetParentUnion);
-    }
-    
-    if (Widget.Next)
-    {
-        Editor_DrawWidget(State, Context, RenderBuffer, *Widget.Next, ParentClipBounds);
-    }
 }


+internal void Editor_DrawWidgetList(app_state* State, context* Context, render_command_buffer* RenderBuffer, ui_widget* Widget, rect2 ParentClipBounds)
+{
+    ui_widget* WidgetAt = Widget;
+    while (WidgetAt)
+    {
+        rect2 WidgetParentUnion = WidgetAt->Bounds;
+        WidgetParentUnion = Rect2Union(WidgetAt->Bounds, ParentClipBounds);
+        
+        Editor_DrawWidget(State, Context, RenderBuffer, *WidgetAt, WidgetParentUnion);
+        
+        if (WidgetAt->ChildrenRoot)
+        {
+            Editor_DrawWidgetList(State, Context, RenderBuffer, WidgetAt->ChildrenRoot, WidgetParentUnion);
+        }
+        
+        WidgetAt = WidgetAt->Next;
+    }
+}
+
 #define FOLDHAUS_EDITOR_DRAW_H
 #endif // FOLDHAUS_EDITOR_DRAW_H
--- a/src/app/editor/interface.h
+++ b/src/app/editor/interface.h
@ -1582,7 +1582,7 @@ ui_InterfaceCreate(context Context, interface_config Style, gs_memory_arena* Per
    Result.WidgetsCountMax = 4096;
    Result.Widgets = PushArray(Permanent, ui_widget, Result.WidgetsCountMax);
    Result.PerFrameMemory = PushStruct(Permanent, gs_memory_arena);
-    *Result.PerFrameMemory = CreateMemoryArena(Context.ThreadContext.Allocator);
+    *Result.PerFrameMemory = CreateMemoryArena(Context.ThreadContext.Allocator, "Interface Per Frame Memory Arena", KB(32));
    InterfaceAssert(Result.PerFrameMemory);
    
    Result.Permanent = Permanent;
--- a/src/app/editor/panels/foldhaus_panel_file_view.h
+++ b/src/app/editor/panels/foldhaus_panel_file_view.h
@ -90,7 +90,7 @@ FileView_Init(panel* Panel, app_state* State, context Context)
    // TODO: :FreePanelMemory
    file_view_state* FileViewState = PushStruct(&State->Permanent, file_view_state);
    Panel->StateMemory = StructToData(FileViewState, file_view_state);
-    FileViewState->FileNamesArena = CreateMemoryArena(Context.ThreadContext.Allocator);
+    FileViewState->FileNamesArena = CreateMemoryArena(Context.ThreadContext.Allocator, "File View - File Names Arena");
    
    // TODO(pjs): this shouldn't be stored in permanent
    FileViewState->DisplayDirectory = PushString(&State->Permanent, 1024);
--- a/src/app/editor/panels/foldhaus_panel_profiler.h
+++ b/src/app/editor/panels/foldhaus_panel_profiler.h
@ -163,6 +163,44 @@ RenderProfiler_ListVisualization(ui_interface* Interface, ui_widget* Layout, deb
    ui_EndList(Interface);
 }

+internal void
+RenderProfiler_MemoryView(ui_interface* Interface, ui_widget* Layout, app_state* State, context Context, gs_memory_arena* Memory)
+{
+    gs_allocator_debug Debug = *Context.ThreadContext.Allocator.Debug;
+    gs_string TempString = PushString(State->Transient, 256);
+    
+    u64 MemFootprint = Debug.TotalAllocSize;
+    u64 AllocCount = Debug.AllocationsCount;
+    PrintF(&TempString, "Total Memory Size: %lld | Allocations: %lld", MemFootprint, AllocCount);
+    ui_Label(Interface, TempString);
+    
+    ui_column_spec ColumnWidths[] = {
+        { UIColumnSize_Fill, 0 },
+        { UIColumnSize_Fixed,256 },
+    };
+    ui_BeginRow(Interface, 2, &ColumnWidths[0]);
+    {
+        ui_Label(Interface, MakeString("Location"));
+        ui_Label(Interface, MakeString("Alloc Size"));
+    }
+    ui_EndRow(Interface);
+    
+    ui_BeginList(Interface, MakeString("Alloc List"), 10, Debug.AllocationsCount);
+    ui_BeginRow(Interface, 2, &ColumnWidths[0]);
+    for (s32 n = 0; n < Debug.AllocationsCount; n++)
+    {
+        gs_debug_allocation A = Debug.Allocations[n];
+        
+        PrintF(&TempString, "%S", A.Location);
+        ui_Label(Interface, TempString);
+        
+        PrintF(&TempString, "%lld bytes", A.Size);
+        ui_Label(Interface, TempString);
+    }
+    ui_EndRow(Interface);
+    ui_EndList(Interface);
+}
+
 GSMetaTag(panel_render);
 GSMetaTag(panel_type_profiler);
 internal void
@ -234,24 +272,39 @@ ProfilerView_Render(panel* Panel, rect2 PanelBounds, render_command_buffer* Rend
    
    ui_BeginRow(&State->Interface, 8);
    {
-        if (ui_Button(&State->Interface, MakeString("Scope View")))
+        if (ui_Button(&State->Interface, MakeString("Profiler")))
        {
-            GlobalDebugServices->Interface.FrameView = FRAME_VIEW_PROFILER;
+            GlobalDebugServices->Interface.FrameView = DebugUI_Profiler;
        }
        if (ui_Button(&State->Interface, MakeString("List View")))
        {
-            GlobalDebugServices->Interface.FrameView = FRAME_VIEW_SCOPE_LIST;
+            GlobalDebugServices->Interface.FrameView = DebugUI_ScopeList;
+        }
+        if (ui_Button(&State->Interface, MakeString("Memory")))
+        {
+            GlobalDebugServices->Interface.FrameView = DebugUI_MemoryView;
        }
    }
    ui_EndRow(&State->Interface);
    
-    if (GlobalDebugServices->Interface.FrameView == FRAME_VIEW_PROFILER)
+    switch (GlobalDebugServices->Interface.FrameView)
    {
-        RenderProfiler_ScopeVisualization(&State->Interface, Layout, VisibleFrame, Memory);
-    }
-    else
-    {
-        RenderProfiler_ListVisualization(&State->Interface, Layout, VisibleFrame, Memory);
+        case DebugUI_Profiler:
+        {
+            RenderProfiler_ScopeVisualization(&State->Interface, Layout, VisibleFrame, Memory);
+        }break;
+        
+        case DebugUI_ScopeList:
+        {
+            RenderProfiler_ListVisualization(&State->Interface, Layout, VisibleFrame, Memory);
+        }break;
+        
+        case DebugUI_MemoryView:
+        {
+            RenderProfiler_MemoryView(&State->Interface, Layout, State, Context, Memory);
+        }break;
+        
+        InvalidDefaultCase;
    }
    
    ui_PopLayout(&State->Interface, MakeString("Profiler Layout"));
--- a/src/app/engine/assembly/foldhaus_assembly.cpp
+++ b/src/app/engine/assembly/foldhaus_assembly.cpp
@ -206,7 +206,7 @@ LoadAssembly (assembly_array* Assemblies, led_system* LedSystem, gs_memory_arena
        gs_const_string FileName = Substring(Path, IndexOfLastSlash + 1, Path.Length);
        
        assembly* NewAssembly = AssemblyArray_Take(Assemblies);
-        NewAssembly->Arena = CreateMemoryArena(Context.ThreadContext.Allocator);
+        NewAssembly->Arena = CreateMemoryArena(Context.ThreadContext.Allocator, "Assembly Arena");
        
        parser AssemblyParser = ParseAssemblyFile(NewAssembly, FileName, AssemblyFileText, Scratch);
        if (AssemblyParser.Success)
--- a/src/app/engine/foldhaus_addressed_data.h
+++ b/src/app/engine/foldhaus_addressed_data.h
@ -100,7 +100,7 @@ AddressedDataBufferList_Create(gs_thread_context TC)
 {
    addressed_data_buffer_list Result = {};
    Result.Arena = AllocatorAllocStruct(TC.Allocator, gs_memory_arena);
-    *Result.Arena = CreateMemoryArena(TC.Allocator);
+    *Result.Arena = CreateMemoryArena(TC.Allocator, "Addressed Data Buffer List Arena");
    return Result;
 }

--- a/src/app/foldhaus_app.cpp
+++ b/src/app/foldhaus_app.cpp
@ -24,7 +24,7 @@ INITIALIZE_APPLICATION(InitializeApplication)
    app_state* State = (app_state*)Context.MemoryBase;
    *State = {};
    
-    State->Permanent = CreateMemoryArena(Context.ThreadContext.Allocator);
+    State->Permanent = CreateMemoryArena(Context.ThreadContext.Allocator, "Permanent");
    State->Transient = Context.ThreadContext.Transient;
    State->Assemblies = AssemblyArray_Create(8, &State->Permanent);
    
@ -91,6 +91,8 @@ INITIALIZE_APPLICATION(InitializeApplication)
        Panel_SetType(Hierarchy, &State->PanelSystem, PanelType_AssemblyDebug, State, Context);
        
    }
+    
+    State->RunEditor = true;
 }

 UPDATE_AND_RENDER(UpdateAndRender)
@ -104,7 +106,10 @@ UPDATE_AND_RENDER(UpdateAndRender)
    // incorrect to clear the arena, and then access the memory later.
    ClearArena(State->Transient);
    
-    Editor_Update(State, Context, InputQueue);
+    if (State->RunEditor)
+    {
+        Editor_Update(State, Context, InputQueue);
+    }
    
    AnimationSystem_Update(&State->AnimationSystem, Context->DeltaTime);
    if (AnimationSystem_NeedsRender(State->AnimationSystem))
@ -123,7 +128,10 @@ UPDATE_AND_RENDER(UpdateAndRender)
                                 State->Assemblies,
                                 State->LedSystem);
    
-    Editor_Render(State, Context, RenderBuffer);
+    if (State->RunEditor)
+    {
+        Editor_Render(State, Context, RenderBuffer);
+    }
    
    // NOTE(pjs): Building data buffers to be sent out to the sculpture
    // This array is used on the platform side to actually send the information
--- a/src/app/foldhaus_app.h
+++ b/src/app/foldhaus_app.h
@ -13,7 +13,7 @@
 #include "../gs_libs/gs_font.h"
 #include "foldhaus_log.h"

-#include "interface.h"
+#include "editor/interface.h"

 #include "engine/foldhaus_network_ordering.h"

@ -42,7 +42,7 @@ typedef struct panel panel;
 #include "engine/animation/foldhaus_animation_renderer.cpp"

 #include "engine/user_space.h"
-#include "blumen_lumen.h"
+#include "ss_blumen_lumen/blumen_lumen.h"

 struct app_state
 {
@ -72,6 +72,8 @@ struct app_state
    panel* HotPanel;
    
    user_space_desc UserSpaceDesc;
+    
+    bool RunEditor;
 };

 internal void OpenColorPicker(app_state* State, v4* Address);
@ -81,7 +83,7 @@ internal void OpenColorPicker(app_state* State, v4* Address);
 #include "engine/user_space.cpp"

 #include "patterns/blumen_patterns.h"
-#include "blumen_lumen.cpp"
+#include "ss_blumen_lumen/blumen_lumen.cpp"

 internal void
 EndCurrentOperationMode(app_state* State)
--- a/src/app/foldhaus_debug.h
+++ b/src/app/foldhaus_debug.h
@ -64,8 +64,14 @@ struct debug_frame
    collated_scope_record* CollatedScopes;
 };

-#define FRAME_VIEW_PROFILER 0
-#define FRAME_VIEW_SCOPE_LIST 1
+enum debug_ui_view
+{
+    DebugUI_Profiler,
+    DebugUI_ScopeList,
+    DebugUI_MemoryView,
+    
+    DebugUI_Count,
+};

 struct debug_interface
 {
--- a/src/app/handmade_math.h
+++ b/src/app/handmade_math.h
--- a/src/app/platform_win32/win32_foldhaus.cpp
+++ b/src/app/platform_win32/win32_foldhaus.cpp
@ -535,6 +535,12 @@ WinMain (
 {
    gs_thread_context ThreadContext = Win32CreateThreadContext();
    
+    gs_allocator_debug AllocDebug = {};
+    AllocDebug.AllocationsCountMax = 4096;
+    AllocDebug.Allocations = (gs_debug_allocation*)Win32Alloc(sizeof(gs_debug_allocation) * AllocDebug.AllocationsCountMax, 0);
+    
+    ThreadContext.Allocator.Debug = &AllocDebug;
+    
    gs_file_info A = GetFileInfo(ThreadContext.FileHandler, ConstString("C:\\projects\\Lumenarium"));
    
    gs_file_info B = GetFileInfo(ThreadContext.FileHandler, ConstString("C:\\projects\\Lumenarium\\"));
@ -556,7 +562,7 @@ WinMain (
    Context.MemorySize = MB(64);
    Context.MemoryBase = (u8*)Win32Alloc(Context.MemorySize, 0);
    
-    gs_memory_arena PlatformPermanent = CreateMemoryArena(Context.ThreadContext.Allocator);
+    gs_memory_arena PlatformPermanent = CreateMemoryArena(Context.ThreadContext.Allocator, "Platform Memory");
    
    s64 PerformanceCountFrequency = GetPerformanceFrequency();
    s64 LastFrameEnd = GetWallClock();
--- a/src/app/platform_win32/win32_foldhaus_serial.h
+++ b/src/app/platform_win32/win32_foldhaus_serial.h
@ -204,9 +204,13 @@ Win32SerialArray_Create(gs_thread_context Context)
    Win32SerialPortNames = AllocatorAllocArray(Context.Allocator, gs_string, Win32SerialHandlesCountMax);
    Win32SerialPortFilled = AllocatorAllocArray(Context.Allocator, s32, Win32SerialHandlesCountMax);
    
+    u64 PortNameSize = 256;
+    u64 PortNameBufferSize = PortNameSize * Win32SerialHandlesCountMax;
+    char* PortNameBuffer = AllocatorAllocArray(Context.Allocator, char, PortNameBufferSize);
    for (u32 i = 0; i < Win32SerialHandlesCountMax; i++)
    {
-        Win32SerialPortNames[i] = AllocatorAllocString(Context.Allocator, 256);
+        char* NameBase = PortNameBuffer + (PortNameSize * i);
+        Win32SerialPortNames[i] = MakeString(NameBase, 0, PortNameSize);
        Win32SerialPortFilled[i] = 0;
    }
 }
--- a/src/app/platform_win32/win32_foldhaus_work_queue.h
+++ b/src/app/platform_win32/win32_foldhaus_work_queue.h
@ -48,7 +48,7 @@ Win32CreateThreadContext(gs_memory_arena* Transient = 0)
    else
    {
        Result.Transient = (gs_memory_arena*)AllocatorAlloc(Result.Allocator, sizeof(gs_memory_arena)).Memory;
-        *Result.Transient = CreateMemoryArena(Result.Allocator);
+        *Result.Transient = CreateMemoryArena(Result.Allocator, "Tctx Transient");
    }
    Result.FileHandler = CreateFileHandler(Win32GetFileInfo,
                                           Win32ReadEntireFile,
--- a/src/app/ss_blumen_lumen/blumen_lumen.cpp
+++ b/src/app/ss_blumen_lumen/blumen_lumen.cpp
@ -24,10 +24,8 @@ BlumenLumen_MicListenJob(gs_thread_context* Ctx, u8* UserData)
    
    while (*Data->Running)
    {
-#if 1
        if (SocketQueryStatus(Data->SocketManager, Data->ListenSocket))
        {
-            // TODO(pjs): Removing this block for now - nothing is wrong with it except that SocketPeek is still blocking for some reason
            if (SocketPeek(Data->SocketManager, Data->ListenSocket))
            {
                // TODO(pjs): Make this a peek operation
@ -41,7 +39,6 @@ BlumenLumen_MicListenJob(gs_thread_context* Ctx, u8* UserData)
                    }
                }
            }
-#endif
            
            while (Data->OutgoingMsgQueue->ReadHead != Data->OutgoingMsgQueue->WriteHead)
            {
--- a/src/app/ss_blumen_lumen/blumen_lumen.h
+++ b/src/app/ss_blumen_lumen/blumen_lumen.h
--- a/src/app/sse_mathfun.h
+++ b/src/app/sse_mathfun.h
@ -1,711 +0,0 @@
-/* SIMD (SSE1+MMX or SSE2) implementation of sin, cos, exp and log
-
-   Inspired by Intel Approximate Math library, and based on the
-   corresponding algorithms of the cephes math library
-   
-   The default is to use the SSE1 version. If you define USE_SSE2 the
-   the SSE2 intrinsics will be used in place of the MMX intrinsics. Do
-   not expect any significant performance improvement with SSE2.
-*/
-
-/* Copyright (C) 2007  Julien Pommier
-
-  This software is provided 'as-is', without any express or implied
-  warranty.  In no event will the authors be held liable for any damages
-  arising from the use of this software.
-  
-  Permission is granted to anyone to use this software for any purpose,
-  including commercial applications, and to alter it and redistribute it
-  freely, subject to the following restrictions:
-  
-  1. The origin of this software must not be misrepresented; you must not
-     claim that you wrote the original software. If you use this software
-     in a product, an acknowledgment in the product documentation would be
-     appreciated but is not required.
-  2. Altered source versions must be plainly marked as such, and must not be
-     misrepresented as being the original software.
-  3. This notice may not be removed or altered from any source distribution.
-  
-  (this is the zlib license)
-*/
-
-#include <xmmintrin.h>
-
-/* yes I know, the top of this file is quite ugly */
-
-#ifdef _MSC_VER /* visual c++ */
-# define ALIGN16_BEG __declspec(align(16))
-# define ALIGN16_END 
-#else /* gcc or icc */
-# define ALIGN16_BEG
-# define ALIGN16_END __attribute__((aligned(16)))
-#endif
-
-/* __m128 is ugly to write */
-typedef __m128 v4sf;  // vector of 4 float (sse1)
-
-#ifdef USE_SSE2
-# include <emmintrin.h>
-typedef __m128i v4si; // vector of 4 int (sse2)
-#else
-typedef __m64 v2si;   // vector of 2 int (mmx)
-#endif
-
-/* declare some SSE constants -- why can't I figure a better way to do that? */
-#define _PS_CONST(Name, Val)                                            \
-static const ALIGN16_BEG float _ps_##Name[4] ALIGN16_END = { Val, Val, Val, Val }
-#define _PI32_CONST(Name, Val)                                            \
-static const ALIGN16_BEG int _pi32_##Name[4] ALIGN16_END = { Val, Val, Val, Val }
-#define _PS_CONST_TYPE(Name, Type, Val)                                 \
-static const ALIGN16_BEG Type _ps_##Name[4] ALIGN16_END = { Val, Val, Val, Val }
-
-_PS_CONST(1  , 1.0f);
-_PS_CONST(0p5, 0.5f);
-/* the smallest non denormalized float number */
-_PS_CONST_TYPE(min_norm_pos, int, 0x00800000);
-_PS_CONST_TYPE(mant_mask, int, 0x7f800000);
-_PS_CONST_TYPE(inv_mant_mask, int, ~0x7f800000);
-
-_PS_CONST_TYPE(sign_mask, int, (int)0x80000000);
-_PS_CONST_TYPE(inv_sign_mask, int, ~0x80000000);
-
-_PI32_CONST(1, 1);
-_PI32_CONST(inv1, ~1);
-_PI32_CONST(2, 2);
-_PI32_CONST(4, 4);
-_PI32_CONST(0x7f, 0x7f);
-
-_PS_CONST(cephes_SQRTHF, 0.707106781186547524);
-_PS_CONST(cephes_log_p0, 7.0376836292E-2);
-_PS_CONST(cephes_log_p1, - 1.1514610310E-1);
-_PS_CONST(cephes_log_p2, 1.1676998740E-1);
-_PS_CONST(cephes_log_p3, - 1.2420140846E-1);
-_PS_CONST(cephes_log_p4, + 1.4249322787E-1);
-_PS_CONST(cephes_log_p5, - 1.6668057665E-1);
-_PS_CONST(cephes_log_p6, + 2.0000714765E-1);
-_PS_CONST(cephes_log_p7, - 2.4999993993E-1);
-_PS_CONST(cephes_log_p8, + 3.3333331174E-1);
-_PS_CONST(cephes_log_q1, -2.12194440e-4);
-_PS_CONST(cephes_log_q2, 0.693359375);
-
-#ifndef USE_SSE2
-typedef union xmm_mm_union {
-    __m128 xmm;
-    __m64 mm[2];
-} xmm_mm_union;
-
-#define COPY_XMM_TO_MM(xmm_, mm0_, mm1_) {          \
-    xmm_mm_union u; u.xmm = xmm_;                   \
-    mm0_ = u.mm[0];                                 \
-    mm1_ = u.mm[1];                                 \
-}
-
-#define COPY_MM_TO_XMM(mm0_, mm1_, xmm_) {                         \
-    xmm_mm_union u; u.mm[0]=mm0_; u.mm[1]=mm1_; xmm_ = u.xmm;      \
-}
-
-#endif // USE_SSE2
-
-/* natural logarithm computed for 4 simultaneous float 
-   return NaN for x <= 0
-*/
-v4sf log_ps(v4sf x) {
-#ifdef USE_SSE2
-    v4si emm0;
-#else
-    v2si mm0, mm1;
-#endif
-    v4sf one = *(v4sf*)_ps_1;
-    
-    v4sf invalid_mask = _mm_cmple_ps(x, _mm_setzero_ps());
-    
-    x = _mm_max_ps(x, *(v4sf*)_ps_min_norm_pos);  /* cut off denormalized stuff */
-    
-#ifndef USE_SSE2
-    /* part 1: x = frexpf(x, &e); */
-    COPY_XMM_TO_MM(x, mm0, mm1);
-    mm0 = _mm_srli_pi32(mm0, 23);
-    mm1 = _mm_srli_pi32(mm1, 23);
-#else
-    emm0 = _mm_srli_epi32(_mm_castps_si128(x), 23);
-#endif
-    /* keep only the fractional part */
-    x = _mm_and_ps(x, *(v4sf*)_ps_inv_mant_mask);
-    x = _mm_or_ps(x, *(v4sf*)_ps_0p5);
-    
-#ifndef USE_SSE2
-    /* now e=mm0:mm1 contain the really base-2 exponent */
-    mm0 = _mm_sub_pi32(mm0, *(v2si*)_pi32_0x7f);
-    mm1 = _mm_sub_pi32(mm1, *(v2si*)_pi32_0x7f);
-    v4sf e = _mm_cvtpi32x2_ps(mm0, mm1);
-    _mm_empty(); /* bye bye mmx */
-#else
-    emm0 = _mm_sub_epi32(emm0, *(v4si*)_pi32_0x7f);
-    v4sf e = _mm_cvtepi32_ps(emm0);
-#endif
-    
-    e = _mm_add_ps(e, one);
-    
-    /* part2: 
-       if( x < SQRTHF ) {
-         e -= 1;
-         x = x + x - 1.0;
-       } else { x = x - 1.0; }
-    */
-    v4sf mask = _mm_cmplt_ps(x, *(v4sf*)_ps_cephes_SQRTHF);
-    v4sf tmp = _mm_and_ps(x, mask);
-    x = _mm_sub_ps(x, one);
-    e = _mm_sub_ps(e, _mm_and_ps(one, mask));
-    x = _mm_add_ps(x, tmp);
-    
-    
-    v4sf z = _mm_mul_ps(x,x);
-    
-    v4sf y = *(v4sf*)_ps_cephes_log_p0;
-    y = _mm_mul_ps(y, x);
-    y = _mm_add_ps(y, *(v4sf*)_ps_cephes_log_p1);
-    y = _mm_mul_ps(y, x);
-    y = _mm_add_ps(y, *(v4sf*)_ps_cephes_log_p2);
-    y = _mm_mul_ps(y, x);
-    y = _mm_add_ps(y, *(v4sf*)_ps_cephes_log_p3);
-    y = _mm_mul_ps(y, x);
-    y = _mm_add_ps(y, *(v4sf*)_ps_cephes_log_p4);
-    y = _mm_mul_ps(y, x);
-    y = _mm_add_ps(y, *(v4sf*)_ps_cephes_log_p5);
-    y = _mm_mul_ps(y, x);
-    y = _mm_add_ps(y, *(v4sf*)_ps_cephes_log_p6);
-    y = _mm_mul_ps(y, x);
-    y = _mm_add_ps(y, *(v4sf*)_ps_cephes_log_p7);
-    y = _mm_mul_ps(y, x);
-    y = _mm_add_ps(y, *(v4sf*)_ps_cephes_log_p8);
-    y = _mm_mul_ps(y, x);
-    
-    y = _mm_mul_ps(y, z);
-    
-    
-    tmp = _mm_mul_ps(e, *(v4sf*)_ps_cephes_log_q1);
-    y = _mm_add_ps(y, tmp);
-    
-    
-    tmp = _mm_mul_ps(z, *(v4sf*)_ps_0p5);
-    y = _mm_sub_ps(y, tmp);
-    
-    tmp = _mm_mul_ps(e, *(v4sf*)_ps_cephes_log_q2);
-    x = _mm_add_ps(x, y);
-    x = _mm_add_ps(x, tmp);
-    x = _mm_or_ps(x, invalid_mask); // negative arg will be NAN
-    return x;
-}
-
-_PS_CONST(exp_hi,	88.3762626647949f);
-_PS_CONST(exp_lo,	-88.3762626647949f);
-
-_PS_CONST(cephes_LOG2EF, 1.44269504088896341);
-_PS_CONST(cephes_exp_C1, 0.693359375);
-_PS_CONST(cephes_exp_C2, -2.12194440e-4);
-
-_PS_CONST(cephes_exp_p0, 1.9875691500E-4);
-_PS_CONST(cephes_exp_p1, 1.3981999507E-3);
-_PS_CONST(cephes_exp_p2, 8.3334519073E-3);
-_PS_CONST(cephes_exp_p3, 4.1665795894E-2);
-_PS_CONST(cephes_exp_p4, 1.6666665459E-1);
-_PS_CONST(cephes_exp_p5, 5.0000001201E-1);
-
-v4sf exp_ps(v4sf x) {
-    v4sf tmp = _mm_setzero_ps(), fx;
-#ifdef USE_SSE2
-    v4si emm0;
-#else
-    v2si mm0, mm1;
-#endif
-    v4sf one = *(v4sf*)_ps_1;
-    
-    x = _mm_min_ps(x, *(v4sf*)_ps_exp_hi);
-    x = _mm_max_ps(x, *(v4sf*)_ps_exp_lo);
-    
-    /* express exp(x) as exp(g + n*log(2)) */
-    fx = _mm_mul_ps(x, *(v4sf*)_ps_cephes_LOG2EF);
-    fx = _mm_add_ps(fx, *(v4sf*)_ps_0p5);
-    
-    /* how to perform a floorf with SSE: just below */
-#ifndef USE_SSE2
-    /* step 1 : cast to int */
-    tmp = _mm_movehl_ps(tmp, fx);
-    mm0 = _mm_cvttps_pi32(fx);
-    mm1 = _mm_cvttps_pi32(tmp);
-    /* step 2 : cast back to float */
-    tmp = _mm_cvtpi32x2_ps(mm0, mm1);
-#else
-    emm0 = _mm_cvttps_epi32(fx);
-    tmp  = _mm_cvtepi32_ps(emm0);
-#endif
-    /* if greater, substract 1 */
-    v4sf mask = _mm_cmpgt_ps(tmp, fx);    
-    mask = _mm_and_ps(mask, one);
-    fx = _mm_sub_ps(tmp, mask);
-    
-    tmp = _mm_mul_ps(fx, *(v4sf*)_ps_cephes_exp_C1);
-    v4sf z = _mm_mul_ps(fx, *(v4sf*)_ps_cephes_exp_C2);
-    x = _mm_sub_ps(x, tmp);
-    x = _mm_sub_ps(x, z);
-    
-    z = _mm_mul_ps(x,x);
-    
-    v4sf y = *(v4sf*)_ps_cephes_exp_p0;
-    y = _mm_mul_ps(y, x);
-    y = _mm_add_ps(y, *(v4sf*)_ps_cephes_exp_p1);
-    y = _mm_mul_ps(y, x);
-    y = _mm_add_ps(y, *(v4sf*)_ps_cephes_exp_p2);
-    y = _mm_mul_ps(y, x);
-    y = _mm_add_ps(y, *(v4sf*)_ps_cephes_exp_p3);
-    y = _mm_mul_ps(y, x);
-    y = _mm_add_ps(y, *(v4sf*)_ps_cephes_exp_p4);
-    y = _mm_mul_ps(y, x);
-    y = _mm_add_ps(y, *(v4sf*)_ps_cephes_exp_p5);
-    y = _mm_mul_ps(y, z);
-    y = _mm_add_ps(y, x);
-    y = _mm_add_ps(y, one);
-    
-    /* build 2^n */
-#ifndef USE_SSE2
-    z = _mm_movehl_ps(z, fx);
-    mm0 = _mm_cvttps_pi32(fx);
-    mm1 = _mm_cvttps_pi32(z);
-    mm0 = _mm_add_pi32(mm0, *(v2si*)_pi32_0x7f);
-    mm1 = _mm_add_pi32(mm1, *(v2si*)_pi32_0x7f);
-    mm0 = _mm_slli_pi32(mm0, 23); 
-    mm1 = _mm_slli_pi32(mm1, 23);
-    
-    v4sf pow2n; 
-    COPY_MM_TO_XMM(mm0, mm1, pow2n);
-    _mm_empty();
-#else
-    emm0 = _mm_cvttps_epi32(fx);
-    emm0 = _mm_add_epi32(emm0, *(v4si*)_pi32_0x7f);
-    emm0 = _mm_slli_epi32(emm0, 23);
-    v4sf pow2n = _mm_castsi128_ps(emm0);
-#endif
-    y = _mm_mul_ps(y, pow2n);
-    return y;
-}
-
-_PS_CONST(minus_cephes_DP1, -0.78515625);
-_PS_CONST(minus_cephes_DP2, -2.4187564849853515625e-4);
-_PS_CONST(minus_cephes_DP3, -3.77489497744594108e-8);
-_PS_CONST(sincof_p0, -1.9515295891E-4);
-_PS_CONST(sincof_p1,  8.3321608736E-3);
-_PS_CONST(sincof_p2, -1.6666654611E-1);
-_PS_CONST(coscof_p0,  2.443315711809948E-005);
-_PS_CONST(coscof_p1, -1.388731625493765E-003);
-_PS_CONST(coscof_p2,  4.166664568298827E-002);
-_PS_CONST(cephes_FOPI, 1.27323954473516); // 4 / M_PI
-
-
-/* evaluation of 4 sines at onces, using only SSE1+MMX intrinsics so
-   it runs also on old athlons XPs and the pentium III of your grand
-   mother.
-   
-   The code is the exact rewriting of the cephes sinf function.
-   Precision is excellent as long as x < 8192 (I did not bother to
-   take into account the special handling they have for greater values
-   -- it does not return garbage for arguments over 8192, though, but
-   the extra precision is missing).
-   
-   Note that it is such that sinf((float)M_PI) = 8.74e-8, which is the
-   surprising but correct result.
-   
-   Performance is also surprisingly good, 1.33 times faster than the
-   macos vsinf SSE2 function, and 1.5 times faster than the
-   __vrs4_sinf of amd's ACML (which is only available in 64 bits). Not
-   too bad for an SSE1 function (with no special tuning) !
-   However the latter libraries probably have a much better handling of NaN,
-   Inf, denormalized and other special arguments..
-   
-   On my core 1 duo, the execution of this function takes approximately 95 cycles.
-   
-   From what I have observed on the experiments with Intel AMath lib, switching to an
-   SSE2 version would improve the perf by only 10%.
-   
-   Since it is based on SSE intrinsics, it has to be compiled at -O2 to
-   deliver full speed.
-*/
-v4sf sin_ps(v4sf x) { // any x
-    v4sf xmm1, xmm2 = _mm_setzero_ps(), xmm3, sign_bit, y;
-    
-#ifdef USE_SSE2
-    v4si emm0, emm2;
-#else
-    v2si mm0, mm1, mm2, mm3;
-#endif
-    sign_bit = x;
-    /* take the absolute value */
-    x = _mm_and_ps(x, *(v4sf*)_ps_inv_sign_mask);
-    /* extract the sign bit (upper one) */
-    sign_bit = _mm_and_ps(sign_bit, *(v4sf*)_ps_sign_mask);
-    
-    /* scale by 4/Pi */
-    y = _mm_mul_ps(x, *(v4sf*)_ps_cephes_FOPI);
-    
-#ifdef USE_SSE2
-    /* store the integer part of y in mm0 */
-    emm2 = _mm_cvttps_epi32(y);
-    /* j=(j+1) & (~1) (see the cephes sources) */
-    emm2 = _mm_add_epi32(emm2, *(v4si*)_pi32_1);
-    emm2 = _mm_and_si128(emm2, *(v4si*)_pi32_inv1);
-    y = _mm_cvtepi32_ps(emm2);
-    
-    /* get the swap sign flag */
-    emm0 = _mm_and_si128(emm2, *(v4si*)_pi32_4);
-    emm0 = _mm_slli_epi32(emm0, 29);
-    /* get the polynom selection mask 
-       there is one polynom for 0 <= x <= Pi/4
-       and another one for Pi/4<x<=Pi/2
-       
-       Both branches will be computed.
-    */
-    emm2 = _mm_and_si128(emm2, *(v4si*)_pi32_2);
-    emm2 = _mm_cmpeq_epi32(emm2, _mm_setzero_si128());
-    
-    v4sf swap_sign_bit = _mm_castsi128_ps(emm0);
-    v4sf poly_mask = _mm_castsi128_ps(emm2);
-    sign_bit = _mm_xor_ps(sign_bit, swap_sign_bit);
-    
-#else
-    /* store the integer part of y in mm0:mm1 */
-    xmm2 = _mm_movehl_ps(xmm2, y);
-    mm2 = _mm_cvttps_pi32(y);
-    mm3 = _mm_cvttps_pi32(xmm2);
-    /* j=(j+1) & (~1) (see the cephes sources) */
-    mm2 = _mm_add_pi32(mm2, *(v2si*)_pi32_1);
-    mm3 = _mm_add_pi32(mm3, *(v2si*)_pi32_1);
-    mm2 = _mm_and_si64(mm2, *(v2si*)_pi32_inv1);
-    mm3 = _mm_and_si64(mm3, *(v2si*)_pi32_inv1);
-    y = _mm_cvtpi32x2_ps(mm2, mm3);
-    /* get the swap sign flag */
-    mm0 = _mm_and_si64(mm2, *(v2si*)_pi32_4);
-    mm1 = _mm_and_si64(mm3, *(v2si*)_pi32_4);
-    mm0 = _mm_slli_pi32(mm0, 29);
-    mm1 = _mm_slli_pi32(mm1, 29);
-    /* get the polynom selection mask */
-    mm2 = _mm_and_si64(mm2, *(v2si*)_pi32_2);
-    mm3 = _mm_and_si64(mm3, *(v2si*)_pi32_2);
-    mm2 = _mm_cmpeq_pi32(mm2, _mm_setzero_si64());
-    mm3 = _mm_cmpeq_pi32(mm3, _mm_setzero_si64());
-    v4sf swap_sign_bit, poly_mask;
-    COPY_MM_TO_XMM(mm0, mm1, swap_sign_bit);
-    COPY_MM_TO_XMM(mm2, mm3, poly_mask);
-    sign_bit = _mm_xor_ps(sign_bit, swap_sign_bit);
-    _mm_empty(); /* good-bye mmx */
-#endif
-    
-    /* The magic pass: "Extended precision modular arithmetic" 
-       x = ((x - y * DP1) - y * DP2) - y * DP3; */
-    xmm1 = *(v4sf*)_ps_minus_cephes_DP1;
-    xmm2 = *(v4sf*)_ps_minus_cephes_DP2;
-    xmm3 = *(v4sf*)_ps_minus_cephes_DP3;
-    xmm1 = _mm_mul_ps(y, xmm1);
-    xmm2 = _mm_mul_ps(y, xmm2);
-    xmm3 = _mm_mul_ps(y, xmm3);
-    x = _mm_add_ps(x, xmm1);
-    x = _mm_add_ps(x, xmm2);
-    x = _mm_add_ps(x, xmm3);
-    
-    /* Evaluate the first polynom  (0 <= x <= Pi/4) */
-    y = *(v4sf*)_ps_coscof_p0;
-    v4sf z = _mm_mul_ps(x,x);
-    
-    y = _mm_mul_ps(y, z);
-    y = _mm_add_ps(y, *(v4sf*)_ps_coscof_p1);
-    y = _mm_mul_ps(y, z);
-    y = _mm_add_ps(y, *(v4sf*)_ps_coscof_p2);
-    y = _mm_mul_ps(y, z);
-    y = _mm_mul_ps(y, z);
-    v4sf tmp = _mm_mul_ps(z, *(v4sf*)_ps_0p5);
-    y = _mm_sub_ps(y, tmp);
-    y = _mm_add_ps(y, *(v4sf*)_ps_1);
-    
-    /* Evaluate the second polynom  (Pi/4 <= x <= 0) */
-    
-    v4sf y2 = *(v4sf*)_ps_sincof_p0;
-    y2 = _mm_mul_ps(y2, z);
-    y2 = _mm_add_ps(y2, *(v4sf*)_ps_sincof_p1);
-    y2 = _mm_mul_ps(y2, z);
-    y2 = _mm_add_ps(y2, *(v4sf*)_ps_sincof_p2);
-    y2 = _mm_mul_ps(y2, z);
-    y2 = _mm_mul_ps(y2, x);
-    y2 = _mm_add_ps(y2, x);
-    
-    /* select the correct result from the two polynoms */  
-    xmm3 = poly_mask;
-    y2 = _mm_and_ps(xmm3, y2); //, xmm3);
-    y = _mm_andnot_ps(xmm3, y);
-    y = _mm_add_ps(y,y2);
-    /* update the sign */
-    y = _mm_xor_ps(y, sign_bit);
-    return y;
-}
-
-/* almost the same as sin_ps */
-v4sf cos_ps(v4sf x) { // any x
-    v4sf xmm1, xmm2 = _mm_setzero_ps(), xmm3, y;
-#ifdef USE_SSE2
-    v4si emm0, emm2;
-#else
-    v2si mm0, mm1, mm2, mm3;
-#endif
-    /* take the absolute value */
-    x = _mm_and_ps(x, *(v4sf*)_ps_inv_sign_mask);
-    
-    /* scale by 4/Pi */
-    y = _mm_mul_ps(x, *(v4sf*)_ps_cephes_FOPI);
-    
-#ifdef USE_SSE2
-    /* store the integer part of y in mm0 */
-    emm2 = _mm_cvttps_epi32(y);
-    /* j=(j+1) & (~1) (see the cephes sources) */
-    emm2 = _mm_add_epi32(emm2, *(v4si*)_pi32_1);
-    emm2 = _mm_and_si128(emm2, *(v4si*)_pi32_inv1);
-    y = _mm_cvtepi32_ps(emm2);
-    
-    emm2 = _mm_sub_epi32(emm2, *(v4si*)_pi32_2);
-    
-    /* get the swap sign flag */
-    emm0 = _mm_andnot_si128(emm2, *(v4si*)_pi32_4);
-    emm0 = _mm_slli_epi32(emm0, 29);
-    /* get the polynom selection mask */
-    emm2 = _mm_and_si128(emm2, *(v4si*)_pi32_2);
-    emm2 = _mm_cmpeq_epi32(emm2, _mm_setzero_si128());
-    
-    v4sf sign_bit = _mm_castsi128_ps(emm0);
-    v4sf poly_mask = _mm_castsi128_ps(emm2);
-#else
-    /* store the integer part of y in mm0:mm1 */
-    xmm2 = _mm_movehl_ps(xmm2, y);
-    mm2 = _mm_cvttps_pi32(y);
-    mm3 = _mm_cvttps_pi32(xmm2);
-    
-    /* j=(j+1) & (~1) (see the cephes sources) */
-    mm2 = _mm_add_pi32(mm2, *(v2si*)_pi32_1);
-    mm3 = _mm_add_pi32(mm3, *(v2si*)_pi32_1);
-    mm2 = _mm_and_si64(mm2, *(v2si*)_pi32_inv1);
-    mm3 = _mm_and_si64(mm3, *(v2si*)_pi32_inv1);
-    
-    y = _mm_cvtpi32x2_ps(mm2, mm3);
-    
-    
-    mm2 = _mm_sub_pi32(mm2, *(v2si*)_pi32_2);
-    mm3 = _mm_sub_pi32(mm3, *(v2si*)_pi32_2);
-    
-    /* get the swap sign flag in mm0:mm1 and the 
-       polynom selection mask in mm2:mm3 */
-    
-    mm0 = _mm_andnot_si64(mm2, *(v2si*)_pi32_4);
-    mm1 = _mm_andnot_si64(mm3, *(v2si*)_pi32_4);
-    mm0 = _mm_slli_pi32(mm0, 29);
-    mm1 = _mm_slli_pi32(mm1, 29);
-    
-    mm2 = _mm_and_si64(mm2, *(v2si*)_pi32_2);
-    mm3 = _mm_and_si64(mm3, *(v2si*)_pi32_2);
-    
-    mm2 = _mm_cmpeq_pi32(mm2, _mm_setzero_si64());
-    mm3 = _mm_cmpeq_pi32(mm3, _mm_setzero_si64());
-    
-    v4sf sign_bit, poly_mask;
-    COPY_MM_TO_XMM(mm0, mm1, sign_bit);
-    COPY_MM_TO_XMM(mm2, mm3, poly_mask);
-    _mm_empty(); /* good-bye mmx */
-#endif
-    /* The magic pass: "Extended precision modular arithmetic" 
-       x = ((x - y * DP1) - y * DP2) - y * DP3; */
-    xmm1 = *(v4sf*)_ps_minus_cephes_DP1;
-    xmm2 = *(v4sf*)_ps_minus_cephes_DP2;
-    xmm3 = *(v4sf*)_ps_minus_cephes_DP3;
-    xmm1 = _mm_mul_ps(y, xmm1);
-    xmm2 = _mm_mul_ps(y, xmm2);
-    xmm3 = _mm_mul_ps(y, xmm3);
-    x = _mm_add_ps(x, xmm1);
-    x = _mm_add_ps(x, xmm2);
-    x = _mm_add_ps(x, xmm3);
-    
-    /* Evaluate the first polynom  (0 <= x <= Pi/4) */
-    y = *(v4sf*)_ps_coscof_p0;
-    v4sf z = _mm_mul_ps(x,x);
-    
-    y = _mm_mul_ps(y, z);
-    y = _mm_add_ps(y, *(v4sf*)_ps_coscof_p1);
-    y = _mm_mul_ps(y, z);
-    y = _mm_add_ps(y, *(v4sf*)_ps_coscof_p2);
-    y = _mm_mul_ps(y, z);
-    y = _mm_mul_ps(y, z);
-    v4sf tmp = _mm_mul_ps(z, *(v4sf*)_ps_0p5);
-    y = _mm_sub_ps(y, tmp);
-    y = _mm_add_ps(y, *(v4sf*)_ps_1);
-    
-    /* Evaluate the second polynom  (Pi/4 <= x <= 0) */
-    
-    v4sf y2 = *(v4sf*)_ps_sincof_p0;
-    y2 = _mm_mul_ps(y2, z);
-    y2 = _mm_add_ps(y2, *(v4sf*)_ps_sincof_p1);
-    y2 = _mm_mul_ps(y2, z);
-    y2 = _mm_add_ps(y2, *(v4sf*)_ps_sincof_p2);
-    y2 = _mm_mul_ps(y2, z);
-    y2 = _mm_mul_ps(y2, x);
-    y2 = _mm_add_ps(y2, x);
-    
-    /* select the correct result from the two polynoms */  
-    xmm3 = poly_mask;
-    y2 = _mm_and_ps(xmm3, y2); //, xmm3);
-    y = _mm_andnot_ps(xmm3, y);
-    y = _mm_add_ps(y,y2);
-    /* update the sign */
-    y = _mm_xor_ps(y, sign_bit);
-    
-    return y;
-}
-
-/* since sin_ps and cos_ps are almost identical, sincos_ps could replace both of them..
-   it is almost as fast, and gives you a free cosine with your sine */
-void sincos_ps(v4sf x, v4sf *s, v4sf *c) {
-    v4sf xmm1, xmm2, xmm3 = _mm_setzero_ps(), sign_bit_sin, y;
-#ifdef USE_SSE2
-    v4si emm0, emm2, emm4;
-#else
-    v2si mm0, mm1, mm2, mm3, mm4, mm5;
-#endif
-    sign_bit_sin = x;
-    /* take the absolute value */
-    x = _mm_and_ps(x, *(v4sf*)_ps_inv_sign_mask);
-    /* extract the sign bit (upper one) */
-    sign_bit_sin = _mm_and_ps(sign_bit_sin, *(v4sf*)_ps_sign_mask);
-    
-    /* scale by 4/Pi */
-    y = _mm_mul_ps(x, *(v4sf*)_ps_cephes_FOPI);
-    
-#ifdef USE_SSE2
-    /* store the integer part of y in emm2 */
-    emm2 = _mm_cvttps_epi32(y);
-    
-    /* j=(j+1) & (~1) (see the cephes sources) */
-    emm2 = _mm_add_epi32(emm2, *(v4si*)_pi32_1);
-    emm2 = _mm_and_si128(emm2, *(v4si*)_pi32_inv1);
-    y = _mm_cvtepi32_ps(emm2);
-    
-    emm4 = emm2;
-    
-    /* get the swap sign flag for the sine */
-    emm0 = _mm_and_si128(emm2, *(v4si*)_pi32_4);
-    emm0 = _mm_slli_epi32(emm0, 29);
-    v4sf swap_sign_bit_sin = _mm_castsi128_ps(emm0);
-    
-    /* get the polynom selection mask for the sine*/
-    emm2 = _mm_and_si128(emm2, *(v4si*)_pi32_2);
-    emm2 = _mm_cmpeq_epi32(emm2, _mm_setzero_si128());
-    v4sf poly_mask = _mm_castsi128_ps(emm2);
-#else
-    /* store the integer part of y in mm2:mm3 */
-    xmm3 = _mm_movehl_ps(xmm3, y);
-    mm2 = _mm_cvttps_pi32(y);
-    mm3 = _mm_cvttps_pi32(xmm3);
-    
-    /* j=(j+1) & (~1) (see the cephes sources) */
-    mm2 = _mm_add_pi32(mm2, *(v2si*)_pi32_1);
-    mm3 = _mm_add_pi32(mm3, *(v2si*)_pi32_1);
-    mm2 = _mm_and_si64(mm2, *(v2si*)_pi32_inv1);
-    mm3 = _mm_and_si64(mm3, *(v2si*)_pi32_inv1);
-    
-    y = _mm_cvtpi32x2_ps(mm2, mm3);
-    
-    mm4 = mm2;
-    mm5 = mm3;
-    
-    /* get the swap sign flag for the sine */
-    mm0 = _mm_and_si64(mm2, *(v2si*)_pi32_4);
-    mm1 = _mm_and_si64(mm3, *(v2si*)_pi32_4);
-    mm0 = _mm_slli_pi32(mm0, 29);
-    mm1 = _mm_slli_pi32(mm1, 29);
-    v4sf swap_sign_bit_sin;
-    COPY_MM_TO_XMM(mm0, mm1, swap_sign_bit_sin);
-    
-    /* get the polynom selection mask for the sine */
-    
-    mm2 = _mm_and_si64(mm2, *(v2si*)_pi32_2);
-    mm3 = _mm_and_si64(mm3, *(v2si*)_pi32_2);
-    mm2 = _mm_cmpeq_pi32(mm2, _mm_setzero_si64());
-    mm3 = _mm_cmpeq_pi32(mm3, _mm_setzero_si64());
-    v4sf poly_mask;
-    COPY_MM_TO_XMM(mm2, mm3, poly_mask);
-#endif
-    
-    /* The magic pass: "Extended precision modular arithmetic" 
-       x = ((x - y * DP1) - y * DP2) - y * DP3; */
-    xmm1 = *(v4sf*)_ps_minus_cephes_DP1;
-    xmm2 = *(v4sf*)_ps_minus_cephes_DP2;
-    xmm3 = *(v4sf*)_ps_minus_cephes_DP3;
-    xmm1 = _mm_mul_ps(y, xmm1);
-    xmm2 = _mm_mul_ps(y, xmm2);
-    xmm3 = _mm_mul_ps(y, xmm3);
-    x = _mm_add_ps(x, xmm1);
-    x = _mm_add_ps(x, xmm2);
-    x = _mm_add_ps(x, xmm3);
-    
-#ifdef USE_SSE2
-    emm4 = _mm_sub_epi32(emm4, *(v4si*)_pi32_2);
-    emm4 = _mm_andnot_si128(emm4, *(v4si*)_pi32_4);
-    emm4 = _mm_slli_epi32(emm4, 29);
-    v4sf sign_bit_cos = _mm_castsi128_ps(emm4);
-#else
-    /* get the sign flag for the cosine */
-    mm4 = _mm_sub_pi32(mm4, *(v2si*)_pi32_2);
-    mm5 = _mm_sub_pi32(mm5, *(v2si*)_pi32_2);
-    mm4 = _mm_andnot_si64(mm4, *(v2si*)_pi32_4);
-    mm5 = _mm_andnot_si64(mm5, *(v2si*)_pi32_4);
-    mm4 = _mm_slli_pi32(mm4, 29);
-    mm5 = _mm_slli_pi32(mm5, 29);
-    v4sf sign_bit_cos;
-    COPY_MM_TO_XMM(mm4, mm5, sign_bit_cos);
-    _mm_empty(); /* good-bye mmx */
-#endif
-    
-    sign_bit_sin = _mm_xor_ps(sign_bit_sin, swap_sign_bit_sin);
-    
-    
-    /* Evaluate the first polynom  (0 <= x <= Pi/4) */
-    v4sf z = _mm_mul_ps(x,x);
-    y = *(v4sf*)_ps_coscof_p0;
-    
-    y = _mm_mul_ps(y, z);
-    y = _mm_add_ps(y, *(v4sf*)_ps_coscof_p1);
-    y = _mm_mul_ps(y, z);
-    y = _mm_add_ps(y, *(v4sf*)_ps_coscof_p2);
-    y = _mm_mul_ps(y, z);
-    y = _mm_mul_ps(y, z);
-    v4sf tmp = _mm_mul_ps(z, *(v4sf*)_ps_0p5);
-    y = _mm_sub_ps(y, tmp);
-    y = _mm_add_ps(y, *(v4sf*)_ps_1);
-    
-    /* Evaluate the second polynom  (Pi/4 <= x <= 0) */
-    
-    v4sf y2 = *(v4sf*)_ps_sincof_p0;
-    y2 = _mm_mul_ps(y2, z);
-    y2 = _mm_add_ps(y2, *(v4sf*)_ps_sincof_p1);
-    y2 = _mm_mul_ps(y2, z);
-    y2 = _mm_add_ps(y2, *(v4sf*)_ps_sincof_p2);
-    y2 = _mm_mul_ps(y2, z);
-    y2 = _mm_mul_ps(y2, x);
-    y2 = _mm_add_ps(y2, x);
-    
-    /* select the correct result from the two polynoms */  
-    xmm3 = poly_mask;
-    v4sf ysin2 = _mm_and_ps(xmm3, y2);
-    v4sf ysin1 = _mm_andnot_ps(xmm3, y);
-    y2 = _mm_sub_ps(y2,ysin2);
-    y = _mm_sub_ps(y, ysin1);
-    
-    xmm1 = _mm_add_ps(ysin1,ysin2);
-    xmm2 = _mm_add_ps(y,y2);
-    
-    /* update the sign */
-    *s = _mm_xor_ps(xmm1, sign_bit_sin);
-    *c = _mm_xor_ps(xmm2, sign_bit_cos);
-}
-
--- a/src/app/sse_mathfun_extension.h
+++ b/src/app/sse_mathfun_extension.h
@ -1,360 +0,0 @@
-/*
-sse_mathfun_extension.h - zlib license
-Written by Tolga Mizrak 2016
-Extension of sse_mathfun.h, which is written by Julien Pommier
-
-Based on the corresponding algorithms of the cephes math library
-
-This is written as an extension to sse_mathfun.h instead of modifying it, just because I didn't want
-to maintain a modified version of the original library. This way switching to a newer version of the
-library won't be a hassle.
-
-Note that non SSE2 implementations of tan_ps, atan_ps, cot_ps and atan2_ps are not implemented yet.
-As such, currently you need to #define USE_SSE2 to compile.
-
-With tan_ps, cot_ps you get good precision on input ranges that are further away from the domain
-borders (-PI/2, PI/2 for tan and 0, 1 for cot). See the results on the deviations for these
-functions on my machine:
-checking tan on [-0.25*Pi, 0.25*Pi]
-max deviation from tanf(x): 1.19209e-07 at 0.250000006957*Pi, max deviation from cephes_tan(x):
-5.96046e-08
-   ->> precision OK for the tan_ps <<-
-   
-checking tan on [-0.49*Pi, 0.49*Pi]
-max deviation from tanf(x): 3.8147e-06 at -0.490000009841*Pi, max deviation from cephes_tan(x):
-9.53674e-07
-   ->> precision OK for the tan_ps <<-
-   
-checking cot on [0.2*Pi, 0.7*Pi]
-max deviation from cotf(x): 1.19209e-07 at 0.204303119606*Pi, max deviation from cephes_cot(x):
-1.19209e-07
-   ->> precision OK for the cot_ps <<-
-   
-checking cot on [0.01*Pi, 0.99*Pi]
-max deviation from cotf(x): 3.8147e-06 at 0.987876517942*Pi, max deviation from cephes_cot(x):
-9.53674e-07
-   ->> precision OK for the cot_ps <<-
-   
-With atan_ps and atan2_ps you get pretty good precision, atan_ps max deviation is < 2e-7 and
-atan2_ps max deviation is < 2.5e-7
-*/
-
-/* Copyright (C) 2016 Tolga Mizrak
-
-  This software is provided 'as-is', without any express or implied
-  warranty.  In no event will the authors be held liable for any damages
-  arising from the use of this software.
-  
-  Permission is granted to anyone to use this software for any purpose,
-  including commercial applications, and to alter it and redistribute it
-  freely, subject to the following restrictions:
-  
-  1. The origin of this software must not be misrepresented; you must not
-  claim that you wrote the original software. If you use this software
-  in a product, an acknowledgment in the product documentation would be
-  appreciated but is not required.
-  2. Altered source versions must be plainly marked as such, and must not be
-  misrepresented as being the original software.
-  3. This notice may not be removed or altered from any source distribution.
-  
-  (this is the zlib license)
-*/
-
-#pragma once
-
-#ifndef _SSE_MATHFUN_EXTENSION_H_INCLUDED_
-#define _SSE_MATHFUN_EXTENSION_H_INCLUDED_
-
-#ifndef USE_SSE2
-#error sse1 & mmx version not implemented
-#endif
-
-#ifdef _MSC_VER
-#pragma warning( push )
-/* warning C4838: conversion from 'double' to 'const float' requires a narrowing conversion */
-#pragma warning( disable : 4838 )
-/* warning C4305: 'initializing': truncation from 'double' to 'const float' */
-#pragma warning( disable : 4305 )
-#endif
-
-#include "sse_mathfun.h"
-
-_PS_CONST( 0, 0 );
-_PS_CONST( 2, 2 );
-_PI32_CONST( neg1, 1 );
-
-_PS_CONST( tancof_p0, 9.38540185543E-3 );
-_PS_CONST( tancof_p1, 3.11992232697E-3 );
-_PS_CONST( tancof_p2, 2.44301354525E-2 );
-_PS_CONST( tancof_p3, 5.34112807005E-2 );
-_PS_CONST( tancof_p4, 1.33387994085E-1 );
-_PS_CONST( tancof_p5, 3.33331568548E-1 );
-
-_PS_CONST( tancot_eps, 1.0e-4 );
-
-v4sf tancot_ps( v4sf x, int cotFlag )
-{
-	v4sf xmm1, xmm2 = _mm_setzero_ps(), xmm3, sign_bit, y;
-    
-#ifdef USE_SSE2
-	v4si emm2;
-#else
-#endif
-	sign_bit = x;
-	/* take the absolute value */
-	x = _mm_and_ps( x, *(v4sf*)_ps_inv_sign_mask );
-	/* extract the sign bit (upper one) */
-	sign_bit = _mm_and_ps( sign_bit, *(v4sf*)_ps_sign_mask );
-    
-	/* scale by 4/Pi */
-	y = _mm_mul_ps( x, *(v4sf*)_ps_cephes_FOPI );
-    
-#ifdef USE_SSE2
-	/* store the integer part of y in mm0 */
-	emm2 = _mm_cvttps_epi32( y );
-	/* j=(j+1) & (~1) (see the cephes sources) */
-	emm2 = _mm_add_epi32( emm2, *(v4si*)_pi32_1 );
-	emm2 = _mm_and_si128( emm2, *(v4si*)_pi32_inv1 );
-	y = _mm_cvtepi32_ps( emm2 );
-    
-	emm2 = _mm_and_si128( emm2, *(v4si*)_pi32_2 );
-	emm2 = _mm_cmpeq_epi32( emm2, _mm_setzero_si128() );
-    
-	v4sf poly_mask = _mm_castsi128_ps( emm2 );
-#else
-#endif
-	/* The magic pass: "Extended precision modular arithmetic"
-    x = ((x - y * DP1) - y * DP2) - y * DP3; */
-	xmm1 = *(v4sf*)_ps_minus_cephes_DP1;
-	xmm2 = *(v4sf*)_ps_minus_cephes_DP2;
-	xmm3 = *(v4sf*)_ps_minus_cephes_DP3;
-	xmm1 = _mm_mul_ps( y, xmm1 );
-	xmm2 = _mm_mul_ps( y, xmm2 );
-	xmm3 = _mm_mul_ps( y, xmm3 );
-	v4sf z = _mm_add_ps( x, xmm1 );
-	z = _mm_add_ps( z, xmm2 );
-	z = _mm_add_ps( z, xmm3 );
-    
-	v4sf zz = _mm_mul_ps( z, z );
-    
-	y = *(v4sf*)_ps_tancof_p0;
-	y = _mm_mul_ps( y, zz );
-	y = _mm_add_ps( y, *(v4sf*)_ps_tancof_p1 );
-	y = _mm_mul_ps( y, zz );
-	y = _mm_add_ps( y, *(v4sf*)_ps_tancof_p2 );
-	y = _mm_mul_ps( y, zz );
-	y = _mm_add_ps( y, *(v4sf*)_ps_tancof_p3 );
-	y = _mm_mul_ps( y, zz );
-	y = _mm_add_ps( y, *(v4sf*)_ps_tancof_p4 );
-	y = _mm_mul_ps( y, zz );
-	y = _mm_add_ps( y, *(v4sf*)_ps_tancof_p5 );
-	y = _mm_mul_ps( y, zz );
-	y = _mm_mul_ps( y, z );
-	y = _mm_add_ps( y, z );
-    
-	v4sf y2;
-	if( cotFlag ) {
-		y2 = _mm_xor_ps( y, *(v4sf*)_ps_sign_mask );
-		/* y = _mm_rcp_ps( y ); */
-		/* using _mm_rcp_ps here loses on way too much precision, better to do a div */
-		y = _mm_div_ps( *(v4sf*)_ps_1, y );
-	} else {
-		/* y2 = _mm_rcp_ps( y ); */
-		/* using _mm_rcp_ps here loses on way too much precision, better to do a div */
-		y2 = _mm_div_ps( *(v4sf*)_ps_1, y );
-		y2 = _mm_xor_ps( y2, *(v4sf*)_ps_sign_mask );
-	}
-    
-	/* select the correct result from the two polynoms */
-	xmm3 = poly_mask;
-	y = _mm_and_ps( xmm3, y );
-	y2 = _mm_andnot_ps( xmm3, y2 );
-	y = _mm_or_ps( y, y2 );
-    
-	/* update the sign */
-	y = _mm_xor_ps( y, sign_bit );
-    
-	return y;
-}
-
-v4sf tan_ps( v4sf x ) { return tancot_ps( x, 0 ); }
-
-v4sf cot_ps( v4sf x ) { return tancot_ps( x, 1 ); }
-
-_PS_CONST( atanrange_hi, 2.414213562373095 );
-_PS_CONST( atanrange_lo, 0.4142135623730950 );
-const float PIF = 3.141592653589793238;
-const float PIO2F = 1.5707963267948966192;
-_PS_CONST( cephes_PIF, 3.141592653589793238 );
-_PS_CONST( cephes_PIO2F, 1.5707963267948966192 );
-_PS_CONST( cephes_PIO4F, 0.7853981633974483096 );
-
-_PS_CONST( atancof_p0, 8.05374449538e-2 );
-_PS_CONST( atancof_p1, 1.38776856032E-1 );
-_PS_CONST( atancof_p2, 1.99777106478E-1 );
-_PS_CONST( atancof_p3, 3.33329491539E-1 );
-
-v4sf atan_ps( v4sf x )
-{
-	v4sf sign_bit, y;
-    
-	sign_bit = x;
-	/* take the absolute value */
-	x = _mm_and_ps( x, *(v4sf*)_ps_inv_sign_mask );
-	/* extract the sign bit (upper one) */
-	sign_bit = _mm_and_ps( sign_bit, *(v4sf*)_ps_sign_mask );
-    
-    /* range reduction, init x and y depending on range */
-#ifdef USE_SSE2
-	/* x > 2.414213562373095 */
-	v4sf cmp0 = _mm_cmpgt_ps( x, *(v4sf*)_ps_atanrange_hi );
-	/* x > 0.4142135623730950 */
-	v4sf cmp1 = _mm_cmpgt_ps( x, *(v4sf*)_ps_atanrange_lo );
-    
-	/* x > 0.4142135623730950 && !( x > 2.414213562373095 ) */
-	v4sf cmp2 = _mm_andnot_ps( cmp0, cmp1 );
-    
-	/* -( 1.0/x ) */
-	v4sf y0 = _mm_and_ps( cmp0, *(v4sf*)_ps_cephes_PIO2F );
-	v4sf x0 = _mm_div_ps( *(v4sf*)_ps_1, x );
-	x0 = _mm_xor_ps( x0, *(v4sf*)_ps_sign_mask );
-    
-	v4sf y1 = _mm_and_ps( cmp2, *(v4sf*)_ps_cephes_PIO4F );
-	/* (x-1.0)/(x+1.0) */
-	v4sf x1_o = _mm_sub_ps( x, *(v4sf*)_ps_1 );
-	v4sf x1_u = _mm_add_ps( x, *(v4sf*)_ps_1 );
-	v4sf x1 = _mm_div_ps( x1_o, x1_u );
-    
-	v4sf x2 = _mm_and_ps( cmp2, x1 );
-	x0 = _mm_and_ps( cmp0, x0 );
-	x2 = _mm_or_ps( x2, x0 );
-	cmp1 = _mm_or_ps( cmp0, cmp2 );
-	x2 = _mm_and_ps( cmp1, x2 );
-	x = _mm_andnot_ps( cmp1, x );
-	x = _mm_or_ps( x2, x );
-    
-	y = _mm_or_ps( y0, y1 );
-#else
-#error sse1 & mmx version not implemented
-#endif
-    
-	v4sf zz = _mm_mul_ps( x, x );
-	v4sf acc = *(v4sf*)_ps_atancof_p0;
-	acc = _mm_mul_ps( acc, zz );
-	acc = _mm_sub_ps( acc, *(v4sf*)_ps_atancof_p1 );
-	acc = _mm_mul_ps( acc, zz );
-	acc = _mm_add_ps( acc, *(v4sf*)_ps_atancof_p2 );
-	acc = _mm_mul_ps( acc, zz );
-	acc = _mm_sub_ps( acc, *(v4sf*)_ps_atancof_p3 );
-	acc = _mm_mul_ps( acc, zz );
-	acc = _mm_mul_ps( acc, x );
-	acc = _mm_add_ps( acc, x );
-	y = _mm_add_ps( y, acc );
-    
-	/* update the sign */
-	y = _mm_xor_ps( y, sign_bit );
-    
-	return y;
-}
-
-v4sf atan2_ps( v4sf y, v4sf x )
-{
-	v4sf x_eq_0 = _mm_cmpeq_ps( x, *(v4sf*)_ps_0 );
-	v4sf x_gt_0 = _mm_cmpgt_ps( x, *(v4sf*)_ps_0 );
-	v4sf x_le_0 = _mm_cmple_ps( x, *(v4sf*)_ps_0 );
-	v4sf y_eq_0 = _mm_cmpeq_ps( y, *(v4sf*)_ps_0 );
-	v4sf x_lt_0 = _mm_cmplt_ps( x, *(v4sf*)_ps_0 );
-	v4sf y_lt_0 = _mm_cmplt_ps( y, *(v4sf*)_ps_0 );
-    
-	v4sf zero_mask = _mm_and_ps( x_eq_0, y_eq_0 );
-	v4sf zero_mask_other_case = _mm_and_ps( y_eq_0, x_gt_0 );
-	zero_mask = _mm_or_ps( zero_mask, zero_mask_other_case );
-    
-	v4sf pio2_mask = _mm_andnot_ps( y_eq_0, x_eq_0 );
-	v4sf pio2_mask_sign = _mm_and_ps( y_lt_0, *(v4sf*)_ps_sign_mask );
-	v4sf pio2_result = *(v4sf*)_ps_cephes_PIO2F;
-	pio2_result = _mm_xor_ps( pio2_result, pio2_mask_sign );
-	pio2_result = _mm_and_ps( pio2_mask, pio2_result );
-    
-	v4sf pi_mask = _mm_and_ps( y_eq_0, x_le_0 );
-	v4sf pi = *(v4sf*)_ps_cephes_PIF;
-	v4sf pi_result = _mm_and_ps( pi_mask, pi );
-    
-	v4sf swap_sign_mask_offset = _mm_and_ps( x_lt_0, y_lt_0 );
-	swap_sign_mask_offset = _mm_and_ps( swap_sign_mask_offset, *(v4sf*)_ps_sign_mask );
-    
-	v4sf offset0 = _mm_setzero_ps();
-	v4sf offset1 = *(v4sf*)_ps_cephes_PIF;
-	offset1 = _mm_xor_ps( offset1, swap_sign_mask_offset );
-    
-	v4sf offset = _mm_andnot_ps( x_lt_0, offset0 );
-	offset = _mm_and_ps( x_lt_0, offset1 );
-    
-	v4sf arg = _mm_div_ps( y, x );
-	v4sf atan_result = atan_ps( arg );
-	atan_result = _mm_add_ps( atan_result, offset );
-    
-	/* select between zero_result, pio2_result and atan_result */
-    
-	v4sf result = _mm_andnot_ps( zero_mask, pio2_result );
-	atan_result = _mm_andnot_ps( pio2_mask, atan_result );
-	atan_result = _mm_andnot_ps( pio2_mask, atan_result);
-	result = _mm_or_ps( result, atan_result );
-	result = _mm_or_ps( result, pi_result );
-    
-	return result;
-}
-
-/* for convenience of calling simd sqrt */
-float sqrt_ps( float x )
-{
-	v4sf sse_value = _mm_set_ps1( x );
-	sse_value = _mm_sqrt_ps( sse_value );
-	return _mm_cvtss_f32( sse_value );
-}
-float rsqrt_ps( float x )
-{
-	v4sf sse_value = _mm_set_ps1( x );
-	sse_value = _mm_rsqrt_ps( sse_value );
-	return _mm_cvtss_f32( sse_value );
-}
-
-/* atan2 implementation using atan, used as a reference to implement atan2_ps */
-float atan2_ref( float y, float x )
-{
-	if( x == 0.0f ) {
-		if( y == 0.0f ) {
-			return 0.0f;
-		}
-		float result = _ps_cephes_PIO2F[0];
-		if( y < 0.0f ) {
-			result = -result;
-		}
-		return result;
-	}
-    
-	if( y == 0.0f ) {
-		if( x > 0.0f ) {
-			return 0.0f;
-		}
-		return PIF;
-	}
-    
-	float offset = 0;
-	if( x < 0.0f ) {
-		offset = PIF;
-		if( y < 0.0f ) {
-			offset = -offset;
-		}
-	}
-    
-	v4sf val = _mm_set_ps1( y / x );
-	val = atan_ps( val );
-	return offset + _mm_cvtss_f32( val );
-}
-
-#ifdef _MSC_VER
-#pragma warning( pop )
-#endif
-
-#endif
--- a/src/gs_libs/gs_types.cpp
+++ b/src/gs_libs/gs_types.cpp
@ -2424,12 +2424,37 @@ CreateAllocator_(allocator_allocate* Alloc, allocator_free* Free)
 }
 #define CreateAllocator(a, f) CreateAllocator_((allocator_allocate*)(a), (allocator_free*)(f))

+internal void
+AllocatorDebug_PushAlloc(gs_allocator_debug* Debug, u64 Size, char* Location)
+{
+    // NOTE(pjs): I don't want this debug procedure to be the reason the
+    // application crashes.
+    if (Debug->AllocationsCount < Debug->AllocationsCountMax)
+    {
+        gs_debug_allocation Allocation = {};
+        
+        gs_const_string L = ConstString(Location);
+        
+        s64 LastSlash = FindLastFromSet(L, "\\/");
+        if (LastSlash < 0) LastSlash = 0;
+        Allocation.Location = GetStringAfter(L, LastSlash);
+        Allocation.Size = Size;
+        
+        Debug->Allocations[Debug->AllocationsCount++] = Allocation;
+    }
+    Debug->TotalAllocSize += Size;
+}
+
 internal gs_data
 AllocatorAlloc_(gs_allocator Allocator, u64 Size, char* Location)
 {
    // TODO(Peter): Memory Profiling with Location
    u64 SizeResult = 0;
    void* Memory = Allocator.Alloc(Size, &SizeResult);
+    if (Allocator.Debug)
+    {
+        AllocatorDebug_PushAlloc(Allocator.Debug, Size, Location);
+    }
    return CreateData((u8*)Memory, SizeResult);
 }
 internal void
@ -2439,6 +2464,13 @@ AllocatorFree_(gs_allocator Allocator, void* Base, u64 Size, char* Location)
    if (Base != 0 && Size != 0)
    {
        Allocator.Free(Base, Size);
+        if (Allocator.Debug)
+        {
+            // NOTE(pjs): There's no reason we should be going negative
+            // ie. Freeing more memory than we allocated
+            Assert(Allocator.Debug->TotalAllocSize >= Size);
+            Allocator.Debug->TotalAllocSize -= Size;
+        }
    }
 }

@ -2526,30 +2558,37 @@ FreeCursorListEntry(gs_allocator Allocator, gs_memory_cursor_list* CursorEntry)
 }

 internal gs_memory_arena
-CreateMemoryArena_(arena_type ArenaType, gs_allocator Allocator, u64 ChunkSize, u64 Alignment, gs_memory_arena* ParentArena)
+CreateMemoryArena_(arena_type ArenaType, gs_allocator Allocator, u64 ChunkSize, u64 Alignment, gs_memory_arena* ParentArena, char* Name)
 {
    // we only want a parent arena if the type is Arena_SubArena
    Assert(((ArenaType == Arena_BaseArena) && (ParentArena == 0)) ||
           ((ArenaType == Arena_SubArena) && (ParentArena != 0)));
    
    gs_memory_arena Arena = {};
+    Arena.ArenaName = Name;
    Arena.Type = ArenaType;
    Arena.Allocator = Allocator;
    Arena.Parent = ParentArena;
+    
+#if MEMORY_CURSOR_STATIC_ARRAY
+    Arena.CursorsCountMax = 4096;
+    Arena.Cursors = AllocatorAllocArray(Allocator, gs_memory_cursor_list, Arena.CursorsCountMax);
+#endif
+    
    Arena.MemoryChunkSize = ChunkSize;
    Arena.MemoryAlignment = Alignment;
    return Arena;
 }

 internal gs_memory_arena
-CreateMemoryArena(gs_allocator Allocator, u64 ChunkSize = KB(32), u64 Alignment = Bytes(8))
+CreateMemoryArena(gs_allocator Allocator, char* Name, u64 ChunkSize = KB(32), u64 Alignment = Bytes(8))
 {
-    return CreateMemoryArena_(Arena_BaseArena, Allocator, ChunkSize, Alignment, 0);
+    return CreateMemoryArena_(Arena_BaseArena, Allocator, ChunkSize, Alignment, 0, Name);
 }
 internal gs_memory_arena
-CreateMemorySubArena(gs_memory_arena* Parent, u64 ChunkSize = KB(32), u64 Alignment = Bytes(8))
+CreateMemorySubArena(gs_memory_arena* Parent, char* Name, u64 ChunkSize = KB(32), u64 Alignment = Bytes(8))
 {
-    return CreateMemoryArena_(Arena_SubArena, Parent->Allocator, ChunkSize, Alignment, Parent);
+    return CreateMemoryArena_(Arena_SubArena, Parent->Allocator, ChunkSize, Alignment, Parent, Name);
 }

 internal gs_data PushSize_(gs_memory_arena* Arena, u64 Size, char* Location);
@ -2557,6 +2596,7 @@ internal gs_data PushSize_(gs_memory_arena* Arena, u64 Size, char* Location);
 internal void
 FreeCursorList(gs_memory_cursor_list* List, gs_allocator Allocator)
 {
+#if !MEMORY_CURSOR_STATIC_ARRAY
    gs_memory_cursor_list* CursorAt = List;
    while (CursorAt != 0)
    {
@ -2564,13 +2604,18 @@ FreeCursorList(gs_memory_cursor_list* List, gs_allocator Allocator)
        FreeCursorListEntry(Allocator, CursorAt);
        CursorAt = Prev;
    }
+#endif
 }

 internal gs_memory_cursor_list*
 MemoryArenaNewCursor(gs_memory_arena* Arena, u64 MinSize, char* Location)
 {
+#if MEMORY_CURSOR_STATIC_ARRAY
+    u64 AllocSize = Max(MinSize, Arena->MemoryChunkSize);
+#else
    // Allocate enough spcae for the minimum size needed + sizeo for the cursor list
    u64 AllocSize = Max(MinSize, Arena->MemoryChunkSize) + sizeof(gs_memory_cursor_list);
+#endif
    
    gs_data Data = {0};
    switch (Arena->Type)
@ -2588,6 +2633,11 @@ MemoryArenaNewCursor(gs_memory_arena* Arena, u64 MinSize, char* Location)
        InvalidDefaultCase;
    }
    
+#if MEMORY_CURSOR_STATIC_ARRAY
+    Assert(Arena->CursorsCount < Arena->CursorsCountMax);
+    gs_memory_cursor_list* Result = Arena->Cursors + Arena->CursorsCount++;
+    Result->Cursor = CreateMemoryCursor(Data.Memory, Data.Size);
+#else
    // Fit the memory cursor into the region allocated
    Assert(MinSize + sizeof(gs_memory_cursor_list) <= Data.Size);
    gs_memory_cursor_list* Result = (gs_memory_cursor_list*)Data.Memory;
@ -2599,9 +2649,14 @@ MemoryArenaNewCursor(gs_memory_arena* Arena, u64 MinSize, char* Location)
    Result->Next = 0;
    if (Arena->CursorList != 0)
    {
+        if (Arena->CursorList->Next != 0)
+        {
+            Result->Next = Arena->CursorList->Next;
+        }
        Arena->CursorList->Next = Result;
    }
    Arena->CursorList = Result;
+#endif
    return Result;
 }

@ -2611,6 +2666,27 @@ PushSize_(gs_memory_arena* Arena, u64 Size, char* Location)
    gs_data Result = {0};
    if (Size > 0)
    {
+#if MEMORY_CURSOR_STATIC_ARRAY
+        gs_memory_cursor_list* CursorEntry = 0;
+        for (u64 i = 0;
+             i < Arena->CursorsCount;
+             i++)
+        {
+            gs_memory_cursor_list* At = Arena->Cursors + i;
+            if (CursorHasRoom(At->Cursor, Size))
+            {
+                CursorEntry = At;
+                break;
+            }
+        }
+        if (!CursorEntry)
+        {
+            CursorEntry = MemoryArenaNewCursor(Arena, Size, Location);
+        }
+        Assert(CursorEntry);
+        Assert(CursorHasRoom(CursorEntry->Cursor, Size));
+#else
+        
        gs_memory_cursor_list* CursorEntry = Arena->CursorList;
        if (CursorEntry == 0)
        {
@ -2627,6 +2703,7 @@ PushSize_(gs_memory_arena* Arena, u64 Size, char* Location)
                CursorEntry = MemoryArenaNewCursor(Arena, Size, Location);
            }
        }
+#endif
        Assert(CursorEntry != 0);
        Result = PushSizeOnCursor_(&CursorEntry->Cursor, Size, Location);
        Assert(Result.Memory != 0);
@ -2651,44 +2728,19 @@ PushSize_(gs_memory_arena* Arena, u64 Size, char* Location)
    return Result;
 }

-internal void
-PopSize(gs_memory_arena* Arena, u64 Size)
-{
-    gs_allocator Allocator = Arena->Allocator;
-    gs_memory_cursor_list* CursorEntry = Arena->CursorList;
-    for (gs_memory_cursor_list* Prev = 0;
-         CursorEntry != 0 && Size != 0;
-         CursorEntry = Prev)
-    {
-        Prev = CursorEntry->Prev;
-        if (Size >= CursorEntry->Cursor.Position)
-        {
-            Size -= CursorEntry->Cursor.Position;
-            FreeCursorListEntry(Allocator, CursorEntry);
-        }
-        else
-        {
-            PopSizeOnCursor(&CursorEntry->Cursor, Size);
-            break;
-        }
-    }
-    Arena->CursorList = CursorEntry;
-}
 internal void
 FreeMemoryArena(gs_memory_arena* Arena)
 {
-    gs_allocator Allocator = Arena->Allocator;
-    gs_memory_cursor_list* CursorEntry = Arena->CursorList;
-    for (gs_memory_cursor_list* Prev = 0;
-         CursorEntry != 0;
-         CursorEntry = Prev)
+#if MEMORY_CURSOR_STATIC_ARRAY
+    for (u32 i = 0; i < Arena->CursorsCount; i++)
    {
-        Prev = CursorEntry->Prev;
-        if (CursorEntry != 0)
-        {
-            FreeCursorListEntry(Allocator, CursorEntry);
-        }
+        gs_memory_cursor_list E = Arena->Cursors[i];
+        AllocatorFree(Arena->Allocator, E.Cursor.Data.Memory, E.Cursor.Data.Size);
    }
+    AllocatorFreeArray(Arena->Allocator, Arena->Cursors, gs_memory_cursor_list, Arena->CursorsCountMax);
+#else
+    FreeCursorList(Arena->CursorList, Arena->Allocator);
+#endif
 }

 #define PushSizeToData(arena, size) PushSize_((arena), (size), FileNameAndLineNumberString)
@ -2726,6 +2778,12 @@ PushStringCopy(gs_memory_arena* Arena, gs_const_string String)
 internal void
 ClearArena(gs_memory_arena* Arena)
 {
+#if MEMORY_CURSOR_STATIC_ARRAY
+    for (u32 i = 0; i < Arena->CursorsCount; i++)
+    {
+        Arena->Cursors[i].Cursor.Position = 0;
+    }
+#else
    gs_memory_cursor_list* First = 0;
    for (gs_memory_cursor_list* CursorEntry = Arena->CursorList;
         CursorEntry != 0;
@ -2735,12 +2793,13 @@ ClearArena(gs_memory_arena* Arena)
        CursorEntry->Cursor.Position = 0;
    }
    Arena->CursorList = First;
+#endif
 }

 internal void
 FreeArena(gs_memory_arena* Arena)
 {
-    FreeCursorList(Arena->CursorList, Arena->Allocator);
+    FreeMemoryArena(Arena);
 }

 ///////////////////////////
@ -2789,14 +2848,14 @@ CreateDynarrayWithStorage(gs_memory_arena Storage, u32 ElementSize, u32 Elements
 internal gs_dynarray
 CreateDynarray_(gs_allocator Allocator, u32 ElementSize, u32 ElementsPerBuffer)
 {
-    gs_memory_arena Storage = CreateMemoryArena(Allocator, ElementSize * ElementsPerBuffer);
+    gs_memory_arena Storage = CreateMemoryArena(Allocator, "Dynarray Arena", ElementSize * ElementsPerBuffer);
    return CreateDynarrayWithStorage(Storage, ElementSize, ElementsPerBuffer);
 };

 internal gs_dynarray
 CreateDynarray_(gs_memory_arena* Arena, u32 ElementSize, u32 ElementsPerBuffer)
 {
-    gs_memory_arena Storage = CreateMemorySubArena(Arena, ElementSize * ElementsPerBuffer);
+    gs_memory_arena Storage = CreateMemorySubArena(Arena, "Dynarray Sub Arena", ElementSize * ElementsPerBuffer);
    return CreateDynarrayWithStorage(Storage, ElementSize, ElementsPerBuffer);
 };

--- a/src/gs_libs/gs_types.h
+++ b/src/gs_libs/gs_types.h
@ -247,7 +247,7 @@ enum { \
 #define DontCompile ImAfraidICantDoThat

 #define LineNumberString Stringify(__LINE__)
-#define FileNameAndLineNumberString_ __FILE__ ":" LineNumberString ":"
+#define FileNameAndLineNumberString_ __FILE__ ":" LineNumberString ":" __FUNCTION__
 #define FileNameAndLineNumberString (char*)FileNameAndLineNumberString_

 //
@ -633,10 +633,27 @@ typedef ALLOCATOR_ALLOC(allocator_allocate);
 #define ALLOCATOR_FREE(name) void name(void* Ptr, u64 Size)
 typedef ALLOCATOR_FREE(allocator_free);

+struct gs_debug_allocation
+{
+    gs_const_string Location;
+    u64 Size;
+};
+
+struct gs_allocator_debug
+{
+    u64 TotalAllocSize;
+    
+    u64 AllocationsCount;
+    u64 AllocationsCountMax;
+    gs_debug_allocation* Allocations;
+};
+
 struct gs_allocator
 {
    allocator_allocate* Alloc;
    allocator_free* Free;
+    
+    gs_allocator_debug* Debug;
 };

 struct gs_memory_cursor
@ -645,11 +662,26 @@ struct gs_memory_cursor
    u64 Position;
 };

+/* TODO(pjs): Setting MEMORY_CURSOR_STATIC_ARRAY will still compile,
+ However, it introduces a bug that I haven't fully diagnosed.
+The problem seems to occur when trying to push to a cleared memory arena
+Where the FirstCursor doesn't have enough room for the allocation, but
+also FirstCursor->Next points to a valid cursor. The new cursor is put
+in the middle however we seem to continually keep allocating new
+cursors forever and losing old ones.
+The problem in Lumenarium is found in the OutputData structure
+
+Leaving this in a simplified state for now
+*/
+#define MEMORY_CURSOR_STATIC_ARRAY 1
+
 struct gs_memory_cursor_list
 {
    gs_memory_cursor  Cursor;
+#if !MEMORY_CURSOR_STATIC_ARRAY
    gs_memory_cursor_list* Next;
    gs_memory_cursor_list* Prev;
+#endif
 };

 enum arena_type
@ -664,9 +696,18 @@ struct gs_memory_arena
    gs_allocator Allocator;
    gs_memory_arena* Parent;
    
+#if MEMORY_CURSOR_STATIC_ARRAY
+    gs_memory_cursor_list* Cursors;
+    u64 CursorsCount;
+    u64 CursorsCountMax;
+#else
    gs_memory_cursor_list* CursorList;
+#endif
+    
    u64 MemoryChunkSize;
    u64 MemoryAlignment;
+    
+    char* ArenaName;
 };

 struct gs_memory_arena_array
--- a/src/tests/interface_test.cpp
+++ b/src/tests/interface_test.cpp
--- a/src/tests/sanity_tests.cpp
+++ b/src/tests/sanity_tests.cpp
@ -31,7 +31,7 @@ bool PathTest (char* In, char* Out) {

 int main (int ArgCount, char** Args)
 {
-    Scratch = CreateMemoryArena(CreateAllocator(Alloc, Free));
+    Scratch = CreateMemoryArena(CreateAllocator(Alloc, Free), "Scratch");
    
    Test("gs_string")
    {
--- a/src/tests/test_patterns.h
+++ b/src/tests/test_patterns.h