Memory profiling & codebase cleanup
This commit is contained in:
parent
4f199ee1c6
commit
c054a0e6b6
|
@ -0,0 +1,2 @@
|
||||||
|
@echo off
|
||||||
|
build\build_app_msvc_win32_debug.bat
|
|
@ -144,8 +144,8 @@ Editor_Render(app_state* State, context* Context, render_command_buffer* RenderB
|
||||||
// Draw the Interface
|
// Draw the Interface
|
||||||
if (State->Interface.DrawOrderRoot != 0)
|
if (State->Interface.DrawOrderRoot != 0)
|
||||||
{
|
{
|
||||||
ui_widget Widget = *State->Interface.DrawOrderRoot;
|
ui_widget* Widget = State->Interface.DrawOrderRoot;
|
||||||
Editor_DrawWidget(State, Context, RenderBuffer, Widget, Context->WindowBounds);
|
Editor_DrawWidgetList(State, Context, RenderBuffer, Widget, Context->WindowBounds);
|
||||||
}
|
}
|
||||||
|
|
||||||
Context->GeneralWorkQueue->CompleteQueueWork(Context->GeneralWorkQueue, Context->ThreadContext);
|
Context->GeneralWorkQueue->CompleteQueueWork(Context->GeneralWorkQueue, Context->ThreadContext);
|
||||||
|
|
|
@ -77,12 +77,11 @@ Editor_GetWidgetFillBounds(ui_widget Widget)
|
||||||
return Result;
|
return Result;
|
||||||
}
|
}
|
||||||
|
|
||||||
internal void
|
internal void Editor_DrawWidgetList(app_state* State, context* Context, render_command_buffer* RenderBuffer, ui_widget Widget, rect2 ParentClipBounds);
|
||||||
Editor_DrawWidget(app_state* State, context* Context, render_command_buffer* RenderBuffer, ui_widget Widget, rect2 ParentClipBounds)
|
|
||||||
{
|
|
||||||
rect2 WidgetParentUnion = Widget.Bounds;
|
|
||||||
WidgetParentUnion = Rect2Union(Widget.Bounds, ParentClipBounds);
|
|
||||||
|
|
||||||
|
internal void
|
||||||
|
Editor_DrawWidget(app_state* State, context* Context, render_command_buffer* RenderBuffer, ui_widget Widget, rect2 WidgetParentUnion)
|
||||||
|
{
|
||||||
bool IsActiveWidget = ui_WidgetIdsEqual(Widget.Id, State->Interface.ActiveWidget);
|
bool IsActiveWidget = ui_WidgetIdsEqual(Widget.Id, State->Interface.ActiveWidget);
|
||||||
;
|
;
|
||||||
if (!Widget.Parent || (Rect2Area(WidgetParentUnion) > 0))
|
if (!Widget.Parent || (Rect2Area(WidgetParentUnion) > 0))
|
||||||
|
@ -146,18 +145,27 @@ Editor_DrawWidget(app_state* State, context* Context, render_command_buffer* Ren
|
||||||
PushRenderBoundingBox2D(RenderBuffer, WidgetParentUnion.Min, WidgetParentUnion.Max, Thickness, Color);
|
PushRenderBoundingBox2D(RenderBuffer, WidgetParentUnion.Min, WidgetParentUnion.Max, Thickness, Color);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
}
|
||||||
|
|
||||||
if (Widget.ChildrenRoot)
|
|
||||||
|
internal void Editor_DrawWidgetList(app_state* State, context* Context, render_command_buffer* RenderBuffer, ui_widget* Widget, rect2 ParentClipBounds)
|
||||||
{
|
{
|
||||||
Editor_DrawWidget(State, Context, RenderBuffer, *Widget.ChildrenRoot, WidgetParentUnion);
|
ui_widget* WidgetAt = Widget;
|
||||||
}
|
while (WidgetAt)
|
||||||
|
|
||||||
if (Widget.Next)
|
|
||||||
{
|
{
|
||||||
Editor_DrawWidget(State, Context, RenderBuffer, *Widget.Next, ParentClipBounds);
|
rect2 WidgetParentUnion = WidgetAt->Bounds;
|
||||||
}
|
WidgetParentUnion = Rect2Union(WidgetAt->Bounds, ParentClipBounds);
|
||||||
|
|
||||||
|
Editor_DrawWidget(State, Context, RenderBuffer, *WidgetAt, WidgetParentUnion);
|
||||||
|
|
||||||
|
if (WidgetAt->ChildrenRoot)
|
||||||
|
{
|
||||||
|
Editor_DrawWidgetList(State, Context, RenderBuffer, WidgetAt->ChildrenRoot, WidgetParentUnion);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
WidgetAt = WidgetAt->Next;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
#define FOLDHAUS_EDITOR_DRAW_H
|
#define FOLDHAUS_EDITOR_DRAW_H
|
||||||
#endif // FOLDHAUS_EDITOR_DRAW_H
|
#endif // FOLDHAUS_EDITOR_DRAW_H
|
|
@ -1582,7 +1582,7 @@ ui_InterfaceCreate(context Context, interface_config Style, gs_memory_arena* Per
|
||||||
Result.WidgetsCountMax = 4096;
|
Result.WidgetsCountMax = 4096;
|
||||||
Result.Widgets = PushArray(Permanent, ui_widget, Result.WidgetsCountMax);
|
Result.Widgets = PushArray(Permanent, ui_widget, Result.WidgetsCountMax);
|
||||||
Result.PerFrameMemory = PushStruct(Permanent, gs_memory_arena);
|
Result.PerFrameMemory = PushStruct(Permanent, gs_memory_arena);
|
||||||
*Result.PerFrameMemory = CreateMemoryArena(Context.ThreadContext.Allocator);
|
*Result.PerFrameMemory = CreateMemoryArena(Context.ThreadContext.Allocator, "Interface Per Frame Memory Arena", KB(32));
|
||||||
InterfaceAssert(Result.PerFrameMemory);
|
InterfaceAssert(Result.PerFrameMemory);
|
||||||
|
|
||||||
Result.Permanent = Permanent;
|
Result.Permanent = Permanent;
|
|
@ -90,7 +90,7 @@ FileView_Init(panel* Panel, app_state* State, context Context)
|
||||||
// TODO: :FreePanelMemory
|
// TODO: :FreePanelMemory
|
||||||
file_view_state* FileViewState = PushStruct(&State->Permanent, file_view_state);
|
file_view_state* FileViewState = PushStruct(&State->Permanent, file_view_state);
|
||||||
Panel->StateMemory = StructToData(FileViewState, file_view_state);
|
Panel->StateMemory = StructToData(FileViewState, file_view_state);
|
||||||
FileViewState->FileNamesArena = CreateMemoryArena(Context.ThreadContext.Allocator);
|
FileViewState->FileNamesArena = CreateMemoryArena(Context.ThreadContext.Allocator, "File View - File Names Arena");
|
||||||
|
|
||||||
// TODO(pjs): this shouldn't be stored in permanent
|
// TODO(pjs): this shouldn't be stored in permanent
|
||||||
FileViewState->DisplayDirectory = PushString(&State->Permanent, 1024);
|
FileViewState->DisplayDirectory = PushString(&State->Permanent, 1024);
|
||||||
|
|
|
@ -163,6 +163,44 @@ RenderProfiler_ListVisualization(ui_interface* Interface, ui_widget* Layout, deb
|
||||||
ui_EndList(Interface);
|
ui_EndList(Interface);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
internal void
|
||||||
|
RenderProfiler_MemoryView(ui_interface* Interface, ui_widget* Layout, app_state* State, context Context, gs_memory_arena* Memory)
|
||||||
|
{
|
||||||
|
gs_allocator_debug Debug = *Context.ThreadContext.Allocator.Debug;
|
||||||
|
gs_string TempString = PushString(State->Transient, 256);
|
||||||
|
|
||||||
|
u64 MemFootprint = Debug.TotalAllocSize;
|
||||||
|
u64 AllocCount = Debug.AllocationsCount;
|
||||||
|
PrintF(&TempString, "Total Memory Size: %lld | Allocations: %lld", MemFootprint, AllocCount);
|
||||||
|
ui_Label(Interface, TempString);
|
||||||
|
|
||||||
|
ui_column_spec ColumnWidths[] = {
|
||||||
|
{ UIColumnSize_Fill, 0 },
|
||||||
|
{ UIColumnSize_Fixed,256 },
|
||||||
|
};
|
||||||
|
ui_BeginRow(Interface, 2, &ColumnWidths[0]);
|
||||||
|
{
|
||||||
|
ui_Label(Interface, MakeString("Location"));
|
||||||
|
ui_Label(Interface, MakeString("Alloc Size"));
|
||||||
|
}
|
||||||
|
ui_EndRow(Interface);
|
||||||
|
|
||||||
|
ui_BeginList(Interface, MakeString("Alloc List"), 10, Debug.AllocationsCount);
|
||||||
|
ui_BeginRow(Interface, 2, &ColumnWidths[0]);
|
||||||
|
for (s32 n = 0; n < Debug.AllocationsCount; n++)
|
||||||
|
{
|
||||||
|
gs_debug_allocation A = Debug.Allocations[n];
|
||||||
|
|
||||||
|
PrintF(&TempString, "%S", A.Location);
|
||||||
|
ui_Label(Interface, TempString);
|
||||||
|
|
||||||
|
PrintF(&TempString, "%lld bytes", A.Size);
|
||||||
|
ui_Label(Interface, TempString);
|
||||||
|
}
|
||||||
|
ui_EndRow(Interface);
|
||||||
|
ui_EndList(Interface);
|
||||||
|
}
|
||||||
|
|
||||||
GSMetaTag(panel_render);
|
GSMetaTag(panel_render);
|
||||||
GSMetaTag(panel_type_profiler);
|
GSMetaTag(panel_type_profiler);
|
||||||
internal void
|
internal void
|
||||||
|
@ -234,24 +272,39 @@ ProfilerView_Render(panel* Panel, rect2 PanelBounds, render_command_buffer* Rend
|
||||||
|
|
||||||
ui_BeginRow(&State->Interface, 8);
|
ui_BeginRow(&State->Interface, 8);
|
||||||
{
|
{
|
||||||
if (ui_Button(&State->Interface, MakeString("Scope View")))
|
if (ui_Button(&State->Interface, MakeString("Profiler")))
|
||||||
{
|
{
|
||||||
GlobalDebugServices->Interface.FrameView = FRAME_VIEW_PROFILER;
|
GlobalDebugServices->Interface.FrameView = DebugUI_Profiler;
|
||||||
}
|
}
|
||||||
if (ui_Button(&State->Interface, MakeString("List View")))
|
if (ui_Button(&State->Interface, MakeString("List View")))
|
||||||
{
|
{
|
||||||
GlobalDebugServices->Interface.FrameView = FRAME_VIEW_SCOPE_LIST;
|
GlobalDebugServices->Interface.FrameView = DebugUI_ScopeList;
|
||||||
|
}
|
||||||
|
if (ui_Button(&State->Interface, MakeString("Memory")))
|
||||||
|
{
|
||||||
|
GlobalDebugServices->Interface.FrameView = DebugUI_MemoryView;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
ui_EndRow(&State->Interface);
|
ui_EndRow(&State->Interface);
|
||||||
|
|
||||||
if (GlobalDebugServices->Interface.FrameView == FRAME_VIEW_PROFILER)
|
switch (GlobalDebugServices->Interface.FrameView)
|
||||||
|
{
|
||||||
|
case DebugUI_Profiler:
|
||||||
{
|
{
|
||||||
RenderProfiler_ScopeVisualization(&State->Interface, Layout, VisibleFrame, Memory);
|
RenderProfiler_ScopeVisualization(&State->Interface, Layout, VisibleFrame, Memory);
|
||||||
}
|
}break;
|
||||||
else
|
|
||||||
|
case DebugUI_ScopeList:
|
||||||
{
|
{
|
||||||
RenderProfiler_ListVisualization(&State->Interface, Layout, VisibleFrame, Memory);
|
RenderProfiler_ListVisualization(&State->Interface, Layout, VisibleFrame, Memory);
|
||||||
|
}break;
|
||||||
|
|
||||||
|
case DebugUI_MemoryView:
|
||||||
|
{
|
||||||
|
RenderProfiler_MemoryView(&State->Interface, Layout, State, Context, Memory);
|
||||||
|
}break;
|
||||||
|
|
||||||
|
InvalidDefaultCase;
|
||||||
}
|
}
|
||||||
|
|
||||||
ui_PopLayout(&State->Interface, MakeString("Profiler Layout"));
|
ui_PopLayout(&State->Interface, MakeString("Profiler Layout"));
|
||||||
|
|
|
@ -206,7 +206,7 @@ LoadAssembly (assembly_array* Assemblies, led_system* LedSystem, gs_memory_arena
|
||||||
gs_const_string FileName = Substring(Path, IndexOfLastSlash + 1, Path.Length);
|
gs_const_string FileName = Substring(Path, IndexOfLastSlash + 1, Path.Length);
|
||||||
|
|
||||||
assembly* NewAssembly = AssemblyArray_Take(Assemblies);
|
assembly* NewAssembly = AssemblyArray_Take(Assemblies);
|
||||||
NewAssembly->Arena = CreateMemoryArena(Context.ThreadContext.Allocator);
|
NewAssembly->Arena = CreateMemoryArena(Context.ThreadContext.Allocator, "Assembly Arena");
|
||||||
|
|
||||||
parser AssemblyParser = ParseAssemblyFile(NewAssembly, FileName, AssemblyFileText, Scratch);
|
parser AssemblyParser = ParseAssemblyFile(NewAssembly, FileName, AssemblyFileText, Scratch);
|
||||||
if (AssemblyParser.Success)
|
if (AssemblyParser.Success)
|
||||||
|
|
|
@ -100,7 +100,7 @@ AddressedDataBufferList_Create(gs_thread_context TC)
|
||||||
{
|
{
|
||||||
addressed_data_buffer_list Result = {};
|
addressed_data_buffer_list Result = {};
|
||||||
Result.Arena = AllocatorAllocStruct(TC.Allocator, gs_memory_arena);
|
Result.Arena = AllocatorAllocStruct(TC.Allocator, gs_memory_arena);
|
||||||
*Result.Arena = CreateMemoryArena(TC.Allocator);
|
*Result.Arena = CreateMemoryArena(TC.Allocator, "Addressed Data Buffer List Arena");
|
||||||
return Result;
|
return Result;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -24,7 +24,7 @@ INITIALIZE_APPLICATION(InitializeApplication)
|
||||||
app_state* State = (app_state*)Context.MemoryBase;
|
app_state* State = (app_state*)Context.MemoryBase;
|
||||||
*State = {};
|
*State = {};
|
||||||
|
|
||||||
State->Permanent = CreateMemoryArena(Context.ThreadContext.Allocator);
|
State->Permanent = CreateMemoryArena(Context.ThreadContext.Allocator, "Permanent");
|
||||||
State->Transient = Context.ThreadContext.Transient;
|
State->Transient = Context.ThreadContext.Transient;
|
||||||
State->Assemblies = AssemblyArray_Create(8, &State->Permanent);
|
State->Assemblies = AssemblyArray_Create(8, &State->Permanent);
|
||||||
|
|
||||||
|
@ -91,6 +91,8 @@ INITIALIZE_APPLICATION(InitializeApplication)
|
||||||
Panel_SetType(Hierarchy, &State->PanelSystem, PanelType_AssemblyDebug, State, Context);
|
Panel_SetType(Hierarchy, &State->PanelSystem, PanelType_AssemblyDebug, State, Context);
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
State->RunEditor = true;
|
||||||
}
|
}
|
||||||
|
|
||||||
UPDATE_AND_RENDER(UpdateAndRender)
|
UPDATE_AND_RENDER(UpdateAndRender)
|
||||||
|
@ -104,7 +106,10 @@ UPDATE_AND_RENDER(UpdateAndRender)
|
||||||
// incorrect to clear the arena, and then access the memory later.
|
// incorrect to clear the arena, and then access the memory later.
|
||||||
ClearArena(State->Transient);
|
ClearArena(State->Transient);
|
||||||
|
|
||||||
|
if (State->RunEditor)
|
||||||
|
{
|
||||||
Editor_Update(State, Context, InputQueue);
|
Editor_Update(State, Context, InputQueue);
|
||||||
|
}
|
||||||
|
|
||||||
AnimationSystem_Update(&State->AnimationSystem, Context->DeltaTime);
|
AnimationSystem_Update(&State->AnimationSystem, Context->DeltaTime);
|
||||||
if (AnimationSystem_NeedsRender(State->AnimationSystem))
|
if (AnimationSystem_NeedsRender(State->AnimationSystem))
|
||||||
|
@ -123,7 +128,10 @@ UPDATE_AND_RENDER(UpdateAndRender)
|
||||||
State->Assemblies,
|
State->Assemblies,
|
||||||
State->LedSystem);
|
State->LedSystem);
|
||||||
|
|
||||||
|
if (State->RunEditor)
|
||||||
|
{
|
||||||
Editor_Render(State, Context, RenderBuffer);
|
Editor_Render(State, Context, RenderBuffer);
|
||||||
|
}
|
||||||
|
|
||||||
// NOTE(pjs): Building data buffers to be sent out to the sculpture
|
// NOTE(pjs): Building data buffers to be sent out to the sculpture
|
||||||
// This array is used on the platform side to actually send the information
|
// This array is used on the platform side to actually send the information
|
||||||
|
|
|
@ -13,7 +13,7 @@
|
||||||
#include "../gs_libs/gs_font.h"
|
#include "../gs_libs/gs_font.h"
|
||||||
#include "foldhaus_log.h"
|
#include "foldhaus_log.h"
|
||||||
|
|
||||||
#include "interface.h"
|
#include "editor/interface.h"
|
||||||
|
|
||||||
#include "engine/foldhaus_network_ordering.h"
|
#include "engine/foldhaus_network_ordering.h"
|
||||||
|
|
||||||
|
@ -42,7 +42,7 @@ typedef struct panel panel;
|
||||||
#include "engine/animation/foldhaus_animation_renderer.cpp"
|
#include "engine/animation/foldhaus_animation_renderer.cpp"
|
||||||
|
|
||||||
#include "engine/user_space.h"
|
#include "engine/user_space.h"
|
||||||
#include "blumen_lumen.h"
|
#include "ss_blumen_lumen/blumen_lumen.h"
|
||||||
|
|
||||||
struct app_state
|
struct app_state
|
||||||
{
|
{
|
||||||
|
@ -72,6 +72,8 @@ struct app_state
|
||||||
panel* HotPanel;
|
panel* HotPanel;
|
||||||
|
|
||||||
user_space_desc UserSpaceDesc;
|
user_space_desc UserSpaceDesc;
|
||||||
|
|
||||||
|
bool RunEditor;
|
||||||
};
|
};
|
||||||
|
|
||||||
internal void OpenColorPicker(app_state* State, v4* Address);
|
internal void OpenColorPicker(app_state* State, v4* Address);
|
||||||
|
@ -81,7 +83,7 @@ internal void OpenColorPicker(app_state* State, v4* Address);
|
||||||
#include "engine/user_space.cpp"
|
#include "engine/user_space.cpp"
|
||||||
|
|
||||||
#include "patterns/blumen_patterns.h"
|
#include "patterns/blumen_patterns.h"
|
||||||
#include "blumen_lumen.cpp"
|
#include "ss_blumen_lumen/blumen_lumen.cpp"
|
||||||
|
|
||||||
internal void
|
internal void
|
||||||
EndCurrentOperationMode(app_state* State)
|
EndCurrentOperationMode(app_state* State)
|
||||||
|
|
|
@ -64,8 +64,14 @@ struct debug_frame
|
||||||
collated_scope_record* CollatedScopes;
|
collated_scope_record* CollatedScopes;
|
||||||
};
|
};
|
||||||
|
|
||||||
#define FRAME_VIEW_PROFILER 0
|
enum debug_ui_view
|
||||||
#define FRAME_VIEW_SCOPE_LIST 1
|
{
|
||||||
|
DebugUI_Profiler,
|
||||||
|
DebugUI_ScopeList,
|
||||||
|
DebugUI_MemoryView,
|
||||||
|
|
||||||
|
DebugUI_Count,
|
||||||
|
};
|
||||||
|
|
||||||
struct debug_interface
|
struct debug_interface
|
||||||
{
|
{
|
||||||
|
|
File diff suppressed because it is too large
Load Diff
|
@ -535,6 +535,12 @@ WinMain (
|
||||||
{
|
{
|
||||||
gs_thread_context ThreadContext = Win32CreateThreadContext();
|
gs_thread_context ThreadContext = Win32CreateThreadContext();
|
||||||
|
|
||||||
|
gs_allocator_debug AllocDebug = {};
|
||||||
|
AllocDebug.AllocationsCountMax = 4096;
|
||||||
|
AllocDebug.Allocations = (gs_debug_allocation*)Win32Alloc(sizeof(gs_debug_allocation) * AllocDebug.AllocationsCountMax, 0);
|
||||||
|
|
||||||
|
ThreadContext.Allocator.Debug = &AllocDebug;
|
||||||
|
|
||||||
gs_file_info A = GetFileInfo(ThreadContext.FileHandler, ConstString("C:\\projects\\Lumenarium"));
|
gs_file_info A = GetFileInfo(ThreadContext.FileHandler, ConstString("C:\\projects\\Lumenarium"));
|
||||||
|
|
||||||
gs_file_info B = GetFileInfo(ThreadContext.FileHandler, ConstString("C:\\projects\\Lumenarium\\"));
|
gs_file_info B = GetFileInfo(ThreadContext.FileHandler, ConstString("C:\\projects\\Lumenarium\\"));
|
||||||
|
@ -556,7 +562,7 @@ WinMain (
|
||||||
Context.MemorySize = MB(64);
|
Context.MemorySize = MB(64);
|
||||||
Context.MemoryBase = (u8*)Win32Alloc(Context.MemorySize, 0);
|
Context.MemoryBase = (u8*)Win32Alloc(Context.MemorySize, 0);
|
||||||
|
|
||||||
gs_memory_arena PlatformPermanent = CreateMemoryArena(Context.ThreadContext.Allocator);
|
gs_memory_arena PlatformPermanent = CreateMemoryArena(Context.ThreadContext.Allocator, "Platform Memory");
|
||||||
|
|
||||||
s64 PerformanceCountFrequency = GetPerformanceFrequency();
|
s64 PerformanceCountFrequency = GetPerformanceFrequency();
|
||||||
s64 LastFrameEnd = GetWallClock();
|
s64 LastFrameEnd = GetWallClock();
|
||||||
|
|
|
@ -204,9 +204,13 @@ Win32SerialArray_Create(gs_thread_context Context)
|
||||||
Win32SerialPortNames = AllocatorAllocArray(Context.Allocator, gs_string, Win32SerialHandlesCountMax);
|
Win32SerialPortNames = AllocatorAllocArray(Context.Allocator, gs_string, Win32SerialHandlesCountMax);
|
||||||
Win32SerialPortFilled = AllocatorAllocArray(Context.Allocator, s32, Win32SerialHandlesCountMax);
|
Win32SerialPortFilled = AllocatorAllocArray(Context.Allocator, s32, Win32SerialHandlesCountMax);
|
||||||
|
|
||||||
|
u64 PortNameSize = 256;
|
||||||
|
u64 PortNameBufferSize = PortNameSize * Win32SerialHandlesCountMax;
|
||||||
|
char* PortNameBuffer = AllocatorAllocArray(Context.Allocator, char, PortNameBufferSize);
|
||||||
for (u32 i = 0; i < Win32SerialHandlesCountMax; i++)
|
for (u32 i = 0; i < Win32SerialHandlesCountMax; i++)
|
||||||
{
|
{
|
||||||
Win32SerialPortNames[i] = AllocatorAllocString(Context.Allocator, 256);
|
char* NameBase = PortNameBuffer + (PortNameSize * i);
|
||||||
|
Win32SerialPortNames[i] = MakeString(NameBase, 0, PortNameSize);
|
||||||
Win32SerialPortFilled[i] = 0;
|
Win32SerialPortFilled[i] = 0;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -48,7 +48,7 @@ Win32CreateThreadContext(gs_memory_arena* Transient = 0)
|
||||||
else
|
else
|
||||||
{
|
{
|
||||||
Result.Transient = (gs_memory_arena*)AllocatorAlloc(Result.Allocator, sizeof(gs_memory_arena)).Memory;
|
Result.Transient = (gs_memory_arena*)AllocatorAlloc(Result.Allocator, sizeof(gs_memory_arena)).Memory;
|
||||||
*Result.Transient = CreateMemoryArena(Result.Allocator);
|
*Result.Transient = CreateMemoryArena(Result.Allocator, "Tctx Transient");
|
||||||
}
|
}
|
||||||
Result.FileHandler = CreateFileHandler(Win32GetFileInfo,
|
Result.FileHandler = CreateFileHandler(Win32GetFileInfo,
|
||||||
Win32ReadEntireFile,
|
Win32ReadEntireFile,
|
||||||
|
|
|
@ -24,10 +24,8 @@ BlumenLumen_MicListenJob(gs_thread_context* Ctx, u8* UserData)
|
||||||
|
|
||||||
while (*Data->Running)
|
while (*Data->Running)
|
||||||
{
|
{
|
||||||
#if 1
|
|
||||||
if (SocketQueryStatus(Data->SocketManager, Data->ListenSocket))
|
if (SocketQueryStatus(Data->SocketManager, Data->ListenSocket))
|
||||||
{
|
{
|
||||||
// TODO(pjs): Removing this block for now - nothing is wrong with it except that SocketPeek is still blocking for some reason
|
|
||||||
if (SocketPeek(Data->SocketManager, Data->ListenSocket))
|
if (SocketPeek(Data->SocketManager, Data->ListenSocket))
|
||||||
{
|
{
|
||||||
// TODO(pjs): Make this a peek operation
|
// TODO(pjs): Make this a peek operation
|
||||||
|
@ -41,7 +39,6 @@ BlumenLumen_MicListenJob(gs_thread_context* Ctx, u8* UserData)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
#endif
|
|
||||||
|
|
||||||
while (Data->OutgoingMsgQueue->ReadHead != Data->OutgoingMsgQueue->WriteHead)
|
while (Data->OutgoingMsgQueue->ReadHead != Data->OutgoingMsgQueue->WriteHead)
|
||||||
{
|
{
|
|
@ -1,711 +0,0 @@
|
||||||
/* SIMD (SSE1+MMX or SSE2) implementation of sin, cos, exp and log
|
|
||||||
|
|
||||||
Inspired by Intel Approximate Math library, and based on the
|
|
||||||
corresponding algorithms of the cephes math library
|
|
||||||
|
|
||||||
The default is to use the SSE1 version. If you define USE_SSE2 the
|
|
||||||
the SSE2 intrinsics will be used in place of the MMX intrinsics. Do
|
|
||||||
not expect any significant performance improvement with SSE2.
|
|
||||||
*/
|
|
||||||
|
|
||||||
/* Copyright (C) 2007 Julien Pommier
|
|
||||||
|
|
||||||
This software is provided 'as-is', without any express or implied
|
|
||||||
warranty. In no event will the authors be held liable for any damages
|
|
||||||
arising from the use of this software.
|
|
||||||
|
|
||||||
Permission is granted to anyone to use this software for any purpose,
|
|
||||||
including commercial applications, and to alter it and redistribute it
|
|
||||||
freely, subject to the following restrictions:
|
|
||||||
|
|
||||||
1. The origin of this software must not be misrepresented; you must not
|
|
||||||
claim that you wrote the original software. If you use this software
|
|
||||||
in a product, an acknowledgment in the product documentation would be
|
|
||||||
appreciated but is not required.
|
|
||||||
2. Altered source versions must be plainly marked as such, and must not be
|
|
||||||
misrepresented as being the original software.
|
|
||||||
3. This notice may not be removed or altered from any source distribution.
|
|
||||||
|
|
||||||
(this is the zlib license)
|
|
||||||
*/
|
|
||||||
|
|
||||||
#include <xmmintrin.h>
|
|
||||||
|
|
||||||
/* yes I know, the top of this file is quite ugly */
|
|
||||||
|
|
||||||
#ifdef _MSC_VER /* visual c++ */
|
|
||||||
# define ALIGN16_BEG __declspec(align(16))
|
|
||||||
# define ALIGN16_END
|
|
||||||
#else /* gcc or icc */
|
|
||||||
# define ALIGN16_BEG
|
|
||||||
# define ALIGN16_END __attribute__((aligned(16)))
|
|
||||||
#endif
|
|
||||||
|
|
||||||
/* __m128 is ugly to write */
|
|
||||||
typedef __m128 v4sf; // vector of 4 float (sse1)
|
|
||||||
|
|
||||||
#ifdef USE_SSE2
|
|
||||||
# include <emmintrin.h>
|
|
||||||
typedef __m128i v4si; // vector of 4 int (sse2)
|
|
||||||
#else
|
|
||||||
typedef __m64 v2si; // vector of 2 int (mmx)
|
|
||||||
#endif
|
|
||||||
|
|
||||||
/* declare some SSE constants -- why can't I figure a better way to do that? */
|
|
||||||
#define _PS_CONST(Name, Val) \
|
|
||||||
static const ALIGN16_BEG float _ps_##Name[4] ALIGN16_END = { Val, Val, Val, Val }
|
|
||||||
#define _PI32_CONST(Name, Val) \
|
|
||||||
static const ALIGN16_BEG int _pi32_##Name[4] ALIGN16_END = { Val, Val, Val, Val }
|
|
||||||
#define _PS_CONST_TYPE(Name, Type, Val) \
|
|
||||||
static const ALIGN16_BEG Type _ps_##Name[4] ALIGN16_END = { Val, Val, Val, Val }
|
|
||||||
|
|
||||||
_PS_CONST(1 , 1.0f);
|
|
||||||
_PS_CONST(0p5, 0.5f);
|
|
||||||
/* the smallest non denormalized float number */
|
|
||||||
_PS_CONST_TYPE(min_norm_pos, int, 0x00800000);
|
|
||||||
_PS_CONST_TYPE(mant_mask, int, 0x7f800000);
|
|
||||||
_PS_CONST_TYPE(inv_mant_mask, int, ~0x7f800000);
|
|
||||||
|
|
||||||
_PS_CONST_TYPE(sign_mask, int, (int)0x80000000);
|
|
||||||
_PS_CONST_TYPE(inv_sign_mask, int, ~0x80000000);
|
|
||||||
|
|
||||||
_PI32_CONST(1, 1);
|
|
||||||
_PI32_CONST(inv1, ~1);
|
|
||||||
_PI32_CONST(2, 2);
|
|
||||||
_PI32_CONST(4, 4);
|
|
||||||
_PI32_CONST(0x7f, 0x7f);
|
|
||||||
|
|
||||||
_PS_CONST(cephes_SQRTHF, 0.707106781186547524);
|
|
||||||
_PS_CONST(cephes_log_p0, 7.0376836292E-2);
|
|
||||||
_PS_CONST(cephes_log_p1, - 1.1514610310E-1);
|
|
||||||
_PS_CONST(cephes_log_p2, 1.1676998740E-1);
|
|
||||||
_PS_CONST(cephes_log_p3, - 1.2420140846E-1);
|
|
||||||
_PS_CONST(cephes_log_p4, + 1.4249322787E-1);
|
|
||||||
_PS_CONST(cephes_log_p5, - 1.6668057665E-1);
|
|
||||||
_PS_CONST(cephes_log_p6, + 2.0000714765E-1);
|
|
||||||
_PS_CONST(cephes_log_p7, - 2.4999993993E-1);
|
|
||||||
_PS_CONST(cephes_log_p8, + 3.3333331174E-1);
|
|
||||||
_PS_CONST(cephes_log_q1, -2.12194440e-4);
|
|
||||||
_PS_CONST(cephes_log_q2, 0.693359375);
|
|
||||||
|
|
||||||
#ifndef USE_SSE2
|
|
||||||
typedef union xmm_mm_union {
|
|
||||||
__m128 xmm;
|
|
||||||
__m64 mm[2];
|
|
||||||
} xmm_mm_union;
|
|
||||||
|
|
||||||
#define COPY_XMM_TO_MM(xmm_, mm0_, mm1_) { \
|
|
||||||
xmm_mm_union u; u.xmm = xmm_; \
|
|
||||||
mm0_ = u.mm[0]; \
|
|
||||||
mm1_ = u.mm[1]; \
|
|
||||||
}
|
|
||||||
|
|
||||||
#define COPY_MM_TO_XMM(mm0_, mm1_, xmm_) { \
|
|
||||||
xmm_mm_union u; u.mm[0]=mm0_; u.mm[1]=mm1_; xmm_ = u.xmm; \
|
|
||||||
}
|
|
||||||
|
|
||||||
#endif // USE_SSE2
|
|
||||||
|
|
||||||
/* natural logarithm computed for 4 simultaneous float
|
|
||||||
return NaN for x <= 0
|
|
||||||
*/
|
|
||||||
v4sf log_ps(v4sf x) {
|
|
||||||
#ifdef USE_SSE2
|
|
||||||
v4si emm0;
|
|
||||||
#else
|
|
||||||
v2si mm0, mm1;
|
|
||||||
#endif
|
|
||||||
v4sf one = *(v4sf*)_ps_1;
|
|
||||||
|
|
||||||
v4sf invalid_mask = _mm_cmple_ps(x, _mm_setzero_ps());
|
|
||||||
|
|
||||||
x = _mm_max_ps(x, *(v4sf*)_ps_min_norm_pos); /* cut off denormalized stuff */
|
|
||||||
|
|
||||||
#ifndef USE_SSE2
|
|
||||||
/* part 1: x = frexpf(x, &e); */
|
|
||||||
COPY_XMM_TO_MM(x, mm0, mm1);
|
|
||||||
mm0 = _mm_srli_pi32(mm0, 23);
|
|
||||||
mm1 = _mm_srli_pi32(mm1, 23);
|
|
||||||
#else
|
|
||||||
emm0 = _mm_srli_epi32(_mm_castps_si128(x), 23);
|
|
||||||
#endif
|
|
||||||
/* keep only the fractional part */
|
|
||||||
x = _mm_and_ps(x, *(v4sf*)_ps_inv_mant_mask);
|
|
||||||
x = _mm_or_ps(x, *(v4sf*)_ps_0p5);
|
|
||||||
|
|
||||||
#ifndef USE_SSE2
|
|
||||||
/* now e=mm0:mm1 contain the really base-2 exponent */
|
|
||||||
mm0 = _mm_sub_pi32(mm0, *(v2si*)_pi32_0x7f);
|
|
||||||
mm1 = _mm_sub_pi32(mm1, *(v2si*)_pi32_0x7f);
|
|
||||||
v4sf e = _mm_cvtpi32x2_ps(mm0, mm1);
|
|
||||||
_mm_empty(); /* bye bye mmx */
|
|
||||||
#else
|
|
||||||
emm0 = _mm_sub_epi32(emm0, *(v4si*)_pi32_0x7f);
|
|
||||||
v4sf e = _mm_cvtepi32_ps(emm0);
|
|
||||||
#endif
|
|
||||||
|
|
||||||
e = _mm_add_ps(e, one);
|
|
||||||
|
|
||||||
/* part2:
|
|
||||||
if( x < SQRTHF ) {
|
|
||||||
e -= 1;
|
|
||||||
x = x + x - 1.0;
|
|
||||||
} else { x = x - 1.0; }
|
|
||||||
*/
|
|
||||||
v4sf mask = _mm_cmplt_ps(x, *(v4sf*)_ps_cephes_SQRTHF);
|
|
||||||
v4sf tmp = _mm_and_ps(x, mask);
|
|
||||||
x = _mm_sub_ps(x, one);
|
|
||||||
e = _mm_sub_ps(e, _mm_and_ps(one, mask));
|
|
||||||
x = _mm_add_ps(x, tmp);
|
|
||||||
|
|
||||||
|
|
||||||
v4sf z = _mm_mul_ps(x,x);
|
|
||||||
|
|
||||||
v4sf y = *(v4sf*)_ps_cephes_log_p0;
|
|
||||||
y = _mm_mul_ps(y, x);
|
|
||||||
y = _mm_add_ps(y, *(v4sf*)_ps_cephes_log_p1);
|
|
||||||
y = _mm_mul_ps(y, x);
|
|
||||||
y = _mm_add_ps(y, *(v4sf*)_ps_cephes_log_p2);
|
|
||||||
y = _mm_mul_ps(y, x);
|
|
||||||
y = _mm_add_ps(y, *(v4sf*)_ps_cephes_log_p3);
|
|
||||||
y = _mm_mul_ps(y, x);
|
|
||||||
y = _mm_add_ps(y, *(v4sf*)_ps_cephes_log_p4);
|
|
||||||
y = _mm_mul_ps(y, x);
|
|
||||||
y = _mm_add_ps(y, *(v4sf*)_ps_cephes_log_p5);
|
|
||||||
y = _mm_mul_ps(y, x);
|
|
||||||
y = _mm_add_ps(y, *(v4sf*)_ps_cephes_log_p6);
|
|
||||||
y = _mm_mul_ps(y, x);
|
|
||||||
y = _mm_add_ps(y, *(v4sf*)_ps_cephes_log_p7);
|
|
||||||
y = _mm_mul_ps(y, x);
|
|
||||||
y = _mm_add_ps(y, *(v4sf*)_ps_cephes_log_p8);
|
|
||||||
y = _mm_mul_ps(y, x);
|
|
||||||
|
|
||||||
y = _mm_mul_ps(y, z);
|
|
||||||
|
|
||||||
|
|
||||||
tmp = _mm_mul_ps(e, *(v4sf*)_ps_cephes_log_q1);
|
|
||||||
y = _mm_add_ps(y, tmp);
|
|
||||||
|
|
||||||
|
|
||||||
tmp = _mm_mul_ps(z, *(v4sf*)_ps_0p5);
|
|
||||||
y = _mm_sub_ps(y, tmp);
|
|
||||||
|
|
||||||
tmp = _mm_mul_ps(e, *(v4sf*)_ps_cephes_log_q2);
|
|
||||||
x = _mm_add_ps(x, y);
|
|
||||||
x = _mm_add_ps(x, tmp);
|
|
||||||
x = _mm_or_ps(x, invalid_mask); // negative arg will be NAN
|
|
||||||
return x;
|
|
||||||
}
|
|
||||||
|
|
||||||
_PS_CONST(exp_hi, 88.3762626647949f);
|
|
||||||
_PS_CONST(exp_lo, -88.3762626647949f);
|
|
||||||
|
|
||||||
_PS_CONST(cephes_LOG2EF, 1.44269504088896341);
|
|
||||||
_PS_CONST(cephes_exp_C1, 0.693359375);
|
|
||||||
_PS_CONST(cephes_exp_C2, -2.12194440e-4);
|
|
||||||
|
|
||||||
_PS_CONST(cephes_exp_p0, 1.9875691500E-4);
|
|
||||||
_PS_CONST(cephes_exp_p1, 1.3981999507E-3);
|
|
||||||
_PS_CONST(cephes_exp_p2, 8.3334519073E-3);
|
|
||||||
_PS_CONST(cephes_exp_p3, 4.1665795894E-2);
|
|
||||||
_PS_CONST(cephes_exp_p4, 1.6666665459E-1);
|
|
||||||
_PS_CONST(cephes_exp_p5, 5.0000001201E-1);
|
|
||||||
|
|
||||||
v4sf exp_ps(v4sf x) {
|
|
||||||
v4sf tmp = _mm_setzero_ps(), fx;
|
|
||||||
#ifdef USE_SSE2
|
|
||||||
v4si emm0;
|
|
||||||
#else
|
|
||||||
v2si mm0, mm1;
|
|
||||||
#endif
|
|
||||||
v4sf one = *(v4sf*)_ps_1;
|
|
||||||
|
|
||||||
x = _mm_min_ps(x, *(v4sf*)_ps_exp_hi);
|
|
||||||
x = _mm_max_ps(x, *(v4sf*)_ps_exp_lo);
|
|
||||||
|
|
||||||
/* express exp(x) as exp(g + n*log(2)) */
|
|
||||||
fx = _mm_mul_ps(x, *(v4sf*)_ps_cephes_LOG2EF);
|
|
||||||
fx = _mm_add_ps(fx, *(v4sf*)_ps_0p5);
|
|
||||||
|
|
||||||
/* how to perform a floorf with SSE: just below */
|
|
||||||
#ifndef USE_SSE2
|
|
||||||
/* step 1 : cast to int */
|
|
||||||
tmp = _mm_movehl_ps(tmp, fx);
|
|
||||||
mm0 = _mm_cvttps_pi32(fx);
|
|
||||||
mm1 = _mm_cvttps_pi32(tmp);
|
|
||||||
/* step 2 : cast back to float */
|
|
||||||
tmp = _mm_cvtpi32x2_ps(mm0, mm1);
|
|
||||||
#else
|
|
||||||
emm0 = _mm_cvttps_epi32(fx);
|
|
||||||
tmp = _mm_cvtepi32_ps(emm0);
|
|
||||||
#endif
|
|
||||||
/* if greater, substract 1 */
|
|
||||||
v4sf mask = _mm_cmpgt_ps(tmp, fx);
|
|
||||||
mask = _mm_and_ps(mask, one);
|
|
||||||
fx = _mm_sub_ps(tmp, mask);
|
|
||||||
|
|
||||||
tmp = _mm_mul_ps(fx, *(v4sf*)_ps_cephes_exp_C1);
|
|
||||||
v4sf z = _mm_mul_ps(fx, *(v4sf*)_ps_cephes_exp_C2);
|
|
||||||
x = _mm_sub_ps(x, tmp);
|
|
||||||
x = _mm_sub_ps(x, z);
|
|
||||||
|
|
||||||
z = _mm_mul_ps(x,x);
|
|
||||||
|
|
||||||
v4sf y = *(v4sf*)_ps_cephes_exp_p0;
|
|
||||||
y = _mm_mul_ps(y, x);
|
|
||||||
y = _mm_add_ps(y, *(v4sf*)_ps_cephes_exp_p1);
|
|
||||||
y = _mm_mul_ps(y, x);
|
|
||||||
y = _mm_add_ps(y, *(v4sf*)_ps_cephes_exp_p2);
|
|
||||||
y = _mm_mul_ps(y, x);
|
|
||||||
y = _mm_add_ps(y, *(v4sf*)_ps_cephes_exp_p3);
|
|
||||||
y = _mm_mul_ps(y, x);
|
|
||||||
y = _mm_add_ps(y, *(v4sf*)_ps_cephes_exp_p4);
|
|
||||||
y = _mm_mul_ps(y, x);
|
|
||||||
y = _mm_add_ps(y, *(v4sf*)_ps_cephes_exp_p5);
|
|
||||||
y = _mm_mul_ps(y, z);
|
|
||||||
y = _mm_add_ps(y, x);
|
|
||||||
y = _mm_add_ps(y, one);
|
|
||||||
|
|
||||||
/* build 2^n */
|
|
||||||
#ifndef USE_SSE2
|
|
||||||
z = _mm_movehl_ps(z, fx);
|
|
||||||
mm0 = _mm_cvttps_pi32(fx);
|
|
||||||
mm1 = _mm_cvttps_pi32(z);
|
|
||||||
mm0 = _mm_add_pi32(mm0, *(v2si*)_pi32_0x7f);
|
|
||||||
mm1 = _mm_add_pi32(mm1, *(v2si*)_pi32_0x7f);
|
|
||||||
mm0 = _mm_slli_pi32(mm0, 23);
|
|
||||||
mm1 = _mm_slli_pi32(mm1, 23);
|
|
||||||
|
|
||||||
v4sf pow2n;
|
|
||||||
COPY_MM_TO_XMM(mm0, mm1, pow2n);
|
|
||||||
_mm_empty();
|
|
||||||
#else
|
|
||||||
emm0 = _mm_cvttps_epi32(fx);
|
|
||||||
emm0 = _mm_add_epi32(emm0, *(v4si*)_pi32_0x7f);
|
|
||||||
emm0 = _mm_slli_epi32(emm0, 23);
|
|
||||||
v4sf pow2n = _mm_castsi128_ps(emm0);
|
|
||||||
#endif
|
|
||||||
y = _mm_mul_ps(y, pow2n);
|
|
||||||
return y;
|
|
||||||
}
|
|
||||||
|
|
||||||
_PS_CONST(minus_cephes_DP1, -0.78515625);
|
|
||||||
_PS_CONST(minus_cephes_DP2, -2.4187564849853515625e-4);
|
|
||||||
_PS_CONST(minus_cephes_DP3, -3.77489497744594108e-8);
|
|
||||||
_PS_CONST(sincof_p0, -1.9515295891E-4);
|
|
||||||
_PS_CONST(sincof_p1, 8.3321608736E-3);
|
|
||||||
_PS_CONST(sincof_p2, -1.6666654611E-1);
|
|
||||||
_PS_CONST(coscof_p0, 2.443315711809948E-005);
|
|
||||||
_PS_CONST(coscof_p1, -1.388731625493765E-003);
|
|
||||||
_PS_CONST(coscof_p2, 4.166664568298827E-002);
|
|
||||||
_PS_CONST(cephes_FOPI, 1.27323954473516); // 4 / M_PI
|
|
||||||
|
|
||||||
|
|
||||||
/* evaluation of 4 sines at onces, using only SSE1+MMX intrinsics so
|
|
||||||
it runs also on old athlons XPs and the pentium III of your grand
|
|
||||||
mother.
|
|
||||||
|
|
||||||
The code is the exact rewriting of the cephes sinf function.
|
|
||||||
Precision is excellent as long as x < 8192 (I did not bother to
|
|
||||||
take into account the special handling they have for greater values
|
|
||||||
-- it does not return garbage for arguments over 8192, though, but
|
|
||||||
the extra precision is missing).
|
|
||||||
|
|
||||||
Note that it is such that sinf((float)M_PI) = 8.74e-8, which is the
|
|
||||||
surprising but correct result.
|
|
||||||
|
|
||||||
Performance is also surprisingly good, 1.33 times faster than the
|
|
||||||
macos vsinf SSE2 function, and 1.5 times faster than the
|
|
||||||
__vrs4_sinf of amd's ACML (which is only available in 64 bits). Not
|
|
||||||
too bad for an SSE1 function (with no special tuning) !
|
|
||||||
However the latter libraries probably have a much better handling of NaN,
|
|
||||||
Inf, denormalized and other special arguments..
|
|
||||||
|
|
||||||
On my core 1 duo, the execution of this function takes approximately 95 cycles.
|
|
||||||
|
|
||||||
From what I have observed on the experiments with Intel AMath lib, switching to an
|
|
||||||
SSE2 version would improve the perf by only 10%.
|
|
||||||
|
|
||||||
Since it is based on SSE intrinsics, it has to be compiled at -O2 to
|
|
||||||
deliver full speed.
|
|
||||||
*/
|
|
||||||
v4sf sin_ps(v4sf x) { // any x
|
|
||||||
v4sf xmm1, xmm2 = _mm_setzero_ps(), xmm3, sign_bit, y;
|
|
||||||
|
|
||||||
#ifdef USE_SSE2
|
|
||||||
v4si emm0, emm2;
|
|
||||||
#else
|
|
||||||
v2si mm0, mm1, mm2, mm3;
|
|
||||||
#endif
|
|
||||||
sign_bit = x;
|
|
||||||
/* take the absolute value */
|
|
||||||
x = _mm_and_ps(x, *(v4sf*)_ps_inv_sign_mask);
|
|
||||||
/* extract the sign bit (upper one) */
|
|
||||||
sign_bit = _mm_and_ps(sign_bit, *(v4sf*)_ps_sign_mask);
|
|
||||||
|
|
||||||
/* scale by 4/Pi */
|
|
||||||
y = _mm_mul_ps(x, *(v4sf*)_ps_cephes_FOPI);
|
|
||||||
|
|
||||||
#ifdef USE_SSE2
|
|
||||||
/* store the integer part of y in mm0 */
|
|
||||||
emm2 = _mm_cvttps_epi32(y);
|
|
||||||
/* j=(j+1) & (~1) (see the cephes sources) */
|
|
||||||
emm2 = _mm_add_epi32(emm2, *(v4si*)_pi32_1);
|
|
||||||
emm2 = _mm_and_si128(emm2, *(v4si*)_pi32_inv1);
|
|
||||||
y = _mm_cvtepi32_ps(emm2);
|
|
||||||
|
|
||||||
/* get the swap sign flag */
|
|
||||||
emm0 = _mm_and_si128(emm2, *(v4si*)_pi32_4);
|
|
||||||
emm0 = _mm_slli_epi32(emm0, 29);
|
|
||||||
/* get the polynom selection mask
|
|
||||||
there is one polynom for 0 <= x <= Pi/4
|
|
||||||
and another one for Pi/4<x<=Pi/2
|
|
||||||
|
|
||||||
Both branches will be computed.
|
|
||||||
*/
|
|
||||||
emm2 = _mm_and_si128(emm2, *(v4si*)_pi32_2);
|
|
||||||
emm2 = _mm_cmpeq_epi32(emm2, _mm_setzero_si128());
|
|
||||||
|
|
||||||
v4sf swap_sign_bit = _mm_castsi128_ps(emm0);
|
|
||||||
v4sf poly_mask = _mm_castsi128_ps(emm2);
|
|
||||||
sign_bit = _mm_xor_ps(sign_bit, swap_sign_bit);
|
|
||||||
|
|
||||||
#else
|
|
||||||
/* store the integer part of y in mm0:mm1 */
|
|
||||||
xmm2 = _mm_movehl_ps(xmm2, y);
|
|
||||||
mm2 = _mm_cvttps_pi32(y);
|
|
||||||
mm3 = _mm_cvttps_pi32(xmm2);
|
|
||||||
/* j=(j+1) & (~1) (see the cephes sources) */
|
|
||||||
mm2 = _mm_add_pi32(mm2, *(v2si*)_pi32_1);
|
|
||||||
mm3 = _mm_add_pi32(mm3, *(v2si*)_pi32_1);
|
|
||||||
mm2 = _mm_and_si64(mm2, *(v2si*)_pi32_inv1);
|
|
||||||
mm3 = _mm_and_si64(mm3, *(v2si*)_pi32_inv1);
|
|
||||||
y = _mm_cvtpi32x2_ps(mm2, mm3);
|
|
||||||
/* get the swap sign flag */
|
|
||||||
mm0 = _mm_and_si64(mm2, *(v2si*)_pi32_4);
|
|
||||||
mm1 = _mm_and_si64(mm3, *(v2si*)_pi32_4);
|
|
||||||
mm0 = _mm_slli_pi32(mm0, 29);
|
|
||||||
mm1 = _mm_slli_pi32(mm1, 29);
|
|
||||||
/* get the polynom selection mask */
|
|
||||||
mm2 = _mm_and_si64(mm2, *(v2si*)_pi32_2);
|
|
||||||
mm3 = _mm_and_si64(mm3, *(v2si*)_pi32_2);
|
|
||||||
mm2 = _mm_cmpeq_pi32(mm2, _mm_setzero_si64());
|
|
||||||
mm3 = _mm_cmpeq_pi32(mm3, _mm_setzero_si64());
|
|
||||||
v4sf swap_sign_bit, poly_mask;
|
|
||||||
COPY_MM_TO_XMM(mm0, mm1, swap_sign_bit);
|
|
||||||
COPY_MM_TO_XMM(mm2, mm3, poly_mask);
|
|
||||||
sign_bit = _mm_xor_ps(sign_bit, swap_sign_bit);
|
|
||||||
_mm_empty(); /* good-bye mmx */
|
|
||||||
#endif
|
|
||||||
|
|
||||||
/* The magic pass: "Extended precision modular arithmetic"
|
|
||||||
x = ((x - y * DP1) - y * DP2) - y * DP3; */
|
|
||||||
xmm1 = *(v4sf*)_ps_minus_cephes_DP1;
|
|
||||||
xmm2 = *(v4sf*)_ps_minus_cephes_DP2;
|
|
||||||
xmm3 = *(v4sf*)_ps_minus_cephes_DP3;
|
|
||||||
xmm1 = _mm_mul_ps(y, xmm1);
|
|
||||||
xmm2 = _mm_mul_ps(y, xmm2);
|
|
||||||
xmm3 = _mm_mul_ps(y, xmm3);
|
|
||||||
x = _mm_add_ps(x, xmm1);
|
|
||||||
x = _mm_add_ps(x, xmm2);
|
|
||||||
x = _mm_add_ps(x, xmm3);
|
|
||||||
|
|
||||||
/* Evaluate the first polynom (0 <= x <= Pi/4) */
|
|
||||||
y = *(v4sf*)_ps_coscof_p0;
|
|
||||||
v4sf z = _mm_mul_ps(x,x);
|
|
||||||
|
|
||||||
y = _mm_mul_ps(y, z);
|
|
||||||
y = _mm_add_ps(y, *(v4sf*)_ps_coscof_p1);
|
|
||||||
y = _mm_mul_ps(y, z);
|
|
||||||
y = _mm_add_ps(y, *(v4sf*)_ps_coscof_p2);
|
|
||||||
y = _mm_mul_ps(y, z);
|
|
||||||
y = _mm_mul_ps(y, z);
|
|
||||||
v4sf tmp = _mm_mul_ps(z, *(v4sf*)_ps_0p5);
|
|
||||||
y = _mm_sub_ps(y, tmp);
|
|
||||||
y = _mm_add_ps(y, *(v4sf*)_ps_1);
|
|
||||||
|
|
||||||
/* Evaluate the second polynom (Pi/4 <= x <= 0) */
|
|
||||||
|
|
||||||
v4sf y2 = *(v4sf*)_ps_sincof_p0;
|
|
||||||
y2 = _mm_mul_ps(y2, z);
|
|
||||||
y2 = _mm_add_ps(y2, *(v4sf*)_ps_sincof_p1);
|
|
||||||
y2 = _mm_mul_ps(y2, z);
|
|
||||||
y2 = _mm_add_ps(y2, *(v4sf*)_ps_sincof_p2);
|
|
||||||
y2 = _mm_mul_ps(y2, z);
|
|
||||||
y2 = _mm_mul_ps(y2, x);
|
|
||||||
y2 = _mm_add_ps(y2, x);
|
|
||||||
|
|
||||||
/* select the correct result from the two polynoms */
|
|
||||||
xmm3 = poly_mask;
|
|
||||||
y2 = _mm_and_ps(xmm3, y2); //, xmm3);
|
|
||||||
y = _mm_andnot_ps(xmm3, y);
|
|
||||||
y = _mm_add_ps(y,y2);
|
|
||||||
/* update the sign */
|
|
||||||
y = _mm_xor_ps(y, sign_bit);
|
|
||||||
return y;
|
|
||||||
}
|
|
||||||
|
|
||||||
/* almost the same as sin_ps */
|
|
||||||
v4sf cos_ps(v4sf x) { // any x
|
|
||||||
v4sf xmm1, xmm2 = _mm_setzero_ps(), xmm3, y;
|
|
||||||
#ifdef USE_SSE2
|
|
||||||
v4si emm0, emm2;
|
|
||||||
#else
|
|
||||||
v2si mm0, mm1, mm2, mm3;
|
|
||||||
#endif
|
|
||||||
/* take the absolute value */
|
|
||||||
x = _mm_and_ps(x, *(v4sf*)_ps_inv_sign_mask);
|
|
||||||
|
|
||||||
/* scale by 4/Pi */
|
|
||||||
y = _mm_mul_ps(x, *(v4sf*)_ps_cephes_FOPI);
|
|
||||||
|
|
||||||
#ifdef USE_SSE2
|
|
||||||
/* store the integer part of y in mm0 */
|
|
||||||
emm2 = _mm_cvttps_epi32(y);
|
|
||||||
/* j=(j+1) & (~1) (see the cephes sources) */
|
|
||||||
emm2 = _mm_add_epi32(emm2, *(v4si*)_pi32_1);
|
|
||||||
emm2 = _mm_and_si128(emm2, *(v4si*)_pi32_inv1);
|
|
||||||
y = _mm_cvtepi32_ps(emm2);
|
|
||||||
|
|
||||||
emm2 = _mm_sub_epi32(emm2, *(v4si*)_pi32_2);
|
|
||||||
|
|
||||||
/* get the swap sign flag */
|
|
||||||
emm0 = _mm_andnot_si128(emm2, *(v4si*)_pi32_4);
|
|
||||||
emm0 = _mm_slli_epi32(emm0, 29);
|
|
||||||
/* get the polynom selection mask */
|
|
||||||
emm2 = _mm_and_si128(emm2, *(v4si*)_pi32_2);
|
|
||||||
emm2 = _mm_cmpeq_epi32(emm2, _mm_setzero_si128());
|
|
||||||
|
|
||||||
v4sf sign_bit = _mm_castsi128_ps(emm0);
|
|
||||||
v4sf poly_mask = _mm_castsi128_ps(emm2);
|
|
||||||
#else
|
|
||||||
/* store the integer part of y in mm0:mm1 */
|
|
||||||
xmm2 = _mm_movehl_ps(xmm2, y);
|
|
||||||
mm2 = _mm_cvttps_pi32(y);
|
|
||||||
mm3 = _mm_cvttps_pi32(xmm2);
|
|
||||||
|
|
||||||
/* j=(j+1) & (~1) (see the cephes sources) */
|
|
||||||
mm2 = _mm_add_pi32(mm2, *(v2si*)_pi32_1);
|
|
||||||
mm3 = _mm_add_pi32(mm3, *(v2si*)_pi32_1);
|
|
||||||
mm2 = _mm_and_si64(mm2, *(v2si*)_pi32_inv1);
|
|
||||||
mm3 = _mm_and_si64(mm3, *(v2si*)_pi32_inv1);
|
|
||||||
|
|
||||||
y = _mm_cvtpi32x2_ps(mm2, mm3);
|
|
||||||
|
|
||||||
|
|
||||||
mm2 = _mm_sub_pi32(mm2, *(v2si*)_pi32_2);
|
|
||||||
mm3 = _mm_sub_pi32(mm3, *(v2si*)_pi32_2);
|
|
||||||
|
|
||||||
/* get the swap sign flag in mm0:mm1 and the
|
|
||||||
polynom selection mask in mm2:mm3 */
|
|
||||||
|
|
||||||
mm0 = _mm_andnot_si64(mm2, *(v2si*)_pi32_4);
|
|
||||||
mm1 = _mm_andnot_si64(mm3, *(v2si*)_pi32_4);
|
|
||||||
mm0 = _mm_slli_pi32(mm0, 29);
|
|
||||||
mm1 = _mm_slli_pi32(mm1, 29);
|
|
||||||
|
|
||||||
mm2 = _mm_and_si64(mm2, *(v2si*)_pi32_2);
|
|
||||||
mm3 = _mm_and_si64(mm3, *(v2si*)_pi32_2);
|
|
||||||
|
|
||||||
mm2 = _mm_cmpeq_pi32(mm2, _mm_setzero_si64());
|
|
||||||
mm3 = _mm_cmpeq_pi32(mm3, _mm_setzero_si64());
|
|
||||||
|
|
||||||
v4sf sign_bit, poly_mask;
|
|
||||||
COPY_MM_TO_XMM(mm0, mm1, sign_bit);
|
|
||||||
COPY_MM_TO_XMM(mm2, mm3, poly_mask);
|
|
||||||
_mm_empty(); /* good-bye mmx */
|
|
||||||
#endif
|
|
||||||
/* The magic pass: "Extended precision modular arithmetic"
|
|
||||||
x = ((x - y * DP1) - y * DP2) - y * DP3; */
|
|
||||||
xmm1 = *(v4sf*)_ps_minus_cephes_DP1;
|
|
||||||
xmm2 = *(v4sf*)_ps_minus_cephes_DP2;
|
|
||||||
xmm3 = *(v4sf*)_ps_minus_cephes_DP3;
|
|
||||||
xmm1 = _mm_mul_ps(y, xmm1);
|
|
||||||
xmm2 = _mm_mul_ps(y, xmm2);
|
|
||||||
xmm3 = _mm_mul_ps(y, xmm3);
|
|
||||||
x = _mm_add_ps(x, xmm1);
|
|
||||||
x = _mm_add_ps(x, xmm2);
|
|
||||||
x = _mm_add_ps(x, xmm3);
|
|
||||||
|
|
||||||
/* Evaluate the first polynom (0 <= x <= Pi/4) */
|
|
||||||
y = *(v4sf*)_ps_coscof_p0;
|
|
||||||
v4sf z = _mm_mul_ps(x,x);
|
|
||||||
|
|
||||||
y = _mm_mul_ps(y, z);
|
|
||||||
y = _mm_add_ps(y, *(v4sf*)_ps_coscof_p1);
|
|
||||||
y = _mm_mul_ps(y, z);
|
|
||||||
y = _mm_add_ps(y, *(v4sf*)_ps_coscof_p2);
|
|
||||||
y = _mm_mul_ps(y, z);
|
|
||||||
y = _mm_mul_ps(y, z);
|
|
||||||
v4sf tmp = _mm_mul_ps(z, *(v4sf*)_ps_0p5);
|
|
||||||
y = _mm_sub_ps(y, tmp);
|
|
||||||
y = _mm_add_ps(y, *(v4sf*)_ps_1);
|
|
||||||
|
|
||||||
/* Evaluate the second polynom (Pi/4 <= x <= 0) */
|
|
||||||
|
|
||||||
v4sf y2 = *(v4sf*)_ps_sincof_p0;
|
|
||||||
y2 = _mm_mul_ps(y2, z);
|
|
||||||
y2 = _mm_add_ps(y2, *(v4sf*)_ps_sincof_p1);
|
|
||||||
y2 = _mm_mul_ps(y2, z);
|
|
||||||
y2 = _mm_add_ps(y2, *(v4sf*)_ps_sincof_p2);
|
|
||||||
y2 = _mm_mul_ps(y2, z);
|
|
||||||
y2 = _mm_mul_ps(y2, x);
|
|
||||||
y2 = _mm_add_ps(y2, x);
|
|
||||||
|
|
||||||
/* select the correct result from the two polynoms */
|
|
||||||
xmm3 = poly_mask;
|
|
||||||
y2 = _mm_and_ps(xmm3, y2); //, xmm3);
|
|
||||||
y = _mm_andnot_ps(xmm3, y);
|
|
||||||
y = _mm_add_ps(y,y2);
|
|
||||||
/* update the sign */
|
|
||||||
y = _mm_xor_ps(y, sign_bit);
|
|
||||||
|
|
||||||
return y;
|
|
||||||
}
|
|
||||||
|
|
||||||
/* since sin_ps and cos_ps are almost identical, sincos_ps could replace both of them..
|
|
||||||
it is almost as fast, and gives you a free cosine with your sine */
|
|
||||||
void sincos_ps(v4sf x, v4sf *s, v4sf *c) {
|
|
||||||
v4sf xmm1, xmm2, xmm3 = _mm_setzero_ps(), sign_bit_sin, y;
|
|
||||||
#ifdef USE_SSE2
|
|
||||||
v4si emm0, emm2, emm4;
|
|
||||||
#else
|
|
||||||
v2si mm0, mm1, mm2, mm3, mm4, mm5;
|
|
||||||
#endif
|
|
||||||
sign_bit_sin = x;
|
|
||||||
/* take the absolute value */
|
|
||||||
x = _mm_and_ps(x, *(v4sf*)_ps_inv_sign_mask);
|
|
||||||
/* extract the sign bit (upper one) */
|
|
||||||
sign_bit_sin = _mm_and_ps(sign_bit_sin, *(v4sf*)_ps_sign_mask);
|
|
||||||
|
|
||||||
/* scale by 4/Pi */
|
|
||||||
y = _mm_mul_ps(x, *(v4sf*)_ps_cephes_FOPI);
|
|
||||||
|
|
||||||
#ifdef USE_SSE2
|
|
||||||
/* store the integer part of y in emm2 */
|
|
||||||
emm2 = _mm_cvttps_epi32(y);
|
|
||||||
|
|
||||||
/* j=(j+1) & (~1) (see the cephes sources) */
|
|
||||||
emm2 = _mm_add_epi32(emm2, *(v4si*)_pi32_1);
|
|
||||||
emm2 = _mm_and_si128(emm2, *(v4si*)_pi32_inv1);
|
|
||||||
y = _mm_cvtepi32_ps(emm2);
|
|
||||||
|
|
||||||
emm4 = emm2;
|
|
||||||
|
|
||||||
/* get the swap sign flag for the sine */
|
|
||||||
emm0 = _mm_and_si128(emm2, *(v4si*)_pi32_4);
|
|
||||||
emm0 = _mm_slli_epi32(emm0, 29);
|
|
||||||
v4sf swap_sign_bit_sin = _mm_castsi128_ps(emm0);
|
|
||||||
|
|
||||||
/* get the polynom selection mask for the sine*/
|
|
||||||
emm2 = _mm_and_si128(emm2, *(v4si*)_pi32_2);
|
|
||||||
emm2 = _mm_cmpeq_epi32(emm2, _mm_setzero_si128());
|
|
||||||
v4sf poly_mask = _mm_castsi128_ps(emm2);
|
|
||||||
#else
|
|
||||||
/* store the integer part of y in mm2:mm3 */
|
|
||||||
xmm3 = _mm_movehl_ps(xmm3, y);
|
|
||||||
mm2 = _mm_cvttps_pi32(y);
|
|
||||||
mm3 = _mm_cvttps_pi32(xmm3);
|
|
||||||
|
|
||||||
/* j=(j+1) & (~1) (see the cephes sources) */
|
|
||||||
mm2 = _mm_add_pi32(mm2, *(v2si*)_pi32_1);
|
|
||||||
mm3 = _mm_add_pi32(mm3, *(v2si*)_pi32_1);
|
|
||||||
mm2 = _mm_and_si64(mm2, *(v2si*)_pi32_inv1);
|
|
||||||
mm3 = _mm_and_si64(mm3, *(v2si*)_pi32_inv1);
|
|
||||||
|
|
||||||
y = _mm_cvtpi32x2_ps(mm2, mm3);
|
|
||||||
|
|
||||||
mm4 = mm2;
|
|
||||||
mm5 = mm3;
|
|
||||||
|
|
||||||
/* get the swap sign flag for the sine */
|
|
||||||
mm0 = _mm_and_si64(mm2, *(v2si*)_pi32_4);
|
|
||||||
mm1 = _mm_and_si64(mm3, *(v2si*)_pi32_4);
|
|
||||||
mm0 = _mm_slli_pi32(mm0, 29);
|
|
||||||
mm1 = _mm_slli_pi32(mm1, 29);
|
|
||||||
v4sf swap_sign_bit_sin;
|
|
||||||
COPY_MM_TO_XMM(mm0, mm1, swap_sign_bit_sin);
|
|
||||||
|
|
||||||
/* get the polynom selection mask for the sine */
|
|
||||||
|
|
||||||
mm2 = _mm_and_si64(mm2, *(v2si*)_pi32_2);
|
|
||||||
mm3 = _mm_and_si64(mm3, *(v2si*)_pi32_2);
|
|
||||||
mm2 = _mm_cmpeq_pi32(mm2, _mm_setzero_si64());
|
|
||||||
mm3 = _mm_cmpeq_pi32(mm3, _mm_setzero_si64());
|
|
||||||
v4sf poly_mask;
|
|
||||||
COPY_MM_TO_XMM(mm2, mm3, poly_mask);
|
|
||||||
#endif
|
|
||||||
|
|
||||||
/* The magic pass: "Extended precision modular arithmetic"
|
|
||||||
x = ((x - y * DP1) - y * DP2) - y * DP3; */
|
|
||||||
xmm1 = *(v4sf*)_ps_minus_cephes_DP1;
|
|
||||||
xmm2 = *(v4sf*)_ps_minus_cephes_DP2;
|
|
||||||
xmm3 = *(v4sf*)_ps_minus_cephes_DP3;
|
|
||||||
xmm1 = _mm_mul_ps(y, xmm1);
|
|
||||||
xmm2 = _mm_mul_ps(y, xmm2);
|
|
||||||
xmm3 = _mm_mul_ps(y, xmm3);
|
|
||||||
x = _mm_add_ps(x, xmm1);
|
|
||||||
x = _mm_add_ps(x, xmm2);
|
|
||||||
x = _mm_add_ps(x, xmm3);
|
|
||||||
|
|
||||||
#ifdef USE_SSE2
|
|
||||||
emm4 = _mm_sub_epi32(emm4, *(v4si*)_pi32_2);
|
|
||||||
emm4 = _mm_andnot_si128(emm4, *(v4si*)_pi32_4);
|
|
||||||
emm4 = _mm_slli_epi32(emm4, 29);
|
|
||||||
v4sf sign_bit_cos = _mm_castsi128_ps(emm4);
|
|
||||||
#else
|
|
||||||
/* get the sign flag for the cosine */
|
|
||||||
mm4 = _mm_sub_pi32(mm4, *(v2si*)_pi32_2);
|
|
||||||
mm5 = _mm_sub_pi32(mm5, *(v2si*)_pi32_2);
|
|
||||||
mm4 = _mm_andnot_si64(mm4, *(v2si*)_pi32_4);
|
|
||||||
mm5 = _mm_andnot_si64(mm5, *(v2si*)_pi32_4);
|
|
||||||
mm4 = _mm_slli_pi32(mm4, 29);
|
|
||||||
mm5 = _mm_slli_pi32(mm5, 29);
|
|
||||||
v4sf sign_bit_cos;
|
|
||||||
COPY_MM_TO_XMM(mm4, mm5, sign_bit_cos);
|
|
||||||
_mm_empty(); /* good-bye mmx */
|
|
||||||
#endif
|
|
||||||
|
|
||||||
sign_bit_sin = _mm_xor_ps(sign_bit_sin, swap_sign_bit_sin);
|
|
||||||
|
|
||||||
|
|
||||||
/* Evaluate the first polynom (0 <= x <= Pi/4) */
|
|
||||||
v4sf z = _mm_mul_ps(x,x);
|
|
||||||
y = *(v4sf*)_ps_coscof_p0;
|
|
||||||
|
|
||||||
y = _mm_mul_ps(y, z);
|
|
||||||
y = _mm_add_ps(y, *(v4sf*)_ps_coscof_p1);
|
|
||||||
y = _mm_mul_ps(y, z);
|
|
||||||
y = _mm_add_ps(y, *(v4sf*)_ps_coscof_p2);
|
|
||||||
y = _mm_mul_ps(y, z);
|
|
||||||
y = _mm_mul_ps(y, z);
|
|
||||||
v4sf tmp = _mm_mul_ps(z, *(v4sf*)_ps_0p5);
|
|
||||||
y = _mm_sub_ps(y, tmp);
|
|
||||||
y = _mm_add_ps(y, *(v4sf*)_ps_1);
|
|
||||||
|
|
||||||
/* Evaluate the second polynom (Pi/4 <= x <= 0) */
|
|
||||||
|
|
||||||
v4sf y2 = *(v4sf*)_ps_sincof_p0;
|
|
||||||
y2 = _mm_mul_ps(y2, z);
|
|
||||||
y2 = _mm_add_ps(y2, *(v4sf*)_ps_sincof_p1);
|
|
||||||
y2 = _mm_mul_ps(y2, z);
|
|
||||||
y2 = _mm_add_ps(y2, *(v4sf*)_ps_sincof_p2);
|
|
||||||
y2 = _mm_mul_ps(y2, z);
|
|
||||||
y2 = _mm_mul_ps(y2, x);
|
|
||||||
y2 = _mm_add_ps(y2, x);
|
|
||||||
|
|
||||||
/* select the correct result from the two polynoms */
|
|
||||||
xmm3 = poly_mask;
|
|
||||||
v4sf ysin2 = _mm_and_ps(xmm3, y2);
|
|
||||||
v4sf ysin1 = _mm_andnot_ps(xmm3, y);
|
|
||||||
y2 = _mm_sub_ps(y2,ysin2);
|
|
||||||
y = _mm_sub_ps(y, ysin1);
|
|
||||||
|
|
||||||
xmm1 = _mm_add_ps(ysin1,ysin2);
|
|
||||||
xmm2 = _mm_add_ps(y,y2);
|
|
||||||
|
|
||||||
/* update the sign */
|
|
||||||
*s = _mm_xor_ps(xmm1, sign_bit_sin);
|
|
||||||
*c = _mm_xor_ps(xmm2, sign_bit_cos);
|
|
||||||
}
|
|
||||||
|
|
|
@ -1,360 +0,0 @@
|
||||||
/*
|
|
||||||
sse_mathfun_extension.h - zlib license
|
|
||||||
Written by Tolga Mizrak 2016
|
|
||||||
Extension of sse_mathfun.h, which is written by Julien Pommier
|
|
||||||
|
|
||||||
Based on the corresponding algorithms of the cephes math library
|
|
||||||
|
|
||||||
This is written as an extension to sse_mathfun.h instead of modifying it, just because I didn't want
|
|
||||||
to maintain a modified version of the original library. This way switching to a newer version of the
|
|
||||||
library won't be a hassle.
|
|
||||||
|
|
||||||
Note that non SSE2 implementations of tan_ps, atan_ps, cot_ps and atan2_ps are not implemented yet.
|
|
||||||
As such, currently you need to #define USE_SSE2 to compile.
|
|
||||||
|
|
||||||
With tan_ps, cot_ps you get good precision on input ranges that are further away from the domain
|
|
||||||
borders (-PI/2, PI/2 for tan and 0, 1 for cot). See the results on the deviations for these
|
|
||||||
functions on my machine:
|
|
||||||
checking tan on [-0.25*Pi, 0.25*Pi]
|
|
||||||
max deviation from tanf(x): 1.19209e-07 at 0.250000006957*Pi, max deviation from cephes_tan(x):
|
|
||||||
5.96046e-08
|
|
||||||
->> precision OK for the tan_ps <<-
|
|
||||||
|
|
||||||
checking tan on [-0.49*Pi, 0.49*Pi]
|
|
||||||
max deviation from tanf(x): 3.8147e-06 at -0.490000009841*Pi, max deviation from cephes_tan(x):
|
|
||||||
9.53674e-07
|
|
||||||
->> precision OK for the tan_ps <<-
|
|
||||||
|
|
||||||
checking cot on [0.2*Pi, 0.7*Pi]
|
|
||||||
max deviation from cotf(x): 1.19209e-07 at 0.204303119606*Pi, max deviation from cephes_cot(x):
|
|
||||||
1.19209e-07
|
|
||||||
->> precision OK for the cot_ps <<-
|
|
||||||
|
|
||||||
checking cot on [0.01*Pi, 0.99*Pi]
|
|
||||||
max deviation from cotf(x): 3.8147e-06 at 0.987876517942*Pi, max deviation from cephes_cot(x):
|
|
||||||
9.53674e-07
|
|
||||||
->> precision OK for the cot_ps <<-
|
|
||||||
|
|
||||||
With atan_ps and atan2_ps you get pretty good precision, atan_ps max deviation is < 2e-7 and
|
|
||||||
atan2_ps max deviation is < 2.5e-7
|
|
||||||
*/
|
|
||||||
|
|
||||||
/* Copyright (C) 2016 Tolga Mizrak
|
|
||||||
|
|
||||||
This software is provided 'as-is', without any express or implied
|
|
||||||
warranty. In no event will the authors be held liable for any damages
|
|
||||||
arising from the use of this software.
|
|
||||||
|
|
||||||
Permission is granted to anyone to use this software for any purpose,
|
|
||||||
including commercial applications, and to alter it and redistribute it
|
|
||||||
freely, subject to the following restrictions:
|
|
||||||
|
|
||||||
1. The origin of this software must not be misrepresented; you must not
|
|
||||||
claim that you wrote the original software. If you use this software
|
|
||||||
in a product, an acknowledgment in the product documentation would be
|
|
||||||
appreciated but is not required.
|
|
||||||
2. Altered source versions must be plainly marked as such, and must not be
|
|
||||||
misrepresented as being the original software.
|
|
||||||
3. This notice may not be removed or altered from any source distribution.
|
|
||||||
|
|
||||||
(this is the zlib license)
|
|
||||||
*/
|
|
||||||
|
|
||||||
#pragma once
|
|
||||||
|
|
||||||
#ifndef _SSE_MATHFUN_EXTENSION_H_INCLUDED_
|
|
||||||
#define _SSE_MATHFUN_EXTENSION_H_INCLUDED_
|
|
||||||
|
|
||||||
#ifndef USE_SSE2
|
|
||||||
#error sse1 & mmx version not implemented
|
|
||||||
#endif
|
|
||||||
|
|
||||||
#ifdef _MSC_VER
|
|
||||||
#pragma warning( push )
|
|
||||||
/* warning C4838: conversion from 'double' to 'const float' requires a narrowing conversion */
|
|
||||||
#pragma warning( disable : 4838 )
|
|
||||||
/* warning C4305: 'initializing': truncation from 'double' to 'const float' */
|
|
||||||
#pragma warning( disable : 4305 )
|
|
||||||
#endif
|
|
||||||
|
|
||||||
#include "sse_mathfun.h"
|
|
||||||
|
|
||||||
_PS_CONST( 0, 0 );
|
|
||||||
_PS_CONST( 2, 2 );
|
|
||||||
_PI32_CONST( neg1, 1 );
|
|
||||||
|
|
||||||
_PS_CONST( tancof_p0, 9.38540185543E-3 );
|
|
||||||
_PS_CONST( tancof_p1, 3.11992232697E-3 );
|
|
||||||
_PS_CONST( tancof_p2, 2.44301354525E-2 );
|
|
||||||
_PS_CONST( tancof_p3, 5.34112807005E-2 );
|
|
||||||
_PS_CONST( tancof_p4, 1.33387994085E-1 );
|
|
||||||
_PS_CONST( tancof_p5, 3.33331568548E-1 );
|
|
||||||
|
|
||||||
_PS_CONST( tancot_eps, 1.0e-4 );
|
|
||||||
|
|
||||||
v4sf tancot_ps( v4sf x, int cotFlag )
|
|
||||||
{
|
|
||||||
v4sf xmm1, xmm2 = _mm_setzero_ps(), xmm3, sign_bit, y;
|
|
||||||
|
|
||||||
#ifdef USE_SSE2
|
|
||||||
v4si emm2;
|
|
||||||
#else
|
|
||||||
#endif
|
|
||||||
sign_bit = x;
|
|
||||||
/* take the absolute value */
|
|
||||||
x = _mm_and_ps( x, *(v4sf*)_ps_inv_sign_mask );
|
|
||||||
/* extract the sign bit (upper one) */
|
|
||||||
sign_bit = _mm_and_ps( sign_bit, *(v4sf*)_ps_sign_mask );
|
|
||||||
|
|
||||||
/* scale by 4/Pi */
|
|
||||||
y = _mm_mul_ps( x, *(v4sf*)_ps_cephes_FOPI );
|
|
||||||
|
|
||||||
#ifdef USE_SSE2
|
|
||||||
/* store the integer part of y in mm0 */
|
|
||||||
emm2 = _mm_cvttps_epi32( y );
|
|
||||||
/* j=(j+1) & (~1) (see the cephes sources) */
|
|
||||||
emm2 = _mm_add_epi32( emm2, *(v4si*)_pi32_1 );
|
|
||||||
emm2 = _mm_and_si128( emm2, *(v4si*)_pi32_inv1 );
|
|
||||||
y = _mm_cvtepi32_ps( emm2 );
|
|
||||||
|
|
||||||
emm2 = _mm_and_si128( emm2, *(v4si*)_pi32_2 );
|
|
||||||
emm2 = _mm_cmpeq_epi32( emm2, _mm_setzero_si128() );
|
|
||||||
|
|
||||||
v4sf poly_mask = _mm_castsi128_ps( emm2 );
|
|
||||||
#else
|
|
||||||
#endif
|
|
||||||
/* The magic pass: "Extended precision modular arithmetic"
|
|
||||||
x = ((x - y * DP1) - y * DP2) - y * DP3; */
|
|
||||||
xmm1 = *(v4sf*)_ps_minus_cephes_DP1;
|
|
||||||
xmm2 = *(v4sf*)_ps_minus_cephes_DP2;
|
|
||||||
xmm3 = *(v4sf*)_ps_minus_cephes_DP3;
|
|
||||||
xmm1 = _mm_mul_ps( y, xmm1 );
|
|
||||||
xmm2 = _mm_mul_ps( y, xmm2 );
|
|
||||||
xmm3 = _mm_mul_ps( y, xmm3 );
|
|
||||||
v4sf z = _mm_add_ps( x, xmm1 );
|
|
||||||
z = _mm_add_ps( z, xmm2 );
|
|
||||||
z = _mm_add_ps( z, xmm3 );
|
|
||||||
|
|
||||||
v4sf zz = _mm_mul_ps( z, z );
|
|
||||||
|
|
||||||
y = *(v4sf*)_ps_tancof_p0;
|
|
||||||
y = _mm_mul_ps( y, zz );
|
|
||||||
y = _mm_add_ps( y, *(v4sf*)_ps_tancof_p1 );
|
|
||||||
y = _mm_mul_ps( y, zz );
|
|
||||||
y = _mm_add_ps( y, *(v4sf*)_ps_tancof_p2 );
|
|
||||||
y = _mm_mul_ps( y, zz );
|
|
||||||
y = _mm_add_ps( y, *(v4sf*)_ps_tancof_p3 );
|
|
||||||
y = _mm_mul_ps( y, zz );
|
|
||||||
y = _mm_add_ps( y, *(v4sf*)_ps_tancof_p4 );
|
|
||||||
y = _mm_mul_ps( y, zz );
|
|
||||||
y = _mm_add_ps( y, *(v4sf*)_ps_tancof_p5 );
|
|
||||||
y = _mm_mul_ps( y, zz );
|
|
||||||
y = _mm_mul_ps( y, z );
|
|
||||||
y = _mm_add_ps( y, z );
|
|
||||||
|
|
||||||
v4sf y2;
|
|
||||||
if( cotFlag ) {
|
|
||||||
y2 = _mm_xor_ps( y, *(v4sf*)_ps_sign_mask );
|
|
||||||
/* y = _mm_rcp_ps( y ); */
|
|
||||||
/* using _mm_rcp_ps here loses on way too much precision, better to do a div */
|
|
||||||
y = _mm_div_ps( *(v4sf*)_ps_1, y );
|
|
||||||
} else {
|
|
||||||
/* y2 = _mm_rcp_ps( y ); */
|
|
||||||
/* using _mm_rcp_ps here loses on way too much precision, better to do a div */
|
|
||||||
y2 = _mm_div_ps( *(v4sf*)_ps_1, y );
|
|
||||||
y2 = _mm_xor_ps( y2, *(v4sf*)_ps_sign_mask );
|
|
||||||
}
|
|
||||||
|
|
||||||
/* select the correct result from the two polynoms */
|
|
||||||
xmm3 = poly_mask;
|
|
||||||
y = _mm_and_ps( xmm3, y );
|
|
||||||
y2 = _mm_andnot_ps( xmm3, y2 );
|
|
||||||
y = _mm_or_ps( y, y2 );
|
|
||||||
|
|
||||||
/* update the sign */
|
|
||||||
y = _mm_xor_ps( y, sign_bit );
|
|
||||||
|
|
||||||
return y;
|
|
||||||
}
|
|
||||||
|
|
||||||
v4sf tan_ps( v4sf x ) { return tancot_ps( x, 0 ); }
|
|
||||||
|
|
||||||
v4sf cot_ps( v4sf x ) { return tancot_ps( x, 1 ); }
|
|
||||||
|
|
||||||
_PS_CONST( atanrange_hi, 2.414213562373095 );
|
|
||||||
_PS_CONST( atanrange_lo, 0.4142135623730950 );
|
|
||||||
const float PIF = 3.141592653589793238;
|
|
||||||
const float PIO2F = 1.5707963267948966192;
|
|
||||||
_PS_CONST( cephes_PIF, 3.141592653589793238 );
|
|
||||||
_PS_CONST( cephes_PIO2F, 1.5707963267948966192 );
|
|
||||||
_PS_CONST( cephes_PIO4F, 0.7853981633974483096 );
|
|
||||||
|
|
||||||
_PS_CONST( atancof_p0, 8.05374449538e-2 );
|
|
||||||
_PS_CONST( atancof_p1, 1.38776856032E-1 );
|
|
||||||
_PS_CONST( atancof_p2, 1.99777106478E-1 );
|
|
||||||
_PS_CONST( atancof_p3, 3.33329491539E-1 );
|
|
||||||
|
|
||||||
v4sf atan_ps( v4sf x )
|
|
||||||
{
|
|
||||||
v4sf sign_bit, y;
|
|
||||||
|
|
||||||
sign_bit = x;
|
|
||||||
/* take the absolute value */
|
|
||||||
x = _mm_and_ps( x, *(v4sf*)_ps_inv_sign_mask );
|
|
||||||
/* extract the sign bit (upper one) */
|
|
||||||
sign_bit = _mm_and_ps( sign_bit, *(v4sf*)_ps_sign_mask );
|
|
||||||
|
|
||||||
/* range reduction, init x and y depending on range */
|
|
||||||
#ifdef USE_SSE2
|
|
||||||
/* x > 2.414213562373095 */
|
|
||||||
v4sf cmp0 = _mm_cmpgt_ps( x, *(v4sf*)_ps_atanrange_hi );
|
|
||||||
/* x > 0.4142135623730950 */
|
|
||||||
v4sf cmp1 = _mm_cmpgt_ps( x, *(v4sf*)_ps_atanrange_lo );
|
|
||||||
|
|
||||||
/* x > 0.4142135623730950 && !( x > 2.414213562373095 ) */
|
|
||||||
v4sf cmp2 = _mm_andnot_ps( cmp0, cmp1 );
|
|
||||||
|
|
||||||
/* -( 1.0/x ) */
|
|
||||||
v4sf y0 = _mm_and_ps( cmp0, *(v4sf*)_ps_cephes_PIO2F );
|
|
||||||
v4sf x0 = _mm_div_ps( *(v4sf*)_ps_1, x );
|
|
||||||
x0 = _mm_xor_ps( x0, *(v4sf*)_ps_sign_mask );
|
|
||||||
|
|
||||||
v4sf y1 = _mm_and_ps( cmp2, *(v4sf*)_ps_cephes_PIO4F );
|
|
||||||
/* (x-1.0)/(x+1.0) */
|
|
||||||
v4sf x1_o = _mm_sub_ps( x, *(v4sf*)_ps_1 );
|
|
||||||
v4sf x1_u = _mm_add_ps( x, *(v4sf*)_ps_1 );
|
|
||||||
v4sf x1 = _mm_div_ps( x1_o, x1_u );
|
|
||||||
|
|
||||||
v4sf x2 = _mm_and_ps( cmp2, x1 );
|
|
||||||
x0 = _mm_and_ps( cmp0, x0 );
|
|
||||||
x2 = _mm_or_ps( x2, x0 );
|
|
||||||
cmp1 = _mm_or_ps( cmp0, cmp2 );
|
|
||||||
x2 = _mm_and_ps( cmp1, x2 );
|
|
||||||
x = _mm_andnot_ps( cmp1, x );
|
|
||||||
x = _mm_or_ps( x2, x );
|
|
||||||
|
|
||||||
y = _mm_or_ps( y0, y1 );
|
|
||||||
#else
|
|
||||||
#error sse1 & mmx version not implemented
|
|
||||||
#endif
|
|
||||||
|
|
||||||
v4sf zz = _mm_mul_ps( x, x );
|
|
||||||
v4sf acc = *(v4sf*)_ps_atancof_p0;
|
|
||||||
acc = _mm_mul_ps( acc, zz );
|
|
||||||
acc = _mm_sub_ps( acc, *(v4sf*)_ps_atancof_p1 );
|
|
||||||
acc = _mm_mul_ps( acc, zz );
|
|
||||||
acc = _mm_add_ps( acc, *(v4sf*)_ps_atancof_p2 );
|
|
||||||
acc = _mm_mul_ps( acc, zz );
|
|
||||||
acc = _mm_sub_ps( acc, *(v4sf*)_ps_atancof_p3 );
|
|
||||||
acc = _mm_mul_ps( acc, zz );
|
|
||||||
acc = _mm_mul_ps( acc, x );
|
|
||||||
acc = _mm_add_ps( acc, x );
|
|
||||||
y = _mm_add_ps( y, acc );
|
|
||||||
|
|
||||||
/* update the sign */
|
|
||||||
y = _mm_xor_ps( y, sign_bit );
|
|
||||||
|
|
||||||
return y;
|
|
||||||
}
|
|
||||||
|
|
||||||
v4sf atan2_ps( v4sf y, v4sf x )
|
|
||||||
{
|
|
||||||
v4sf x_eq_0 = _mm_cmpeq_ps( x, *(v4sf*)_ps_0 );
|
|
||||||
v4sf x_gt_0 = _mm_cmpgt_ps( x, *(v4sf*)_ps_0 );
|
|
||||||
v4sf x_le_0 = _mm_cmple_ps( x, *(v4sf*)_ps_0 );
|
|
||||||
v4sf y_eq_0 = _mm_cmpeq_ps( y, *(v4sf*)_ps_0 );
|
|
||||||
v4sf x_lt_0 = _mm_cmplt_ps( x, *(v4sf*)_ps_0 );
|
|
||||||
v4sf y_lt_0 = _mm_cmplt_ps( y, *(v4sf*)_ps_0 );
|
|
||||||
|
|
||||||
v4sf zero_mask = _mm_and_ps( x_eq_0, y_eq_0 );
|
|
||||||
v4sf zero_mask_other_case = _mm_and_ps( y_eq_0, x_gt_0 );
|
|
||||||
zero_mask = _mm_or_ps( zero_mask, zero_mask_other_case );
|
|
||||||
|
|
||||||
v4sf pio2_mask = _mm_andnot_ps( y_eq_0, x_eq_0 );
|
|
||||||
v4sf pio2_mask_sign = _mm_and_ps( y_lt_0, *(v4sf*)_ps_sign_mask );
|
|
||||||
v4sf pio2_result = *(v4sf*)_ps_cephes_PIO2F;
|
|
||||||
pio2_result = _mm_xor_ps( pio2_result, pio2_mask_sign );
|
|
||||||
pio2_result = _mm_and_ps( pio2_mask, pio2_result );
|
|
||||||
|
|
||||||
v4sf pi_mask = _mm_and_ps( y_eq_0, x_le_0 );
|
|
||||||
v4sf pi = *(v4sf*)_ps_cephes_PIF;
|
|
||||||
v4sf pi_result = _mm_and_ps( pi_mask, pi );
|
|
||||||
|
|
||||||
v4sf swap_sign_mask_offset = _mm_and_ps( x_lt_0, y_lt_0 );
|
|
||||||
swap_sign_mask_offset = _mm_and_ps( swap_sign_mask_offset, *(v4sf*)_ps_sign_mask );
|
|
||||||
|
|
||||||
v4sf offset0 = _mm_setzero_ps();
|
|
||||||
v4sf offset1 = *(v4sf*)_ps_cephes_PIF;
|
|
||||||
offset1 = _mm_xor_ps( offset1, swap_sign_mask_offset );
|
|
||||||
|
|
||||||
v4sf offset = _mm_andnot_ps( x_lt_0, offset0 );
|
|
||||||
offset = _mm_and_ps( x_lt_0, offset1 );
|
|
||||||
|
|
||||||
v4sf arg = _mm_div_ps( y, x );
|
|
||||||
v4sf atan_result = atan_ps( arg );
|
|
||||||
atan_result = _mm_add_ps( atan_result, offset );
|
|
||||||
|
|
||||||
/* select between zero_result, pio2_result and atan_result */
|
|
||||||
|
|
||||||
v4sf result = _mm_andnot_ps( zero_mask, pio2_result );
|
|
||||||
atan_result = _mm_andnot_ps( pio2_mask, atan_result );
|
|
||||||
atan_result = _mm_andnot_ps( pio2_mask, atan_result);
|
|
||||||
result = _mm_or_ps( result, atan_result );
|
|
||||||
result = _mm_or_ps( result, pi_result );
|
|
||||||
|
|
||||||
return result;
|
|
||||||
}
|
|
||||||
|
|
||||||
/* for convenience of calling simd sqrt */
|
|
||||||
float sqrt_ps( float x )
|
|
||||||
{
|
|
||||||
v4sf sse_value = _mm_set_ps1( x );
|
|
||||||
sse_value = _mm_sqrt_ps( sse_value );
|
|
||||||
return _mm_cvtss_f32( sse_value );
|
|
||||||
}
|
|
||||||
float rsqrt_ps( float x )
|
|
||||||
{
|
|
||||||
v4sf sse_value = _mm_set_ps1( x );
|
|
||||||
sse_value = _mm_rsqrt_ps( sse_value );
|
|
||||||
return _mm_cvtss_f32( sse_value );
|
|
||||||
}
|
|
||||||
|
|
||||||
/* atan2 implementation using atan, used as a reference to implement atan2_ps */
|
|
||||||
float atan2_ref( float y, float x )
|
|
||||||
{
|
|
||||||
if( x == 0.0f ) {
|
|
||||||
if( y == 0.0f ) {
|
|
||||||
return 0.0f;
|
|
||||||
}
|
|
||||||
float result = _ps_cephes_PIO2F[0];
|
|
||||||
if( y < 0.0f ) {
|
|
||||||
result = -result;
|
|
||||||
}
|
|
||||||
return result;
|
|
||||||
}
|
|
||||||
|
|
||||||
if( y == 0.0f ) {
|
|
||||||
if( x > 0.0f ) {
|
|
||||||
return 0.0f;
|
|
||||||
}
|
|
||||||
return PIF;
|
|
||||||
}
|
|
||||||
|
|
||||||
float offset = 0;
|
|
||||||
if( x < 0.0f ) {
|
|
||||||
offset = PIF;
|
|
||||||
if( y < 0.0f ) {
|
|
||||||
offset = -offset;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
v4sf val = _mm_set_ps1( y / x );
|
|
||||||
val = atan_ps( val );
|
|
||||||
return offset + _mm_cvtss_f32( val );
|
|
||||||
}
|
|
||||||
|
|
||||||
#ifdef _MSC_VER
|
|
||||||
#pragma warning( pop )
|
|
||||||
#endif
|
|
||||||
|
|
||||||
#endif
|
|
|
@ -2424,12 +2424,37 @@ CreateAllocator_(allocator_allocate* Alloc, allocator_free* Free)
|
||||||
}
|
}
|
||||||
#define CreateAllocator(a, f) CreateAllocator_((allocator_allocate*)(a), (allocator_free*)(f))
|
#define CreateAllocator(a, f) CreateAllocator_((allocator_allocate*)(a), (allocator_free*)(f))
|
||||||
|
|
||||||
|
internal void
|
||||||
|
AllocatorDebug_PushAlloc(gs_allocator_debug* Debug, u64 Size, char* Location)
|
||||||
|
{
|
||||||
|
// NOTE(pjs): I don't want this debug procedure to be the reason the
|
||||||
|
// application crashes.
|
||||||
|
if (Debug->AllocationsCount < Debug->AllocationsCountMax)
|
||||||
|
{
|
||||||
|
gs_debug_allocation Allocation = {};
|
||||||
|
|
||||||
|
gs_const_string L = ConstString(Location);
|
||||||
|
|
||||||
|
s64 LastSlash = FindLastFromSet(L, "\\/");
|
||||||
|
if (LastSlash < 0) LastSlash = 0;
|
||||||
|
Allocation.Location = GetStringAfter(L, LastSlash);
|
||||||
|
Allocation.Size = Size;
|
||||||
|
|
||||||
|
Debug->Allocations[Debug->AllocationsCount++] = Allocation;
|
||||||
|
}
|
||||||
|
Debug->TotalAllocSize += Size;
|
||||||
|
}
|
||||||
|
|
||||||
internal gs_data
|
internal gs_data
|
||||||
AllocatorAlloc_(gs_allocator Allocator, u64 Size, char* Location)
|
AllocatorAlloc_(gs_allocator Allocator, u64 Size, char* Location)
|
||||||
{
|
{
|
||||||
// TODO(Peter): Memory Profiling with Location
|
// TODO(Peter): Memory Profiling with Location
|
||||||
u64 SizeResult = 0;
|
u64 SizeResult = 0;
|
||||||
void* Memory = Allocator.Alloc(Size, &SizeResult);
|
void* Memory = Allocator.Alloc(Size, &SizeResult);
|
||||||
|
if (Allocator.Debug)
|
||||||
|
{
|
||||||
|
AllocatorDebug_PushAlloc(Allocator.Debug, Size, Location);
|
||||||
|
}
|
||||||
return CreateData((u8*)Memory, SizeResult);
|
return CreateData((u8*)Memory, SizeResult);
|
||||||
}
|
}
|
||||||
internal void
|
internal void
|
||||||
|
@ -2439,6 +2464,13 @@ AllocatorFree_(gs_allocator Allocator, void* Base, u64 Size, char* Location)
|
||||||
if (Base != 0 && Size != 0)
|
if (Base != 0 && Size != 0)
|
||||||
{
|
{
|
||||||
Allocator.Free(Base, Size);
|
Allocator.Free(Base, Size);
|
||||||
|
if (Allocator.Debug)
|
||||||
|
{
|
||||||
|
// NOTE(pjs): There's no reason we should be going negative
|
||||||
|
// ie. Freeing more memory than we allocated
|
||||||
|
Assert(Allocator.Debug->TotalAllocSize >= Size);
|
||||||
|
Allocator.Debug->TotalAllocSize -= Size;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -2526,30 +2558,37 @@ FreeCursorListEntry(gs_allocator Allocator, gs_memory_cursor_list* CursorEntry)
|
||||||
}
|
}
|
||||||
|
|
||||||
internal gs_memory_arena
|
internal gs_memory_arena
|
||||||
CreateMemoryArena_(arena_type ArenaType, gs_allocator Allocator, u64 ChunkSize, u64 Alignment, gs_memory_arena* ParentArena)
|
CreateMemoryArena_(arena_type ArenaType, gs_allocator Allocator, u64 ChunkSize, u64 Alignment, gs_memory_arena* ParentArena, char* Name)
|
||||||
{
|
{
|
||||||
// we only want a parent arena if the type is Arena_SubArena
|
// we only want a parent arena if the type is Arena_SubArena
|
||||||
Assert(((ArenaType == Arena_BaseArena) && (ParentArena == 0)) ||
|
Assert(((ArenaType == Arena_BaseArena) && (ParentArena == 0)) ||
|
||||||
((ArenaType == Arena_SubArena) && (ParentArena != 0)));
|
((ArenaType == Arena_SubArena) && (ParentArena != 0)));
|
||||||
|
|
||||||
gs_memory_arena Arena = {};
|
gs_memory_arena Arena = {};
|
||||||
|
Arena.ArenaName = Name;
|
||||||
Arena.Type = ArenaType;
|
Arena.Type = ArenaType;
|
||||||
Arena.Allocator = Allocator;
|
Arena.Allocator = Allocator;
|
||||||
Arena.Parent = ParentArena;
|
Arena.Parent = ParentArena;
|
||||||
|
|
||||||
|
#if MEMORY_CURSOR_STATIC_ARRAY
|
||||||
|
Arena.CursorsCountMax = 4096;
|
||||||
|
Arena.Cursors = AllocatorAllocArray(Allocator, gs_memory_cursor_list, Arena.CursorsCountMax);
|
||||||
|
#endif
|
||||||
|
|
||||||
Arena.MemoryChunkSize = ChunkSize;
|
Arena.MemoryChunkSize = ChunkSize;
|
||||||
Arena.MemoryAlignment = Alignment;
|
Arena.MemoryAlignment = Alignment;
|
||||||
return Arena;
|
return Arena;
|
||||||
}
|
}
|
||||||
|
|
||||||
internal gs_memory_arena
|
internal gs_memory_arena
|
||||||
CreateMemoryArena(gs_allocator Allocator, u64 ChunkSize = KB(32), u64 Alignment = Bytes(8))
|
CreateMemoryArena(gs_allocator Allocator, char* Name, u64 ChunkSize = KB(32), u64 Alignment = Bytes(8))
|
||||||
{
|
{
|
||||||
return CreateMemoryArena_(Arena_BaseArena, Allocator, ChunkSize, Alignment, 0);
|
return CreateMemoryArena_(Arena_BaseArena, Allocator, ChunkSize, Alignment, 0, Name);
|
||||||
}
|
}
|
||||||
internal gs_memory_arena
|
internal gs_memory_arena
|
||||||
CreateMemorySubArena(gs_memory_arena* Parent, u64 ChunkSize = KB(32), u64 Alignment = Bytes(8))
|
CreateMemorySubArena(gs_memory_arena* Parent, char* Name, u64 ChunkSize = KB(32), u64 Alignment = Bytes(8))
|
||||||
{
|
{
|
||||||
return CreateMemoryArena_(Arena_SubArena, Parent->Allocator, ChunkSize, Alignment, Parent);
|
return CreateMemoryArena_(Arena_SubArena, Parent->Allocator, ChunkSize, Alignment, Parent, Name);
|
||||||
}
|
}
|
||||||
|
|
||||||
internal gs_data PushSize_(gs_memory_arena* Arena, u64 Size, char* Location);
|
internal gs_data PushSize_(gs_memory_arena* Arena, u64 Size, char* Location);
|
||||||
|
@ -2557,6 +2596,7 @@ internal gs_data PushSize_(gs_memory_arena* Arena, u64 Size, char* Location);
|
||||||
internal void
|
internal void
|
||||||
FreeCursorList(gs_memory_cursor_list* List, gs_allocator Allocator)
|
FreeCursorList(gs_memory_cursor_list* List, gs_allocator Allocator)
|
||||||
{
|
{
|
||||||
|
#if !MEMORY_CURSOR_STATIC_ARRAY
|
||||||
gs_memory_cursor_list* CursorAt = List;
|
gs_memory_cursor_list* CursorAt = List;
|
||||||
while (CursorAt != 0)
|
while (CursorAt != 0)
|
||||||
{
|
{
|
||||||
|
@ -2564,13 +2604,18 @@ FreeCursorList(gs_memory_cursor_list* List, gs_allocator Allocator)
|
||||||
FreeCursorListEntry(Allocator, CursorAt);
|
FreeCursorListEntry(Allocator, CursorAt);
|
||||||
CursorAt = Prev;
|
CursorAt = Prev;
|
||||||
}
|
}
|
||||||
|
#endif
|
||||||
}
|
}
|
||||||
|
|
||||||
internal gs_memory_cursor_list*
|
internal gs_memory_cursor_list*
|
||||||
MemoryArenaNewCursor(gs_memory_arena* Arena, u64 MinSize, char* Location)
|
MemoryArenaNewCursor(gs_memory_arena* Arena, u64 MinSize, char* Location)
|
||||||
{
|
{
|
||||||
|
#if MEMORY_CURSOR_STATIC_ARRAY
|
||||||
|
u64 AllocSize = Max(MinSize, Arena->MemoryChunkSize);
|
||||||
|
#else
|
||||||
// Allocate enough spcae for the minimum size needed + sizeo for the cursor list
|
// Allocate enough spcae for the minimum size needed + sizeo for the cursor list
|
||||||
u64 AllocSize = Max(MinSize, Arena->MemoryChunkSize) + sizeof(gs_memory_cursor_list);
|
u64 AllocSize = Max(MinSize, Arena->MemoryChunkSize) + sizeof(gs_memory_cursor_list);
|
||||||
|
#endif
|
||||||
|
|
||||||
gs_data Data = {0};
|
gs_data Data = {0};
|
||||||
switch (Arena->Type)
|
switch (Arena->Type)
|
||||||
|
@ -2588,6 +2633,11 @@ MemoryArenaNewCursor(gs_memory_arena* Arena, u64 MinSize, char* Location)
|
||||||
InvalidDefaultCase;
|
InvalidDefaultCase;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#if MEMORY_CURSOR_STATIC_ARRAY
|
||||||
|
Assert(Arena->CursorsCount < Arena->CursorsCountMax);
|
||||||
|
gs_memory_cursor_list* Result = Arena->Cursors + Arena->CursorsCount++;
|
||||||
|
Result->Cursor = CreateMemoryCursor(Data.Memory, Data.Size);
|
||||||
|
#else
|
||||||
// Fit the memory cursor into the region allocated
|
// Fit the memory cursor into the region allocated
|
||||||
Assert(MinSize + sizeof(gs_memory_cursor_list) <= Data.Size);
|
Assert(MinSize + sizeof(gs_memory_cursor_list) <= Data.Size);
|
||||||
gs_memory_cursor_list* Result = (gs_memory_cursor_list*)Data.Memory;
|
gs_memory_cursor_list* Result = (gs_memory_cursor_list*)Data.Memory;
|
||||||
|
@ -2599,9 +2649,14 @@ MemoryArenaNewCursor(gs_memory_arena* Arena, u64 MinSize, char* Location)
|
||||||
Result->Next = 0;
|
Result->Next = 0;
|
||||||
if (Arena->CursorList != 0)
|
if (Arena->CursorList != 0)
|
||||||
{
|
{
|
||||||
|
if (Arena->CursorList->Next != 0)
|
||||||
|
{
|
||||||
|
Result->Next = Arena->CursorList->Next;
|
||||||
|
}
|
||||||
Arena->CursorList->Next = Result;
|
Arena->CursorList->Next = Result;
|
||||||
}
|
}
|
||||||
Arena->CursorList = Result;
|
Arena->CursorList = Result;
|
||||||
|
#endif
|
||||||
return Result;
|
return Result;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -2611,6 +2666,27 @@ PushSize_(gs_memory_arena* Arena, u64 Size, char* Location)
|
||||||
gs_data Result = {0};
|
gs_data Result = {0};
|
||||||
if (Size > 0)
|
if (Size > 0)
|
||||||
{
|
{
|
||||||
|
#if MEMORY_CURSOR_STATIC_ARRAY
|
||||||
|
gs_memory_cursor_list* CursorEntry = 0;
|
||||||
|
for (u64 i = 0;
|
||||||
|
i < Arena->CursorsCount;
|
||||||
|
i++)
|
||||||
|
{
|
||||||
|
gs_memory_cursor_list* At = Arena->Cursors + i;
|
||||||
|
if (CursorHasRoom(At->Cursor, Size))
|
||||||
|
{
|
||||||
|
CursorEntry = At;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if (!CursorEntry)
|
||||||
|
{
|
||||||
|
CursorEntry = MemoryArenaNewCursor(Arena, Size, Location);
|
||||||
|
}
|
||||||
|
Assert(CursorEntry);
|
||||||
|
Assert(CursorHasRoom(CursorEntry->Cursor, Size));
|
||||||
|
#else
|
||||||
|
|
||||||
gs_memory_cursor_list* CursorEntry = Arena->CursorList;
|
gs_memory_cursor_list* CursorEntry = Arena->CursorList;
|
||||||
if (CursorEntry == 0)
|
if (CursorEntry == 0)
|
||||||
{
|
{
|
||||||
|
@ -2627,6 +2703,7 @@ PushSize_(gs_memory_arena* Arena, u64 Size, char* Location)
|
||||||
CursorEntry = MemoryArenaNewCursor(Arena, Size, Location);
|
CursorEntry = MemoryArenaNewCursor(Arena, Size, Location);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
#endif
|
||||||
Assert(CursorEntry != 0);
|
Assert(CursorEntry != 0);
|
||||||
Result = PushSizeOnCursor_(&CursorEntry->Cursor, Size, Location);
|
Result = PushSizeOnCursor_(&CursorEntry->Cursor, Size, Location);
|
||||||
Assert(Result.Memory != 0);
|
Assert(Result.Memory != 0);
|
||||||
|
@ -2651,44 +2728,19 @@ PushSize_(gs_memory_arena* Arena, u64 Size, char* Location)
|
||||||
return Result;
|
return Result;
|
||||||
}
|
}
|
||||||
|
|
||||||
internal void
|
|
||||||
PopSize(gs_memory_arena* Arena, u64 Size)
|
|
||||||
{
|
|
||||||
gs_allocator Allocator = Arena->Allocator;
|
|
||||||
gs_memory_cursor_list* CursorEntry = Arena->CursorList;
|
|
||||||
for (gs_memory_cursor_list* Prev = 0;
|
|
||||||
CursorEntry != 0 && Size != 0;
|
|
||||||
CursorEntry = Prev)
|
|
||||||
{
|
|
||||||
Prev = CursorEntry->Prev;
|
|
||||||
if (Size >= CursorEntry->Cursor.Position)
|
|
||||||
{
|
|
||||||
Size -= CursorEntry->Cursor.Position;
|
|
||||||
FreeCursorListEntry(Allocator, CursorEntry);
|
|
||||||
}
|
|
||||||
else
|
|
||||||
{
|
|
||||||
PopSizeOnCursor(&CursorEntry->Cursor, Size);
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
Arena->CursorList = CursorEntry;
|
|
||||||
}
|
|
||||||
internal void
|
internal void
|
||||||
FreeMemoryArena(gs_memory_arena* Arena)
|
FreeMemoryArena(gs_memory_arena* Arena)
|
||||||
{
|
{
|
||||||
gs_allocator Allocator = Arena->Allocator;
|
#if MEMORY_CURSOR_STATIC_ARRAY
|
||||||
gs_memory_cursor_list* CursorEntry = Arena->CursorList;
|
for (u32 i = 0; i < Arena->CursorsCount; i++)
|
||||||
for (gs_memory_cursor_list* Prev = 0;
|
|
||||||
CursorEntry != 0;
|
|
||||||
CursorEntry = Prev)
|
|
||||||
{
|
{
|
||||||
Prev = CursorEntry->Prev;
|
gs_memory_cursor_list E = Arena->Cursors[i];
|
||||||
if (CursorEntry != 0)
|
AllocatorFree(Arena->Allocator, E.Cursor.Data.Memory, E.Cursor.Data.Size);
|
||||||
{
|
|
||||||
FreeCursorListEntry(Allocator, CursorEntry);
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
AllocatorFreeArray(Arena->Allocator, Arena->Cursors, gs_memory_cursor_list, Arena->CursorsCountMax);
|
||||||
|
#else
|
||||||
|
FreeCursorList(Arena->CursorList, Arena->Allocator);
|
||||||
|
#endif
|
||||||
}
|
}
|
||||||
|
|
||||||
#define PushSizeToData(arena, size) PushSize_((arena), (size), FileNameAndLineNumberString)
|
#define PushSizeToData(arena, size) PushSize_((arena), (size), FileNameAndLineNumberString)
|
||||||
|
@ -2726,6 +2778,12 @@ PushStringCopy(gs_memory_arena* Arena, gs_const_string String)
|
||||||
internal void
|
internal void
|
||||||
ClearArena(gs_memory_arena* Arena)
|
ClearArena(gs_memory_arena* Arena)
|
||||||
{
|
{
|
||||||
|
#if MEMORY_CURSOR_STATIC_ARRAY
|
||||||
|
for (u32 i = 0; i < Arena->CursorsCount; i++)
|
||||||
|
{
|
||||||
|
Arena->Cursors[i].Cursor.Position = 0;
|
||||||
|
}
|
||||||
|
#else
|
||||||
gs_memory_cursor_list* First = 0;
|
gs_memory_cursor_list* First = 0;
|
||||||
for (gs_memory_cursor_list* CursorEntry = Arena->CursorList;
|
for (gs_memory_cursor_list* CursorEntry = Arena->CursorList;
|
||||||
CursorEntry != 0;
|
CursorEntry != 0;
|
||||||
|
@ -2735,12 +2793,13 @@ ClearArena(gs_memory_arena* Arena)
|
||||||
CursorEntry->Cursor.Position = 0;
|
CursorEntry->Cursor.Position = 0;
|
||||||
}
|
}
|
||||||
Arena->CursorList = First;
|
Arena->CursorList = First;
|
||||||
|
#endif
|
||||||
}
|
}
|
||||||
|
|
||||||
internal void
|
internal void
|
||||||
FreeArena(gs_memory_arena* Arena)
|
FreeArena(gs_memory_arena* Arena)
|
||||||
{
|
{
|
||||||
FreeCursorList(Arena->CursorList, Arena->Allocator);
|
FreeMemoryArena(Arena);
|
||||||
}
|
}
|
||||||
|
|
||||||
///////////////////////////
|
///////////////////////////
|
||||||
|
@ -2789,14 +2848,14 @@ CreateDynarrayWithStorage(gs_memory_arena Storage, u32 ElementSize, u32 Elements
|
||||||
internal gs_dynarray
|
internal gs_dynarray
|
||||||
CreateDynarray_(gs_allocator Allocator, u32 ElementSize, u32 ElementsPerBuffer)
|
CreateDynarray_(gs_allocator Allocator, u32 ElementSize, u32 ElementsPerBuffer)
|
||||||
{
|
{
|
||||||
gs_memory_arena Storage = CreateMemoryArena(Allocator, ElementSize * ElementsPerBuffer);
|
gs_memory_arena Storage = CreateMemoryArena(Allocator, "Dynarray Arena", ElementSize * ElementsPerBuffer);
|
||||||
return CreateDynarrayWithStorage(Storage, ElementSize, ElementsPerBuffer);
|
return CreateDynarrayWithStorage(Storage, ElementSize, ElementsPerBuffer);
|
||||||
};
|
};
|
||||||
|
|
||||||
internal gs_dynarray
|
internal gs_dynarray
|
||||||
CreateDynarray_(gs_memory_arena* Arena, u32 ElementSize, u32 ElementsPerBuffer)
|
CreateDynarray_(gs_memory_arena* Arena, u32 ElementSize, u32 ElementsPerBuffer)
|
||||||
{
|
{
|
||||||
gs_memory_arena Storage = CreateMemorySubArena(Arena, ElementSize * ElementsPerBuffer);
|
gs_memory_arena Storage = CreateMemorySubArena(Arena, "Dynarray Sub Arena", ElementSize * ElementsPerBuffer);
|
||||||
return CreateDynarrayWithStorage(Storage, ElementSize, ElementsPerBuffer);
|
return CreateDynarrayWithStorage(Storage, ElementSize, ElementsPerBuffer);
|
||||||
};
|
};
|
||||||
|
|
||||||
|
|
|
@ -247,7 +247,7 @@ enum { \
|
||||||
#define DontCompile ImAfraidICantDoThat
|
#define DontCompile ImAfraidICantDoThat
|
||||||
|
|
||||||
#define LineNumberString Stringify(__LINE__)
|
#define LineNumberString Stringify(__LINE__)
|
||||||
#define FileNameAndLineNumberString_ __FILE__ ":" LineNumberString ":"
|
#define FileNameAndLineNumberString_ __FILE__ ":" LineNumberString ":" __FUNCTION__
|
||||||
#define FileNameAndLineNumberString (char*)FileNameAndLineNumberString_
|
#define FileNameAndLineNumberString (char*)FileNameAndLineNumberString_
|
||||||
|
|
||||||
//
|
//
|
||||||
|
@ -633,10 +633,27 @@ typedef ALLOCATOR_ALLOC(allocator_allocate);
|
||||||
#define ALLOCATOR_FREE(name) void name(void* Ptr, u64 Size)
|
#define ALLOCATOR_FREE(name) void name(void* Ptr, u64 Size)
|
||||||
typedef ALLOCATOR_FREE(allocator_free);
|
typedef ALLOCATOR_FREE(allocator_free);
|
||||||
|
|
||||||
|
struct gs_debug_allocation
|
||||||
|
{
|
||||||
|
gs_const_string Location;
|
||||||
|
u64 Size;
|
||||||
|
};
|
||||||
|
|
||||||
|
struct gs_allocator_debug
|
||||||
|
{
|
||||||
|
u64 TotalAllocSize;
|
||||||
|
|
||||||
|
u64 AllocationsCount;
|
||||||
|
u64 AllocationsCountMax;
|
||||||
|
gs_debug_allocation* Allocations;
|
||||||
|
};
|
||||||
|
|
||||||
struct gs_allocator
|
struct gs_allocator
|
||||||
{
|
{
|
||||||
allocator_allocate* Alloc;
|
allocator_allocate* Alloc;
|
||||||
allocator_free* Free;
|
allocator_free* Free;
|
||||||
|
|
||||||
|
gs_allocator_debug* Debug;
|
||||||
};
|
};
|
||||||
|
|
||||||
struct gs_memory_cursor
|
struct gs_memory_cursor
|
||||||
|
@ -645,11 +662,26 @@ struct gs_memory_cursor
|
||||||
u64 Position;
|
u64 Position;
|
||||||
};
|
};
|
||||||
|
|
||||||
|
/* TODO(pjs): Setting MEMORY_CURSOR_STATIC_ARRAY will still compile,
|
||||||
|
However, it introduces a bug that I haven't fully diagnosed.
|
||||||
|
The problem seems to occur when trying to push to a cleared memory arena
|
||||||
|
Where the FirstCursor doesn't have enough room for the allocation, but
|
||||||
|
also FirstCursor->Next points to a valid cursor. The new cursor is put
|
||||||
|
in the middle however we seem to continually keep allocating new
|
||||||
|
cursors forever and losing old ones.
|
||||||
|
The problem in Lumenarium is found in the OutputData structure
|
||||||
|
|
||||||
|
Leaving this in a simplified state for now
|
||||||
|
*/
|
||||||
|
#define MEMORY_CURSOR_STATIC_ARRAY 1
|
||||||
|
|
||||||
struct gs_memory_cursor_list
|
struct gs_memory_cursor_list
|
||||||
{
|
{
|
||||||
gs_memory_cursor Cursor;
|
gs_memory_cursor Cursor;
|
||||||
|
#if !MEMORY_CURSOR_STATIC_ARRAY
|
||||||
gs_memory_cursor_list* Next;
|
gs_memory_cursor_list* Next;
|
||||||
gs_memory_cursor_list* Prev;
|
gs_memory_cursor_list* Prev;
|
||||||
|
#endif
|
||||||
};
|
};
|
||||||
|
|
||||||
enum arena_type
|
enum arena_type
|
||||||
|
@ -664,9 +696,18 @@ struct gs_memory_arena
|
||||||
gs_allocator Allocator;
|
gs_allocator Allocator;
|
||||||
gs_memory_arena* Parent;
|
gs_memory_arena* Parent;
|
||||||
|
|
||||||
|
#if MEMORY_CURSOR_STATIC_ARRAY
|
||||||
|
gs_memory_cursor_list* Cursors;
|
||||||
|
u64 CursorsCount;
|
||||||
|
u64 CursorsCountMax;
|
||||||
|
#else
|
||||||
gs_memory_cursor_list* CursorList;
|
gs_memory_cursor_list* CursorList;
|
||||||
|
#endif
|
||||||
|
|
||||||
u64 MemoryChunkSize;
|
u64 MemoryChunkSize;
|
||||||
u64 MemoryAlignment;
|
u64 MemoryAlignment;
|
||||||
|
|
||||||
|
char* ArenaName;
|
||||||
};
|
};
|
||||||
|
|
||||||
struct gs_memory_arena_array
|
struct gs_memory_arena_array
|
||||||
|
|
|
@ -31,7 +31,7 @@ bool PathTest (char* In, char* Out) {
|
||||||
|
|
||||||
int main (int ArgCount, char** Args)
|
int main (int ArgCount, char** Args)
|
||||||
{
|
{
|
||||||
Scratch = CreateMemoryArena(CreateAllocator(Alloc, Free));
|
Scratch = CreateMemoryArena(CreateAllocator(Alloc, Free), "Scratch");
|
||||||
|
|
||||||
Test("gs_string")
|
Test("gs_string")
|
||||||
{
|
{
|
||||||
|
|
Loading…
Reference in New Issue