259 lines
9.4 KiB
C
259 lines
9.4 KiB
C
|
/***
|
||
|
*** Copyright (C) 1985-2007 Intel Corporation. All rights reserved.
|
||
|
***
|
||
|
*** The information and source code contained herein is the exclusive
|
||
|
*** property of Intel Corporation and may not be disclosed, examined
|
||
|
*** or reproduced in whole or in part without explicit written authorization
|
||
|
*** from the company.
|
||
|
***
|
||
|
****/
|
||
|
|
||
|
/*
|
||
|
* smmintrin.h
|
||
|
*
|
||
|
* Principal header file for Intel(R) Core(TM) 2 Duo processor
|
||
|
* SSE4.1 intrinsics
|
||
|
*/
|
||
|
|
||
|
#pragma once
|
||
|
#ifndef __midl
|
||
|
#ifndef _INCLUDED_SMM
|
||
|
#define _INCLUDED_SMM
|
||
|
|
||
|
#if defined (_M_CEE_PURE)
|
||
|
#error ERROR: EMM intrinsics not supported in the pure mode!
|
||
|
#else /* defined (_M_CEE_PURE) */
|
||
|
|
||
|
#include <tmmintrin.h>
|
||
|
|
||
|
|
||
|
/*
|
||
|
* Rounding mode macros
|
||
|
*/
|
||
|
|
||
|
#define _MM_FROUND_TO_NEAREST_INT 0x00
|
||
|
#define _MM_FROUND_TO_NEG_INF 0x01
|
||
|
#define _MM_FROUND_TO_POS_INF 0x02
|
||
|
#define _MM_FROUND_TO_ZERO 0x03
|
||
|
#define _MM_FROUND_CUR_DIRECTION 0x04
|
||
|
|
||
|
#define _MM_FROUND_RAISE_EXC 0x00
|
||
|
#define _MM_FROUND_NO_EXC 0x08
|
||
|
|
||
|
#define _MM_FROUND_NINT _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_RAISE_EXC
|
||
|
#define _MM_FROUND_FLOOR _MM_FROUND_TO_NEG_INF | _MM_FROUND_RAISE_EXC
|
||
|
#define _MM_FROUND_CEIL _MM_FROUND_TO_POS_INF | _MM_FROUND_RAISE_EXC
|
||
|
#define _MM_FROUND_TRUNC _MM_FROUND_TO_ZERO | _MM_FROUND_RAISE_EXC
|
||
|
#define _MM_FROUND_RINT _MM_FROUND_CUR_DIRECTION | _MM_FROUND_RAISE_EXC
|
||
|
#define _MM_FROUND_NEARBYINT _MM_FROUND_CUR_DIRECTION | _MM_FROUND_NO_EXC
|
||
|
|
||
|
/*
|
||
|
* MACRO functions for ceil/floor intrinsics
|
||
|
*/
|
||
|
|
||
|
#define _mm_ceil_pd(val) _mm_round_pd((val), _MM_FROUND_CEIL)
|
||
|
#define _mm_ceil_sd(dst, val) _mm_round_sd((dst), (val), _MM_FROUND_CEIL)
|
||
|
|
||
|
#define _mm_floor_pd(val) _mm_round_pd((val), _MM_FROUND_FLOOR)
|
||
|
#define _mm_floor_sd(dst, val) _mm_round_sd((dst), (val), _MM_FROUND_FLOOR)
|
||
|
|
||
|
#define _mm_ceil_ps(val) _mm_round_ps((val), _MM_FROUND_CEIL)
|
||
|
#define _mm_ceil_ss(dst, val) _mm_round_ss((dst), (val), _MM_FROUND_CEIL)
|
||
|
|
||
|
#define _mm_floor_ps(val) _mm_round_ps((val), _MM_FROUND_FLOOR)
|
||
|
#define _mm_floor_ss(dst, val) _mm_round_ss((dst), (val), _MM_FROUND_FLOOR)
|
||
|
|
||
|
#define _mm_test_all_zeros(mask, val) _mm_testz_si128((mask), (val))
|
||
|
|
||
|
/*
|
||
|
* MACRO functions for packed integer 128-bit comparison intrinsics.
|
||
|
*/
|
||
|
|
||
|
#define _mm_test_all_ones(val) \
|
||
|
_mm_testc_si128((val), _mm_cmpeq_epi32((val),(val)))
|
||
|
|
||
|
#define _mm_test_mix_ones_zeros(mask, val) _mm_testnzc_si128((mask), (val))
|
||
|
|
||
|
#if __cplusplus
|
||
|
extern "C" {
|
||
|
#endif /* __cplusplus */
|
||
|
|
||
|
// Integer blend instructions - select data from 2 sources
|
||
|
// using constant/variable mask
|
||
|
|
||
|
extern __m128i _mm_blend_epi16 (__m128i v1, __m128i v2,
|
||
|
const int mask);
|
||
|
extern __m128i _mm_blendv_epi8 (__m128i v1, __m128i v2, __m128i mask);
|
||
|
|
||
|
// Float single precision blend instructions - select data
|
||
|
// from 2 sources using constant/variable mask
|
||
|
|
||
|
extern __m128 _mm_blend_ps (__m128 v1, __m128 v2, const int mask);
|
||
|
extern __m128 _mm_blendv_ps(__m128 v1, __m128 v2, __m128 v3);
|
||
|
|
||
|
// Float double precision blend instructions - select data
|
||
|
// from 2 sources using constant/variable mask
|
||
|
|
||
|
extern __m128d _mm_blend_pd (__m128d v1, __m128d v2, const int mask);
|
||
|
extern __m128d _mm_blendv_pd(__m128d v1, __m128d v2, __m128d v3);
|
||
|
|
||
|
// Dot product instructions with mask-defined summing and zeroing
|
||
|
// of result's parts
|
||
|
|
||
|
extern __m128 _mm_dp_ps(__m128 val1, __m128 val2, const int mask);
|
||
|
extern __m128d _mm_dp_pd(__m128d val1, __m128d val2, const int mask);
|
||
|
|
||
|
// Packed integer 64-bit comparison, zeroing or filling with ones
|
||
|
// corresponding parts of result
|
||
|
|
||
|
extern __m128i _mm_cmpeq_epi64(__m128i val1, __m128i val2);
|
||
|
|
||
|
// Min/max packed integer instructions
|
||
|
|
||
|
extern __m128i _mm_min_epi8 (__m128i val1, __m128i val2);
|
||
|
extern __m128i _mm_max_epi8 (__m128i val1, __m128i val2);
|
||
|
|
||
|
extern __m128i _mm_min_epu16(__m128i val1, __m128i val2);
|
||
|
extern __m128i _mm_max_epu16(__m128i val1, __m128i val2);
|
||
|
|
||
|
extern __m128i _mm_min_epi32(__m128i val1, __m128i val2);
|
||
|
extern __m128i _mm_max_epi32(__m128i val1, __m128i val2);
|
||
|
extern __m128i _mm_min_epu32(__m128i val1, __m128i val2);
|
||
|
extern __m128i _mm_max_epu32(__m128i val1, __m128i val2);
|
||
|
|
||
|
// Packed integer 32-bit multiplication with truncation
|
||
|
// of upper halves of results
|
||
|
|
||
|
extern __m128i _mm_mullo_epi32(__m128i a, __m128i b);
|
||
|
|
||
|
// Packed integer 32-bit multiplication of 2 pairs of operands
|
||
|
// producing two 64-bit results
|
||
|
|
||
|
extern __m128i _mm_mul_epi32(__m128i a, __m128i b);
|
||
|
|
||
|
// Packed integer 128-bit bitwise comparison.
|
||
|
// return 1 if (val 'and' mask) == 0
|
||
|
|
||
|
extern int _mm_testz_si128(__m128i mask, __m128i val);
|
||
|
|
||
|
// Packed integer 128-bit bitwise comparison.
|
||
|
// return 1 if (val 'and_not' mask) == 0
|
||
|
|
||
|
extern int _mm_testc_si128(__m128i mask, __m128i val);
|
||
|
|
||
|
// Packed integer 128-bit bitwise comparison
|
||
|
// ZF = ((val 'and' mask) == 0) CF = ((val 'and_not' mask) == 0)
|
||
|
// return 1 if both ZF and CF are 0
|
||
|
|
||
|
extern int _mm_testnzc_si128(__m128i mask, __m128i s2);
|
||
|
|
||
|
// Insert single precision float into packed single precision
|
||
|
// array element selected by index.
|
||
|
// The bits [7-6] of the 3d parameter define src index,
|
||
|
// the bits [5-4] define dst index, and bits [3-0] define zeroing
|
||
|
// mask for dst
|
||
|
|
||
|
extern __m128 _mm_insert_ps(__m128 dst, __m128 src, const int ndx);
|
||
|
|
||
|
// Helper macro to create ndx-parameter value for _mm_insert_ps
|
||
|
|
||
|
#define _MM_MK_INSERTPS_NDX(srcField, dstField, zeroMask) \
|
||
|
(((srcField)<<6) | ((dstField)<<4) | (zeroMask))
|
||
|
|
||
|
// Extract binary representation of single precision float from
|
||
|
// packed single precision array element selected by index
|
||
|
|
||
|
extern int _mm_extract_ps(__m128 src, const int ndx);
|
||
|
|
||
|
// Extract single precision float from packed single precision
|
||
|
// array element selected by index into dest
|
||
|
|
||
|
#define _MM_EXTRACT_FLOAT(dest, src, ndx) \
|
||
|
*((int*)&(dest)) = _mm_extract_ps((src), (ndx))
|
||
|
|
||
|
// Extract specified single precision float element
|
||
|
// into the lower part of __m128
|
||
|
|
||
|
#define _MM_PICK_OUT_PS(src, num) \
|
||
|
_mm_insert_ps(_mm_setzero_ps(), (src), \
|
||
|
_MM_MK_INSERTPS_NDX((num), 0, 0x0e))
|
||
|
|
||
|
// Insert integer into packed integer array element
|
||
|
// selected by index
|
||
|
|
||
|
extern __m128i _mm_insert_epi8 (__m128i dst, int s, const int ndx);
|
||
|
extern __m128i _mm_insert_epi32(__m128i dst, int s, const int ndx);
|
||
|
|
||
|
#if defined (_M_X64)
|
||
|
extern __m128i _mm_insert_epi64(__m128i dst, __int64 s, const int ndx);
|
||
|
#endif /* defined (_M_X64) */
|
||
|
// Extract integer from packed integer array element
|
||
|
// selected by index
|
||
|
|
||
|
extern int _mm_extract_epi8 (__m128i src, const int ndx);
|
||
|
extern int _mm_extract_epi32(__m128i src, const int ndx);
|
||
|
|
||
|
#if defined (_M_X64)
|
||
|
extern __int64 _mm_extract_epi64(__m128i src, const int ndx);
|
||
|
#endif /* defined (_M_X64) */
|
||
|
|
||
|
// Horizontal packed word minimum and its index in
|
||
|
// result[15:0] and result[18:16] respectively
|
||
|
|
||
|
extern __m128i _mm_minpos_epu16(__m128i shortValues);
|
||
|
|
||
|
// Packed/single float double precision rounding
|
||
|
|
||
|
extern __m128d _mm_round_pd(__m128d val, int iRoundMode);
|
||
|
extern __m128d _mm_round_sd(__m128d dst, __m128d val, int iRoundMode);
|
||
|
|
||
|
// Packed/single float single precision rounding
|
||
|
|
||
|
extern __m128 _mm_round_ps(__m128 val, int iRoundMode);
|
||
|
extern __m128 _mm_round_ss(__m128 dst, __m128 val, int iRoundMode);
|
||
|
|
||
|
// Packed integer sign-extension
|
||
|
|
||
|
extern __m128i _mm_cvtepi8_epi32 (__m128i byteValues);
|
||
|
extern __m128i _mm_cvtepi16_epi32(__m128i shortValues);
|
||
|
extern __m128i _mm_cvtepi8_epi64 (__m128i byteValues);
|
||
|
extern __m128i _mm_cvtepi32_epi64(__m128i intValues);
|
||
|
extern __m128i _mm_cvtepi16_epi64(__m128i shortValues);
|
||
|
extern __m128i _mm_cvtepi8_epi16 (__m128i byteValues);
|
||
|
|
||
|
// Packed integer zero-extension
|
||
|
|
||
|
extern __m128i _mm_cvtepu8_epi32 (__m128i byteValues);
|
||
|
extern __m128i _mm_cvtepu16_epi32(__m128i shortValues);
|
||
|
extern __m128i _mm_cvtepu8_epi64 (__m128i shortValues);
|
||
|
extern __m128i _mm_cvtepu32_epi64(__m128i intValues);
|
||
|
extern __m128i _mm_cvtepu16_epi64(__m128i shortValues);
|
||
|
extern __m128i _mm_cvtepu8_epi16 (__m128i byteValues);
|
||
|
|
||
|
|
||
|
// Pack 8 double words from 2 operands into 8 words of result
|
||
|
// with unsigned saturation
|
||
|
|
||
|
extern __m128i _mm_packus_epi32(__m128i val1, __m128i val2);
|
||
|
|
||
|
// Sum absolute 8-bit integer difference of adjacent groups of 4 byte
|
||
|
// integers in operands. Starting offsets within operands are
|
||
|
// determined by mask
|
||
|
|
||
|
extern __m128i _mm_mpsadbw_epu8(__m128i s1, __m128i s2, const int msk);
|
||
|
|
||
|
/*
|
||
|
* Load double quadword using non-temporal aligned hint
|
||
|
*/
|
||
|
|
||
|
extern __m128i _mm_stream_load_si128(__m128i* v1);
|
||
|
|
||
|
#if defined __cplusplus
|
||
|
}; /* End "C" */
|
||
|
#endif /* defined __cplusplus */
|
||
|
|
||
|
#endif /* defined (_M_CEE_PURE) */
|
||
|
|
||
|
#endif /* _INCLUDED_SMM */
|
||
|
#endif /* __midl */
|