416 lines
15 KiB
C
416 lines
15 KiB
C
/***
|
|
*** Copyright (C) 1985-1999 Intel Corporation. All rights reserved.
|
|
***
|
|
*** The information and source code contained herein is the exclusive
|
|
*** property of Intel Corporation and may not be disclosed, examined
|
|
*** or reproduced in whole or in part without explicit written authorization
|
|
*** from the company.
|
|
***
|
|
****/
|
|
|
|
/*
|
|
* emmintrin.h
|
|
*
|
|
* Principal header file for Willamette New Instruction intrinsics
|
|
*
|
|
* The intrinsics package can be used in 2 ways, based whether or not
|
|
* _EMM_FUNCTIONALITY is defined; if it is, the C implementation
|
|
* will be used (the "functional intrinsics").
|
|
*/
|
|
|
|
#pragma once
|
|
#ifndef __midl
|
|
#ifndef _INCLUDED_EMM
|
|
#define _INCLUDED_EMM
|
|
|
|
#if defined (_M_CEE_PURE)
|
|
#error ERROR: EMM intrinsics not supported in the pure mode!
|
|
#else /* defined (_M_CEE_PURE) */
|
|
|
|
/*
|
|
* the __m128 & __m64 types are required for the intrinsics
|
|
*/
|
|
#include <xmmintrin.h>
|
|
|
|
typedef union __declspec(intrin_type) _CRT_ALIGN(16) __m128i {
|
|
__int8 m128i_i8[16];
|
|
__int16 m128i_i16[8];
|
|
__int32 m128i_i32[4];
|
|
__int64 m128i_i64[2];
|
|
unsigned __int8 m128i_u8[16];
|
|
unsigned __int16 m128i_u16[8];
|
|
unsigned __int32 m128i_u32[4];
|
|
unsigned __int64 m128i_u64[2];
|
|
} __m128i;
|
|
|
|
typedef struct __declspec(intrin_type) _CRT_ALIGN(16) __m128d {
|
|
double m128d_f64[2];
|
|
} __m128d;
|
|
|
|
/*
|
|
* Macro function for shuffle
|
|
*/
|
|
#define _MM_SHUFFLE2(x,y) (((x)<<1) | (y))
|
|
|
|
/*****************************************************/
|
|
/* INTRINSICS FUNCTION PROTOTYPES START HERE */
|
|
/*****************************************************/
|
|
|
|
#if defined __cplusplus
|
|
extern "C" { /* Begin "C" */
|
|
/* Intrinsics use C name-mangling. */
|
|
#endif /* defined __cplusplus */
|
|
|
|
/*
|
|
* DP, arithmetic
|
|
*/
|
|
|
|
extern __m128d _mm_add_sd(__m128d _A, __m128d _B);
|
|
extern __m128d _mm_add_pd(__m128d _A, __m128d _B);
|
|
extern __m128d _mm_sub_sd(__m128d _A, __m128d _B);
|
|
extern __m128d _mm_sub_pd(__m128d _A, __m128d _B);
|
|
extern __m128d _mm_mul_sd(__m128d _A, __m128d _B);
|
|
extern __m128d _mm_mul_pd(__m128d _A, __m128d _B);
|
|
extern __m128d _mm_sqrt_sd(__m128d _A, __m128d _B);
|
|
extern __m128d _mm_sqrt_pd(__m128d _A);
|
|
extern __m128d _mm_div_sd(__m128d _A, __m128d _B);
|
|
extern __m128d _mm_div_pd(__m128d _A, __m128d _B);
|
|
extern __m128d _mm_min_sd(__m128d _A, __m128d _B);
|
|
extern __m128d _mm_min_pd(__m128d _A, __m128d _B);
|
|
extern __m128d _mm_max_sd(__m128d _A, __m128d _B);
|
|
extern __m128d _mm_max_pd(__m128d _A, __m128d _B);
|
|
|
|
/*
|
|
* DP, logicals
|
|
*/
|
|
|
|
extern __m128d _mm_and_pd(__m128d _A, __m128d _B);
|
|
extern __m128d _mm_andnot_pd(__m128d _A, __m128d _B);
|
|
extern __m128d _mm_or_pd(__m128d _A, __m128d _B);
|
|
extern __m128d _mm_xor_pd(__m128d _A, __m128d _B);
|
|
|
|
/*
|
|
* DP, comparisons
|
|
*/
|
|
|
|
extern __m128d _mm_cmpeq_sd(__m128d _A, __m128d _B);
|
|
extern __m128d _mm_cmpeq_pd(__m128d _A, __m128d _B);
|
|
extern __m128d _mm_cmplt_sd(__m128d _A, __m128d _B);
|
|
extern __m128d _mm_cmplt_pd(__m128d _A, __m128d _B);
|
|
extern __m128d _mm_cmple_sd(__m128d _A, __m128d _B);
|
|
extern __m128d _mm_cmple_pd(__m128d _A, __m128d _B);
|
|
extern __m128d _mm_cmpgt_sd(__m128d _A, __m128d _B);
|
|
extern __m128d _mm_cmpgt_pd(__m128d _A, __m128d _B);
|
|
extern __m128d _mm_cmpge_sd(__m128d _A, __m128d _B);
|
|
extern __m128d _mm_cmpge_pd(__m128d _A, __m128d _B);
|
|
extern __m128d _mm_cmpneq_sd(__m128d _A, __m128d _B);
|
|
extern __m128d _mm_cmpneq_pd(__m128d _A, __m128d _B);
|
|
extern __m128d _mm_cmpnlt_sd(__m128d _A, __m128d _B);
|
|
extern __m128d _mm_cmpnlt_pd(__m128d _A, __m128d _B);
|
|
extern __m128d _mm_cmpnle_sd(__m128d _A, __m128d _B);
|
|
extern __m128d _mm_cmpnle_pd(__m128d _A, __m128d _B);
|
|
extern __m128d _mm_cmpngt_sd(__m128d _A, __m128d _B);
|
|
extern __m128d _mm_cmpngt_pd(__m128d _A, __m128d _B);
|
|
extern __m128d _mm_cmpnge_sd(__m128d _A, __m128d _B);
|
|
extern __m128d _mm_cmpnge_pd(__m128d _A, __m128d _B);
|
|
extern __m128d _mm_cmpord_pd(__m128d _A, __m128d _B);
|
|
extern __m128d _mm_cmpord_sd(__m128d _A, __m128d _B);
|
|
extern __m128d _mm_cmpunord_pd(__m128d _A, __m128d _B);
|
|
extern __m128d _mm_cmpunord_sd(__m128d _A, __m128d _B);
|
|
extern int _mm_comieq_sd(__m128d _A, __m128d _B);
|
|
extern int _mm_comilt_sd(__m128d _A, __m128d _B);
|
|
extern int _mm_comile_sd(__m128d _A, __m128d _B);
|
|
extern int _mm_comigt_sd(__m128d _A, __m128d _B);
|
|
extern int _mm_comige_sd(__m128d _A, __m128d _B);
|
|
extern int _mm_comineq_sd(__m128d _A, __m128d _B);
|
|
extern int _mm_ucomieq_sd(__m128d _A, __m128d _B);
|
|
extern int _mm_ucomilt_sd(__m128d _A, __m128d _B);
|
|
extern int _mm_ucomile_sd(__m128d _A, __m128d _B);
|
|
extern int _mm_ucomigt_sd(__m128d _A, __m128d _B);
|
|
extern int _mm_ucomige_sd(__m128d _A, __m128d _B);
|
|
extern int _mm_ucomineq_sd(__m128d _A, __m128d _B);
|
|
|
|
/*
|
|
* DP, converts
|
|
*/
|
|
|
|
extern __m128d _mm_cvtepi32_pd(__m128i _A);
|
|
extern __m128i _mm_cvtpd_epi32(__m128d _A);
|
|
extern __m128i _mm_cvttpd_epi32(__m128d _A);
|
|
extern __m128 _mm_cvtepi32_ps(__m128i _A);
|
|
extern __m128i _mm_cvtps_epi32(__m128 _A);
|
|
extern __m128i _mm_cvttps_epi32(__m128 _A);
|
|
extern __m128 _mm_cvtpd_ps(__m128d _A);
|
|
extern __m128d _mm_cvtps_pd(__m128 _A);
|
|
extern __m128 _mm_cvtsd_ss(__m128 _A, __m128d _B);
|
|
extern __m128d _mm_cvtss_sd(__m128d _A, __m128 _B);
|
|
|
|
extern int _mm_cvtsd_si32(__m128d _A);
|
|
extern int _mm_cvttsd_si32(__m128d _A);
|
|
extern __m128d _mm_cvtsi32_sd(__m128d _A, int _B);
|
|
|
|
extern __m64 _mm_cvtpd_pi32(__m128d _A);
|
|
extern __m64 _mm_cvttpd_pi32(__m128d _A);
|
|
extern __m128d _mm_cvtpi32_pd(__m64 _A);
|
|
|
|
/*
|
|
* DP, misc
|
|
*/
|
|
|
|
extern __m128d _mm_unpackhi_pd(__m128d _A, __m128d _B);
|
|
extern __m128d _mm_unpacklo_pd(__m128d _A, __m128d _B);
|
|
extern int _mm_movemask_pd(__m128d _A);
|
|
extern __m128d _mm_shuffle_pd(__m128d _A, __m128d _B, int _I);
|
|
|
|
/*
|
|
* DP, loads
|
|
*/
|
|
|
|
extern __m128d _mm_load_pd(double const*_Dp);
|
|
extern __m128d _mm_load1_pd(double const*_Dp);
|
|
extern __m128d _mm_loadr_pd(double const*_Dp);
|
|
extern __m128d _mm_loadu_pd(double const*_Dp);
|
|
extern __m128d _mm_load_sd(double const*_Dp);
|
|
extern __m128d _mm_loadh_pd(__m128d _A, double const*_Dp);
|
|
extern __m128d _mm_loadl_pd(__m128d _A, double const*_Dp);
|
|
|
|
/*
|
|
* DP, sets
|
|
*/
|
|
|
|
extern __m128d _mm_set_sd(double _W);
|
|
extern __m128d _mm_set1_pd(double _A);
|
|
extern __m128d _mm_set_pd(double _Z, double _Y);
|
|
extern __m128d _mm_setr_pd(double _Y, double _Z);
|
|
extern __m128d _mm_setzero_pd(void);
|
|
extern __m128d _mm_move_sd(__m128d _A, __m128d _B);
|
|
|
|
/*
|
|
* DP, stores
|
|
*/
|
|
|
|
extern void _mm_store_sd(double *_Dp, __m128d _A);
|
|
extern void _mm_store1_pd(double *_Dp, __m128d _A);
|
|
extern void _mm_store_pd(double *_Dp, __m128d _A);
|
|
extern void _mm_storeu_pd(double *_Dp, __m128d _A);
|
|
extern void _mm_storer_pd(double *_Dp, __m128d _A);
|
|
extern void _mm_storeh_pd(double *_Dp, __m128d _A);
|
|
extern void _mm_storel_pd(double *_Dp, __m128d _A);
|
|
|
|
/*
|
|
* Integer, arithmetic
|
|
*/
|
|
|
|
extern __m128i _mm_add_epi8(__m128i _A, __m128i _B);
|
|
extern __m128i _mm_add_epi16(__m128i _A, __m128i _B);
|
|
extern __m128i _mm_add_epi32(__m128i _A, __m128i _B);
|
|
extern __m64 _mm_add_si64(__m64 _A, __m64 _B);
|
|
extern __m128i _mm_add_epi64(__m128i _A, __m128i _B);
|
|
extern __m128i _mm_adds_epi8(__m128i _A, __m128i _B);
|
|
extern __m128i _mm_adds_epi16(__m128i _A, __m128i _B);
|
|
extern __m128i _mm_adds_epu8(__m128i _A, __m128i _B);
|
|
extern __m128i _mm_adds_epu16(__m128i _A, __m128i _B);
|
|
extern __m128i _mm_avg_epu8(__m128i _A, __m128i _B);
|
|
extern __m128i _mm_avg_epu16(__m128i _A, __m128i _B);
|
|
extern __m128i _mm_madd_epi16(__m128i _A, __m128i _B);
|
|
extern __m128i _mm_max_epi16(__m128i _A, __m128i _B);
|
|
extern __m128i _mm_max_epu8(__m128i _A, __m128i _B);
|
|
extern __m128i _mm_min_epi16(__m128i _A, __m128i _B);
|
|
extern __m128i _mm_min_epu8(__m128i _A, __m128i _B);
|
|
extern __m128i _mm_mulhi_epi16(__m128i _A, __m128i _B);
|
|
extern __m128i _mm_mulhi_epu16(__m128i _A, __m128i _B);
|
|
extern __m128i _mm_mullo_epi16(__m128i _A, __m128i _B);
|
|
extern __m64 _mm_mul_su32(__m64 _A, __m64 _B);
|
|
extern __m128i _mm_mul_epu32(__m128i _A, __m128i _B);
|
|
extern __m128i _mm_sad_epu8(__m128i _A, __m128i _B);
|
|
extern __m128i _mm_sub_epi8(__m128i _A, __m128i _B);
|
|
extern __m128i _mm_sub_epi16(__m128i _A, __m128i _B);
|
|
extern __m128i _mm_sub_epi32(__m128i _A, __m128i _B);
|
|
extern __m64 _mm_sub_si64(__m64 _A, __m64 _B);
|
|
extern __m128i _mm_sub_epi64(__m128i _A, __m128i _B);
|
|
extern __m128i _mm_subs_epi8(__m128i _A, __m128i _B);
|
|
extern __m128i _mm_subs_epi16(__m128i _A, __m128i _B);
|
|
extern __m128i _mm_subs_epu8(__m128i _A, __m128i _B);
|
|
extern __m128i _mm_subs_epu16(__m128i _A, __m128i _B);
|
|
|
|
/*
|
|
* Integer, logicals
|
|
*/
|
|
|
|
extern __m128i _mm_and_si128(__m128i _A, __m128i _B);
|
|
extern __m128i _mm_andnot_si128(__m128i _A, __m128i _B);
|
|
extern __m128i _mm_or_si128(__m128i _A, __m128i _B);
|
|
extern __m128i _mm_xor_si128(__m128i _A, __m128i _B);
|
|
|
|
/*
|
|
* Integer, shifts
|
|
*/
|
|
|
|
extern __m128i _mm_slli_si128(__m128i _A, int _Imm);
|
|
extern __m128i _mm_slli_epi16(__m128i _A, int _Count);
|
|
extern __m128i _mm_sll_epi16(__m128i _A, __m128i _Count);
|
|
extern __m128i _mm_slli_epi32(__m128i _A, int _Count);
|
|
extern __m128i _mm_sll_epi32(__m128i _A, __m128i _Count);
|
|
extern __m128i _mm_slli_epi64(__m128i _A, int _Count);
|
|
extern __m128i _mm_sll_epi64(__m128i _A, __m128i _Count);
|
|
extern __m128i _mm_srai_epi16(__m128i _A, int _Count);
|
|
extern __m128i _mm_sra_epi16(__m128i _A, __m128i _Count);
|
|
extern __m128i _mm_srai_epi32(__m128i _A, int _Count);
|
|
extern __m128i _mm_sra_epi32(__m128i _A, __m128i _Count);
|
|
extern __m128i _mm_srli_si128(__m128i _A, int _Imm);
|
|
extern __m128i _mm_srli_epi16(__m128i _A, int _Count);
|
|
extern __m128i _mm_srl_epi16(__m128i _A, __m128i _Count);
|
|
extern __m128i _mm_srli_epi32(__m128i _A, int _Count);
|
|
extern __m128i _mm_srl_epi32(__m128i _A, __m128i _Count);
|
|
extern __m128i _mm_srli_epi64(__m128i _A, int _Count);
|
|
extern __m128i _mm_srl_epi64(__m128i _A, __m128i _Count);
|
|
|
|
/*
|
|
* Integer, comparisons
|
|
*/
|
|
|
|
extern __m128i _mm_cmpeq_epi8(__m128i _A, __m128i _B);
|
|
extern __m128i _mm_cmpeq_epi16(__m128i _A, __m128i _B);
|
|
extern __m128i _mm_cmpeq_epi32(__m128i _A, __m128i _B);
|
|
extern __m128i _mm_cmpgt_epi8(__m128i _A, __m128i _B);
|
|
extern __m128i _mm_cmpgt_epi16(__m128i _A, __m128i _B);
|
|
extern __m128i _mm_cmpgt_epi32(__m128i _A, __m128i _B);
|
|
extern __m128i _mm_cmplt_epi8(__m128i _A, __m128i _B);
|
|
extern __m128i _mm_cmplt_epi16(__m128i _A, __m128i _B);
|
|
extern __m128i _mm_cmplt_epi32(__m128i _A, __m128i _B);
|
|
|
|
/*
|
|
* Integer, converts
|
|
*/
|
|
|
|
extern __m128i _mm_cvtsi32_si128(int _A);
|
|
extern int _mm_cvtsi128_si32(__m128i _A);
|
|
|
|
/*
|
|
* Integer, misc
|
|
*/
|
|
|
|
extern __m128i _mm_packs_epi16(__m128i _A, __m128i _B);
|
|
extern __m128i _mm_packs_epi32(__m128i _A, __m128i _B);
|
|
extern __m128i _mm_packus_epi16(__m128i _A, __m128i _B);
|
|
extern int _mm_extract_epi16(__m128i _A, int _Imm);
|
|
extern __m128i _mm_insert_epi16(__m128i _A, int _B, int _Imm);
|
|
extern int _mm_movemask_epi8(__m128i _A);
|
|
extern __m128i _mm_shuffle_epi32(__m128i _A, int _Imm);
|
|
extern __m128i _mm_shufflehi_epi16(__m128i _A, int _Imm);
|
|
extern __m128i _mm_shufflelo_epi16(__m128i _A, int _Imm);
|
|
extern __m128i _mm_unpackhi_epi8(__m128i _A, __m128i _B);
|
|
extern __m128i _mm_unpackhi_epi16(__m128i _A, __m128i _B);
|
|
extern __m128i _mm_unpackhi_epi32(__m128i _A, __m128i _B);
|
|
extern __m128i _mm_unpackhi_epi64(__m128i _A, __m128i _B);
|
|
extern __m128i _mm_unpacklo_epi8(__m128i _A, __m128i _B);
|
|
extern __m128i _mm_unpacklo_epi16(__m128i _A, __m128i _B);
|
|
extern __m128i _mm_unpacklo_epi32(__m128i _A, __m128i _B);
|
|
extern __m128i _mm_unpacklo_epi64(__m128i _A, __m128i _B);
|
|
|
|
/*
|
|
* Integer, loads
|
|
*/
|
|
|
|
extern __m128i _mm_load_si128(__m128i const*_P);
|
|
extern __m128i _mm_loadu_si128(__m128i const*_P);
|
|
extern __m128i _mm_loadl_epi64(__m128i const*_P);
|
|
|
|
/*
|
|
* Integer, sets
|
|
*/
|
|
|
|
extern __m128i _mm_set_epi64(__m64 _Q1, __m64 _Q0);
|
|
extern __m128i _mm_set_epi32(int _I3, int _I2, int _I1, int _I0);
|
|
extern __m128i _mm_set_epi16(short _W7, short _W6, short _W5, short _W4,
|
|
short _W3, short _W2, short _W1, short _W0);
|
|
extern __m128i _mm_set_epi8(char _B15, char _B14, char _B13, char _B12,
|
|
char _B11, char _B10, char _B9, char _B8,
|
|
char _B7, char _B6, char _B5, char _B4,
|
|
char _B3, char _B2, char _B1, char _B0);
|
|
extern __m128i _mm_set1_epi64(__m64 _Q);
|
|
extern __m128i _mm_set1_epi32(int _I);
|
|
extern __m128i _mm_set1_epi16(short _W);
|
|
extern __m128i _mm_set1_epi8(char _B);
|
|
extern __m128i _mm_setl_epi64(__m128i _Q);
|
|
extern __m128i _mm_setr_epi64(__m64 _Q0, __m64 _Q1);
|
|
extern __m128i _mm_setr_epi32(int _I0, int _I1, int _I2, int _I3);
|
|
extern __m128i _mm_setr_epi16(short _W0, short _W1, short _W2, short _W3,
|
|
short _W4, short _W5, short _W6, short _W7);
|
|
extern __m128i _mm_setr_epi8(char _B15, char _B14, char _B13, char _B12,
|
|
char _B11, char _B10, char _B9, char _B8,
|
|
char _B7, char _B6, char _B5, char _B4,
|
|
char _B3, char _B2, char _B1, char _B0);
|
|
extern __m128i _mm_setzero_si128(void);
|
|
|
|
/*
|
|
* Integer, stores
|
|
*/
|
|
|
|
extern void _mm_store_si128(__m128i *_P, __m128i _B);
|
|
extern void _mm_storeu_si128(__m128i *_P, __m128i _B);
|
|
extern void _mm_storel_epi64(__m128i *_P, __m128i _Q);
|
|
extern void _mm_maskmoveu_si128(__m128i _D, __m128i _N, char *_P);
|
|
|
|
/*
|
|
* Integer, moves
|
|
*/
|
|
|
|
extern __m128i _mm_move_epi64(__m128i _Q);
|
|
extern __m128i _mm_movpi64_epi64(__m64 _Q);
|
|
extern __m64 _mm_movepi64_pi64(__m128i _Q);
|
|
|
|
/*
|
|
* Cacheability support
|
|
*/
|
|
|
|
extern void _mm_stream_pd(double *_Dp, __m128d _A);
|
|
extern void _mm_stream_si128(__m128i *_P, __m128i _A);
|
|
extern void _mm_clflush(void const*_P);
|
|
extern void _mm_lfence(void);
|
|
extern void _mm_mfence(void);
|
|
extern void _mm_stream_si32(int *_P, int _I);
|
|
extern void _mm_pause(void);
|
|
|
|
/*
|
|
* New convert to float
|
|
*/
|
|
|
|
extern double _mm_cvtsd_f64(__m128d _A);
|
|
|
|
/*
|
|
* Support for casting between various SP, DP, INT vector types.
|
|
* Note that these do no conversion of values, they just change
|
|
* the type.
|
|
*/
|
|
|
|
extern __m128 _mm_castpd_ps(__m128d);
|
|
extern __m128i _mm_castpd_si128(__m128d);
|
|
extern __m128d _mm_castps_pd(__m128);
|
|
extern __m128i _mm_castps_si128(__m128);
|
|
extern __m128 _mm_castsi128_ps(__m128i);
|
|
extern __m128d _mm_castsi128_pd(__m128i);
|
|
|
|
/*
|
|
* Support for 64-bit extension intrinsics
|
|
*/
|
|
|
|
#if defined (_M_X64)
|
|
extern __int64 _mm_cvtsd_si64(__m128d);
|
|
extern __int64 _mm_cvttsd_si64(__m128d);
|
|
extern __m128d _mm_cvtsi64_sd(__m128d, __int64);
|
|
extern __m128i _mm_cvtsi64_si128(__int64);
|
|
extern __int64 _mm_cvtsi128_si64(__m128i);
|
|
/* Alternate intrinsic name definitions */
|
|
#define _mm_stream_si64 _mm_stream_si64x
|
|
#endif /* defined (_M_X64) */
|
|
|
|
#if defined __cplusplus
|
|
}; /* End "C" */
|
|
#endif /* defined __cplusplus */
|
|
|
|
#endif /* defined (_M_CEE_PURE) */
|
|
|
|
#endif /* _INCLUDED_EMM */
|
|
#endif /* __midl */
|