ppc_simd.h 99 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099110011011102110311041105110611071108110911101111111211131114111511161117111811191120112111221123112411251126112711281129113011311132113311341135113611371138113911401141114211431144114511461147114811491150115111521153115411551156115711581159116011611162116311641165116611671168116911701171117211731174117511761177117811791180118111821183118411851186118711881189119011911192119311941195119611971198119912001201120212031204120512061207120812091210121112121213121412151216121712181219122012211222122312241225122612271228122912301231123212331234123512361237123812391240124112421243124412451246124712481249125012511252125312541255125612571258125912601261126212631264126512661267126812691270127112721273127412751276127712781279128012811282128312841285128612871288128912901291129212931294129512961297129812991300130113021303130413051306130713081309131013111312131313141315131613171318131913201321132213231324132513261327132813291330133113321333133413351336133713381339134013411342134313441345134613471348134913501351135213531354135513561357135813591360136113621363136413651366136713681369137013711372137313741375137613771378137913801381138213831384138513861387138813891390139113921393139413951396139713981399140014011402140314041405140614071408140914101411141214131414141514161417141814191420142114221423142414251426142714281429143014311432143314341435143614371438143914401441144214431444144514461447144814491450145114521453145414551456145714581459146014611462146314641465146614671468146914701471147214731474147514761477147814791480148114821483148414851486148714881489149014911492149314941495149614971498149915001501150215031504150515061507150815091510151115121513151415151516151715181519152015211522152315241525152615271528152915301531153215331534153515361537153815391540154115421543154415451546154715481549155015511552155315541555155615571558155915601561156215631564156515661567156815691570157115721573157415751576157715781579158015811582158315841585158615871588158915901591159215931594159515961597159815991600160116021603160416051606160716081609161016111612161316141615161616171618161916201621162216231624162516261627162816291630163116321633163416351636163716381639164016411642164316441645164616471648164916501651165216531654165516561657165816591660166116621663166416651666166716681669167016711672167316741675167616771678167916801681168216831684168516861687168816891690169116921693169416951696169716981699170017011702170317041705170617071708170917101711171217131714171517161717171817191720172117221723172417251726172717281729173017311732173317341735173617371738173917401741174217431744174517461747174817491750175117521753175417551756175717581759176017611762176317641765176617671768176917701771177217731774177517761777177817791780178117821783178417851786178717881789179017911792179317941795179617971798179918001801180218031804180518061807180818091810181118121813181418151816181718181819182018211822182318241825182618271828182918301831183218331834183518361837183818391840184118421843184418451846184718481849185018511852185318541855185618571858185918601861186218631864186518661867186818691870187118721873187418751876187718781879188018811882188318841885188618871888188918901891189218931894189518961897189818991900190119021903190419051906190719081909191019111912191319141915191619171918191919201921192219231924192519261927192819291930193119321933193419351936193719381939194019411942194319441945194619471948194919501951195219531954195519561957195819591960196119621963196419651966196719681969197019711972197319741975197619771978197919801981198219831984198519861987198819891990199119921993199419951996199719981999200020012002200320042005200620072008200920102011201220132014201520162017201820192020202120222023202420252026202720282029203020312032203320342035203620372038203920402041204220432044204520462047204820492050205120522053205420552056205720582059206020612062206320642065206620672068206920702071207220732074207520762077207820792080208120822083208420852086208720882089209020912092209320942095209620972098209921002101210221032104210521062107210821092110211121122113211421152116211721182119212021212122212321242125212621272128212921302131213221332134213521362137213821392140214121422143214421452146214721482149215021512152215321542155215621572158215921602161216221632164216521662167216821692170217121722173217421752176217721782179218021812182218321842185218621872188218921902191219221932194219521962197219821992200220122022203220422052206220722082209221022112212221322142215221622172218221922202221222222232224222522262227222822292230223122322233223422352236223722382239224022412242224322442245224622472248224922502251225222532254225522562257225822592260226122622263226422652266226722682269227022712272227322742275227622772278227922802281228222832284228522862287228822892290229122922293229422952296229722982299230023012302230323042305230623072308230923102311231223132314231523162317231823192320232123222323232423252326232723282329233023312332233323342335233623372338233923402341234223432344234523462347234823492350235123522353235423552356235723582359236023612362236323642365236623672368236923702371237223732374237523762377237823792380238123822383238423852386238723882389239023912392239323942395239623972398239924002401240224032404240524062407240824092410241124122413241424152416241724182419242024212422242324242425242624272428242924302431243224332434243524362437243824392440244124422443244424452446244724482449245024512452245324542455245624572458245924602461246224632464246524662467246824692470247124722473247424752476247724782479248024812482248324842485248624872488248924902491249224932494249524962497249824992500250125022503250425052506250725082509251025112512251325142515251625172518251925202521252225232524252525262527252825292530253125322533253425352536253725382539254025412542254325442545254625472548254925502551255225532554255525562557255825592560256125622563256425652566256725682569257025712572257325742575257625772578257925802581258225832584258525862587258825892590259125922593259425952596259725982599260026012602260326042605260626072608260926102611261226132614261526162617261826192620262126222623262426252626262726282629263026312632263326342635263626372638263926402641264226432644264526462647264826492650265126522653265426552656265726582659266026612662266326642665266626672668266926702671267226732674267526762677267826792680268126822683268426852686268726882689269026912692269326942695269626972698269927002701270227032704270527062707270827092710271127122713271427152716271727182719272027212722272327242725272627272728272927302731273227332734273527362737273827392740274127422743274427452746274727482749275027512752275327542755275627572758275927602761276227632764
  1. // ppc_simd.h - written and placed in public domain by Jeffrey Walton
  2. /// \file ppc_simd.h
  3. /// \brief Support functions for PowerPC and vector operations
  4. /// \details This header provides an agnostic interface into Clang, GCC
  5. /// and IBM XL C/C++ compilers modulo their different built-in functions
  6. /// for accessing vector instructions.
  7. /// \details The abstractions are necessary to support back to GCC 4.8 and
  8. /// XLC 11 and 12. GCC 4.8 and 4.9 are still popular, and they are the
  9. /// default compiler for GCC112, GCC119 and others on the compile farm.
  10. /// Older IBM XL C/C++ compilers also have the need due to lack of
  11. /// <tt>vec_xl</tt> and <tt>vec_xst</tt> support on some platforms. Modern
  12. /// compilers provide best support and don't need many of the hacks
  13. /// below.
  14. /// \details The library is tested with the following PowerPC machines and
  15. /// compilers. GCC110, GCC111, GCC112, GCC119 and GCC135 are provided by
  16. /// the <A HREF="https://cfarm.tetaneutral.net/">GCC Compile Farm</A>
  17. /// - PowerMac G5, OSX 10.5, POWER4, Apple GCC 4.0
  18. /// - PowerMac G5, OSX 10.5, POWER4, Macports GCC 5.0
  19. /// - GCC110, Linux, POWER7, GCC 4.8.5
  20. /// - GCC110, Linux, POWER7, XLC 12.01
  21. /// - GCC111, AIX, POWER7, GCC 4.8.1
  22. /// - GCC111, AIX, POWER7, XLC 12.01
  23. /// - GCC112, Linux, POWER8, GCC 4.8.5
  24. /// - GCC112, Linux, POWER8, XLC 13.01
  25. /// - GCC112, Linux, POWER8, Clang 7.0
  26. /// - GCC119, AIX, POWER8, GCC 7.2.0
  27. /// - GCC119, AIX, POWER8, XLC 13.01
  28. /// - GCC135, Linux, POWER9, GCC 7.0
  29. /// \details 12 machines are used for testing because the three compilers form
  30. /// five or six profiles. The profiles are listed below.
  31. /// - GCC (Linux GCC, Macports GCC, etc. Consistent across machines)
  32. /// - XLC 13.0 and earlier (all IBM components)
  33. /// - XLC 13.1 and later on Linux (LLVM front-end, no compatibility macros)
  34. /// - XLC 13.1 and later on Linux (LLVM front-end, -qxlcompatmacros option)
  35. /// - early LLVM Clang (traditional Clang compiler)
  36. /// - late LLVM Clang (traditional Clang compiler)
  37. /// \details The LLVM front-end makes it tricky to write portable code because
  38. /// LLVM pretends to be other compilers but cannot consume other compiler's
  39. /// builtins. When using XLC with -qxlcompatmacros the compiler pretends to
  40. /// be GCC, Clang and XLC all at once but it can only consume it's variety
  41. /// of builtins.
  42. /// \details At Crypto++ 8.0 the various <tt>Vector{FuncName}</tt> were
  43. /// renamed to <tt>Vec{FuncName}</tt>. For example, <tt>VectorAnd</tt> was
  44. /// changed to <tt>VecAnd</tt>. The name change helped consolidate two
  45. /// slightly different implementations.
  46. /// \details At Crypto++ 8.3 the library added select 64-bit functions for
  47. /// 32-bit Altivec. For example, <tt>VecAdd64</tt> and <tt>VecSub64</tt>
  48. /// take 32-bit vectors and adds or subtracts them as if there were vectors
  49. /// with two 64-bit elements. The functions dramtically improve performance
  50. /// for some algorithms on some platforms, like SIMON128 and SPECK128 on
  51. /// Power6 and earlier. For example, SPECK128 improved from 70 cpb to
  52. /// 10 cpb on an old PowerMac. Use the functions like shown below.
  53. /// <pre>
  54. /// \#if defined(_ARCH_PWR8)
  55. /// \# define speck128_t uint64x2_p
  56. /// \#else
  57. /// \# define speck128_t uint32x4_p
  58. /// \#endif
  59. ///
  60. /// speck128_t rk, x1, x2, y1, y2;
  61. /// rk = (speck128_t)VecLoadAligned(ptr);
  62. /// x1 = VecRotateRight64<8>(x1);
  63. /// x1 = VecAdd64(x1, y1);
  64. /// ...</pre>
  65. /// \since Crypto++ 6.0, LLVM Clang compiler support since Crypto++ 8.0
  66. // Use __ALTIVEC__, _ARCH_PWR7, __VSX__, and _ARCH_PWR8 when detecting
  67. // actual availaibility of the feature for the source file being compiled.
  68. // The preprocessor macros depend on compiler options like -maltivec; and
  69. // not compiler versions.
  70. // For GCC see https://gcc.gnu.org/onlinedocs/gcc/Basic-PowerPC-Built-in-Functions.html
  71. // For XLC see the Compiler Reference manual. For Clang you have to experiment.
  72. // Clang does not document the compiler options, does not reject options it does
  73. // not understand, and pretends to be other compilers even though it cannot
  74. // process the builtins and intrinsics. Clang will waste hours of your time.
  75. // DO NOT USE this pattern in VecLoad and VecStore. We have to use the
  76. // code paths guarded by preprocessor macros because XLC 12 generates
  77. // bad code in some places. To verify the bad code generation test on
  78. // GCC111 with XLC 12.01 installed. XLC 13.01 on GCC112 and GCC119 are OK.
  79. //
  80. // inline uint32x4_p VecLoad(const byte src[16])
  81. // {
  82. // #if defined(__VSX__) || defined(_ARCH_PWR8)
  83. // return (uint32x4_p) *(uint8x16_p*)((byte*)src);
  84. // #else
  85. // return VecLoad_ALTIVEC(src);
  86. // #endif
  87. // }
  88. // We should be able to perform the load using inline asm on Power7 with
  89. // VSX or Power8. The inline asm will avoid C undefined behavior due to
  90. // casting from byte* to word32*. We are safe because our byte* are
  91. // 16-byte aligned for Altivec. Below is the big endian load. Little
  92. // endian would need to follow with xxpermdi for the reversal.
  93. //
  94. // __asm__ ("lxvw4x %x0, %1, %2" : "=wa"(v) : "r"(0), "r"(src) : );
  95. // GCC and XLC use integer math for the address (D-form or byte-offset
  96. // in the ISA manual). LLVM uses pointer math for the address (DS-form
  97. // or indexed in the ISA manual). To keep them consistent we calculate
  98. // the address from the offset and pass to a load or store function
  99. // using a 0 offset.
  100. #ifndef CRYPTOPP_PPC_CRYPTO_H
  101. #define CRYPTOPP_PPC_CRYPTO_H
  102. #include "config.h"
  103. #include "misc.h"
  104. #if defined(__ALTIVEC__)
  105. # include <altivec.h>
  106. # undef vector
  107. # undef pixel
  108. # undef bool
  109. #endif
  110. // XL C++ on AIX does not define VSX and does not
  111. // provide an option to set it. We have to set it
  112. // for the code below. This define must stay in
  113. // sync with the define in test_ppc_power7.cpp.
  114. #ifndef CRYPTOPP_DISABLE_POWER7
  115. # if defined(_AIX) && defined(_ARCH_PWR7) && defined(__xlC__)
  116. # define __VSX__ 1
  117. # endif
  118. #endif
  119. // XL C++ on AIX does not define CRYPTO and does not
  120. // provide an option to set it. We have to set it
  121. // for the code below. This define must stay in
  122. // sync with the define in test_ppc_power8.cpp
  123. #ifndef CRYPTOPP_DISABLE_POWER8
  124. # if defined(_AIX) && defined(_ARCH_PWR8) && defined(__xlC__)
  125. # define __CRYPTO__ 1
  126. # endif
  127. #endif
  128. /// \brief Cast array to vector pointer
  129. /// \details CONST_V8_CAST casts a const array to a vector
  130. /// pointer for a byte array. The Power ABI says source arrays
  131. /// are non-const, so this define removes the const. XLC++ will
  132. /// fail the compile if the source array is const.
  133. #define CONST_V8_CAST(x) ((unsigned char*)(x))
  134. /// \brief Cast array to vector pointer
  135. /// \details CONST_V32_CAST casts a const array to a vector
  136. /// pointer for a word array. The Power ABI says source arrays
  137. /// are non-const, so this define removes the const. XLC++ will
  138. /// fail the compile if the source array is const.
  139. #define CONST_V32_CAST(x) ((unsigned int*)(x))
  140. /// \brief Cast array to vector pointer
  141. /// \details CONST_V64_CAST casts a const array to a vector
  142. /// pointer for a double word array. The Power ABI says source arrays
  143. /// are non-const, so this define removes the const. XLC++ will
  144. /// fail the compile if the source array is const.
  145. #define CONST_V64_CAST(x) ((unsigned long long*)(x))
  146. /// \brief Cast array to vector pointer
  147. /// \details NCONST_V8_CAST casts an array to a vector
  148. /// pointer for a byte array. The Power ABI says source arrays
  149. /// are non-const, so this define removes the const. XLC++ will
  150. /// fail the compile if the source array is const.
  151. #define NCONST_V8_CAST(x) ((unsigned char*)(x))
  152. /// \brief Cast array to vector pointer
  153. /// \details NCONST_V32_CAST casts an array to a vector
  154. /// pointer for a word array. The Power ABI says source arrays
  155. /// are non-const, so this define removes the const. XLC++ will
  156. /// fail the compile if the source array is const.
  157. #define NCONST_V32_CAST(x) ((unsigned int*)(x))
  158. /// \brief Cast array to vector pointer
  159. /// \details NCONST_V64_CAST casts an array to a vector
  160. /// pointer for a double word array. The Power ABI says source arrays
  161. /// are non-const, so this define removes the const. XLC++ will
  162. /// fail the compile if the source array is const.
  163. #define NCONST_V64_CAST(x) ((unsigned long long*)(x))
  164. // VecLoad_ALTIVEC and VecStore_ALTIVEC are
  165. // too noisy on modern compilers
  166. #if CRYPTOPP_GCC_DIAGNOSTIC_AVAILABLE
  167. # pragma GCC diagnostic push
  168. # pragma GCC diagnostic ignored "-Wdeprecated"
  169. #endif
  170. NAMESPACE_BEGIN(CryptoPP)
  171. #if defined(__ALTIVEC__) || defined(CRYPTOPP_DOXYGEN_PROCESSING)
  172. /// \brief Vector of 8-bit elements
  173. /// \par Wraps
  174. /// __vector unsigned char
  175. /// \since Crypto++ 6.0
  176. typedef __vector unsigned char uint8x16_p;
  177. /// \brief Vector of 16-bit elements
  178. /// \par Wraps
  179. /// __vector unsigned short
  180. /// \since Crypto++ 6.0
  181. typedef __vector unsigned short uint16x8_p;
  182. /// \brief Vector of 32-bit elements
  183. /// \par Wraps
  184. /// __vector unsigned int
  185. /// \since Crypto++ 6.0
  186. typedef __vector unsigned int uint32x4_p;
  187. #if defined(__VSX__) || defined(_ARCH_PWR8) || defined(CRYPTOPP_DOXYGEN_PROCESSING)
  188. /// \brief Vector of 64-bit elements
  189. /// \details uint64x2_p is available on POWER7 with VSX and above. Most
  190. /// supporting functions, like 64-bit <tt>vec_add</tt> (<tt>vaddudm</tt>)
  191. /// and <tt>vec_sub</tt> (<tt>vsubudm</tt>), did not arrive until POWER8.
  192. /// \par Wraps
  193. /// __vector unsigned long long
  194. /// \since Crypto++ 6.0
  195. typedef __vector unsigned long long uint64x2_p;
  196. #endif // VSX or ARCH_PWR8
  197. /// \brief The 0 vector
  198. /// \return a 32-bit vector of 0's
  199. /// \since Crypto++ 8.0
  200. inline uint32x4_p VecZero()
  201. {
  202. const uint32x4_p v = {0,0,0,0};
  203. return v;
  204. }
  205. /// \brief The 1 vector
  206. /// \return a 32-bit vector of 1's
  207. /// \since Crypto++ 8.0
  208. inline uint32x4_p VecOne()
  209. {
  210. const uint32x4_p v = {1,1,1,1};
  211. return v;
  212. }
  213. /// \brief Reverse bytes in a vector
  214. /// \tparam T vector type
  215. /// \param data the vector
  216. /// \return vector
  217. /// \details VecReverse() reverses the bytes in a vector
  218. /// \par Wraps
  219. /// vec_perm
  220. /// \since Crypto++ 6.0
  221. template <class T>
  222. inline T VecReverse(const T data)
  223. {
  224. #if defined(CRYPTOPP_BIG_ENDIAN)
  225. const uint8x16_p mask = {15,14,13,12, 11,10,9,8, 7,6,5,4, 3,2,1,0};
  226. return (T)vec_perm(data, data, mask);
  227. #else
  228. const uint8x16_p mask = {0,1,2,3, 4,5,6,7, 8,9,10,11, 12,13,14,15};
  229. return (T)vec_perm(data, data, mask);
  230. #endif
  231. }
  232. /// \brief Reverse bytes in a vector
  233. /// \tparam T vector type
  234. /// \param data the vector
  235. /// \return vector
  236. /// \details VecReverseLE() reverses the bytes in a vector on
  237. /// little-endian systems.
  238. /// \par Wraps
  239. /// vec_perm
  240. /// \since Crypto++ 6.0
  241. template <class T>
  242. inline T VecReverseLE(const T data)
  243. {
  244. #if defined(CRYPTOPP_LITTLE_ENDIAN)
  245. const uint8x16_p mask = {15,14,13,12, 11,10,9,8, 7,6,5,4, 3,2,1,0};
  246. return (T)vec_perm(data, data, mask);
  247. #else
  248. return data;
  249. #endif
  250. }
  251. /// \brief Reverse bytes in a vector
  252. /// \tparam T vector type
  253. /// \param data the vector
  254. /// \return vector
  255. /// \details VecReverseBE() reverses the bytes in a vector on
  256. /// big-endian systems.
  257. /// \par Wraps
  258. /// vec_perm
  259. /// \since Crypto++ 6.0
  260. template <class T>
  261. inline T VecReverseBE(const T data)
  262. {
  263. #if defined(CRYPTOPP_BIG_ENDIAN)
  264. const uint8x16_p mask = {15,14,13,12, 11,10,9,8, 7,6,5,4, 3,2,1,0};
  265. return (T)vec_perm(data, data, mask);
  266. #else
  267. return data;
  268. #endif
  269. }
  270. /// \name LOAD OPERATIONS
  271. //@{
  272. /// \brief Loads a vector from a byte array
  273. /// \param src the byte array
  274. /// \details Loads a vector in native endian format from a byte array.
  275. /// \details VecLoad_ALTIVEC() uses <tt>vec_ld</tt> if the effective address
  276. /// of <tt>src</tt> is aligned. If unaligned it uses <tt>vec_lvsl</tt>,
  277. /// <tt>vec_ld</tt>, <tt>vec_perm</tt> and <tt>src</tt>. The fixups using
  278. /// <tt>vec_lvsl</tt> and <tt>vec_perm</tt> are relatively expensive so
  279. /// you should provide aligned memory addresses.
  280. /// \par Wraps
  281. /// vec_ld, vec_lvsl, vec_perm
  282. /// \sa VecLoad, VecLoadAligned
  283. /// \since Crypto++ 6.0
  284. inline uint32x4_p VecLoad_ALTIVEC(const byte src[16])
  285. {
  286. // Avoid IsAlignedOn for convenience.
  287. const uintptr_t addr = reinterpret_cast<uintptr_t>(src);
  288. if (addr % 16 == 0)
  289. {
  290. return (uint32x4_p)vec_ld(0, CONST_V8_CAST(addr));
  291. }
  292. else
  293. {
  294. // http://www.nxp.com/docs/en/reference-manual/ALTIVECPEM.pdf
  295. const uint8x16_p perm = vec_lvsl(0, CONST_V8_CAST(addr));
  296. const uint8x16_p low = vec_ld(0, CONST_V8_CAST(addr));
  297. const uint8x16_p high = vec_ld(15, CONST_V8_CAST(addr));
  298. return (uint32x4_p)vec_perm(low, high, perm);
  299. }
  300. }
  301. /// \brief Loads a vector from a byte array
  302. /// \param src the byte array
  303. /// \param off offset into the src byte array
  304. /// \details Loads a vector in native endian format from a byte array.
  305. /// \details VecLoad_ALTIVEC() uses <tt>vec_ld</tt> if the effective address
  306. /// of <tt>src</tt> is aligned. If unaligned it uses <tt>vec_lvsl</tt>,
  307. /// <tt>vec_ld</tt>, <tt>vec_perm</tt> and <tt>src</tt>.
  308. /// \details The fixups using <tt>vec_lvsl</tt> and <tt>vec_perm</tt> are
  309. /// relatively expensive so you should provide aligned memory addresses.
  310. /// \par Wraps
  311. /// vec_ld, vec_lvsl, vec_perm
  312. /// \sa VecLoad, VecLoadAligned
  313. /// \since Crypto++ 6.0
  314. inline uint32x4_p VecLoad_ALTIVEC(int off, const byte src[16])
  315. {
  316. // Avoid IsAlignedOn for convenience.
  317. const uintptr_t addr = reinterpret_cast<uintptr_t>(src)+off;
  318. if (addr % 16 == 0)
  319. {
  320. return (uint32x4_p)vec_ld(0, CONST_V8_CAST(addr));
  321. }
  322. else
  323. {
  324. // http://www.nxp.com/docs/en/reference-manual/ALTIVECPEM.pdf
  325. const uint8x16_p perm = vec_lvsl(0, CONST_V8_CAST(addr));
  326. const uint8x16_p low = vec_ld(0, CONST_V8_CAST(addr));
  327. const uint8x16_p high = vec_ld(15, CONST_V8_CAST(addr));
  328. return (uint32x4_p)vec_perm(low, high, perm);
  329. }
  330. }
  331. /// \brief Loads a vector from a byte array
  332. /// \param src the byte array
  333. /// \details VecLoad() loads a vector from a byte array.
  334. /// \details VecLoad() uses POWER9's <tt>vec_xl</tt> if available.
  335. /// The instruction does not require aligned effective memory addresses.
  336. /// VecLoad_ALTIVEC() is used if POWER9 is not available.
  337. /// VecLoad_ALTIVEC() can be relatively expensive if extra instructions
  338. /// are required to fix up unaligned memory addresses.
  339. /// \par Wraps
  340. /// vec_xl on POWER9 and above, Altivec load on POWER8 and below
  341. /// \sa VecLoad_ALTIVEC, VecLoadAligned
  342. /// \since Crypto++ 6.0
  343. inline uint32x4_p VecLoad(const byte src[16])
  344. {
  345. // Power7/ISA 2.06 provides vec_xl, but only for 32-bit and 64-bit
  346. // word pointers. The ISA lacks loads for short* and char*.
  347. // Power9/ISA 3.0 provides vec_xl for all datatypes.
  348. const uintptr_t addr = reinterpret_cast<uintptr_t>(src);
  349. CRYPTOPP_ASSERT(addr % GetAlignmentOf<byte>() == 0);
  350. CRYPTOPP_UNUSED(addr);
  351. #if defined(_ARCH_PWR9)
  352. return (uint32x4_p)vec_xl(0, CONST_V8_CAST(src));
  353. #else
  354. return (uint32x4_p)VecLoad_ALTIVEC(CONST_V8_CAST(addr));
  355. #endif
  356. }
  357. /// \brief Loads a vector from a byte array
  358. /// \param src the byte array
  359. /// \param off offset into the src byte array
  360. /// \details VecLoad() loads a vector from a byte array.
  361. /// \details VecLoad() uses POWER9's <tt>vec_xl</tt> if available.
  362. /// The instruction does not require aligned effective memory addresses.
  363. /// VecLoad_ALTIVEC() is used if POWER9 is not available.
  364. /// VecLoad_ALTIVEC() can be relatively expensive if extra instructions
  365. /// are required to fix up unaligned memory addresses.
  366. /// \par Wraps
  367. /// vec_xl on POWER9 and above, Altivec load on POWER8 and below
  368. /// \sa VecLoad_ALTIVEC, VecLoadAligned
  369. /// \since Crypto++ 6.0
  370. inline uint32x4_p VecLoad(int off, const byte src[16])
  371. {
  372. // Power7/ISA 2.06 provides vec_xl, but only for 32-bit and 64-bit
  373. // word pointers. The ISA lacks loads for short* and char*.
  374. // Power9/ISA 3.0 provides vec_xl for all datatypes.
  375. const uintptr_t addr = reinterpret_cast<uintptr_t>(src)+off;
  376. CRYPTOPP_ASSERT(addr % GetAlignmentOf<byte>() == 0);
  377. CRYPTOPP_UNUSED(addr);
  378. #if defined(_ARCH_PWR9)
  379. return (uint32x4_p)vec_xl(off, CONST_V8_CAST(src));
  380. #else
  381. return (uint32x4_p)VecLoad_ALTIVEC(CONST_V8_CAST(addr));
  382. #endif
  383. }
  384. /// \brief Loads a vector from a word array
  385. /// \param src the word array
  386. /// \details VecLoad() loads a vector from a word array.
  387. /// \details VecLoad() uses POWER7's and VSX's <tt>vec_xl</tt> if available.
  388. /// The instruction does not require aligned effective memory addresses.
  389. /// VecLoad_ALTIVEC() is used if POWER7 is not available.
  390. /// VecLoad_ALTIVEC() can be relatively expensive if extra instructions
  391. /// are required to fix up unaligned memory addresses.
  392. /// \par Wraps
  393. /// vec_xl on VSX or POWER8 and above, Altivec load on POWER7 and below
  394. /// \sa VecLoad_ALTIVEC, VecLoadAligned
  395. /// \since Crypto++ 8.0
  396. inline uint32x4_p VecLoad(const word32 src[4])
  397. {
  398. // Power7/ISA 2.06 provides vec_xl, but only for 32-bit and 64-bit
  399. // word pointers. The ISA lacks loads for short* and char*.
  400. // Power9/ISA 3.0 provides vec_xl for all datatypes.
  401. const uintptr_t addr = reinterpret_cast<uintptr_t>(src);
  402. CRYPTOPP_ASSERT(addr % GetAlignmentOf<word32>() == 0);
  403. CRYPTOPP_UNUSED(addr);
  404. #if defined(_ARCH_PWR9)
  405. return (uint32x4_p)vec_xl(0, CONST_V8_CAST(src));
  406. #elif defined(__VSX__) || defined(_ARCH_PWR8)
  407. return (uint32x4_p)vec_xl(0, CONST_V32_CAST(addr));
  408. #else
  409. return (uint32x4_p)VecLoad_ALTIVEC(CONST_V8_CAST(addr));
  410. #endif
  411. }
  412. /// \brief Loads a vector from a word array
  413. /// \param src the word array
  414. /// \param off offset into the word array
  415. /// \details VecLoad() loads a vector from a word array.
  416. /// \details VecLoad() uses POWER7's and VSX's <tt>vec_xl</tt> if available.
  417. /// The instruction does not require aligned effective memory addresses.
  418. /// VecLoad_ALTIVEC() is used if POWER7 is not available.
  419. /// VecLoad_ALTIVEC() can be relatively expensive if extra instructions
  420. /// are required to fix up unaligned memory addresses.
  421. /// \par Wraps
  422. /// vec_xl on VSX or POWER8 and above, Altivec load on POWER7 and below
  423. /// \sa VecLoad_ALTIVEC, VecLoadAligned
  424. /// \since Crypto++ 8.0
  425. inline uint32x4_p VecLoad(int off, const word32 src[4])
  426. {
  427. // Power7/ISA 2.06 provides vec_xl, but only for 32-bit and 64-bit
  428. // word pointers. The ISA lacks loads for short* and char*.
  429. // Power9/ISA 3.0 provides vec_xl for all datatypes.
  430. const uintptr_t addr = reinterpret_cast<uintptr_t>(src)+off;
  431. CRYPTOPP_ASSERT(addr % GetAlignmentOf<word32>() == 0);
  432. CRYPTOPP_UNUSED(addr);
  433. #if defined(_ARCH_PWR9)
  434. return (uint32x4_p)vec_xl(off, CONST_V8_CAST(src));
  435. #elif defined(__VSX__) || defined(_ARCH_PWR8)
  436. return (uint32x4_p)vec_xl(0, CONST_V32_CAST(addr));
  437. #else
  438. return (uint32x4_p)VecLoad_ALTIVEC(CONST_V8_CAST(addr));
  439. #endif
  440. }
  441. #if defined(__VSX__) || defined(_ARCH_PWR8) || defined(CRYPTOPP_DOXYGEN_PROCESSING)
  442. /// \brief Loads a vector from a double word array
  443. /// \param src the double word array
  444. /// \details VecLoad() loads a vector from a double word array.
  445. /// \details VecLoad() uses POWER7's and VSX's <tt>vec_xl</tt> if available.
  446. /// The instruction does not require aligned effective memory addresses.
  447. /// VecLoad_ALTIVEC() is used if POWER7 and VSX are not available.
  448. /// VecLoad_ALTIVEC() can be relatively expensive if extra instructions
  449. /// are required to fix up unaligned memory addresses.
  450. /// \details VecLoad() with 64-bit elements is available on POWER7 and above.
  451. /// \par Wraps
  452. /// vec_xl on VSX or POWER8 and above, Altivec load on POWER7 and below
  453. /// \sa VecLoad_ALTIVEC, VecLoadAligned
  454. /// \since Crypto++ 8.0
  455. inline uint64x2_p VecLoad(const word64 src[2])
  456. {
  457. // Power7/ISA 2.06 provides vec_xl, but only for 32-bit and 64-bit
  458. // word pointers. The ISA lacks loads for short* and char*.
  459. // Power9/ISA 3.0 provides vec_xl for all datatypes.
  460. const uintptr_t addr = reinterpret_cast<uintptr_t>(src);
  461. CRYPTOPP_ASSERT(addr % GetAlignmentOf<word64>() == 0);
  462. CRYPTOPP_UNUSED(addr);
  463. #if defined(_ARCH_PWR9)
  464. return (uint64x2_p)vec_xl(0, CONST_V8_CAST(src));
  465. #elif defined(__VSX__) || defined(_ARCH_PWR8)
  466. // The 32-bit cast is not a typo. Compiler workaround.
  467. return (uint64x2_p)vec_xl(0, CONST_V32_CAST(addr));
  468. #else
  469. return (uint64x2_p)VecLoad_ALTIVEC(CONST_V8_CAST(addr));
  470. #endif
  471. }
  472. /// \brief Loads a vector from a double word array
  473. /// \param src the double word array
  474. /// \param off offset into the double word array
  475. /// \details VecLoad() loads a vector from a double word array.
  476. /// \details VecLoad() uses POWER7's and VSX's <tt>vec_xl</tt> if available.
  477. /// The instruction does not require aligned effective memory addresses.
  478. /// VecLoad_ALTIVEC() is used if POWER7 and VSX are not available.
  479. /// VecLoad_ALTIVEC() can be relatively expensive if extra instructions
  480. /// are required to fix up unaligned memory addresses.
  481. /// \details VecLoad() with 64-bit elements is available on POWER8 and above.
  482. /// \par Wraps
  483. /// vec_xl on VSX or POWER8 and above, Altivec load on POWER7 and below
  484. /// \sa VecLoad_ALTIVEC, VecLoadAligned
  485. /// \since Crypto++ 8.0
  486. inline uint64x2_p VecLoad(int off, const word64 src[2])
  487. {
  488. // Power7/ISA 2.06 provides vec_xl, but only for 32-bit and 64-bit
  489. // word pointers. The ISA lacks loads for short* and char*.
  490. // Power9/ISA 3.0 provides vec_xl for all datatypes.
  491. const uintptr_t addr = reinterpret_cast<uintptr_t>(src)+off;
  492. CRYPTOPP_ASSERT(addr % GetAlignmentOf<word64>() == 0);
  493. CRYPTOPP_UNUSED(addr);
  494. #if defined(_ARCH_PWR9)
  495. return (uint64x2_p)vec_xl(off, CONST_V8_CAST(src));
  496. #elif defined(__VSX__) || defined(_ARCH_PWR8)
  497. // The 32-bit cast is not a typo. Compiler workaround.
  498. return (uint64x2_p)vec_xl(0, CONST_V32_CAST(addr));
  499. #else
  500. return (uint64x2_p)VecLoad_ALTIVEC(CONST_V8_CAST(addr));
  501. #endif
  502. }
  503. #endif // VSX or ARCH_PWR8
  504. /// \brief Loads a vector from an aligned byte array
  505. /// \param src the byte array
  506. /// \details VecLoadAligned() loads a vector from an aligned byte array.
  507. /// \details VecLoadAligned() uses POWER9's <tt>vec_xl</tt> if available.
  508. /// <tt>vec_ld</tt> is used if POWER9 is not available. The effective
  509. /// address of <tt>src</tt> must be 16-byte aligned for Altivec.
  510. /// \par Wraps
  511. /// vec_xl on POWER9, vec_ld on POWER8 and below
  512. /// \sa VecLoad_ALTIVEC, VecLoad
  513. /// \since Crypto++ 8.0
  514. inline uint32x4_p VecLoadAligned(const byte src[16])
  515. {
  516. // Power7/ISA 2.06 provides vec_xl, but only for 32-bit and 64-bit
  517. // word pointers. The ISA lacks loads for short* and char*.
  518. // Power9/ISA 3.0 provides vec_xl for all datatypes.
  519. const uintptr_t addr = reinterpret_cast<uintptr_t>(src);
  520. CRYPTOPP_ASSERT(addr % 16 == 0);
  521. CRYPTOPP_UNUSED(addr);
  522. #if defined(_ARCH_PWR9)
  523. return (uint32x4_p)vec_xl(0, CONST_V8_CAST(src));
  524. #else
  525. return (uint32x4_p)vec_ld(0, CONST_V8_CAST(src));
  526. #endif
  527. }
  528. /// \brief Loads a vector from an aligned byte array
  529. /// \param src the byte array
  530. /// \param off offset into the src byte array
  531. /// \details VecLoadAligned() loads a vector from an aligned byte array.
  532. /// \details VecLoadAligned() uses POWER9's <tt>vec_xl</tt> if available.
  533. /// <tt>vec_ld</tt> is used if POWER9 is not available. The effective
  534. /// address of <tt>src</tt> must be 16-byte aligned for Altivec.
  535. /// \par Wraps
  536. /// vec_xl on POWER9, vec_ld on POWER8 and below
  537. /// \sa VecLoad_ALTIVEC, VecLoad
  538. /// \since Crypto++ 8.0
  539. inline uint32x4_p VecLoadAligned(int off, const byte src[16])
  540. {
  541. // Power7/ISA 2.06 provides vec_xl, but only for 32-bit and 64-bit
  542. // word pointers. The ISA lacks loads for short* and char*.
  543. // Power9/ISA 3.0 provides vec_xl for all datatypes.
  544. const uintptr_t addr = reinterpret_cast<uintptr_t>(src)+off;
  545. CRYPTOPP_ASSERT(addr % 16 == 0);
  546. CRYPTOPP_UNUSED(addr);
  547. #if defined(_ARCH_PWR9)
  548. return (uint32x4_p)vec_xl(off, CONST_V8_CAST(src));
  549. #else
  550. return (uint32x4_p)vec_ld(off, CONST_V8_CAST(src));
  551. #endif
  552. }
  553. /// \brief Loads a vector from an aligned word array
  554. /// \param src the word array
  555. /// \details VecLoadAligned() loads a vector from an aligned word array.
  556. /// \details VecLoadAligned() uses POWER7's and VSX's <tt>vec_xl</tt> if
  557. /// available. <tt>vec_ld</tt> is used if POWER7 or VSX are not available.
  558. /// The effective address of <tt>src</tt> must be 16-byte aligned for Altivec.
  559. /// \par Wraps
  560. /// vec_xl on VSX or POWER8 and above, vec_ld on POWER7 and below
  561. /// \sa VecLoad_ALTIVEC, VecLoad
  562. /// \since Crypto++ 8.0
  563. inline uint32x4_p VecLoadAligned(const word32 src[4])
  564. {
  565. // Power7/ISA 2.06 provides vec_xl, but only for 32-bit and 64-bit
  566. // word pointers. The ISA lacks loads for short* and char*.
  567. // Power9/ISA 3.0 provides vec_xl for all datatypes.
  568. const uintptr_t addr = reinterpret_cast<uintptr_t>(src);
  569. CRYPTOPP_ASSERT(addr % 16 == 0);
  570. CRYPTOPP_UNUSED(addr);
  571. #if defined(_ARCH_PWR9)
  572. return (uint32x4_p)vec_xl(0, CONST_V8_CAST(src));
  573. #elif defined(__VSX__) || defined(_ARCH_PWR8)
  574. return (uint32x4_p)vec_xl(0, CONST_V32_CAST(src));
  575. #else
  576. return (uint32x4_p)vec_ld(0, CONST_V8_CAST(src));
  577. #endif
  578. }
  579. /// \brief Loads a vector from an aligned word array
  580. /// \param src the word array
  581. /// \param off offset into the src word array
  582. /// \details VecLoadAligned() loads a vector from an aligned word array.
  583. /// \details VecLoadAligned() uses POWER7's and VSX's <tt>vec_xl</tt> if
  584. /// available. <tt>vec_ld</tt> is used if POWER7 or VSX are not available.
  585. /// The effective address of <tt>src</tt> must be 16-byte aligned for Altivec.
  586. /// \par Wraps
  587. /// vec_xl on VSX or POWER8 and above, vec_ld on POWER7 and below
  588. /// \sa VecLoad_ALTIVEC, VecLoad
  589. /// \since Crypto++ 8.0
  590. inline uint32x4_p VecLoadAligned(int off, const word32 src[4])
  591. {
  592. // Power7/ISA 2.06 provides vec_xl, but only for 32-bit and 64-bit
  593. // word pointers. The ISA lacks loads for short* and char*.
  594. // Power9/ISA 3.0 provides vec_xl for all datatypes.
  595. const uintptr_t addr = reinterpret_cast<uintptr_t>(src)+off;
  596. CRYPTOPP_ASSERT(addr % 16 == 0);
  597. CRYPTOPP_UNUSED(addr);
  598. #if defined(_ARCH_PWR9)
  599. return (uint32x4_p)vec_xl(off, CONST_V8_CAST(src));
  600. #elif defined(__VSX__) || defined(_ARCH_PWR8)
  601. return (uint32x4_p)vec_xl(0, CONST_V32_CAST(addr));
  602. #else
  603. return (uint32x4_p)vec_ld(off, CONST_V8_CAST(src));
  604. #endif
  605. }
  606. #if defined(__VSX__) || defined(_ARCH_PWR8) || defined(CRYPTOPP_DOXYGEN_PROCESSING)
  607. /// \brief Loads a vector from an aligned double word array
  608. /// \param src the double word array
  609. /// \details VecLoadAligned() loads a vector from an aligned double word array.
  610. /// \details VecLoadAligned() uses POWER7's and VSX's <tt>vec_xl</tt> if
  611. /// available. <tt>vec_ld</tt> is used if POWER7 or VSX are not available.
  612. /// The effective address of <tt>src</tt> must be 16-byte aligned for Altivec.
  613. /// \par Wraps
  614. /// vec_xl on VSX or POWER8 and above, vec_ld on POWER7 and below
  615. /// \sa VecLoad_ALTIVEC, VecLoad
  616. /// \since Crypto++ 8.0
  617. inline uint64x2_p VecLoadAligned(const word64 src[4])
  618. {
  619. // Power7/ISA 2.06 provides vec_xl, but only for 32-bit and 64-bit
  620. // word pointers. The ISA lacks loads for short* and char*.
  621. // Power9/ISA 3.0 provides vec_xl for all datatypes.
  622. const uintptr_t addr = reinterpret_cast<uintptr_t>(src);
  623. CRYPTOPP_ASSERT(addr % 16 == 0);
  624. CRYPTOPP_UNUSED(addr);
  625. #if defined(_ARCH_PWR9)
  626. return (uint64x2_p)vec_xl(0, CONST_V8_CAST(src));
  627. #elif defined(__VSX__) || defined(_ARCH_PWR8)
  628. // The 32-bit cast is not a typo. Compiler workaround.
  629. return (uint64x2_p)vec_xl(0, CONST_V32_CAST(src));
  630. #else
  631. return (uint64x2_p)vec_ld(0, CONST_V8_CAST(src));
  632. #endif
  633. }
  634. /// \brief Loads a vector from an aligned double word array
  635. /// \param src the double word array
  636. /// \param off offset into the src double word array
  637. /// \details VecLoadAligned() loads a vector from an aligned double word array.
  638. /// \details VecLoadAligned() uses POWER7's and VSX's <tt>vec_xl</tt> if
  639. /// available. <tt>vec_ld</tt> is used if POWER7 or VSX are not available.
  640. /// The effective address of <tt>src</tt> must be 16-byte aligned for Altivec.
  641. /// \par Wraps
  642. /// vec_xl on VSX or POWER8 and above, vec_ld on POWER7 and below
  643. /// \sa VecLoad_ALTIVEC, VecLoad
  644. /// \since Crypto++ 8.0
  645. inline uint64x2_p VecLoadAligned(int off, const word64 src[4])
  646. {
  647. // Power7/ISA 2.06 provides vec_xl, but only for 32-bit and 64-bit
  648. // word pointers. The ISA lacks loads for short* and char*.
  649. // Power9/ISA 3.0 provides vec_xl for all datatypes.
  650. const uintptr_t addr = reinterpret_cast<uintptr_t>(src)+off;
  651. CRYPTOPP_ASSERT(addr % 16 == 0);
  652. CRYPTOPP_UNUSED(addr);
  653. #if defined(_ARCH_PWR9)
  654. return (uint64x2_p)vec_xl(off, CONST_V8_CAST(src));
  655. #elif defined(__VSX__) || defined(_ARCH_PWR8)
  656. // The 32-bit cast is not a typo. Compiler workaround.
  657. return (uint64x2_p)vec_xl(0, CONST_V32_CAST(addr));
  658. #else
  659. return (uint64x2_p)vec_ld(off, CONST_V8_CAST(src));
  660. #endif
  661. }
  662. #endif
  663. /// \brief Loads a vector from a byte array
  664. /// \param src the byte array
  665. /// \details VecLoadBE() loads a vector from a byte array. VecLoadBE
  666. /// will reverse all bytes in the array on a little endian system.
  667. /// \details VecLoadBE() uses POWER7's and VSX's <tt>vec_xl</tt> if available.
  668. /// The instruction does not require aligned effective memory addresses.
  669. /// VecLoad_ALTIVEC() is used if POWER7 or VSX are not available.
  670. /// VecLoad_ALTIVEC() can be relatively expensive if extra instructions
  671. /// are required to fix up unaligned memory addresses.
  672. /// \par Wraps
  673. /// vec_xl on POWER8, Altivec load on POWER7 and below
  674. /// \sa VecLoad_ALTIVEC, VecLoad, VecLoadAligned
  675. /// \since Crypto++ 6.0
  676. inline uint32x4_p VecLoadBE(const byte src[16])
  677. {
  678. // Power7/ISA 2.06 provides vec_xl, but only for 32-bit and 64-bit
  679. // word pointers. The ISA lacks loads for short* and char*.
  680. // Power9/ISA 3.0 provides vec_xl for all datatypes.
  681. const uintptr_t addr = reinterpret_cast<uintptr_t>(src);
  682. // CRYPTOPP_ASSERT(addr % GetAlignmentOf<byte>() == 0);
  683. CRYPTOPP_UNUSED(addr);
  684. #if defined(_ARCH_PWR9)
  685. CRYPTOPP_ASSERT(addr % GetAlignmentOf<byte>() == 0);
  686. return (uint32x4_p)vec_xl_be(0, CONST_V8_CAST(src));
  687. #elif defined(CRYPTOPP_BIG_ENDIAN)
  688. return (uint32x4_p)VecLoad_ALTIVEC(0, CONST_V8_CAST(src));
  689. #else
  690. return (uint32x4_p)VecReverseLE(VecLoad_ALTIVEC(CONST_V8_CAST(src)));
  691. #endif
  692. }
  693. /// \brief Loads a vector from a byte array
  694. /// \param src the byte array
  695. /// \param off offset into the src byte array
  696. /// \details VecLoadBE() loads a vector from a byte array. VecLoadBE
  697. /// will reverse all bytes in the array on a little endian system.
  698. /// \details VecLoadBE() uses POWER7's and VSX's <tt>vec_xl</tt> if available.
  699. /// The instruction does not require aligned effective memory addresses.
  700. /// VecLoad_ALTIVEC() is used if POWER7 is not available.
  701. /// VecLoad_ALTIVEC() can be relatively expensive if extra instructions
  702. /// are required to fix up unaligned memory addresses.
  703. /// \par Wraps
  704. /// vec_xl on POWER8, Altivec load on POWER7 and below
  705. /// \sa VecLoad_ALTIVEC, VecLoad, VecLoadAligned
  706. /// \since Crypto++ 6.0
  707. inline uint32x4_p VecLoadBE(int off, const byte src[16])
  708. {
  709. // Power7/ISA 2.06 provides vec_xl, but only for 32-bit and 64-bit
  710. // word pointers. The ISA lacks loads for short* and char*.
  711. // Power9/ISA 3.0 provides vec_xl for all datatypes.
  712. const uintptr_t addr = reinterpret_cast<uintptr_t>(src)+off;
  713. // CRYPTOPP_ASSERT(addr % GetAlignmentOf<byte>() == 0);
  714. CRYPTOPP_UNUSED(addr);
  715. #if defined(_ARCH_PWR9)
  716. CRYPTOPP_ASSERT(addr % GetAlignmentOf<byte>() == 0);
  717. return (uint32x4_p)vec_xl_be(off, CONST_V8_CAST(src));
  718. #elif defined(CRYPTOPP_BIG_ENDIAN)
  719. return (uint32x4_p)VecLoad_ALTIVEC(CONST_V8_CAST(addr));
  720. #else
  721. return (uint32x4_p)VecReverseLE(VecLoad_ALTIVEC(CONST_V8_CAST(addr)));
  722. #endif
  723. }
  724. //@}
  725. /// \name STORE OPERATIONS
  726. //@{
  727. /// \brief Stores a vector to a byte array
  728. /// \tparam T vector type
  729. /// \param data the vector
  730. /// \param dest the byte array
  731. /// \details VecStore_ALTIVEC() stores a vector to a byte array.
  732. /// \details VecStore_ALTIVEC() uses <tt>vec_st</tt> if the effective address
  733. /// of <tt>dest</tt> is aligned, and uses <tt>vec_ste</tt> otherwise.
  734. /// <tt>vec_ste</tt> is relatively expensive so you should provide aligned
  735. /// memory addresses.
  736. /// \details VecStore_ALTIVEC() is used when POWER7 or above
  737. /// and unaligned loads is not available.
  738. /// \par Wraps
  739. /// vec_st, vec_ste, vec_lvsr, vec_perm
  740. /// \sa VecStore, VecStoreAligned
  741. /// \since Crypto++ 8.0
  742. template<class T>
  743. inline void VecStore_ALTIVEC(const T data, byte dest[16])
  744. {
  745. // Avoid IsAlignedOn for convenience.
  746. uintptr_t addr = reinterpret_cast<uintptr_t>(dest);
  747. if (addr % 16 == 0)
  748. {
  749. vec_st((uint8x16_p)data, 0, NCONST_V8_CAST(addr));
  750. }
  751. else
  752. {
  753. // http://www.nxp.com/docs/en/reference-manual/ALTIVECPEM.pdf
  754. uint8x16_p perm = (uint8x16_p)vec_perm(data, data, vec_lvsr(0, NCONST_V8_CAST(addr)));
  755. vec_ste((uint8x16_p) perm, 0, (unsigned char*) NCONST_V8_CAST(addr));
  756. vec_ste((uint16x8_p) perm, 1, (unsigned short*)NCONST_V8_CAST(addr));
  757. vec_ste((uint32x4_p) perm, 3, (unsigned int*) NCONST_V8_CAST(addr));
  758. vec_ste((uint32x4_p) perm, 4, (unsigned int*) NCONST_V8_CAST(addr));
  759. vec_ste((uint32x4_p) perm, 8, (unsigned int*) NCONST_V8_CAST(addr));
  760. vec_ste((uint32x4_p) perm, 12, (unsigned int*) NCONST_V8_CAST(addr));
  761. vec_ste((uint16x8_p) perm, 14, (unsigned short*)NCONST_V8_CAST(addr));
  762. vec_ste((uint8x16_p) perm, 15, (unsigned char*) NCONST_V8_CAST(addr));
  763. }
  764. }
  765. /// \brief Stores a vector to a byte array
  766. /// \tparam T vector type
  767. /// \param data the vector
  768. /// \param off offset into the dest byte array
  769. /// \param dest the byte array
  770. /// \details VecStore_ALTIVEC() stores a vector to a byte array.
  771. /// \details VecStore_ALTIVEC() uses <tt>vec_st</tt> if the effective address
  772. /// of <tt>dest</tt> is aligned, and uses <tt>vec_ste</tt> otherwise.
  773. /// <tt>vec_ste</tt> is relatively expensive so you should provide aligned
  774. /// memory addresses.
  775. /// \details VecStore_ALTIVEC() is used when POWER7 or above
  776. /// and unaligned loads is not available.
  777. /// \par Wraps
  778. /// vec_st, vec_ste, vec_lvsr, vec_perm
  779. /// \sa VecStore, VecStoreAligned
  780. /// \since Crypto++ 8.0
  781. template<class T>
  782. inline void VecStore_ALTIVEC(const T data, int off, byte dest[16])
  783. {
  784. // Avoid IsAlignedOn for convenience.
  785. uintptr_t addr = reinterpret_cast<uintptr_t>(dest)+off;
  786. if (addr % 16 == 0)
  787. {
  788. vec_st((uint8x16_p)data, 0, NCONST_V8_CAST(addr));
  789. }
  790. else
  791. {
  792. // http://www.nxp.com/docs/en/reference-manual/ALTIVECPEM.pdf
  793. uint8x16_p perm = (uint8x16_p)vec_perm(data, data, vec_lvsr(0, NCONST_V8_CAST(addr)));
  794. vec_ste((uint8x16_p) perm, 0, (unsigned char*) NCONST_V8_CAST(addr));
  795. vec_ste((uint16x8_p) perm, 1, (unsigned short*)NCONST_V8_CAST(addr));
  796. vec_ste((uint32x4_p) perm, 3, (unsigned int*) NCONST_V8_CAST(addr));
  797. vec_ste((uint32x4_p) perm, 4, (unsigned int*) NCONST_V8_CAST(addr));
  798. vec_ste((uint32x4_p) perm, 8, (unsigned int*) NCONST_V8_CAST(addr));
  799. vec_ste((uint32x4_p) perm, 12, (unsigned int*) NCONST_V8_CAST(addr));
  800. vec_ste((uint16x8_p) perm, 14, (unsigned short*)NCONST_V8_CAST(addr));
  801. vec_ste((uint8x16_p) perm, 15, (unsigned char*) NCONST_V8_CAST(addr));
  802. }
  803. }
  804. /// \brief Stores a vector to a byte array
  805. /// \tparam T vector type
  806. /// \param data the vector
  807. /// \param dest the byte array
  808. /// \details VecStore() stores a vector to a byte array.
  809. /// \details VecStore() uses POWER9's <tt>vec_xst</tt> if available.
  810. /// The instruction does not require aligned effective memory addresses.
  811. /// VecStore_ALTIVEC() is used if POWER9 is not available.
  812. /// VecStore_ALTIVEC() can be relatively expensive if extra instructions
  813. /// are required to fix up unaligned memory addresses.
  814. /// \par Wraps
  815. /// vec_xst on POWER9 and above, Altivec store on POWER8 and below
  816. /// \sa VecStore_ALTIVEC, VecStoreAligned
  817. /// \since Crypto++ 6.0
  818. template<class T>
  819. inline void VecStore(const T data, byte dest[16])
  820. {
  821. // Power7/ISA 2.06 provides vec_xl, but only for 32-bit and 64-bit
  822. // word pointers. The ISA lacks loads for short* and char*.
  823. // Power9/ISA 3.0 provides vec_xl for all datatypes.
  824. const uintptr_t addr = reinterpret_cast<uintptr_t>(dest);
  825. CRYPTOPP_ASSERT(addr % GetAlignmentOf<byte>() == 0);
  826. CRYPTOPP_UNUSED(addr);
  827. #if defined(_ARCH_PWR9)
  828. vec_xst((uint8x16_p)data, 0, NCONST_V8_CAST(dest));
  829. #else
  830. VecStore_ALTIVEC((uint8x16_p)data, NCONST_V8_CAST(dest));
  831. #endif
  832. }
  833. /// \brief Stores a vector to a byte array
  834. /// \tparam T vector type
  835. /// \param data the vector
  836. /// \param off offset into the dest byte array
  837. /// \param dest the byte array
  838. /// \details VecStore() stores a vector to a byte array.
  839. /// \details VecStore() uses POWER9's <tt>vec_xst</tt> if available.
  840. /// The instruction does not require aligned effective memory addresses.
  841. /// VecStore_ALTIVEC() is used if POWER9 is not available.
  842. /// VecStore_ALTIVEC() can be relatively expensive if extra instructions
  843. /// are required to fix up unaligned memory addresses.
  844. /// \par Wraps
  845. /// vec_xst on POWER9 and above, Altivec store on POWER8 and below
  846. /// \sa VecStore_ALTIVEC, VecStoreAligned
  847. /// \since Crypto++ 6.0
  848. template<class T>
  849. inline void VecStore(const T data, int off, byte dest[16])
  850. {
  851. // Power7/ISA 2.06 provides vec_xl, but only for 32-bit and 64-bit
  852. // word pointers. The ISA lacks loads for short* and char*.
  853. // Power9/ISA 3.0 provides vec_xl for all datatypes.
  854. const uintptr_t addr = reinterpret_cast<uintptr_t>(dest)+off;
  855. CRYPTOPP_ASSERT(addr % GetAlignmentOf<byte>() == 0);
  856. CRYPTOPP_UNUSED(addr);
  857. #if defined(_ARCH_PWR9)
  858. vec_xst((uint8x16_p)data, off, NCONST_V8_CAST(dest));
  859. #else
  860. VecStore_ALTIVEC((uint8x16_p)data, NCONST_V8_CAST(addr));
  861. #endif
  862. }
  863. /// \brief Stores a vector to a word array
  864. /// \tparam T vector type
  865. /// \param data the vector
  866. /// \param dest the word array
  867. /// \details VecStore() stores a vector to a word array.
  868. /// \details VecStore() uses POWER7's and VSX's <tt>vec_xst</tt> if available.
  869. /// The instruction does not require aligned effective memory addresses.
  870. /// VecStore_ALTIVEC() is used if POWER7 or VSX are not available.
  871. /// VecStore_ALTIVEC() can be relatively expensive if extra instructions
  872. /// are required to fix up unaligned memory addresses.
  873. /// \par Wraps
  874. /// vec_xst on VSX or POWER8 and above, Altivec store on POWER7 and below
  875. /// \sa VecStore_ALTIVEC, VecStoreAligned
  876. /// \since Crypto++ 8.0
  877. template<class T>
  878. inline void VecStore(const T data, word32 dest[4])
  879. {
  880. // Power7/ISA 2.06 provides vec_xst, but only for 32-bit and 64-bit
  881. // word pointers. The ISA lacks stores for short* and char*.
  882. // Power9/ISA 3.0 provides vec_xst for all datatypes.
  883. const uintptr_t addr = reinterpret_cast<uintptr_t>(dest);
  884. CRYPTOPP_ASSERT(addr % GetAlignmentOf<word32>() == 0);
  885. CRYPTOPP_UNUSED(addr);
  886. #if defined(_ARCH_PWR9)
  887. vec_xst((uint8x16_p)data, 0, NCONST_V8_CAST(dest));
  888. #elif defined(__VSX__) || defined(_ARCH_PWR8)
  889. vec_xst((uint32x4_p)data, 0, NCONST_V32_CAST(addr));
  890. #else
  891. VecStore_ALTIVEC((uint8x16_p)data, NCONST_V8_CAST(addr));
  892. #endif
  893. }
  894. /// \brief Stores a vector to a word array
  895. /// \tparam T vector type
  896. /// \param data the vector
  897. /// \param off offset into the dest word array
  898. /// \param dest the word array
  899. /// \details VecStore() stores a vector to a word array.
  900. /// \details VecStore() uses POWER7's and VSX's <tt>vec_xst</tt> if available.
  901. /// The instruction does not require aligned effective memory addresses.
  902. /// VecStore_ALTIVEC() is used if POWER7 or VSX are not available.
  903. /// VecStore_ALTIVEC() can be relatively expensive if extra instructions
  904. /// are required to fix up unaligned memory addresses.
  905. /// \par Wraps
  906. /// vec_xst on VSX or POWER8 and above, Altivec store on POWER7 and below
  907. /// \sa VecStore_ALTIVEC, VecStoreAligned
  908. /// \since Crypto++ 8.0
  909. template<class T>
  910. inline void VecStore(const T data, int off, word32 dest[4])
  911. {
  912. // Power7/ISA 2.06 provides vec_xst, but only for 32-bit and 64-bit
  913. // word pointers. The ISA lacks stores for short* and char*.
  914. // Power9/ISA 3.0 provides vec_xst for all datatypes.
  915. const uintptr_t addr = reinterpret_cast<uintptr_t>(dest)+off;
  916. CRYPTOPP_ASSERT(addr % GetAlignmentOf<word32>() == 0);
  917. CRYPTOPP_UNUSED(addr);
  918. #if defined(_ARCH_PWR9)
  919. vec_xst((uint8x16_p)data, off, NCONST_V8_CAST(dest));
  920. #elif defined(__VSX__) || defined(_ARCH_PWR8)
  921. vec_xst((uint32x4_p)data, 0, NCONST_V32_CAST(addr));
  922. #else
  923. VecStore_ALTIVEC((uint8x16_p)data, NCONST_V8_CAST(addr));
  924. #endif
  925. }
  926. /// \brief Stores a vector to a word array
  927. /// \tparam T vector type
  928. /// \param data the vector
  929. /// \param dest the word array
  930. /// \details VecStore() stores a vector to a word array.
  931. /// \details VecStore() uses POWER7's and VSX's <tt>vec_xst</tt> if available.
  932. /// The instruction does not require aligned effective memory addresses.
  933. /// VecStore_ALTIVEC() is used if POWER7 or VSX are not available.
  934. /// VecStore_ALTIVEC() can be relatively expensive if extra instructions
  935. /// are required to fix up unaligned memory addresses.
  936. /// \details VecStore() with 64-bit elements is available on POWER8 and above.
  937. /// \par Wraps
  938. /// vec_xst on VSX or POWER8 and above, Altivec store on POWER7 and below
  939. /// \sa VecStore_ALTIVEC, VecStoreAligned
  940. /// \since Crypto++ 8.0
  941. template<class T>
  942. inline void VecStore(const T data, word64 dest[2])
  943. {
  944. // Power7/ISA 2.06 provides vec_xst, but only for 32-bit and 64-bit
  945. // word pointers. The ISA lacks stores for short* and char*.
  946. // Power9/ISA 3.0 provides vec_xst for all datatypes.
  947. const uintptr_t addr = reinterpret_cast<uintptr_t>(dest);
  948. CRYPTOPP_ASSERT(addr % GetAlignmentOf<word64>() == 0);
  949. CRYPTOPP_UNUSED(addr);
  950. #if defined(_ARCH_PWR9)
  951. vec_xst((uint8x16_p)data, 0, NCONST_V8_CAST(dest));
  952. #elif defined(__VSX__) || defined(_ARCH_PWR8)
  953. // 32-bit cast is not a typo. Compiler workaround.
  954. vec_xst((uint32x4_p)data, 0, NCONST_V32_CAST(addr));
  955. #else
  956. VecStore_ALTIVEC((uint8x16_p)data, NCONST_V8_CAST(addr));
  957. #endif
  958. }
  959. /// \brief Stores a vector to a word array
  960. /// \tparam T vector type
  961. /// \param data the vector
  962. /// \param off offset into the dest word array
  963. /// \param dest the word array
  964. /// \details VecStore() stores a vector to a word array.
  965. /// \details VecStore() uses POWER7's and VSX's <tt>vec_xst</tt> if available.
  966. /// The instruction does not require aligned effective memory addresses.
  967. /// VecStore_ALTIVEC() is used if POWER7 or VSX are not available.
  968. /// VecStore_ALTIVEC() can be relatively expensive if extra instructions
  969. /// are required to fix up unaligned memory addresses.
  970. /// \details VecStore() with 64-bit elements is available on POWER8 and above.
  971. /// \par Wraps
  972. /// vec_xst on VSX or POWER8 and above, Altivec store on POWER7 and below
  973. /// \sa VecStore_ALTIVEC, VecStoreAligned
  974. /// \since Crypto++ 8.0
  975. template<class T>
  976. inline void VecStore(const T data, int off, word64 dest[2])
  977. {
  978. // Power7/ISA 2.06 provides vec_xst, but only for 32-bit and 64-bit
  979. // word pointers. The ISA lacks stores for short* and char*.
  980. // Power9/ISA 3.0 provides vec_xst for all datatypes.
  981. const uintptr_t addr = reinterpret_cast<uintptr_t>(dest)+off;
  982. CRYPTOPP_ASSERT(addr % GetAlignmentOf<word64>() == 0);
  983. CRYPTOPP_UNUSED(addr);
  984. #if defined(_ARCH_PWR9)
  985. vec_xst((uint8x16_p)data, off, NCONST_V8_CAST(dest));
  986. #elif defined(__VSX__) || defined(_ARCH_PWR8)
  987. // 32-bit cast is not a typo. Compiler workaround.
  988. vec_xst((uint32x4_p)data, 0, NCONST_V32_CAST(addr));
  989. #else
  990. VecStore_ALTIVEC((uint8x16_p)data, NCONST_V8_CAST(addr));
  991. #endif
  992. }
  993. /// \brief Stores a vector to a byte array
  994. /// \tparam T vector type
  995. /// \param data the vector
  996. /// \param dest the byte array
  997. /// \details VecStoreAligned() stores a vector from an aligned byte array.
  998. /// \details VecStoreAligned() uses POWER9's <tt>vec_xl</tt> if available.
  999. /// <tt>vec_st</tt> is used if POWER9 is not available. The effective
  1000. /// address of <tt>dest</tt> must be 16-byte aligned for Altivec.
  1001. /// \par Wraps
  1002. /// vec_xst on POWER9 or above, vec_st on POWER8 and below
  1003. /// \sa VecStore_ALTIVEC, VecStore
  1004. /// \since Crypto++ 8.0
  1005. template<class T>
  1006. inline void VecStoreAligned(const T data, byte dest[16])
  1007. {
  1008. // Power7/ISA 2.06 provides vec_xl, but only for 32-bit and 64-bit
  1009. // word pointers. The ISA lacks loads for short* and char*.
  1010. // Power9/ISA 3.0 provides vec_xl for all datatypes.
  1011. const uintptr_t addr = reinterpret_cast<uintptr_t>(dest);
  1012. CRYPTOPP_ASSERT(addr % GetAlignmentOf<byte>() == 0);
  1013. CRYPTOPP_UNUSED(addr);
  1014. #if defined(_ARCH_PWR9)
  1015. vec_xst((uint8x16_p)data, 0, NCONST_V8_CAST(dest));
  1016. #else
  1017. vec_st((uint8x16_p)data, 0, NCONST_V8_CAST(addr));
  1018. #endif
  1019. }
  1020. /// \brief Stores a vector to a byte array
  1021. /// \tparam T vector type
  1022. /// \param data the vector
  1023. /// \param off offset into the dest byte array
  1024. /// \param dest the byte array
  1025. /// \details VecStoreAligned() stores a vector from an aligned byte array.
  1026. /// \details VecStoreAligned() uses POWER9's <tt>vec_xl</tt> if available.
  1027. /// <tt>vec_st</tt> is used if POWER9 is not available. The effective
  1028. /// address of <tt>dest</tt> must be 16-byte aligned for Altivec.
  1029. /// \par Wraps
  1030. /// vec_xst on POWER9 or above, vec_st on POWER8 and below
  1031. /// \sa VecStore_ALTIVEC, VecStore
  1032. /// \since Crypto++ 8.0
  1033. template<class T>
  1034. inline void VecStoreAligned(const T data, int off, byte dest[16])
  1035. {
  1036. // Power7/ISA 2.06 provides vec_xl, but only for 32-bit and 64-bit
  1037. // word pointers. The ISA lacks loads for short* and char*.
  1038. // Power9/ISA 3.0 provides vec_xl for all datatypes.
  1039. const uintptr_t addr = reinterpret_cast<uintptr_t>(dest)+off;
  1040. CRYPTOPP_ASSERT(addr % GetAlignmentOf<byte>() == 0);
  1041. CRYPTOPP_UNUSED(addr);
  1042. #if defined(_ARCH_PWR9)
  1043. vec_xst((uint8x16_p)data, off, NCONST_V8_CAST(dest));
  1044. #else
  1045. vec_st((uint8x16_p)data, 0, NCONST_V8_CAST(addr));
  1046. #endif
  1047. }
  1048. /// \brief Stores a vector to a word array
  1049. /// \tparam T vector type
  1050. /// \param data the vector
  1051. /// \param dest the word array
  1052. /// \details VecStoreAligned() stores a vector from an aligned word array.
  1053. /// \details VecStoreAligned() uses POWER9's <tt>vec_xl</tt> if available.
  1054. /// POWER7 <tt>vec_xst</tt> is used if POWER9 is not available. <tt>vec_st</tt>
  1055. /// is used if POWER7 is not available. The effective address of <tt>dest</tt>
  1056. /// must be 16-byte aligned for Altivec.
  1057. /// \par Wraps
  1058. /// vec_xst on VSX or POWER8 and above, vec_st on POWER7 and below
  1059. /// \sa VecStore_ALTIVEC, VecStore
  1060. /// \since Crypto++ 8.0
  1061. template<class T>
  1062. inline void VecStoreAligned(const T data, word32 dest[4])
  1063. {
  1064. // Power7/ISA 2.06 provides vec_xst, but only for 32-bit and 64-bit
  1065. // word pointers. The ISA lacks stores for short* and char*.
  1066. // Power9/ISA 3.0 provides vec_xst for all datatypes.
  1067. const uintptr_t addr = reinterpret_cast<uintptr_t>(dest);
  1068. CRYPTOPP_ASSERT(addr % GetAlignmentOf<word32>() == 0);
  1069. CRYPTOPP_UNUSED(addr);
  1070. #if defined(_ARCH_PWR9)
  1071. vec_xst((uint8x16_p)data, 0, NCONST_V8_CAST(dest));
  1072. #elif defined(__VSX__) || defined(_ARCH_PWR8)
  1073. vec_xst((uint32x4_p)data, 0, NCONST_V32_CAST(addr));
  1074. #else
  1075. vec_st((uint8x16_p)data, 0, NCONST_V8_CAST(addr));
  1076. #endif
  1077. }
  1078. /// \brief Stores a vector to a word array
  1079. /// \tparam T vector type
  1080. /// \param data the vector
  1081. /// \param off offset into the dest word array
  1082. /// \param dest the word array
  1083. /// \details VecStoreAligned() stores a vector from an aligned word array.
  1084. /// \details VecStoreAligned() uses POWER9's <tt>vec_xl</tt> if available.
  1085. /// POWER7 <tt>vec_xst</tt> is used if POWER9 is not available. <tt>vec_st</tt>
  1086. /// is used if POWER7 is not available. The effective address of <tt>dest</tt>
  1087. /// must be 16-byte aligned for Altivec.
  1088. /// \par Wraps
  1089. /// vec_xst on VSX or POWER8 and above, vec_st on POWER7 and below
  1090. /// \sa VecStore_ALTIVEC, VecStore
  1091. /// \since Crypto++ 8.0
  1092. template<class T>
  1093. inline void VecStoreAligned(const T data, int off, word32 dest[4])
  1094. {
  1095. // Power7/ISA 2.06 provides vec_xst, but only for 32-bit and 64-bit
  1096. // word pointers. The ISA lacks stores for short* and char*.
  1097. // Power9/ISA 3.0 provides vec_xst for all datatypes.
  1098. const uintptr_t addr = reinterpret_cast<uintptr_t>(dest)+off;
  1099. CRYPTOPP_ASSERT(addr % GetAlignmentOf<word32>() == 0);
  1100. CRYPTOPP_UNUSED(addr);
  1101. #if defined(_ARCH_PWR9)
  1102. vec_xst((uint8x16_p)data, off, NCONST_V8_CAST(dest));
  1103. #elif defined(__VSX__) || defined(_ARCH_PWR8)
  1104. vec_xst((uint32x4_p)data, 0, NCONST_V32_CAST(addr));
  1105. #else
  1106. vec_st((uint8x16_p)data, 0, NCONST_V8_CAST(addr));
  1107. #endif
  1108. }
  1109. /// \brief Stores a vector to a byte array
  1110. /// \tparam T vector type
  1111. /// \param data the vector
  1112. /// \param dest the byte array
  1113. /// \details VecStoreBE() stores a vector to a byte array. VecStoreBE
  1114. /// will reverse all bytes in the array on a little endian system.
  1115. /// \details VecStoreBE() uses POWER7's and VSX's <tt>vec_xst</tt> if available.
  1116. /// The instruction does not require aligned effective memory addresses.
  1117. /// VecStore_ALTIVEC() is used if POWER7 is not available.
  1118. /// VecStore_ALTIVEC() can be relatively expensive if extra instructions
  1119. /// are required to fix up unaligned memory addresses.
  1120. /// \par Wraps
  1121. /// vec_xst on VSX or POWER8 and above, vec_st on POWER7 and below
  1122. /// \sa VecStore_ALTIVEC, VecStoreAligned
  1123. /// \since Crypto++ 6.0
  1124. template <class T>
  1125. inline void VecStoreBE(const T data, byte dest[16])
  1126. {
  1127. // Power7/ISA 2.06 provides vec_xst, but only for 32-bit and 64-bit
  1128. // word pointers. The ISA lacks stores for short* and char*.
  1129. // Power9/ISA 3.0 provides vec_xst for all datatypes.
  1130. const uintptr_t addr = reinterpret_cast<uintptr_t>(dest);
  1131. CRYPTOPP_ASSERT(addr % GetAlignmentOf<byte>() == 0);
  1132. CRYPTOPP_UNUSED(addr);
  1133. #if defined(_ARCH_PWR9)
  1134. vec_xst_be((uint8x16_p)data, 0, NCONST_V8_CAST(dest));
  1135. #elif defined(CRYPTOPP_BIG_ENDIAN)
  1136. VecStore((uint8x16_p)data, NCONST_V8_CAST(addr));
  1137. #else
  1138. VecStore((uint8x16_p)VecReverseLE(data), NCONST_V8_CAST(addr));
  1139. #endif
  1140. }
  1141. /// \brief Stores a vector to a byte array
  1142. /// \tparam T vector type
  1143. /// \param data the vector
  1144. /// \param off offset into the dest byte array
  1145. /// \param dest the byte array
  1146. /// \details VecStoreBE() stores a vector to a byte array. VecStoreBE
  1147. /// will reverse all bytes in the array on a little endian system.
  1148. /// \details VecStoreBE() uses POWER7's and VSX's <tt>vec_xst</tt> if available.
  1149. /// The instruction does not require aligned effective memory addresses.
  1150. /// VecStore_ALTIVEC() is used if POWER7 is not available.
  1151. /// VecStore_ALTIVEC() can be relatively expensive if extra instructions
  1152. /// are required to fix up unaligned memory addresses.
  1153. /// \par Wraps
  1154. /// vec_xst on VSX or POWER8 and above, vec_st on POWER7 and below
  1155. /// \sa VecStore_ALTIVEC, VecStoreAligned
  1156. /// \since Crypto++ 6.0
  1157. template <class T>
  1158. inline void VecStoreBE(const T data, int off, byte dest[16])
  1159. {
  1160. // Power7/ISA 2.06 provides vec_xst, but only for 32-bit and 64-bit
  1161. // word pointers. The ISA lacks stores for short* and char*.
  1162. // Power9/ISA 3.0 provides vec_xst for all datatypes.
  1163. const uintptr_t addr = reinterpret_cast<uintptr_t>(dest)+off;
  1164. CRYPTOPP_ASSERT(addr % GetAlignmentOf<byte>() == 0);
  1165. CRYPTOPP_UNUSED(addr);
  1166. #if defined(_ARCH_PWR9)
  1167. vec_xst_be((uint8x16_p)data, off, NCONST_V8_CAST(dest));
  1168. #elif defined(CRYPTOPP_BIG_ENDIAN)
  1169. VecStore((uint8x16_p)data, NCONST_V8_CAST(addr));
  1170. #else
  1171. VecStore((uint8x16_p)VecReverseLE(data), NCONST_V8_CAST(addr));
  1172. #endif
  1173. }
  1174. /// \brief Stores a vector to a word array
  1175. /// \tparam T vector type
  1176. /// \param data the vector
  1177. /// \param dest the word array
  1178. /// \details VecStoreBE() stores a vector to a word array. VecStoreBE
  1179. /// will reverse all bytes in the array on a little endian system.
  1180. /// \details VecStoreBE() uses POWER7's and VSX's <tt>vec_xst</tt> if available.
  1181. /// The instruction does not require aligned effective memory addresses.
  1182. /// VecStore_ALTIVEC() is used if POWER7 is not available.
  1183. /// VecStore_ALTIVEC() can be relatively expensive if extra instructions
  1184. /// are required to fix up unaligned memory addresses.
  1185. /// \par Wraps
  1186. /// vec_xst on VSX or POWER8 and above, vec_st on POWER7 and below
  1187. /// \sa VecStore_ALTIVEC, VecStoreAligned
  1188. /// \since Crypto++ 8.0
  1189. template <class T>
  1190. inline void VecStoreBE(const T data, word32 dest[4])
  1191. {
  1192. // Power7/ISA 2.06 provides vec_xst, but only for 32-bit and 64-bit
  1193. // word pointers. The ISA lacks stores for short* and char*.
  1194. // Power9/ISA 3.0 provides vec_xst for all datatypes.
  1195. const uintptr_t addr = reinterpret_cast<uintptr_t>(dest);
  1196. CRYPTOPP_ASSERT(addr % GetAlignmentOf<word32>() == 0);
  1197. CRYPTOPP_UNUSED(addr);
  1198. #if defined(_ARCH_PWR9)
  1199. vec_xst_be((uint8x16_p)data, 0, NCONST_V8_CAST(dest));
  1200. #elif defined(CRYPTOPP_BIG_ENDIAN)
  1201. VecStore((uint32x4_p)data, NCONST_V32_CAST(addr));
  1202. #else
  1203. VecStore((uint32x4_p)VecReverseLE(data), NCONST_V32_CAST(addr));
  1204. #endif
  1205. }
  1206. /// \brief Stores a vector to a word array
  1207. /// \tparam T vector type
  1208. /// \param data the vector
  1209. /// \param off offset into the dest word array
  1210. /// \param dest the word array
  1211. /// \details VecStoreBE() stores a vector to a word array. VecStoreBE
  1212. /// will reverse all words in the array on a little endian system.
  1213. /// \details VecStoreBE() uses POWER7's and VSX's <tt>vec_xst</tt> if available.
  1214. /// The instruction does not require aligned effective memory addresses.
  1215. /// VecStore_ALTIVEC() is used if POWER7 is not available.
  1216. /// VecStore_ALTIVEC() can be relatively expensive if extra instructions
  1217. /// are required to fix up unaligned memory addresses.
  1218. /// \par Wraps
  1219. /// vec_xst on VSX or POWER8 and above, vec_st on POWER7 and below
  1220. /// \sa VecStore_ALTIVEC, VecStoreAligned
  1221. /// \since Crypto++ 8.0
  1222. template <class T>
  1223. inline void VecStoreBE(const T data, int off, word32 dest[4])
  1224. {
  1225. // Power7/ISA 2.06 provides vec_xst, but only for 32-bit and 64-bit
  1226. // word pointers. The ISA lacks stores for short* and char*.
  1227. // Power9/ISA 3.0 provides vec_xst for all datatypes.
  1228. const uintptr_t addr = reinterpret_cast<uintptr_t>(dest)+off;
  1229. CRYPTOPP_ASSERT(addr % GetAlignmentOf<word32>() == 0);
  1230. CRYPTOPP_UNUSED(addr);
  1231. #if defined(_ARCH_PWR9)
  1232. vec_xst_be((uint8x16_p)data, off, NCONST_V8_CAST(dest));
  1233. #elif defined(CRYPTOPP_BIG_ENDIAN)
  1234. VecStore((uint32x4_p)data, NCONST_V32_CAST(addr));
  1235. #else
  1236. VecStore((uint32x4_p)VecReverseLE(data), NCONST_V32_CAST(addr));
  1237. #endif
  1238. }
  1239. //@}
  1240. /// \name LOGICAL OPERATIONS
  1241. //@{
  1242. /// \brief AND two vectors
  1243. /// \tparam T1 vector type
  1244. /// \tparam T2 vector type
  1245. /// \param vec1 the first vector
  1246. /// \param vec2 the second vector
  1247. /// \return vector
  1248. /// \details VecAnd() performs <tt>vec1 & vec2</tt>.
  1249. /// vec2 is cast to the same type as vec1. The return vector
  1250. /// is the same type as vec1.
  1251. /// \par Wraps
  1252. /// vec_and
  1253. /// \sa VecAnd64
  1254. /// \since Crypto++ 6.0
  1255. template <class T1, class T2>
  1256. inline T1 VecAnd(const T1 vec1, const T2 vec2)
  1257. {
  1258. return (T1)vec_and(vec1, (T1)vec2);
  1259. }
  1260. /// \brief OR two vectors
  1261. /// \tparam T1 vector type
  1262. /// \tparam T2 vector type
  1263. /// \param vec1 the first vector
  1264. /// \param vec2 the second vector
  1265. /// \return vector
  1266. /// \details VecOr() performs <tt>vec1 | vec2</tt>.
  1267. /// vec2 is cast to the same type as vec1. The return vector
  1268. /// is the same type as vec1.
  1269. /// \par Wraps
  1270. /// vec_or
  1271. /// \sa VecOr64
  1272. /// \since Crypto++ 6.0
  1273. template <class T1, class T2>
  1274. inline T1 VecOr(const T1 vec1, const T2 vec2)
  1275. {
  1276. return (T1)vec_or(vec1, (T1)vec2);
  1277. }
  1278. /// \brief XOR two vectors
  1279. /// \tparam T1 vector type
  1280. /// \tparam T2 vector type
  1281. /// \param vec1 the first vector
  1282. /// \param vec2 the second vector
  1283. /// \return vector
  1284. /// \details VecXor() performs <tt>vec1 ^ vec2</tt>.
  1285. /// vec2 is cast to the same type as vec1. The return vector
  1286. /// is the same type as vec1.
  1287. /// \par Wraps
  1288. /// vec_xor
  1289. /// \sa VecXor64
  1290. /// \since Crypto++ 6.0
  1291. template <class T1, class T2>
  1292. inline T1 VecXor(const T1 vec1, const T2 vec2)
  1293. {
  1294. return (T1)vec_xor(vec1, (T1)vec2);
  1295. }
  1296. //@}
  1297. /// \name ARITHMETIC OPERATIONS
  1298. //@{
  1299. /// \brief Add two vectors
  1300. /// \tparam T1 vector type
  1301. /// \tparam T2 vector type
  1302. /// \param vec1 the first vector
  1303. /// \param vec2 the second vector
  1304. /// \return vector
  1305. /// \details VecAdd() performs <tt>vec1 + vec2</tt>.
  1306. /// vec2 is cast to the same type as vec1. The return vector
  1307. /// is the same type as vec1.
  1308. /// \par Wraps
  1309. /// vec_add
  1310. /// \sa VecAdd64
  1311. /// \since Crypto++ 6.0
  1312. template <class T1, class T2>
  1313. inline T1 VecAdd(const T1 vec1, const T2 vec2)
  1314. {
  1315. return (T1)vec_add(vec1, (T1)vec2);
  1316. }
  1317. /// \brief Subtract two vectors
  1318. /// \tparam T1 vector type
  1319. /// \tparam T2 vector type
  1320. /// \param vec1 the first vector
  1321. /// \param vec2 the second vector
  1322. /// \details VecSub() performs <tt>vec1 - vec2</tt>.
  1323. /// vec2 is cast to the same type as vec1. The return vector
  1324. /// is the same type as vec1.
  1325. /// \par Wraps
  1326. /// vec_sub
  1327. /// \sa VecSub64
  1328. /// \since Crypto++ 6.0
  1329. template <class T1, class T2>
  1330. inline T1 VecSub(const T1 vec1, const T2 vec2)
  1331. {
  1332. return (T1)vec_sub(vec1, (T1)vec2);
  1333. }
  1334. //@}
  1335. /// \name PERMUTE OPERATIONS
  1336. //@{
  1337. /// \brief Permutes a vector
  1338. /// \tparam T1 vector type
  1339. /// \tparam T2 vector type
  1340. /// \param vec the vector
  1341. /// \param mask vector mask
  1342. /// \return vector
  1343. /// \details VecPermute() creates a new vector from vec according to mask.
  1344. /// mask is an uint8x16_p vector. The return vector is the same type as vec.
  1345. /// \par Wraps
  1346. /// vec_perm
  1347. /// \since Crypto++ 6.0
  1348. template <class T1, class T2>
  1349. inline T1 VecPermute(const T1 vec, const T2 mask)
  1350. {
  1351. return (T1)vec_perm(vec, vec, (uint8x16_p)mask);
  1352. }
  1353. /// \brief Permutes two vectors
  1354. /// \tparam T1 vector type
  1355. /// \tparam T2 vector type
  1356. /// \param vec1 the first vector
  1357. /// \param vec2 the second vector
  1358. /// \param mask vector mask
  1359. /// \return vector
  1360. /// \details VecPermute() creates a new vector from vec1 and vec2 according to mask.
  1361. /// mask is an uint8x16_p vector. The return vector is the same type as vec.
  1362. /// \par Wraps
  1363. /// vec_perm
  1364. /// \since Crypto++ 6.0
  1365. template <class T1, class T2>
  1366. inline T1 VecPermute(const T1 vec1, const T1 vec2, const T2 mask)
  1367. {
  1368. return (T1)vec_perm(vec1, (T1)vec2, (uint8x16_p)mask);
  1369. }
  1370. //@}
  1371. /// \name SHIFT AND ROTATE OPERATIONS
  1372. //@{
  1373. /// \brief Shift a vector left
  1374. /// \tparam C shift byte count
  1375. /// \tparam T vector type
  1376. /// \param vec the vector
  1377. /// \return vector
  1378. /// \details VecShiftLeftOctet() returns a new vector after shifting the
  1379. /// concatenation of the zero vector and the source vector by the specified
  1380. /// number of bytes. The return vector is the same type as vec.
  1381. /// \details On big endian machines VecShiftLeftOctet() is <tt>vec_sld(a, z,
  1382. /// c)</tt>. On little endian machines VecShiftLeftOctet() is translated to
  1383. /// <tt>vec_sld(z, a, 16-c)</tt>. You should always call the function as
  1384. /// if on a big endian machine as shown below.
  1385. /// <pre>
  1386. /// uint8x16_p x = VecLoad(ptr);
  1387. /// uint8x16_p y = VecShiftLeftOctet<12>(x);
  1388. /// </pre>
  1389. /// \par Wraps
  1390. /// vec_sld
  1391. /// \sa <A HREF="https://stackoverflow.com/q/46341923/608639">Is vec_sld
  1392. /// endian sensitive?</A> on Stack Overflow
  1393. /// \since Crypto++ 6.0
  1394. template <unsigned int C, class T>
  1395. inline T VecShiftLeftOctet(const T vec)
  1396. {
  1397. const T zero = {0};
  1398. if (C >= 16)
  1399. {
  1400. // Out of range
  1401. return zero;
  1402. }
  1403. else if (C == 0)
  1404. {
  1405. // Noop
  1406. return vec;
  1407. }
  1408. else
  1409. {
  1410. #if defined(CRYPTOPP_BIG_ENDIAN)
  1411. enum { R=C&0xf };
  1412. return (T)vec_sld((uint8x16_p)vec, (uint8x16_p)zero, R);
  1413. #else
  1414. enum { R=(16-C)&0xf }; // Linux xlC 13.1 workaround in Debug builds
  1415. return (T)vec_sld((uint8x16_p)zero, (uint8x16_p)vec, R);
  1416. #endif
  1417. }
  1418. }
  1419. /// \brief Shift a vector right
  1420. /// \tparam C shift byte count
  1421. /// \tparam T vector type
  1422. /// \param vec the vector
  1423. /// \return vector
  1424. /// \details VecShiftRightOctet() returns a new vector after shifting the
  1425. /// concatenation of the zero vector and the source vector by the specified
  1426. /// number of bytes. The return vector is the same type as vec.
  1427. /// \details On big endian machines VecShiftRightOctet() is <tt>vec_sld(a, z,
  1428. /// c)</tt>. On little endian machines VecShiftRightOctet() is translated to
  1429. /// <tt>vec_sld(z, a, 16-c)</tt>. You should always call the function as
  1430. /// if on a big endian machine as shown below.
  1431. /// <pre>
  1432. /// uint8x16_p x = VecLoad(ptr);
  1433. /// uint8x16_p y = VecShiftRightOctet<12>(y);
  1434. /// </pre>
  1435. /// \par Wraps
  1436. /// vec_sld
  1437. /// \sa <A HREF="https://stackoverflow.com/q/46341923/608639">Is vec_sld
  1438. /// endian sensitive?</A> on Stack Overflow
  1439. /// \since Crypto++ 6.0
  1440. template <unsigned int C, class T>
  1441. inline T VecShiftRightOctet(const T vec)
  1442. {
  1443. const T zero = {0};
  1444. if (C >= 16)
  1445. {
  1446. // Out of range
  1447. return zero;
  1448. }
  1449. else if (C == 0)
  1450. {
  1451. // Noop
  1452. return vec;
  1453. }
  1454. else
  1455. {
  1456. #if defined(CRYPTOPP_BIG_ENDIAN)
  1457. enum { R=(16-C)&0xf }; // Linux xlC 13.1 workaround in Debug builds
  1458. return (T)vec_sld((uint8x16_p)zero, (uint8x16_p)vec, R);
  1459. #else
  1460. enum { R=C&0xf };
  1461. return (T)vec_sld((uint8x16_p)vec, (uint8x16_p)zero, R);
  1462. #endif
  1463. }
  1464. }
  1465. /// \brief Rotate a vector left
  1466. /// \tparam C shift byte count
  1467. /// \tparam T vector type
  1468. /// \param vec the vector
  1469. /// \return vector
  1470. /// \details VecRotateLeftOctet() returns a new vector after rotating the
  1471. /// concatenation of the source vector with itself by the specified
  1472. /// number of bytes. The return vector is the same type as vec.
  1473. /// \par Wraps
  1474. /// vec_sld
  1475. /// \sa <A HREF="https://stackoverflow.com/q/46341923/608639">Is vec_sld
  1476. /// endian sensitive?</A> on Stack Overflow
  1477. /// \since Crypto++ 6.0
  1478. template <unsigned int C, class T>
  1479. inline T VecRotateLeftOctet(const T vec)
  1480. {
  1481. #if defined(CRYPTOPP_BIG_ENDIAN)
  1482. enum { R = C&0xf };
  1483. return (T)vec_sld((uint8x16_p)vec, (uint8x16_p)vec, R);
  1484. #else
  1485. enum { R=(16-C)&0xf }; // Linux xlC 13.1 workaround in Debug builds
  1486. return (T)vec_sld((uint8x16_p)vec, (uint8x16_p)vec, R);
  1487. #endif
  1488. }
  1489. /// \brief Rotate a vector right
  1490. /// \tparam C shift byte count
  1491. /// \tparam T vector type
  1492. /// \param vec the vector
  1493. /// \return vector
  1494. /// \details VecRotateRightOctet() returns a new vector after rotating the
  1495. /// concatenation of the source vector with itself by the specified
  1496. /// number of bytes. The return vector is the same type as vec.
  1497. /// \par Wraps
  1498. /// vec_sld
  1499. /// \sa <A HREF="https://stackoverflow.com/q/46341923/608639">Is vec_sld
  1500. /// endian sensitive?</A> on Stack Overflow
  1501. /// \since Crypto++ 6.0
  1502. template <unsigned int C, class T>
  1503. inline T VecRotateRightOctet(const T vec)
  1504. {
  1505. #if defined(CRYPTOPP_BIG_ENDIAN)
  1506. enum { R=(16-C)&0xf }; // Linux xlC 13.1 workaround in Debug builds
  1507. return (T)vec_sld((uint8x16_p)vec, (uint8x16_p)vec, R);
  1508. #else
  1509. enum { R = C&0xf };
  1510. return (T)vec_sld((uint8x16_p)vec, (uint8x16_p)vec, R);
  1511. #endif
  1512. }
  1513. /// \brief Rotate a vector left
  1514. /// \tparam C rotate bit count
  1515. /// \param vec the vector
  1516. /// \return vector
  1517. /// \details VecRotateLeft() rotates each element in a vector by
  1518. /// bit count. The return vector is the same type as vec.
  1519. /// \par Wraps
  1520. /// vec_rl
  1521. /// \since Crypto++ 7.0
  1522. template<unsigned int C>
  1523. inline uint32x4_p VecRotateLeft(const uint32x4_p vec)
  1524. {
  1525. const uint32x4_p m = {C, C, C, C};
  1526. return vec_rl(vec, m);
  1527. }
  1528. /// \brief Rotate a vector right
  1529. /// \tparam C rotate bit count
  1530. /// \param vec the vector
  1531. /// \return vector
  1532. /// \details VecRotateRight() rotates each element in a vector
  1533. /// by bit count. The return vector is the same type as vec.
  1534. /// \par Wraps
  1535. /// vec_rl
  1536. /// \since Crypto++ 7.0
  1537. template<unsigned int C>
  1538. inline uint32x4_p VecRotateRight(const uint32x4_p vec)
  1539. {
  1540. const uint32x4_p m = {32-C, 32-C, 32-C, 32-C};
  1541. return vec_rl(vec, m);
  1542. }
  1543. /// \brief Shift a vector left
  1544. /// \tparam C shift bit count
  1545. /// \param vec the vector
  1546. /// \return vector
  1547. /// \details VecShiftLeft() rotates each element in a vector
  1548. /// by bit count. The return vector is the same type as vec.
  1549. /// \par Wraps
  1550. /// vec_sl
  1551. /// \since Crypto++ 8.1
  1552. template<unsigned int C>
  1553. inline uint32x4_p VecShiftLeft(const uint32x4_p vec)
  1554. {
  1555. const uint32x4_p m = {C, C, C, C};
  1556. return vec_sl(vec, m);
  1557. }
  1558. /// \brief Shift a vector right
  1559. /// \tparam C shift bit count
  1560. /// \param vec the vector
  1561. /// \return vector
  1562. /// \details VecShiftRight() rotates each element in a vector
  1563. /// by bit count. The return vector is the same type as vec.
  1564. /// \par Wraps
  1565. /// vec_rl
  1566. /// \since Crypto++ 8.1
  1567. template<unsigned int C>
  1568. inline uint32x4_p VecShiftRight(const uint32x4_p vec)
  1569. {
  1570. const uint32x4_p m = {C, C, C, C};
  1571. return vec_sr(vec, m);
  1572. }
  1573. // 64-bit elements available at POWER7 with VSX, but vec_rl and vec_sl require POWER8
  1574. #if defined(_ARCH_PWR8) || defined(CRYPTOPP_DOXYGEN_PROCESSING)
  1575. /// \brief Rotate a vector left
  1576. /// \tparam C rotate bit count
  1577. /// \param vec the vector
  1578. /// \return vector
  1579. /// \details VecRotateLeft() rotates each element in a vector
  1580. /// by bit count. The return vector is the same type as vec.
  1581. /// \details VecRotateLeft() with 64-bit elements is available on
  1582. /// POWER8 and above.
  1583. /// \par Wraps
  1584. /// vec_rl
  1585. /// \since Crypto++ 8.0
  1586. template<unsigned int C>
  1587. inline uint64x2_p VecRotateLeft(const uint64x2_p vec)
  1588. {
  1589. const uint64x2_p m = {C, C};
  1590. return vec_rl(vec, m);
  1591. }
  1592. /// \brief Shift a vector left
  1593. /// \tparam C shift bit count
  1594. /// \param vec the vector
  1595. /// \return vector
  1596. /// \details VecShiftLeft() rotates each element in a vector
  1597. /// by bit count. The return vector is the same type as vec.
  1598. /// \details VecShiftLeft() with 64-bit elements is available on
  1599. /// POWER8 and above.
  1600. /// \par Wraps
  1601. /// vec_sl
  1602. /// \since Crypto++ 8.1
  1603. template<unsigned int C>
  1604. inline uint64x2_p VecShiftLeft(const uint64x2_p vec)
  1605. {
  1606. const uint64x2_p m = {C, C};
  1607. return vec_sl(vec, m);
  1608. }
  1609. /// \brief Rotate a vector right
  1610. /// \tparam C rotate bit count
  1611. /// \param vec the vector
  1612. /// \return vector
  1613. /// \details VecRotateRight() rotates each element in a vector
  1614. /// by bit count. The return vector is the same type as vec.
  1615. /// \details VecRotateRight() with 64-bit elements is available on
  1616. /// POWER8 and above.
  1617. /// \par Wraps
  1618. /// vec_rl
  1619. /// \since Crypto++ 8.0
  1620. template<unsigned int C>
  1621. inline uint64x2_p VecRotateRight(const uint64x2_p vec)
  1622. {
  1623. const uint64x2_p m = {64-C, 64-C};
  1624. return vec_rl(vec, m);
  1625. }
  1626. /// \brief Shift a vector right
  1627. /// \tparam C shift bit count
  1628. /// \param vec the vector
  1629. /// \return vector
  1630. /// \details VecShiftRight() rotates each element in a vector
  1631. /// by bit count. The return vector is the same type as vec.
  1632. /// \details VecShiftRight() with 64-bit elements is available on
  1633. /// POWER8 and above.
  1634. /// \par Wraps
  1635. /// vec_sr
  1636. /// \since Crypto++ 8.1
  1637. template<unsigned int C>
  1638. inline uint64x2_p VecShiftRight(const uint64x2_p vec)
  1639. {
  1640. const uint64x2_p m = {C, C};
  1641. return vec_sr(vec, m);
  1642. }
  1643. #endif // ARCH_PWR8
  1644. //@}
  1645. /// \name OTHER OPERATIONS
  1646. //@{
  1647. /// \brief Merge two vectors
  1648. /// \tparam T vector type
  1649. /// \param vec1 the first vector
  1650. /// \param vec2 the second vector
  1651. /// \return vector
  1652. /// \par Wraps
  1653. /// vec_mergel
  1654. /// \since Crypto++ 8.1
  1655. template <class T>
  1656. inline T VecMergeLow(const T vec1, const T vec2)
  1657. {
  1658. return vec_mergel(vec1, vec2);
  1659. }
  1660. /// \brief Merge two vectors
  1661. /// \tparam T vector type
  1662. /// \param vec1 the first vector
  1663. /// \param vec2 the second vector
  1664. /// \return vector
  1665. /// \par Wraps
  1666. /// vec_mergeh
  1667. /// \since Crypto++ 8.1
  1668. template <class T>
  1669. inline T VecMergeHigh(const T vec1, const T vec2)
  1670. {
  1671. return vec_mergeh(vec1, vec2);
  1672. }
  1673. /// \brief Broadcast 32-bit word to a vector
  1674. /// \param val the 32-bit value
  1675. /// \return vector
  1676. /// \par Wraps
  1677. /// vec_splats
  1678. /// \since Crypto++ 8.3
  1679. inline uint32x4_p VecSplatWord(word32 val)
  1680. {
  1681. // Fix spurious GCC warning???
  1682. CRYPTOPP_UNUSED(val);
  1683. // Apple Altivec and XL C++ do not offer vec_splats.
  1684. // GCC offers vec_splats back to -mcpu=power4.
  1685. #if defined(_ARCH_PWR4) && defined(__GNUC__)
  1686. return vec_splats(val);
  1687. #else
  1688. //const word32 x[4] = {val,val,val,val};
  1689. //return VecLoad(x);
  1690. const word32 x[4] = {val};
  1691. return vec_splat(VecLoad(x),0);
  1692. #endif
  1693. }
  1694. /// \brief Broadcast 32-bit element to a vector
  1695. /// \tparam the element number
  1696. /// \param val the 32-bit value
  1697. /// \return vector
  1698. /// \par Wraps
  1699. /// vec_splat
  1700. /// \since Crypto++ 8.3
  1701. template <unsigned int N>
  1702. inline uint32x4_p VecSplatElement(const uint32x4_p val)
  1703. {
  1704. return vec_splat(val, N);
  1705. }
  1706. #if defined(__VSX__) || defined(_ARCH_PWR8) || defined(CRYPTOPP_DOXYGEN_PROCESSING)
  1707. /// \brief Broadcast 64-bit double word to a vector
  1708. /// \param val the 64-bit value
  1709. /// \return vector
  1710. /// \par Wraps
  1711. /// vec_splats
  1712. /// \since Crypto++ 8.3
  1713. inline uint64x2_p VecSplatWord(word64 val)
  1714. {
  1715. // The PPC64 ABI says so.
  1716. return vec_splats((unsigned long long)val);
  1717. }
  1718. /// \brief Broadcast 64-bit element to a vector
  1719. /// \tparam the element number
  1720. /// \param val the 64-bit value
  1721. /// \return vector
  1722. /// \par Wraps
  1723. /// vec_splat
  1724. /// \since Crypto++ 8.3
  1725. template <unsigned int N>
  1726. inline uint64x2_p VecSplatElement(const uint64x2_p val)
  1727. {
  1728. #if defined(__VSX__) || defined(_ARCH_PWR8)
  1729. return vec_splat(val, N);
  1730. #else
  1731. enum {E=N&1};
  1732. if (E == 0)
  1733. {
  1734. const uint8x16_p m = {0,1,2,3, 4,5,6,7, 0,1,2,3, 4,5,6,7};
  1735. return vec_perm(val, val, m);
  1736. }
  1737. else // (E == 1)
  1738. {
  1739. const uint8x16_p m = {8,9,10,11, 12,13,14,15, 8,9,10,11, 12,13,14,15};
  1740. return vec_perm(val, val, m);
  1741. }
  1742. #endif
  1743. }
  1744. #endif
  1745. /// \brief Extract a dword from a vector
  1746. /// \tparam T vector type
  1747. /// \param val the vector
  1748. /// \return vector created from low dword
  1749. /// \details VecGetLow() extracts the low dword from a vector. The low dword
  1750. /// is composed of the least significant bits and occupies bytes 8 through 15
  1751. /// when viewed as a big endian array. The return vector is the same type as
  1752. /// the original vector and padded with 0's in the most significant bit positions.
  1753. /// \par Wraps
  1754. /// vec_sld
  1755. /// \since Crypto++ 7.0
  1756. template <class T>
  1757. inline T VecGetLow(const T val)
  1758. {
  1759. #if defined(CRYPTOPP_BIG_ENDIAN) && (defined(__VSX__) || defined(_ARCH_PWR8))
  1760. const T zero = {0};
  1761. return (T)VecMergeLow((uint64x2_p)zero, (uint64x2_p)val);
  1762. #else
  1763. return VecShiftRightOctet<8>(VecShiftLeftOctet<8>(val));
  1764. #endif
  1765. }
  1766. /// \brief Extract a dword from a vector
  1767. /// \tparam T vector type
  1768. /// \param val the vector
  1769. /// \return vector created from high dword
  1770. /// \details VecGetHigh() extracts the high dword from a vector. The high dword
  1771. /// is composed of the most significant bits and occupies bytes 0 through 7
  1772. /// when viewed as a big endian array. The return vector is the same type as
  1773. /// the original vector and padded with 0's in the most significant bit positions.
  1774. /// \par Wraps
  1775. /// vec_sld
  1776. /// \since Crypto++ 7.0
  1777. template <class T>
  1778. inline T VecGetHigh(const T val)
  1779. {
  1780. #if defined(CRYPTOPP_BIG_ENDIAN) && (defined(__VSX__) || defined(_ARCH_PWR8))
  1781. const T zero = {0};
  1782. return (T)VecMergeHigh((uint64x2_p)zero, (uint64x2_p)val);
  1783. #else
  1784. return VecShiftRightOctet<8>(val);
  1785. #endif
  1786. }
  1787. /// \brief Exchange high and low double words
  1788. /// \tparam T vector type
  1789. /// \param vec the vector
  1790. /// \return vector
  1791. /// \par Wraps
  1792. /// vec_sld
  1793. /// \since Crypto++ 7.0
  1794. template <class T>
  1795. inline T VecSwapWords(const T vec)
  1796. {
  1797. return (T)vec_sld((uint8x16_p)vec, (uint8x16_p)vec, 8);
  1798. }
  1799. //@}
  1800. /// \name COMPARISON
  1801. //@{
  1802. /// \brief Compare two vectors
  1803. /// \tparam T1 vector type
  1804. /// \tparam T2 vector type
  1805. /// \param vec1 the first vector
  1806. /// \param vec2 the second vector
  1807. /// \return true if vec1 equals vec2, false otherwise
  1808. /// \details VecEqual() performs a bitwise compare. The vector element types do
  1809. /// not matter.
  1810. /// \par Wraps
  1811. /// vec_all_eq
  1812. /// \since Crypto++ 8.0
  1813. template <class T1, class T2>
  1814. inline bool VecEqual(const T1 vec1, const T2 vec2)
  1815. {
  1816. return 1 == vec_all_eq((uint32x4_p)vec1, (uint32x4_p)vec2);
  1817. }
  1818. /// \brief Compare two vectors
  1819. /// \tparam T1 vector type
  1820. /// \tparam T2 vector type
  1821. /// \param vec1 the first vector
  1822. /// \param vec2 the second vector
  1823. /// \return true if vec1 does not equal vec2, false otherwise
  1824. /// \details VecNotEqual() performs a bitwise compare. The vector element types do
  1825. /// not matter.
  1826. /// \par Wraps
  1827. /// vec_all_eq
  1828. /// \since Crypto++ 8.0
  1829. template <class T1, class T2>
  1830. inline bool VecNotEqual(const T1 vec1, const T2 vec2)
  1831. {
  1832. return 0 == vec_all_eq((uint32x4_p)vec1, (uint32x4_p)vec2);
  1833. }
  1834. //@}
  1835. ////////////////// 32-bit Altivec /////////////////
  1836. /// \name 32-BIT ALTIVEC
  1837. //@{
  1838. /// \brief Add two vectors as if uint64x2_p
  1839. /// \param vec1 the first vector
  1840. /// \param vec2 the second vector
  1841. /// \return vector
  1842. /// \details VecAdd64() performs <tt>vec1 + vec2</tt>. VecAdd64() performs as
  1843. /// if adding two uint64x2_p vectors. On POWER7 and below VecAdd64() manages
  1844. /// the carries from the elements.
  1845. /// \par Wraps
  1846. /// vec_add for POWER8, vec_addc, vec_perm, vec_add for Altivec
  1847. /// \since Crypto++ 8.3
  1848. inline uint32x4_p VecAdd64(const uint32x4_p& vec1, const uint32x4_p& vec2)
  1849. {
  1850. // 64-bit elements available at POWER7 with VSX, but addudm requires POWER8
  1851. #if defined(_ARCH_PWR8) && !defined(CRYPTOPP_DEBUG)
  1852. return (uint32x4_p)vec_add((uint64x2_p)vec1, (uint64x2_p)vec2);
  1853. #else
  1854. // The carry mask selects carrys for elements 1 and 3 and sets
  1855. // remaining elements to 0. The results is then shifted so the
  1856. // carried values are added to elements 0 and 2.
  1857. #if defined(CRYPTOPP_BIG_ENDIAN)
  1858. const uint32x4_p zero = {0, 0, 0, 0};
  1859. const uint32x4_p mask = {0, 1, 0, 1};
  1860. #else
  1861. const uint32x4_p zero = {0, 0, 0, 0};
  1862. const uint32x4_p mask = {1, 0, 1, 0};
  1863. #endif
  1864. uint32x4_p cy = vec_addc(vec1, vec2);
  1865. uint32x4_p res = vec_add(vec1, vec2);
  1866. cy = vec_and(mask, cy);
  1867. cy = vec_sld (cy, zero, 4);
  1868. return vec_add(res, cy);
  1869. #endif
  1870. }
  1871. #if defined(_ARCH_PWR8) || defined(CRYPTOPP_DOXYGEN_PROCESSING)
  1872. /// \brief Add two vectors as if uint64x2_p
  1873. /// \param vec1 the first vector
  1874. /// \param vec2 the second vector
  1875. /// \return vector
  1876. /// \details VecAdd64() performs <tt>vec1 + vec2</tt>. VecAdd64() performs as
  1877. /// if adding two uint64x2_p vectors. On POWER7 and below VecAdd64() manages
  1878. /// the carries from the elements.
  1879. /// \par Wraps
  1880. /// vec_add for POWER8
  1881. /// \since Crypto++ 8.3
  1882. inline uint64x2_p VecAdd64(const uint64x2_p& vec1, const uint64x2_p& vec2)
  1883. {
  1884. // 64-bit elements available at POWER7 with VSX, but addudm requires POWER8
  1885. const uint64x2_p res = vec_add(vec1, vec2);
  1886. #if defined(CRYPTOPP_DEBUG)
  1887. // Test 32-bit add in debug builds while we are here.
  1888. const uint32x4_p x = (uint32x4_p)vec1;
  1889. const uint32x4_p y = (uint32x4_p)vec2;
  1890. const uint32x4_p r = VecAdd64(x, y);
  1891. CRYPTOPP_ASSERT(vec_all_eq((uint32x4_p)res, r) == 1);
  1892. #endif
  1893. return res;
  1894. }
  1895. #endif
  1896. /// \brief Subtract two vectors as if uint64x2_p
  1897. /// \param vec1 the first vector
  1898. /// \param vec2 the second vector
  1899. /// \details VecSub64() performs <tt>vec1 - vec2</tt>. VecSub64() performs as
  1900. /// if subtracting two uint64x2_p vectors. On POWER7 and below VecSub64()
  1901. /// manages the borrows from the elements.
  1902. /// \par Wraps
  1903. /// vec_sub for POWER8, vec_subc, vec_andc, vec_perm, vec_sub for Altivec
  1904. /// \since Crypto++ 8.3
  1905. inline uint32x4_p VecSub64(const uint32x4_p& vec1, const uint32x4_p& vec2)
  1906. {
  1907. #if defined(_ARCH_PWR8) && !defined(CRYPTOPP_DEBUG)
  1908. // 64-bit elements available at POWER7 with VSX, but subudm requires POWER8
  1909. return (uint32x4_p)vec_sub((uint64x2_p)vec1, (uint64x2_p)vec2);
  1910. #else
  1911. // The borrow mask selects borrows for elements 1 and 3 and sets
  1912. // remaining elements to 0. The results is then shifted so the
  1913. // borrowed values are subtracted from elements 0 and 2.
  1914. #if defined(CRYPTOPP_BIG_ENDIAN)
  1915. const uint32x4_p zero = {0, 0, 0, 0};
  1916. const uint32x4_p mask = {0, 1, 0, 1};
  1917. #else
  1918. const uint32x4_p zero = {0, 0, 0, 0};
  1919. const uint32x4_p mask = {1, 0, 1, 0};
  1920. #endif
  1921. // subc sets the complement of borrow, so we have to
  1922. // un-complement it using andc.
  1923. uint32x4_p bw = vec_subc(vec1, vec2);
  1924. uint32x4_p res = vec_sub(vec1, vec2);
  1925. bw = vec_andc(mask, bw);
  1926. bw = vec_sld (bw, zero, 4);
  1927. return vec_sub(res, bw);
  1928. #endif
  1929. }
  1930. #if defined(_ARCH_PWR8) || defined(CRYPTOPP_DOXYGEN_PROCESSING)
  1931. /// \brief Subtract two vectors as if uint64x2_p
  1932. /// \param vec1 the first vector
  1933. /// \param vec2 the second vector
  1934. /// \details VecSub64() performs <tt>vec1 - vec2</tt>. VecSub64() performs as
  1935. /// if subtracting two uint64x2_p vectors. On POWER7 and below VecSub64()
  1936. /// manages the borrows from the elements.
  1937. /// \par Wraps
  1938. /// vec_sub for POWER8
  1939. /// \since Crypto++ 8.3
  1940. inline uint64x2_p VecSub64(const uint64x2_p& vec1, const uint64x2_p& vec2)
  1941. {
  1942. // 64-bit elements available at POWER7 with VSX, but subudm requires POWER8
  1943. const uint64x2_p res = vec_sub(vec1, vec2);
  1944. #if defined(CRYPTOPP_DEBUG)
  1945. // Test 32-bit sub in debug builds while we are here.
  1946. const uint32x4_p x = (uint32x4_p)vec1;
  1947. const uint32x4_p y = (uint32x4_p)vec2;
  1948. const uint32x4_p r = VecSub64(x, y);
  1949. CRYPTOPP_ASSERT(vec_all_eq((uint32x4_p)res, r) == 1);
  1950. #endif
  1951. return res;
  1952. }
  1953. #endif
  1954. /// \brief Rotate a vector left as if uint64x2_p
  1955. /// \tparam C rotate bit count
  1956. /// \param vec the vector
  1957. /// \return vector
  1958. /// \details VecRotateLeft() rotates each element in a vector by bit count.
  1959. /// vec is rotated as if uint64x2_p.
  1960. /// \par Wraps
  1961. /// vec_rl
  1962. /// \since Crypto++ 8.3
  1963. template<unsigned int C>
  1964. inline uint32x4_p VecRotateLeft64(const uint32x4_p vec)
  1965. {
  1966. #if defined(_ARCH_PWR8) && !defined(CRYPTOPP_DEBUG)
  1967. // 64-bit elements available at POWER7 with VSX, but vec_rl and vec_sl require POWER8
  1968. return (uint32x4_p)VecRotateLeft<C>((uint64x2_p)vec);
  1969. #else
  1970. // C=0, 32, or 64 needs special handling. That is S32 and S64 below.
  1971. enum {S64=C&63, S32=C&31, BR=(S64>=32)};
  1972. // Get the low bits, shift them to high bits
  1973. uint32x4_p t1 = VecShiftLeft<S32>(vec);
  1974. // Get the high bits, shift them to low bits
  1975. uint32x4_p t2 = VecShiftRight<32-S32>(vec);
  1976. if (S64 == 0)
  1977. {
  1978. const uint8x16_p m = {0,1,2,3, 4,5,6,7, 8,9,10,11, 12,13,14,15};
  1979. return VecPermute(vec, m);
  1980. }
  1981. else if (S64 == 32)
  1982. {
  1983. const uint8x16_p m = {4,5,6,7, 0,1,2,3, 12,13,14,15, 8,9,10,11};
  1984. return VecPermute(vec, m);
  1985. }
  1986. else if (BR) // Big rotate amount?
  1987. {
  1988. const uint8x16_p m = {4,5,6,7, 0,1,2,3, 12,13,14,15, 8,9,10,11};
  1989. t1 = VecPermute(t1, m);
  1990. }
  1991. else
  1992. {
  1993. const uint8x16_p m = {4,5,6,7, 0,1,2,3, 12,13,14,15, 8,9,10,11};
  1994. t2 = VecPermute(t2, m);
  1995. }
  1996. return vec_or(t1, t2);
  1997. #endif
  1998. }
  1999. /// \brief Rotate a vector left as if uint64x2_p
  2000. /// \param vec the vector
  2001. /// \return vector
  2002. /// \details VecRotateLeft<8>() rotates each element in a vector
  2003. /// by 8-bits. vec is rotated as if uint64x2_p. This specialization
  2004. /// is used by algorithms like Speck128.
  2005. /// \par Wraps
  2006. /// vec_rl
  2007. /// \since Crypto++ 8.3
  2008. template<>
  2009. inline uint32x4_p VecRotateLeft64<8>(const uint32x4_p vec)
  2010. {
  2011. #if (CRYPTOPP_BIG_ENDIAN)
  2012. const uint8x16_p m = { 1,2,3,4, 5,6,7,0, 9,10,11,12, 13,14,15,8 };
  2013. return VecPermute(vec, m);
  2014. #else
  2015. const uint8x16_p m = { 7,0,1,2, 3,4,5,6, 15,8,9,10, 11,12,13,14 };
  2016. return VecPermute(vec, m);
  2017. #endif
  2018. }
  2019. #if defined(_ARCH_PWR8) || defined(CRYPTOPP_DOXYGEN_PROCESSING)
  2020. /// \brief Rotate a vector left as if uint64x2_p
  2021. /// \tparam C rotate bit count
  2022. /// \param vec the vector
  2023. /// \return vector
  2024. /// \details VecRotateLeft64() rotates each element in a vector by
  2025. /// bit count. vec is rotated as if uint64x2_p.
  2026. /// \par Wraps
  2027. /// vec_rl
  2028. /// \since Crypto++ 8.3
  2029. template<unsigned int C>
  2030. inline uint64x2_p VecRotateLeft64(const uint64x2_p vec)
  2031. {
  2032. // 64-bit elements available at POWER7 with VSX, but vec_rl and vec_sl require POWER8
  2033. const uint64x2_p res = VecRotateLeft<C>(vec);
  2034. #if defined(CRYPTOPP_DEBUG)
  2035. // Test 32-bit rotate in debug builds while we are here.
  2036. const uint32x4_p x = (uint32x4_p)vec;
  2037. const uint32x4_p r = VecRotateLeft64<C>(x);
  2038. CRYPTOPP_ASSERT(vec_all_eq((uint32x4_p)res, r) == 1);
  2039. #endif
  2040. return res;
  2041. }
  2042. #endif
  2043. /// \brief Rotate a vector right as if uint64x2_p
  2044. /// \tparam C rotate bit count
  2045. /// \param vec the vector
  2046. /// \return vector
  2047. /// \details VecRotateRight64() rotates each element in a vector by
  2048. /// bit count. vec is rotated as if uint64x2_p.
  2049. /// \par Wraps
  2050. /// vec_rl
  2051. /// \since Crypto++ 8.3
  2052. template<unsigned int C>
  2053. inline uint32x4_p VecRotateRight64(const uint32x4_p vec)
  2054. {
  2055. #if defined(_ARCH_PWR8) && !defined(CRYPTOPP_DEBUG)
  2056. // 64-bit elements available at POWER7 with VSX, but vec_rl and vec_sl require POWER8
  2057. return (uint32x4_p)VecRotateRight<C>((uint64x2_p)vec);
  2058. #else
  2059. // C=0, 32, or 64 needs special handling. That is S32 and S64 below.
  2060. enum {S64=C&63, S32=C&31, BR=(S64>=32)};
  2061. // Get the low bits, shift them to high bits
  2062. uint32x4_p t1 = VecShiftRight<S32>(vec);
  2063. // Get the high bits, shift them to low bits
  2064. uint32x4_p t2 = VecShiftLeft<32-S32>(vec);
  2065. if (S64 == 0)
  2066. {
  2067. const uint8x16_p m = {0,1,2,3, 4,5,6,7, 8,9,10,11, 12,13,14,15};
  2068. return VecPermute(vec, m);
  2069. }
  2070. else if (S64 == 32)
  2071. {
  2072. const uint8x16_p m = {4,5,6,7, 0,1,2,3, 12,13,14,15, 8,9,10,11};
  2073. return VecPermute(vec, m);
  2074. }
  2075. else if (BR) // Big rotate amount?
  2076. {
  2077. const uint8x16_p m = {4,5,6,7, 0,1,2,3, 12,13,14,15, 8,9,10,11};
  2078. t1 = VecPermute(t1, m);
  2079. }
  2080. else
  2081. {
  2082. const uint8x16_p m = {4,5,6,7, 0,1,2,3, 12,13,14,15, 8,9,10,11};
  2083. t2 = VecPermute(t2, m);
  2084. }
  2085. return vec_or(t1, t2);
  2086. #endif
  2087. }
  2088. /// \brief Rotate a vector right as if uint64x2_p
  2089. /// \param vec the vector
  2090. /// \return vector
  2091. /// \details VecRotateRight64<8>() rotates each element in a vector
  2092. /// by 8-bits. vec is rotated as if uint64x2_p. This specialization
  2093. /// is used by algorithms like Speck128.
  2094. /// \details vec is rotated as if uint64x2_p.
  2095. /// \par Wraps
  2096. /// vec_rl
  2097. /// \since Crypto++ 8.3
  2098. template<>
  2099. inline uint32x4_p VecRotateRight64<8>(const uint32x4_p vec)
  2100. {
  2101. #if (CRYPTOPP_BIG_ENDIAN)
  2102. const uint8x16_p m = { 7,0,1,2, 3,4,5,6, 15,8,9,10, 11,12,13,14 };
  2103. return VecPermute(vec, m);
  2104. #else
  2105. const uint8x16_p m = { 1,2,3,4, 5,6,7,0, 9,10,11,12, 13,14,15,8 };
  2106. return VecPermute(vec, m);
  2107. #endif
  2108. }
  2109. #if defined(__VSX__) || defined(_ARCH_PWR8) || defined(CRYPTOPP_DOXYGEN_PROCESSING)
  2110. /// \brief Rotate a vector right as if uint64x2_p
  2111. /// \tparam C rotate bit count
  2112. /// \param vec the vector
  2113. /// \return vector
  2114. /// \details VecRotateRight64() rotates each element in a vector by
  2115. /// bit count. vec is rotated as if uint64x2_p.
  2116. /// \par Wraps
  2117. /// vec_rl
  2118. /// \since Crypto++ 8.3
  2119. template<unsigned int C>
  2120. inline uint64x2_p VecRotateRight64(const uint64x2_p vec)
  2121. {
  2122. // 64-bit elements available at POWER7 with VSX, but vec_rl and vec_sl require POWER8
  2123. const uint64x2_p res = VecRotateRight<C>(vec);
  2124. #if defined(CRYPTOPP_DEBUG)
  2125. // Test 32-bit rotate in debug builds while we are here.
  2126. const uint32x4_p x = (uint32x4_p)vec;
  2127. const uint32x4_p r = VecRotateRight64<C>(x);
  2128. CRYPTOPP_ASSERT(vec_all_eq((uint32x4_p)res, r) == 1);
  2129. #endif
  2130. return res;
  2131. }
  2132. #endif
  2133. /// \brief AND two vectors as if uint64x2_p
  2134. /// \tparam T1 vector type
  2135. /// \tparam T2 vector type
  2136. /// \param vec1 the first vector
  2137. /// \param vec2 the second vector
  2138. /// \return vector
  2139. /// \details VecAnd64() performs <tt>vec1 & vec2</tt>.
  2140. /// vec2 is cast to the same type as vec1. The return vector
  2141. /// is the same type as vec1.
  2142. /// \details VecAnd64() is a convenience function that simply performs a VecAnd().
  2143. /// \par Wraps
  2144. /// vec_and
  2145. /// \since Crypto++ 8.3
  2146. template <class T1, class T2>
  2147. inline T1 VecAnd64(const T1 vec1, const T2 vec2)
  2148. {
  2149. return (T1)vec_and(vec1, (T1)vec2);
  2150. }
  2151. /// \brief OR two vectors as if uint64x2_p
  2152. /// \tparam T1 vector type
  2153. /// \tparam T2 vector type
  2154. /// \param vec1 the first vector
  2155. /// \param vec2 the second vector
  2156. /// \return vector
  2157. /// \details VecOr64() performs <tt>vec1 | vec2</tt>.
  2158. /// vec2 is cast to the same type as vec1. The return vector
  2159. /// is the same type as vec1.
  2160. /// \details VecOr64() is a convenience function that simply performs a VecOr().
  2161. /// \par Wraps
  2162. /// vec_or
  2163. /// \since Crypto++ 8.3
  2164. template <class T1, class T2>
  2165. inline T1 VecOr64(const T1 vec1, const T2 vec2)
  2166. {
  2167. return (T1)vec_or(vec1, (T1)vec2);
  2168. }
  2169. /// \brief XOR two vectors as if uint64x2_p
  2170. /// \tparam T1 vector type
  2171. /// \tparam T2 vector type
  2172. /// \param vec1 the first vector
  2173. /// \param vec2 the second vector
  2174. /// \return vector
  2175. /// \details VecXor64() performs <tt>vec1 ^ vec2</tt>.
  2176. /// vec2 is cast to the same type as vec1. The return vector
  2177. /// is the same type as vec1.
  2178. /// \details VecXor64() is a convenience function that simply performs a VecXor().
  2179. /// \par Wraps
  2180. /// vec_xor
  2181. /// \since Crypto++ 8.3
  2182. template <class T1, class T2>
  2183. inline T1 VecXor64(const T1 vec1, const T2 vec2)
  2184. {
  2185. return (T1)vec_xor(vec1, (T1)vec2);
  2186. }
  2187. /// \brief Broadcast 64-bit double word to a vector
  2188. /// \param val the 64-bit value
  2189. /// \return vector
  2190. /// \par Wraps
  2191. /// vec_splats
  2192. /// \since Crypto++ 8.3
  2193. inline uint32x4_p VecSplatWord64(word64 val)
  2194. {
  2195. #if defined(_ARCH_PWR8)
  2196. // The PPC64 ABI says so.
  2197. return (uint32x4_p)vec_splats((unsigned long long)val);
  2198. #else
  2199. const word64 x[2] = {val,val};
  2200. return (uint32x4_p)VecLoad((const word32*)x);
  2201. #endif
  2202. }
  2203. /// \brief Broadcast 64-bit element to a vector as if uint64x2_p
  2204. /// \tparam the element number
  2205. /// \param val the 64-bit value
  2206. /// \return vector
  2207. /// \par Wraps
  2208. /// vec_splat
  2209. /// \since Crypto++ 8.3
  2210. template <unsigned int N>
  2211. inline uint32x4_p VecSplatElement64(const uint32x4_p val)
  2212. {
  2213. #if defined(__VSX__) || defined(_ARCH_PWR8)
  2214. return (uint32x4_p)vec_splat((uint64x2_p)val, N);
  2215. #else
  2216. enum {E=N&1};
  2217. if (E == 0)
  2218. {
  2219. const uint8x16_p m = {0,1,2,3, 4,5,6,7, 0,1,2,3, 4,5,6,7};
  2220. return (uint32x4_p)vec_perm(val, val, m);
  2221. }
  2222. else // (E == 1)
  2223. {
  2224. const uint8x16_p m = {8,9,10,11, 12,13,14,15, 8,9,10,11, 12,13,14,15};
  2225. return (uint32x4_p)vec_perm(val, val, m);
  2226. }
  2227. #endif
  2228. }
  2229. #if defined(__VSX__) || defined(_ARCH_PWR8) || defined(CRYPTOPP_DOXYGEN_PROCESSING)
  2230. /// \brief Broadcast 64-bit element to a vector
  2231. /// \tparam the element number
  2232. /// \param val the 64-bit value
  2233. /// \return vector
  2234. /// \since Crypto++ 8.3
  2235. template <unsigned int N>
  2236. inline uint64x2_p VecSplatElement64(const uint64x2_p val)
  2237. {
  2238. return vec_splat(val, N);
  2239. }
  2240. #endif
  2241. //@}
  2242. //////////////////////// Power8 Crypto ////////////////////////
  2243. // __CRYPTO__ alone is not enough. Clang will define __CRYPTO__
  2244. // when it is not available, like with Power7. Sigh...
  2245. #if (defined(_ARCH_PWR8) && defined(__CRYPTO__)) || defined(CRYPTOPP_DOXYGEN_PROCESSING)
  2246. /// \name POLYNOMIAL MULTIPLICATION
  2247. //@{
  2248. /// \brief Polynomial multiplication
  2249. /// \param a the first term
  2250. /// \param b the second term
  2251. /// \return vector product
  2252. /// \details VecPolyMultiply() performs polynomial multiplication. POWER8
  2253. /// polynomial multiplication multiplies the high and low terms, and then
  2254. /// XOR's the high and low products. That is, the result is <tt>ah*bh XOR
  2255. /// al*bl</tt>. It is different behavior than Intel polynomial
  2256. /// multiplication. To obtain a single product without the XOR, then set
  2257. /// one of the high or low terms to 0. For example, setting <tt>ah=0</tt>
  2258. /// results in <tt>0*bh XOR al*bl = al*bl</tt>.
  2259. /// \par Wraps
  2260. /// __vpmsumw, __builtin_altivec_crypto_vpmsumw and __builtin_crypto_vpmsumw.
  2261. /// \since Crypto++ 8.1
  2262. inline uint32x4_p VecPolyMultiply(const uint32x4_p& a, const uint32x4_p& b)
  2263. {
  2264. #if defined(__ibmxl__) || (defined(_AIX) && defined(__xlC__))
  2265. return __vpmsumw (a, b);
  2266. #elif defined(__clang__)
  2267. return __builtin_altivec_crypto_vpmsumw (a, b);
  2268. #else
  2269. return __builtin_crypto_vpmsumw (a, b);
  2270. #endif
  2271. }
  2272. /// \brief Polynomial multiplication
  2273. /// \param a the first term
  2274. /// \param b the second term
  2275. /// \return vector product
  2276. /// \details VecPolyMultiply() performs polynomial multiplication. POWER8
  2277. /// polynomial multiplication multiplies the high and low terms, and then
  2278. /// XOR's the high and low products. That is, the result is <tt>ah*bh XOR
  2279. /// al*bl</tt>. It is different behavior than Intel polynomial
  2280. /// multiplication. To obtain a single product without the XOR, then set
  2281. /// one of the high or low terms to 0. For example, setting <tt>ah=0</tt>
  2282. /// results in <tt>0*bh XOR al*bl = al*bl</tt>.
  2283. /// \par Wraps
  2284. /// __vpmsumd, __builtin_altivec_crypto_vpmsumd and __builtin_crypto_vpmsumd.
  2285. /// \since Crypto++ 8.1
  2286. inline uint64x2_p VecPolyMultiply(const uint64x2_p& a, const uint64x2_p& b)
  2287. {
  2288. #if defined(__ibmxl__) || (defined(_AIX) && defined(__xlC__))
  2289. return __vpmsumd (a, b);
  2290. #elif defined(__clang__)
  2291. return __builtin_altivec_crypto_vpmsumd (a, b);
  2292. #else
  2293. return __builtin_crypto_vpmsumd (a, b);
  2294. #endif
  2295. }
  2296. /// \brief Polynomial multiplication
  2297. /// \param a the first term
  2298. /// \param b the second term
  2299. /// \return vector product
  2300. /// \details VecIntelMultiply00() performs polynomial multiplication and presents
  2301. /// the result like Intel's <tt>c = _mm_clmulepi64_si128(a, b, 0x00)</tt>.
  2302. /// The <tt>0x00</tt> indicates the low 64-bits of <tt>a</tt> and <tt>b</tt>
  2303. /// are multiplied.
  2304. /// \note An Intel XMM register is composed of 128-bits. The leftmost bit
  2305. /// is MSB and numbered 127, while the rightmost bit is LSB and numbered 0.
  2306. /// \par Wraps
  2307. /// __vpmsumd, __builtin_altivec_crypto_vpmsumd and __builtin_crypto_vpmsumd.
  2308. /// \since Crypto++ 8.0
  2309. inline uint64x2_p VecIntelMultiply00(const uint64x2_p& a, const uint64x2_p& b)
  2310. {
  2311. #if defined(CRYPTOPP_BIG_ENDIAN)
  2312. return VecSwapWords(VecPolyMultiply(VecGetHigh(a), VecGetHigh(b)));
  2313. #else
  2314. return VecPolyMultiply(VecGetHigh(a), VecGetHigh(b));
  2315. #endif
  2316. }
  2317. /// \brief Polynomial multiplication
  2318. /// \param a the first term
  2319. /// \param b the second term
  2320. /// \return vector product
  2321. /// \details VecIntelMultiply01 performs() polynomial multiplication and presents
  2322. /// the result like Intel's <tt>c = _mm_clmulepi64_si128(a, b, 0x01)</tt>.
  2323. /// The <tt>0x01</tt> indicates the low 64-bits of <tt>a</tt> and high
  2324. /// 64-bits of <tt>b</tt> are multiplied.
  2325. /// \note An Intel XMM register is composed of 128-bits. The leftmost bit
  2326. /// is MSB and numbered 127, while the rightmost bit is LSB and numbered 0.
  2327. /// \par Wraps
  2328. /// __vpmsumd, __builtin_altivec_crypto_vpmsumd and __builtin_crypto_vpmsumd.
  2329. /// \since Crypto++ 8.0
  2330. inline uint64x2_p VecIntelMultiply01(const uint64x2_p& a, const uint64x2_p& b)
  2331. {
  2332. #if defined(CRYPTOPP_BIG_ENDIAN)
  2333. return VecSwapWords(VecPolyMultiply(a, VecGetHigh(b)));
  2334. #else
  2335. return VecPolyMultiply(a, VecGetHigh(b));
  2336. #endif
  2337. }
  2338. /// \brief Polynomial multiplication
  2339. /// \param a the first term
  2340. /// \param b the second term
  2341. /// \return vector product
  2342. /// \details VecIntelMultiply10() performs polynomial multiplication and presents
  2343. /// the result like Intel's <tt>c = _mm_clmulepi64_si128(a, b, 0x10)</tt>.
  2344. /// The <tt>0x10</tt> indicates the high 64-bits of <tt>a</tt> and low
  2345. /// 64-bits of <tt>b</tt> are multiplied.
  2346. /// \note An Intel XMM register is composed of 128-bits. The leftmost bit
  2347. /// is MSB and numbered 127, while the rightmost bit is LSB and numbered 0.
  2348. /// \par Wraps
  2349. /// __vpmsumd, __builtin_altivec_crypto_vpmsumd and __builtin_crypto_vpmsumd.
  2350. /// \since Crypto++ 8.0
  2351. inline uint64x2_p VecIntelMultiply10(const uint64x2_p& a, const uint64x2_p& b)
  2352. {
  2353. #if defined(CRYPTOPP_BIG_ENDIAN)
  2354. return VecSwapWords(VecPolyMultiply(VecGetHigh(a), b));
  2355. #else
  2356. return VecPolyMultiply(VecGetHigh(a), b);
  2357. #endif
  2358. }
  2359. /// \brief Polynomial multiplication
  2360. /// \param a the first term
  2361. /// \param b the second term
  2362. /// \return vector product
  2363. /// \details VecIntelMultiply11() performs polynomial multiplication and presents
  2364. /// the result like Intel's <tt>c = _mm_clmulepi64_si128(a, b, 0x11)</tt>.
  2365. /// The <tt>0x11</tt> indicates the high 64-bits of <tt>a</tt> and <tt>b</tt>
  2366. /// are multiplied.
  2367. /// \note An Intel XMM register is composed of 128-bits. The leftmost bit
  2368. /// is MSB and numbered 127, while the rightmost bit is LSB and numbered 0.
  2369. /// \par Wraps
  2370. /// __vpmsumd, __builtin_altivec_crypto_vpmsumd and __builtin_crypto_vpmsumd.
  2371. /// \since Crypto++ 8.0
  2372. inline uint64x2_p VecIntelMultiply11(const uint64x2_p& a, const uint64x2_p& b)
  2373. {
  2374. #if defined(CRYPTOPP_BIG_ENDIAN)
  2375. return VecSwapWords(VecPolyMultiply(VecGetLow(a), b));
  2376. #else
  2377. return VecPolyMultiply(VecGetLow(a), b);
  2378. #endif
  2379. }
  2380. //@}
  2381. /// \name AES ENCRYPTION
  2382. //@{
  2383. /// \brief One round of AES encryption
  2384. /// \tparam T1 vector type
  2385. /// \tparam T2 vector type
  2386. /// \param state the state vector
  2387. /// \param key the subkey vector
  2388. /// \details VecEncrypt() performs one round of AES encryption of state
  2389. /// using subkey key. The return vector is the same type as state.
  2390. /// \details VecEncrypt() is available on POWER8 and above.
  2391. /// \par Wraps
  2392. /// __vcipher, __builtin_altivec_crypto_vcipher, __builtin_crypto_vcipher
  2393. /// \since GCC and XLC since Crypto++ 6.0, LLVM Clang since Crypto++ 8.0
  2394. template <class T1, class T2>
  2395. inline T1 VecEncrypt(const T1 state, const T2 key)
  2396. {
  2397. #if defined(__ibmxl__) || (defined(_AIX) && defined(__xlC__))
  2398. return (T1)__vcipher((uint8x16_p)state, (uint8x16_p)key);
  2399. #elif defined(__clang__)
  2400. return (T1)__builtin_altivec_crypto_vcipher((uint64x2_p)state, (uint64x2_p)key);
  2401. #elif defined(__GNUC__)
  2402. return (T1)__builtin_crypto_vcipher((uint64x2_p)state, (uint64x2_p)key);
  2403. #else
  2404. CRYPTOPP_ASSERT(0);
  2405. #endif
  2406. }
  2407. /// \brief Final round of AES encryption
  2408. /// \tparam T1 vector type
  2409. /// \tparam T2 vector type
  2410. /// \param state the state vector
  2411. /// \param key the subkey vector
  2412. /// \details VecEncryptLast() performs the final round of AES encryption
  2413. /// of state using subkey key. The return vector is the same type as state.
  2414. /// \details VecEncryptLast() is available on POWER8 and above.
  2415. /// \par Wraps
  2416. /// __vcipherlast, __builtin_altivec_crypto_vcipherlast, __builtin_crypto_vcipherlast
  2417. /// \since GCC and XLC since Crypto++ 6.0, LLVM Clang since Crypto++ 8.0
  2418. template <class T1, class T2>
  2419. inline T1 VecEncryptLast(const T1 state, const T2 key)
  2420. {
  2421. #if defined(__ibmxl__) || (defined(_AIX) && defined(__xlC__))
  2422. return (T1)__vcipherlast((uint8x16_p)state, (uint8x16_p)key);
  2423. #elif defined(__clang__)
  2424. return (T1)__builtin_altivec_crypto_vcipherlast((uint64x2_p)state, (uint64x2_p)key);
  2425. #elif defined(__GNUC__)
  2426. return (T1)__builtin_crypto_vcipherlast((uint64x2_p)state, (uint64x2_p)key);
  2427. #else
  2428. CRYPTOPP_ASSERT(0);
  2429. #endif
  2430. }
  2431. /// \brief One round of AES decryption
  2432. /// \tparam T1 vector type
  2433. /// \tparam T2 vector type
  2434. /// \param state the state vector
  2435. /// \param key the subkey vector
  2436. /// \details VecDecrypt() performs one round of AES decryption of state
  2437. /// using subkey key. The return vector is the same type as state.
  2438. /// \details VecDecrypt() is available on POWER8 and above.
  2439. /// \par Wraps
  2440. /// __vncipher, __builtin_altivec_crypto_vncipher, __builtin_crypto_vncipher
  2441. /// \since GCC and XLC since Crypto++ 6.0, LLVM Clang since Crypto++ 8.0
  2442. template <class T1, class T2>
  2443. inline T1 VecDecrypt(const T1 state, const T2 key)
  2444. {
  2445. #if defined(__ibmxl__) || (defined(_AIX) && defined(__xlC__))
  2446. return (T1)__vncipher((uint8x16_p)state, (uint8x16_p)key);
  2447. #elif defined(__clang__)
  2448. return (T1)__builtin_altivec_crypto_vncipher((uint64x2_p)state, (uint64x2_p)key);
  2449. #elif defined(__GNUC__)
  2450. return (T1)__builtin_crypto_vncipher((uint64x2_p)state, (uint64x2_p)key);
  2451. #else
  2452. CRYPTOPP_ASSERT(0);
  2453. #endif
  2454. }
  2455. /// \brief Final round of AES decryption
  2456. /// \tparam T1 vector type
  2457. /// \tparam T2 vector type
  2458. /// \param state the state vector
  2459. /// \param key the subkey vector
  2460. /// \details VecDecryptLast() performs the final round of AES decryption
  2461. /// of state using subkey key. The return vector is the same type as state.
  2462. /// \details VecDecryptLast() is available on POWER8 and above.
  2463. /// \par Wraps
  2464. /// __vncipherlast, __builtin_altivec_crypto_vncipherlast, __builtin_crypto_vncipherlast
  2465. /// \since GCC and XLC since Crypto++ 6.0, LLVM Clang since Crypto++ 8.0
  2466. template <class T1, class T2>
  2467. inline T1 VecDecryptLast(const T1 state, const T2 key)
  2468. {
  2469. #if defined(__ibmxl__) || (defined(_AIX) && defined(__xlC__))
  2470. return (T1)__vncipherlast((uint8x16_p)state, (uint8x16_p)key);
  2471. #elif defined(__clang__)
  2472. return (T1)__builtin_altivec_crypto_vncipherlast((uint64x2_p)state, (uint64x2_p)key);
  2473. #elif defined(__GNUC__)
  2474. return (T1)__builtin_crypto_vncipherlast((uint64x2_p)state, (uint64x2_p)key);
  2475. #else
  2476. CRYPTOPP_ASSERT(0);
  2477. #endif
  2478. }
  2479. //@}
  2480. /// \name SHA DIGESTS
  2481. //@{
  2482. /// \brief SHA256 Sigma functions
  2483. /// \tparam func function
  2484. /// \tparam fmask function mask
  2485. /// \tparam T vector type
  2486. /// \param data the block to transform
  2487. /// \details VecSHA256() selects sigma0, sigma1, Sigma0, Sigma1 based on
  2488. /// func and fmask. The return vector is the same type as data.
  2489. /// \details VecSHA256() is available on POWER8 and above.
  2490. /// \par Wraps
  2491. /// __vshasigmaw, __builtin_altivec_crypto_vshasigmaw, __builtin_crypto_vshasigmaw
  2492. /// \since GCC and XLC since Crypto++ 6.0, LLVM Clang since Crypto++ 8.0
  2493. template <int func, int fmask, class T>
  2494. inline T VecSHA256(const T data)
  2495. {
  2496. #if defined(__ibmxl__) || (defined(_AIX) && defined(__xlC__))
  2497. return (T)__vshasigmaw((uint32x4_p)data, func, fmask);
  2498. #elif defined(__clang__)
  2499. return (T)__builtin_altivec_crypto_vshasigmaw((uint32x4_p)data, func, fmask);
  2500. #elif defined(__GNUC__)
  2501. return (T)__builtin_crypto_vshasigmaw((uint32x4_p)data, func, fmask);
  2502. #else
  2503. CRYPTOPP_ASSERT(0);
  2504. #endif
  2505. }
  2506. /// \brief SHA512 Sigma functions
  2507. /// \tparam func function
  2508. /// \tparam fmask function mask
  2509. /// \tparam T vector type
  2510. /// \param data the block to transform
  2511. /// \details VecSHA512() selects sigma0, sigma1, Sigma0, Sigma1 based on
  2512. /// func and fmask. The return vector is the same type as data.
  2513. /// \details VecSHA512() is available on POWER8 and above.
  2514. /// \par Wraps
  2515. /// __vshasigmad, __builtin_altivec_crypto_vshasigmad, __builtin_crypto_vshasigmad
  2516. /// \since GCC and XLC since Crypto++ 6.0, LLVM Clang since Crypto++ 8.0
  2517. template <int func, int fmask, class T>
  2518. inline T VecSHA512(const T data)
  2519. {
  2520. #if defined(__ibmxl__) || (defined(_AIX) && defined(__xlC__))
  2521. return (T)__vshasigmad((uint64x2_p)data, func, fmask);
  2522. #elif defined(__clang__)
  2523. return (T)__builtin_altivec_crypto_vshasigmad((uint64x2_p)data, func, fmask);
  2524. #elif defined(__GNUC__)
  2525. return (T)__builtin_crypto_vshasigmad((uint64x2_p)data, func, fmask);
  2526. #else
  2527. CRYPTOPP_ASSERT(0);
  2528. #endif
  2529. }
  2530. //@}
  2531. #endif // __CRYPTO__
  2532. #endif // _ALTIVEC_
  2533. NAMESPACE_END
  2534. #if CRYPTOPP_GCC_DIAGNOSTIC_AVAILABLE
  2535. # pragma GCC diagnostic pop
  2536. #endif
  2537. #endif // CRYPTOPP_PPC_CRYPTO_H