| 12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064106510661067106810691070107110721073107410751076107710781079108010811082108310841085108610871088108910901091109210931094109510961097109810991100110111021103110411051106110711081109111011111112111311141115111611171118111911201121112211231124112511261127112811291130113111321133113411351136113711381139114011411142114311441145114611471148114911501151115211531154115511561157115811591160116111621163116411651166116711681169117011711172117311741175117611771178117911801181118211831184118511861187118811891190119111921193119411951196119711981199120012011202120312041205120612071208120912101211121212131214121512161217121812191220122112221223122412251226122712281229123012311232123312341235123612371238123912401241124212431244124512461247124812491250125112521253125412551256125712581259126012611262126312641265126612671268126912701271127212731274127512761277127812791280128112821283128412851286128712881289129012911292129312941295129612971298129913001301130213031304130513061307130813091310131113121313131413151316131713181319132013211322132313241325132613271328132913301331133213331334133513361337133813391340134113421343134413451346134713481349135013511352135313541355135613571358135913601361136213631364136513661367136813691370137113721373137413751376137713781379138013811382138313841385138613871388138913901391139213931394139513961397139813991400140114021403140414051406140714081409141014111412141314141415141614171418141914201421142214231424142514261427142814291430143114321433143414351436143714381439144014411442144314441445144614471448144914501451145214531454145514561457145814591460146114621463146414651466146714681469147014711472147314741475147614771478147914801481148214831484148514861487148814891490149114921493149414951496149714981499150015011502150315041505150615071508150915101511151215131514151515161517151815191520152115221523152415251526152715281529153015311532153315341535153615371538153915401541154215431544154515461547154815491550155115521553155415551556155715581559156015611562156315641565 |
- include ksamd64.inc
- EXTERNDEF s_sosemanukMulTables:FAR
- .CODE
- ALIGN 8
- Salsa20_OperateKeystream PROC FRAME
- mov r10, [rsp + 5*8]
- alloc_stack(10*16 + 32*16 + 8)
- save_xmm128 xmm6, 0200h
- save_xmm128 xmm7, 0210h
- save_xmm128 xmm8, 0220h
- save_xmm128 xmm9, 0230h
- save_xmm128 xmm10, 0240h
- save_xmm128 xmm11, 0250h
- save_xmm128 xmm12, 0260h
- save_xmm128 xmm13, 0270h
- save_xmm128 xmm14, 0280h
- save_xmm128 xmm15, 0290h
- .endprolog
- cmp r8, 4
- jl label5
- movdqa xmm0, [r10 + 0*16]
- movdqa xmm1, [r10 + 1*16]
- movdqa xmm2, [r10 + 2*16]
- movdqa xmm3, [r10 + 3*16]
- pshufd xmm4, xmm0, 0*64+0*16+0*4+0
- movdqa [rsp + (0*4+0)*16 + 256], xmm4
- pshufd xmm4, xmm0, 1*64+1*16+1*4+1
- movdqa [rsp + (0*4+1)*16 + 256], xmm4
- pshufd xmm4, xmm0, 2*64+2*16+2*4+2
- movdqa [rsp + (0*4+2)*16 + 256], xmm4
- pshufd xmm4, xmm0, 3*64+3*16+3*4+3
- movdqa [rsp + (0*4+3)*16 + 256], xmm4
- pshufd xmm4, xmm1, 0*64+0*16+0*4+0
- movdqa [rsp + (1*4+0)*16 + 256], xmm4
- pshufd xmm4, xmm1, 2*64+2*16+2*4+2
- movdqa [rsp + (1*4+2)*16 + 256], xmm4
- pshufd xmm4, xmm1, 3*64+3*16+3*4+3
- movdqa [rsp + (1*4+3)*16 + 256], xmm4
- pshufd xmm4, xmm2, 1*64+1*16+1*4+1
- movdqa [rsp + (2*4+1)*16 + 256], xmm4
- pshufd xmm4, xmm2, 2*64+2*16+2*4+2
- movdqa [rsp + (2*4+2)*16 + 256], xmm4
- pshufd xmm4, xmm2, 3*64+3*16+3*4+3
- movdqa [rsp + (2*4+3)*16 + 256], xmm4
- pshufd xmm4, xmm3, 0*64+0*16+0*4+0
- movdqa [rsp + (3*4+0)*16 + 256], xmm4
- pshufd xmm4, xmm3, 1*64+1*16+1*4+1
- movdqa [rsp + (3*4+1)*16 + 256], xmm4
- pshufd xmm4, xmm3, 2*64+2*16+2*4+2
- movdqa [rsp + (3*4+2)*16 + 256], xmm4
- pshufd xmm4, xmm3, 3*64+3*16+3*4+3
- movdqa [rsp + (3*4+3)*16 + 256], xmm4
- label1:
- mov eax, dword ptr [r10 + 8*4]
- mov r11d, dword ptr [r10 + 5*4]
- mov dword ptr [rsp + 8*16 + 0*4 + 256], eax
- mov dword ptr [rsp + 5*16 + 0*4 + 256], r11d
- add eax, 1
- adc r11d, 0
- mov dword ptr [rsp + 8*16 + 1*4 + 256], eax
- mov dword ptr [rsp + 5*16 + 1*4 + 256], r11d
- add eax, 1
- adc r11d, 0
- mov dword ptr [rsp + 8*16 + 2*4 + 256], eax
- mov dword ptr [rsp + 5*16 + 2*4 + 256], r11d
- add eax, 1
- adc r11d, 0
- mov dword ptr [rsp + 8*16 + 3*4 + 256], eax
- mov dword ptr [rsp + 5*16 + 3*4 + 256], r11d
- add eax, 1
- adc r11d, 0
- mov dword ptr [r10 + 8*4], eax
- mov dword ptr [r10 + 5*4], r11d
- movdqa xmm0, [rsp + 12*16 + 1*256]
- movdqa xmm4, [rsp + 13*16 + 1*256]
- movdqa xmm8, [rsp + 14*16 + 1*256]
- movdqa xmm12, [rsp + 15*16 + 1*256]
- movdqa xmm2, [rsp + 0*16 + 1*256]
- movdqa xmm6, [rsp + 1*16 + 1*256]
- movdqa xmm10, [rsp + 2*16 + 1*256]
- movdqa xmm14, [rsp + 3*16 + 1*256]
- paddd xmm0, xmm2
- paddd xmm4, xmm6
- paddd xmm8, xmm10
- paddd xmm12, xmm14
- movdqa xmm1, xmm0
- movdqa xmm5, xmm4
- movdqa xmm9, xmm8
- movdqa xmm13, xmm12
- pslld xmm0, 7
- pslld xmm4, 7
- pslld xmm8, 7
- pslld xmm12, 7
- psrld xmm1, 32-7
- psrld xmm5, 32-7
- psrld xmm9, 32-7
- psrld xmm13, 32-7
- pxor xmm0, [rsp + 4*16 + 1*256]
- pxor xmm4, [rsp + 5*16 + 1*256]
- pxor xmm8, [rsp + 6*16 + 1*256]
- pxor xmm12, [rsp + 7*16 + 1*256]
- pxor xmm0, xmm1
- pxor xmm4, xmm5
- pxor xmm8, xmm9
- pxor xmm12, xmm13
- movdqa [rsp + 4*16], xmm0
- movdqa [rsp + 5*16], xmm4
- movdqa [rsp + 6*16], xmm8
- movdqa [rsp + 7*16], xmm12
- movdqa xmm1, xmm0
- movdqa xmm5, xmm4
- movdqa xmm9, xmm8
- movdqa xmm13, xmm12
- paddd xmm0, xmm2
- paddd xmm4, xmm6
- paddd xmm8, xmm10
- paddd xmm12, xmm14
- movdqa xmm3, xmm0
- movdqa xmm7, xmm4
- movdqa xmm11, xmm8
- movdqa xmm15, xmm12
- pslld xmm0, 9
- pslld xmm4, 9
- pslld xmm8, 9
- pslld xmm12, 9
- psrld xmm3, 32-9
- psrld xmm7, 32-9
- psrld xmm11, 32-9
- psrld xmm15, 32-9
- pxor xmm0, [rsp + 8*16 + 1*256]
- pxor xmm4, [rsp + 9*16 + 1*256]
- pxor xmm8, [rsp + 10*16 + 1*256]
- pxor xmm12, [rsp + 11*16 + 1*256]
- pxor xmm0, xmm3
- pxor xmm4, xmm7
- pxor xmm8, xmm11
- pxor xmm12, xmm15
- movdqa [rsp + 8*16], xmm0
- movdqa [rsp + 9*16], xmm4
- movdqa [rsp + 10*16], xmm8
- movdqa [rsp + 11*16], xmm12
- movdqa xmm3, xmm0
- movdqa xmm7, xmm4
- movdqa xmm11, xmm8
- movdqa xmm15, xmm12
- paddd xmm0, xmm1
- paddd xmm4, xmm5
- paddd xmm8, xmm9
- paddd xmm12, xmm13
- movdqa xmm1, xmm0
- movdqa xmm5, xmm4
- movdqa xmm9, xmm8
- movdqa xmm13, xmm12
- pslld xmm0, 13
- pslld xmm4, 13
- pslld xmm8, 13
- pslld xmm12, 13
- psrld xmm1, 32-13
- psrld xmm5, 32-13
- psrld xmm9, 32-13
- psrld xmm13, 32-13
- pxor xmm0, [rsp + 12*16 + 1*256]
- pxor xmm4, [rsp + 13*16 + 1*256]
- pxor xmm8, [rsp + 14*16 + 1*256]
- pxor xmm12, [rsp + 15*16 + 1*256]
- pxor xmm0, xmm1
- pxor xmm4, xmm5
- pxor xmm8, xmm9
- pxor xmm12, xmm13
- movdqa [rsp + 12*16], xmm0
- movdqa [rsp + 13*16], xmm4
- movdqa [rsp + 14*16], xmm8
- movdqa [rsp + 15*16], xmm12
- paddd xmm0, xmm3
- paddd xmm4, xmm7
- paddd xmm8, xmm11
- paddd xmm12, xmm15
- movdqa xmm3, xmm0
- movdqa xmm7, xmm4
- movdqa xmm11, xmm8
- movdqa xmm15, xmm12
- pslld xmm0, 18
- pslld xmm4, 18
- pslld xmm8, 18
- pslld xmm12, 18
- psrld xmm3, 32-18
- psrld xmm7, 32-18
- psrld xmm11, 32-18
- psrld xmm15, 32-18
- pxor xmm0, xmm2
- pxor xmm4, xmm6
- pxor xmm8, xmm10
- pxor xmm12, xmm14
- pxor xmm0, xmm3
- pxor xmm4, xmm7
- pxor xmm8, xmm11
- pxor xmm12, xmm15
- movdqa [rsp + 0*16], xmm0
- movdqa [rsp + 1*16], xmm4
- movdqa [rsp + 2*16], xmm8
- movdqa [rsp + 3*16], xmm12
- mov rax, r9
- jmp label2
- labelSSE2_Salsa_Output:
- movdqa xmm0, xmm4
- punpckldq xmm4, xmm5
- movdqa xmm1, xmm6
- punpckldq xmm6, xmm7
- movdqa xmm2, xmm4
- punpcklqdq xmm4, xmm6
- punpckhqdq xmm2, xmm6
- punpckhdq xmm0, xmm5
- punpckhdq xmm1, xmm7
- movdqa xmm6, xmm0
- punpcklqdq xmm0, xmm1
- punpckhqdq xmm6, xmm1
- test rdx, rdx
- jz labelSSE2_Salsa_Output_A3
- test rdx, 15
- jnz labelSSE2_Salsa_Output_A7
- pxor xmm4, [rdx+0*16]
- pxor xmm2, [rdx+4*16]
- pxor xmm0, [rdx+8*16]
- pxor xmm6, [rdx+12*16]
- add rdx, 1*16
- jmp labelSSE2_Salsa_Output_A3
- labelSSE2_Salsa_Output_A7:
- movdqu xmm1, [rdx+0*16]
- pxor xmm4, xmm1
- movdqu xmm1, [rdx+4*16]
- pxor xmm2, xmm1
- movdqu xmm1, [rdx+8*16]
- pxor xmm0, xmm1
- movdqu xmm1, [rdx+12*16]
- pxor xmm6, xmm1
- add rdx, 1*16
- labelSSE2_Salsa_Output_A3:
- test rcx, 15
- jnz labelSSE2_Salsa_Output_A8
- movdqa [rcx+0*16], xmm4
- movdqa [rcx+4*16], xmm2
- movdqa [rcx+8*16], xmm0
- movdqa [rcx+12*16], xmm6
- jmp labelSSE2_Salsa_Output_A9
- labelSSE2_Salsa_Output_A8:
- movdqu [rcx+0*16], xmm4
- movdqu [rcx+4*16], xmm2
- movdqu [rcx+8*16], xmm0
- movdqu [rcx+12*16], xmm6
- labelSSE2_Salsa_Output_A9:
- add rcx, 1*16
- ret
- label6:
- movdqa xmm0, [rsp + 12*16 + 0*256]
- movdqa xmm4, [rsp + 13*16 + 0*256]
- movdqa xmm8, [rsp + 14*16 + 0*256]
- movdqa xmm12, [rsp + 15*16 + 0*256]
- movdqa xmm2, [rsp + 0*16 + 0*256]
- movdqa xmm6, [rsp + 1*16 + 0*256]
- movdqa xmm10, [rsp + 2*16 + 0*256]
- movdqa xmm14, [rsp + 3*16 + 0*256]
- paddd xmm0, xmm2
- paddd xmm4, xmm6
- paddd xmm8, xmm10
- paddd xmm12, xmm14
- movdqa xmm1, xmm0
- movdqa xmm5, xmm4
- movdqa xmm9, xmm8
- movdqa xmm13, xmm12
- pslld xmm0, 7
- pslld xmm4, 7
- pslld xmm8, 7
- pslld xmm12, 7
- psrld xmm1, 32-7
- psrld xmm5, 32-7
- psrld xmm9, 32-7
- psrld xmm13, 32-7
- pxor xmm0, [rsp + 4*16 + 0*256]
- pxor xmm4, [rsp + 5*16 + 0*256]
- pxor xmm8, [rsp + 6*16 + 0*256]
- pxor xmm12, [rsp + 7*16 + 0*256]
- pxor xmm0, xmm1
- pxor xmm4, xmm5
- pxor xmm8, xmm9
- pxor xmm12, xmm13
- movdqa [rsp + 4*16], xmm0
- movdqa [rsp + 5*16], xmm4
- movdqa [rsp + 6*16], xmm8
- movdqa [rsp + 7*16], xmm12
- movdqa xmm1, xmm0
- movdqa xmm5, xmm4
- movdqa xmm9, xmm8
- movdqa xmm13, xmm12
- paddd xmm0, xmm2
- paddd xmm4, xmm6
- paddd xmm8, xmm10
- paddd xmm12, xmm14
- movdqa xmm3, xmm0
- movdqa xmm7, xmm4
- movdqa xmm11, xmm8
- movdqa xmm15, xmm12
- pslld xmm0, 9
- pslld xmm4, 9
- pslld xmm8, 9
- pslld xmm12, 9
- psrld xmm3, 32-9
- psrld xmm7, 32-9
- psrld xmm11, 32-9
- psrld xmm15, 32-9
- pxor xmm0, [rsp + 8*16 + 0*256]
- pxor xmm4, [rsp + 9*16 + 0*256]
- pxor xmm8, [rsp + 10*16 + 0*256]
- pxor xmm12, [rsp + 11*16 + 0*256]
- pxor xmm0, xmm3
- pxor xmm4, xmm7
- pxor xmm8, xmm11
- pxor xmm12, xmm15
- movdqa [rsp + 8*16], xmm0
- movdqa [rsp + 9*16], xmm4
- movdqa [rsp + 10*16], xmm8
- movdqa [rsp + 11*16], xmm12
- movdqa xmm3, xmm0
- movdqa xmm7, xmm4
- movdqa xmm11, xmm8
- movdqa xmm15, xmm12
- paddd xmm0, xmm1
- paddd xmm4, xmm5
- paddd xmm8, xmm9
- paddd xmm12, xmm13
- movdqa xmm1, xmm0
- movdqa xmm5, xmm4
- movdqa xmm9, xmm8
- movdqa xmm13, xmm12
- pslld xmm0, 13
- pslld xmm4, 13
- pslld xmm8, 13
- pslld xmm12, 13
- psrld xmm1, 32-13
- psrld xmm5, 32-13
- psrld xmm9, 32-13
- psrld xmm13, 32-13
- pxor xmm0, [rsp + 12*16 + 0*256]
- pxor xmm4, [rsp + 13*16 + 0*256]
- pxor xmm8, [rsp + 14*16 + 0*256]
- pxor xmm12, [rsp + 15*16 + 0*256]
- pxor xmm0, xmm1
- pxor xmm4, xmm5
- pxor xmm8, xmm9
- pxor xmm12, xmm13
- movdqa [rsp + 12*16], xmm0
- movdqa [rsp + 13*16], xmm4
- movdqa [rsp + 14*16], xmm8
- movdqa [rsp + 15*16], xmm12
- paddd xmm0, xmm3
- paddd xmm4, xmm7
- paddd xmm8, xmm11
- paddd xmm12, xmm15
- movdqa xmm3, xmm0
- movdqa xmm7, xmm4
- movdqa xmm11, xmm8
- movdqa xmm15, xmm12
- pslld xmm0, 18
- pslld xmm4, 18
- pslld xmm8, 18
- pslld xmm12, 18
- psrld xmm3, 32-18
- psrld xmm7, 32-18
- psrld xmm11, 32-18
- psrld xmm15, 32-18
- pxor xmm0, xmm2
- pxor xmm4, xmm6
- pxor xmm8, xmm10
- pxor xmm12, xmm14
- pxor xmm0, xmm3
- pxor xmm4, xmm7
- pxor xmm8, xmm11
- pxor xmm12, xmm15
- movdqa [rsp + 0*16], xmm0
- movdqa [rsp + 1*16], xmm4
- movdqa [rsp + 2*16], xmm8
- movdqa [rsp + 3*16], xmm12
- label2:
- movdqa xmm0, [rsp + 7*16 + 0*256]
- movdqa xmm4, [rsp + 4*16 + 0*256]
- movdqa xmm8, [rsp + 5*16 + 0*256]
- movdqa xmm12, [rsp + 6*16 + 0*256]
- movdqa xmm2, [rsp + 0*16 + 0*256]
- movdqa xmm6, [rsp + 1*16 + 0*256]
- movdqa xmm10, [rsp + 2*16 + 0*256]
- movdqa xmm14, [rsp + 3*16 + 0*256]
- paddd xmm0, xmm2
- paddd xmm4, xmm6
- paddd xmm8, xmm10
- paddd xmm12, xmm14
- movdqa xmm1, xmm0
- movdqa xmm5, xmm4
- movdqa xmm9, xmm8
- movdqa xmm13, xmm12
- pslld xmm0, 7
- pslld xmm4, 7
- pslld xmm8, 7
- pslld xmm12, 7
- psrld xmm1, 32-7
- psrld xmm5, 32-7
- psrld xmm9, 32-7
- psrld xmm13, 32-7
- pxor xmm0, [rsp + 13*16 + 0*256]
- pxor xmm4, [rsp + 14*16 + 0*256]
- pxor xmm8, [rsp + 15*16 + 0*256]
- pxor xmm12, [rsp + 12*16 + 0*256]
- pxor xmm0, xmm1
- pxor xmm4, xmm5
- pxor xmm8, xmm9
- pxor xmm12, xmm13
- movdqa [rsp + 13*16], xmm0
- movdqa [rsp + 14*16], xmm4
- movdqa [rsp + 15*16], xmm8
- movdqa [rsp + 12*16], xmm12
- movdqa xmm1, xmm0
- movdqa xmm5, xmm4
- movdqa xmm9, xmm8
- movdqa xmm13, xmm12
- paddd xmm0, xmm2
- paddd xmm4, xmm6
- paddd xmm8, xmm10
- paddd xmm12, xmm14
- movdqa xmm3, xmm0
- movdqa xmm7, xmm4
- movdqa xmm11, xmm8
- movdqa xmm15, xmm12
- pslld xmm0, 9
- pslld xmm4, 9
- pslld xmm8, 9
- pslld xmm12, 9
- psrld xmm3, 32-9
- psrld xmm7, 32-9
- psrld xmm11, 32-9
- psrld xmm15, 32-9
- pxor xmm0, [rsp + 10*16 + 0*256]
- pxor xmm4, [rsp + 11*16 + 0*256]
- pxor xmm8, [rsp + 8*16 + 0*256]
- pxor xmm12, [rsp + 9*16 + 0*256]
- pxor xmm0, xmm3
- pxor xmm4, xmm7
- pxor xmm8, xmm11
- pxor xmm12, xmm15
- movdqa [rsp + 10*16], xmm0
- movdqa [rsp + 11*16], xmm4
- movdqa [rsp + 8*16], xmm8
- movdqa [rsp + 9*16], xmm12
- movdqa xmm3, xmm0
- movdqa xmm7, xmm4
- movdqa xmm11, xmm8
- movdqa xmm15, xmm12
- paddd xmm0, xmm1
- paddd xmm4, xmm5
- paddd xmm8, xmm9
- paddd xmm12, xmm13
- movdqa xmm1, xmm0
- movdqa xmm5, xmm4
- movdqa xmm9, xmm8
- movdqa xmm13, xmm12
- pslld xmm0, 13
- pslld xmm4, 13
- pslld xmm8, 13
- pslld xmm12, 13
- psrld xmm1, 32-13
- psrld xmm5, 32-13
- psrld xmm9, 32-13
- psrld xmm13, 32-13
- pxor xmm0, [rsp + 7*16 + 0*256]
- pxor xmm4, [rsp + 4*16 + 0*256]
- pxor xmm8, [rsp + 5*16 + 0*256]
- pxor xmm12, [rsp + 6*16 + 0*256]
- pxor xmm0, xmm1
- pxor xmm4, xmm5
- pxor xmm8, xmm9
- pxor xmm12, xmm13
- movdqa [rsp + 7*16], xmm0
- movdqa [rsp + 4*16], xmm4
- movdqa [rsp + 5*16], xmm8
- movdqa [rsp + 6*16], xmm12
- paddd xmm0, xmm3
- paddd xmm4, xmm7
- paddd xmm8, xmm11
- paddd xmm12, xmm15
- movdqa xmm3, xmm0
- movdqa xmm7, xmm4
- movdqa xmm11, xmm8
- movdqa xmm15, xmm12
- pslld xmm0, 18
- pslld xmm4, 18
- pslld xmm8, 18
- pslld xmm12, 18
- psrld xmm3, 32-18
- psrld xmm7, 32-18
- psrld xmm11, 32-18
- psrld xmm15, 32-18
- pxor xmm0, xmm2
- pxor xmm4, xmm6
- pxor xmm8, xmm10
- pxor xmm12, xmm14
- pxor xmm0, xmm3
- pxor xmm4, xmm7
- pxor xmm8, xmm11
- pxor xmm12, xmm15
- movdqa [rsp + 0*16], xmm0
- movdqa [rsp + 1*16], xmm4
- movdqa [rsp + 2*16], xmm8
- movdqa [rsp + 3*16], xmm12
- sub eax, 2
- jnz label6
- movdqa xmm4, [rsp + 0*16 + 256]
- paddd xmm4, [rsp + 0*16]
- movdqa xmm5, [rsp + 13*16 + 256]
- paddd xmm5, [rsp + 13*16]
- movdqa xmm6, [rsp + 10*16 + 256]
- paddd xmm6, [rsp + 10*16]
- movdqa xmm7, [rsp + 7*16 + 256]
- paddd xmm7, [rsp + 7*16]
- call labelSSE2_Salsa_Output
- movdqa xmm4, [rsp + 4*16 + 256]
- paddd xmm4, [rsp + 4*16]
- movdqa xmm5, [rsp + 1*16 + 256]
- paddd xmm5, [rsp + 1*16]
- movdqa xmm6, [rsp + 14*16 + 256]
- paddd xmm6, [rsp + 14*16]
- movdqa xmm7, [rsp + 11*16 + 256]
- paddd xmm7, [rsp + 11*16]
- call labelSSE2_Salsa_Output
- movdqa xmm4, [rsp + 8*16 + 256]
- paddd xmm4, [rsp + 8*16]
- movdqa xmm5, [rsp + 5*16 + 256]
- paddd xmm5, [rsp + 5*16]
- movdqa xmm6, [rsp + 2*16 + 256]
- paddd xmm6, [rsp + 2*16]
- movdqa xmm7, [rsp + 15*16 + 256]
- paddd xmm7, [rsp + 15*16]
- call labelSSE2_Salsa_Output
- movdqa xmm4, [rsp + 12*16 + 256]
- paddd xmm4, [rsp + 12*16]
- movdqa xmm5, [rsp + 9*16 + 256]
- paddd xmm5, [rsp + 9*16]
- movdqa xmm6, [rsp + 6*16 + 256]
- paddd xmm6, [rsp + 6*16]
- movdqa xmm7, [rsp + 3*16 + 256]
- paddd xmm7, [rsp + 3*16]
- call labelSSE2_Salsa_Output
- test rdx, rdx
- jz label9
- add rdx, 12*16
- label9:
- add rcx, 12*16
- sub r8, 4
- cmp r8, 4
- jge label1
- label5:
- sub r8, 1
- jl label4
- movdqa xmm0, [r10 + 0*16]
- movdqa xmm1, [r10 + 1*16]
- movdqa xmm2, [r10 + 2*16]
- movdqa xmm3, [r10 + 3*16]
- mov rax, r9
- label0:
- movdqa xmm4, xmm3
- paddd xmm4, xmm0
- movdqa xmm5, xmm4
- pslld xmm4, 7
- psrld xmm5, 32-7
- pxor xmm1, xmm4
- pxor xmm1, xmm5
- movdqa xmm4, xmm0
- paddd xmm4, xmm1
- movdqa xmm5, xmm4
- pslld xmm4, 9
- psrld xmm5, 32-9
- pxor xmm2, xmm4
- pxor xmm2, xmm5
- movdqa xmm4, xmm1
- paddd xmm4, xmm2
- movdqa xmm5, xmm4
- pslld xmm4, 13
- psrld xmm5, 32-13
- pxor xmm3, xmm4
- pxor xmm3, xmm5
- movdqa xmm4, xmm2
- paddd xmm4, xmm3
- movdqa xmm5, xmm4
- pslld xmm4, 18
- psrld xmm5, 32-18
- pxor xmm0, xmm4
- pxor xmm0, xmm5
- pshufd xmm1, xmm1, 2*64+1*16+0*4+3
- pshufd xmm2, xmm2, 1*64+0*16+3*4+2
- pshufd xmm3, xmm3, 0*64+3*16+2*4+1
- movdqa xmm4, xmm1
- paddd xmm4, xmm0
- movdqa xmm5, xmm4
- pslld xmm4, 7
- psrld xmm5, 32-7
- pxor xmm3, xmm4
- pxor xmm3, xmm5
- movdqa xmm4, xmm0
- paddd xmm4, xmm3
- movdqa xmm5, xmm4
- pslld xmm4, 9
- psrld xmm5, 32-9
- pxor xmm2, xmm4
- pxor xmm2, xmm5
- movdqa xmm4, xmm3
- paddd xmm4, xmm2
- movdqa xmm5, xmm4
- pslld xmm4, 13
- psrld xmm5, 32-13
- pxor xmm1, xmm4
- pxor xmm1, xmm5
- movdqa xmm4, xmm2
- paddd xmm4, xmm1
- movdqa xmm5, xmm4
- pslld xmm4, 18
- psrld xmm5, 32-18
- pxor xmm0, xmm4
- pxor xmm0, xmm5
- pshufd xmm1, xmm1, 0*64+3*16+2*4+1
- pshufd xmm2, xmm2, 1*64+0*16+3*4+2
- pshufd xmm3, xmm3, 2*64+1*16+0*4+3
- sub eax, 2
- jnz label0
- paddd xmm0, [r10 + 0*16]
- paddd xmm1, [r10 + 1*16]
- paddd xmm2, [r10 + 2*16]
- paddd xmm3, [r10 + 3*16]
- add dword ptr [r10 + 8*4], 1
- adc dword ptr [r10 + 5*4], 0
- pcmpeqb xmm6, xmm6
- psrlq xmm6, 32
- pshufd xmm7, xmm6, 0*64+1*16+2*4+3
- movdqa xmm4, xmm0
- movdqa xmm5, xmm3
- pand xmm0, xmm7
- pand xmm4, xmm6
- pand xmm3, xmm6
- pand xmm5, xmm7
- por xmm4, xmm5
- movdqa xmm5, xmm1
- pand xmm1, xmm7
- pand xmm5, xmm6
- por xmm0, xmm5
- pand xmm6, xmm2
- pand xmm2, xmm7
- por xmm1, xmm6
- por xmm2, xmm3
- movdqa xmm5, xmm4
- movdqa xmm6, xmm0
- shufpd xmm4, xmm1, 2
- shufpd xmm0, xmm2, 2
- shufpd xmm1, xmm5, 2
- shufpd xmm2, xmm6, 2
- test rdx, rdx
- jz labelSSE2_Salsa_Output_B3
- test rdx, 15
- jnz labelSSE2_Salsa_Output_B7
- pxor xmm4, [rdx+0*16]
- pxor xmm0, [rdx+1*16]
- pxor xmm1, [rdx+2*16]
- pxor xmm2, [rdx+3*16]
- add rdx, 4*16
- jmp labelSSE2_Salsa_Output_B3
- labelSSE2_Salsa_Output_B7:
- movdqu xmm3, [rdx+0*16]
- pxor xmm4, xmm3
- movdqu xmm3, [rdx+1*16]
- pxor xmm0, xmm3
- movdqu xmm3, [rdx+2*16]
- pxor xmm1, xmm3
- movdqu xmm3, [rdx+3*16]
- pxor xmm2, xmm3
- add rdx, 4*16
- labelSSE2_Salsa_Output_B3:
- test rcx, 15
- jnz labelSSE2_Salsa_Output_B8
- movdqa [rcx+0*16], xmm4
- movdqa [rcx+1*16], xmm0
- movdqa [rcx+2*16], xmm1
- movdqa [rcx+3*16], xmm2
- jmp labelSSE2_Salsa_Output_B9
- labelSSE2_Salsa_Output_B8:
- movdqu [rcx+0*16], xmm4
- movdqu [rcx+1*16], xmm0
- movdqu [rcx+2*16], xmm1
- movdqu [rcx+3*16], xmm2
- labelSSE2_Salsa_Output_B9:
- add rcx, 4*16
- jmp label5
- label4:
- movdqa xmm6, [rsp + 0200h]
- movdqa xmm7, [rsp + 0210h]
- movdqa xmm8, [rsp + 0220h]
- movdqa xmm9, [rsp + 0230h]
- movdqa xmm10, [rsp + 0240h]
- movdqa xmm11, [rsp + 0250h]
- movdqa xmm12, [rsp + 0260h]
- movdqa xmm13, [rsp + 0270h]
- movdqa xmm14, [rsp + 0280h]
- movdqa xmm15, [rsp + 0290h]
- add rsp, 10*16 + 32*16 + 8
- ret
- Salsa20_OperateKeystream ENDP
- ALIGN 8
- Sosemanuk_OperateKeystream PROC FRAME
- rex_push_reg rsi
- push_reg rdi
- alloc_stack(80*4*2+12*4+8*8 + 2*16+8)
- save_xmm128 xmm6, 02f0h
- save_xmm128 xmm7, 0300h
- .endprolog
- mov rdi, r8
- mov rax, r9
- mov QWORD PTR [rsp+1*8], rdi
- mov QWORD PTR [rsp+2*8], rdx
- mov QWORD PTR [rsp+6*8], rax
- lea rcx, [4*rcx+rcx]
- lea rsi, [4*rcx]
- mov QWORD PTR [rsp+3*8], rsi
- movdqa xmm0, [rax+0*16]
- movdqa [rsp + 8*8+0*16], xmm0
- movdqa xmm0, [rax+1*16]
- movdqa [rsp + 8*8+1*16], xmm0
- movq xmm0, QWORD PTR [rax+2*16]
- movq QWORD PTR [rsp + 8*8+2*16], xmm0
- psrlq xmm0, 32
- movd r10d, xmm0
- mov ecx, [rax+10*4]
- mov edx, [rax+11*4]
- pcmpeqb xmm7, xmm7
- label2:
- lea rdi, [rsp + 8*8 + 12*4]
- mov rax, 80
- cmp rsi, 80
- cmovg rsi, rax
- mov QWORD PTR [rsp+7*8], rsi
- lea rsi, [rdi+rsi]
- mov QWORD PTR [rsp+4*8], rsi
- lea rsi, s_sosemanukMulTables
- label0:
- mov eax, [rsp + 8*8 + ((0+0)-((0+0)/(10))*(10))*4]
- mov [rdi + (((0)-((0)/(4))*(4))*20 + (0/4)) * 4 + 80*4], eax
- rol eax, 8
- lea r11d, [r10d + edx]
- xor r11d, ecx
- mov [rdi + (((0)-((0)/(4))*(4))*20 + (0/4)) * 4], r11d
- mov r11d, 1
- and r11d, edx
- neg r11d
- and r11d, r10d
- xor r10d, eax
- movzx eax, al
- xor r10d, [rsi+rax*4]
- mov eax, [rsp + 8*8 + ((0+3)-((0+3)/(10))*(10))*4]
- xor r11d, [rsp + 8*8 + ((0+2)-((0+2)/(10))*(10))*4]
- add ecx, r11d
- movzx r11d, al
- shr eax, 8
- xor r10d, [rsi+1024+r11*4]
- xor r10d, eax
- imul edx, 54655307h
- rol edx, 7
- mov [rsp + 8*8 + ((0+0)-((0+0)/(10))*(10))*4], r10d
- mov eax, [rsp + 8*8 + ((1+0)-((1+0)/(10))*(10))*4]
- mov [rdi + (((1)-((1)/(4))*(4))*20 + (1/4)) * 4 + 80*4], eax
- rol eax, 8
- lea r11d, [r10d + ecx]
- xor r11d, edx
- mov [rdi + (((1)-((1)/(4))*(4))*20 + (1/4)) * 4], r11d
- mov r11d, 1
- and r11d, ecx
- neg r11d
- and r11d, r10d
- xor r10d, eax
- movzx eax, al
- xor r10d, [rsi+rax*4]
- mov eax, [rsp + 8*8 + ((1+3)-((1+3)/(10))*(10))*4]
- xor r11d, [rsp + 8*8 + ((1+2)-((1+2)/(10))*(10))*4]
- add edx, r11d
- movzx r11d, al
- shr eax, 8
- xor r10d, [rsi+1024+r11*4]
- xor r10d, eax
- imul ecx, 54655307h
- rol ecx, 7
- mov [rsp + 8*8 + ((1+0)-((1+0)/(10))*(10))*4], r10d
- mov eax, [rsp + 8*8 + ((2+0)-((2+0)/(10))*(10))*4]
- mov [rdi + (((2)-((2)/(4))*(4))*20 + (2/4)) * 4 + 80*4], eax
- rol eax, 8
- lea r11d, [r10d + edx]
- xor r11d, ecx
- mov [rdi + (((2)-((2)/(4))*(4))*20 + (2/4)) * 4], r11d
- mov r11d, 1
- and r11d, edx
- neg r11d
- and r11d, r10d
- xor r10d, eax
- movzx eax, al
- xor r10d, [rsi+rax*4]
- mov eax, [rsp + 8*8 + ((2+3)-((2+3)/(10))*(10))*4]
- xor r11d, [rsp + 8*8 + ((2+2)-((2+2)/(10))*(10))*4]
- add ecx, r11d
- movzx r11d, al
- shr eax, 8
- xor r10d, [rsi+1024+r11*4]
- xor r10d, eax
- imul edx, 54655307h
- rol edx, 7
- mov [rsp + 8*8 + ((2+0)-((2+0)/(10))*(10))*4], r10d
- mov eax, [rsp + 8*8 + ((3+0)-((3+0)/(10))*(10))*4]
- mov [rdi + (((3)-((3)/(4))*(4))*20 + (3/4)) * 4 + 80*4], eax
- rol eax, 8
- lea r11d, [r10d + ecx]
- xor r11d, edx
- mov [rdi + (((3)-((3)/(4))*(4))*20 + (3/4)) * 4], r11d
- mov r11d, 1
- and r11d, ecx
- neg r11d
- and r11d, r10d
- xor r10d, eax
- movzx eax, al
- xor r10d, [rsi+rax*4]
- mov eax, [rsp + 8*8 + ((3+3)-((3+3)/(10))*(10))*4]
- xor r11d, [rsp + 8*8 + ((3+2)-((3+2)/(10))*(10))*4]
- add edx, r11d
- movzx r11d, al
- shr eax, 8
- xor r10d, [rsi+1024+r11*4]
- xor r10d, eax
- imul ecx, 54655307h
- rol ecx, 7
- mov [rsp + 8*8 + ((3+0)-((3+0)/(10))*(10))*4], r10d
- mov eax, [rsp + 8*8 + ((4+0)-((4+0)/(10))*(10))*4]
- mov [rdi + (((4)-((4)/(4))*(4))*20 + (4/4)) * 4 + 80*4], eax
- rol eax, 8
- lea r11d, [r10d + edx]
- xor r11d, ecx
- mov [rdi + (((4)-((4)/(4))*(4))*20 + (4/4)) * 4], r11d
- mov r11d, 1
- and r11d, edx
- neg r11d
- and r11d, r10d
- xor r10d, eax
- movzx eax, al
- xor r10d, [rsi+rax*4]
- mov eax, [rsp + 8*8 + ((4+3)-((4+3)/(10))*(10))*4]
- xor r11d, [rsp + 8*8 + ((4+2)-((4+2)/(10))*(10))*4]
- add ecx, r11d
- movzx r11d, al
- shr eax, 8
- xor r10d, [rsi+1024+r11*4]
- xor r10d, eax
- imul edx, 54655307h
- rol edx, 7
- mov [rsp + 8*8 + ((4+0)-((4+0)/(10))*(10))*4], r10d
- mov eax, [rsp + 8*8 + ((5+0)-((5+0)/(10))*(10))*4]
- mov [rdi + (((5)-((5)/(4))*(4))*20 + (5/4)) * 4 + 80*4], eax
- rol eax, 8
- lea r11d, [r10d + ecx]
- xor r11d, edx
- mov [rdi + (((5)-((5)/(4))*(4))*20 + (5/4)) * 4], r11d
- mov r11d, 1
- and r11d, ecx
- neg r11d
- and r11d, r10d
- xor r10d, eax
- movzx eax, al
- xor r10d, [rsi+rax*4]
- mov eax, [rsp + 8*8 + ((5+3)-((5+3)/(10))*(10))*4]
- xor r11d, [rsp + 8*8 + ((5+2)-((5+2)/(10))*(10))*4]
- add edx, r11d
- movzx r11d, al
- shr eax, 8
- xor r10d, [rsi+1024+r11*4]
- xor r10d, eax
- imul ecx, 54655307h
- rol ecx, 7
- mov [rsp + 8*8 + ((5+0)-((5+0)/(10))*(10))*4], r10d
- mov eax, [rsp + 8*8 + ((6+0)-((6+0)/(10))*(10))*4]
- mov [rdi + (((6)-((6)/(4))*(4))*20 + (6/4)) * 4 + 80*4], eax
- rol eax, 8
- lea r11d, [r10d + edx]
- xor r11d, ecx
- mov [rdi + (((6)-((6)/(4))*(4))*20 + (6/4)) * 4], r11d
- mov r11d, 1
- and r11d, edx
- neg r11d
- and r11d, r10d
- xor r10d, eax
- movzx eax, al
- xor r10d, [rsi+rax*4]
- mov eax, [rsp + 8*8 + ((6+3)-((6+3)/(10))*(10))*4]
- xor r11d, [rsp + 8*8 + ((6+2)-((6+2)/(10))*(10))*4]
- add ecx, r11d
- movzx r11d, al
- shr eax, 8
- xor r10d, [rsi+1024+r11*4]
- xor r10d, eax
- imul edx, 54655307h
- rol edx, 7
- mov [rsp + 8*8 + ((6+0)-((6+0)/(10))*(10))*4], r10d
- mov eax, [rsp + 8*8 + ((7+0)-((7+0)/(10))*(10))*4]
- mov [rdi + (((7)-((7)/(4))*(4))*20 + (7/4)) * 4 + 80*4], eax
- rol eax, 8
- lea r11d, [r10d + ecx]
- xor r11d, edx
- mov [rdi + (((7)-((7)/(4))*(4))*20 + (7/4)) * 4], r11d
- mov r11d, 1
- and r11d, ecx
- neg r11d
- and r11d, r10d
- xor r10d, eax
- movzx eax, al
- xor r10d, [rsi+rax*4]
- mov eax, [rsp + 8*8 + ((7+3)-((7+3)/(10))*(10))*4]
- xor r11d, [rsp + 8*8 + ((7+2)-((7+2)/(10))*(10))*4]
- add edx, r11d
- movzx r11d, al
- shr eax, 8
- xor r10d, [rsi+1024+r11*4]
- xor r10d, eax
- imul ecx, 54655307h
- rol ecx, 7
- mov [rsp + 8*8 + ((7+0)-((7+0)/(10))*(10))*4], r10d
- mov eax, [rsp + 8*8 + ((8+0)-((8+0)/(10))*(10))*4]
- mov [rdi + (((8)-((8)/(4))*(4))*20 + (8/4)) * 4 + 80*4], eax
- rol eax, 8
- lea r11d, [r10d + edx]
- xor r11d, ecx
- mov [rdi + (((8)-((8)/(4))*(4))*20 + (8/4)) * 4], r11d
- mov r11d, 1
- and r11d, edx
- neg r11d
- and r11d, r10d
- xor r10d, eax
- movzx eax, al
- xor r10d, [rsi+rax*4]
- mov eax, [rsp + 8*8 + ((8+3)-((8+3)/(10))*(10))*4]
- xor r11d, [rsp + 8*8 + ((8+2)-((8+2)/(10))*(10))*4]
- add ecx, r11d
- movzx r11d, al
- shr eax, 8
- xor r10d, [rsi+1024+r11*4]
- xor r10d, eax
- imul edx, 54655307h
- rol edx, 7
- mov [rsp + 8*8 + ((8+0)-((8+0)/(10))*(10))*4], r10d
- mov eax, [rsp + 8*8 + ((9+0)-((9+0)/(10))*(10))*4]
- mov [rdi + (((9)-((9)/(4))*(4))*20 + (9/4)) * 4 + 80*4], eax
- rol eax, 8
- lea r11d, [r10d + ecx]
- xor r11d, edx
- mov [rdi + (((9)-((9)/(4))*(4))*20 + (9/4)) * 4], r11d
- mov r11d, 1
- and r11d, ecx
- neg r11d
- and r11d, r10d
- xor r10d, eax
- movzx eax, al
- xor r10d, [rsi+rax*4]
- mov eax, [rsp + 8*8 + ((9+3)-((9+3)/(10))*(10))*4]
- xor r11d, [rsp + 8*8 + ((9+2)-((9+2)/(10))*(10))*4]
- add edx, r11d
- movzx r11d, al
- shr eax, 8
- xor r10d, [rsi+1024+r11*4]
- xor r10d, eax
- imul ecx, 54655307h
- rol ecx, 7
- mov [rsp + 8*8 + ((9+0)-((9+0)/(10))*(10))*4], r10d
- mov eax, [rsp + 8*8 + ((10+0)-((10+0)/(10))*(10))*4]
- mov [rdi + (((10)-((10)/(4))*(4))*20 + (10/4)) * 4 + 80*4], eax
- rol eax, 8
- lea r11d, [r10d + edx]
- xor r11d, ecx
- mov [rdi + (((10)-((10)/(4))*(4))*20 + (10/4)) * 4], r11d
- mov r11d, 1
- and r11d, edx
- neg r11d
- and r11d, r10d
- xor r10d, eax
- movzx eax, al
- xor r10d, [rsi+rax*4]
- mov eax, [rsp + 8*8 + ((10+3)-((10+3)/(10))*(10))*4]
- xor r11d, [rsp + 8*8 + ((10+2)-((10+2)/(10))*(10))*4]
- add ecx, r11d
- movzx r11d, al
- shr eax, 8
- xor r10d, [rsi+1024+r11*4]
- xor r10d, eax
- imul edx, 54655307h
- rol edx, 7
- mov [rsp + 8*8 + ((10+0)-((10+0)/(10))*(10))*4], r10d
- mov eax, [rsp + 8*8 + ((11+0)-((11+0)/(10))*(10))*4]
- mov [rdi + (((11)-((11)/(4))*(4))*20 + (11/4)) * 4 + 80*4], eax
- rol eax, 8
- lea r11d, [r10d + ecx]
- xor r11d, edx
- mov [rdi + (((11)-((11)/(4))*(4))*20 + (11/4)) * 4], r11d
- mov r11d, 1
- and r11d, ecx
- neg r11d
- and r11d, r10d
- xor r10d, eax
- movzx eax, al
- xor r10d, [rsi+rax*4]
- mov eax, [rsp + 8*8 + ((11+3)-((11+3)/(10))*(10))*4]
- xor r11d, [rsp + 8*8 + ((11+2)-((11+2)/(10))*(10))*4]
- add edx, r11d
- movzx r11d, al
- shr eax, 8
- xor r10d, [rsi+1024+r11*4]
- xor r10d, eax
- imul ecx, 54655307h
- rol ecx, 7
- mov [rsp + 8*8 + ((11+0)-((11+0)/(10))*(10))*4], r10d
- mov eax, [rsp + 8*8 + ((12+0)-((12+0)/(10))*(10))*4]
- mov [rdi + (((12)-((12)/(4))*(4))*20 + (12/4)) * 4 + 80*4], eax
- rol eax, 8
- lea r11d, [r10d + edx]
- xor r11d, ecx
- mov [rdi + (((12)-((12)/(4))*(4))*20 + (12/4)) * 4], r11d
- mov r11d, 1
- and r11d, edx
- neg r11d
- and r11d, r10d
- xor r10d, eax
- movzx eax, al
- xor r10d, [rsi+rax*4]
- mov eax, [rsp + 8*8 + ((12+3)-((12+3)/(10))*(10))*4]
- xor r11d, [rsp + 8*8 + ((12+2)-((12+2)/(10))*(10))*4]
- add ecx, r11d
- movzx r11d, al
- shr eax, 8
- xor r10d, [rsi+1024+r11*4]
- xor r10d, eax
- imul edx, 54655307h
- rol edx, 7
- mov [rsp + 8*8 + ((12+0)-((12+0)/(10))*(10))*4], r10d
- mov eax, [rsp + 8*8 + ((13+0)-((13+0)/(10))*(10))*4]
- mov [rdi + (((13)-((13)/(4))*(4))*20 + (13/4)) * 4 + 80*4], eax
- rol eax, 8
- lea r11d, [r10d + ecx]
- xor r11d, edx
- mov [rdi + (((13)-((13)/(4))*(4))*20 + (13/4)) * 4], r11d
- mov r11d, 1
- and r11d, ecx
- neg r11d
- and r11d, r10d
- xor r10d, eax
- movzx eax, al
- xor r10d, [rsi+rax*4]
- mov eax, [rsp + 8*8 + ((13+3)-((13+3)/(10))*(10))*4]
- xor r11d, [rsp + 8*8 + ((13+2)-((13+2)/(10))*(10))*4]
- add edx, r11d
- movzx r11d, al
- shr eax, 8
- xor r10d, [rsi+1024+r11*4]
- xor r10d, eax
- imul ecx, 54655307h
- rol ecx, 7
- mov [rsp + 8*8 + ((13+0)-((13+0)/(10))*(10))*4], r10d
- mov eax, [rsp + 8*8 + ((14+0)-((14+0)/(10))*(10))*4]
- mov [rdi + (((14)-((14)/(4))*(4))*20 + (14/4)) * 4 + 80*4], eax
- rol eax, 8
- lea r11d, [r10d + edx]
- xor r11d, ecx
- mov [rdi + (((14)-((14)/(4))*(4))*20 + (14/4)) * 4], r11d
- mov r11d, 1
- and r11d, edx
- neg r11d
- and r11d, r10d
- xor r10d, eax
- movzx eax, al
- xor r10d, [rsi+rax*4]
- mov eax, [rsp + 8*8 + ((14+3)-((14+3)/(10))*(10))*4]
- xor r11d, [rsp + 8*8 + ((14+2)-((14+2)/(10))*(10))*4]
- add ecx, r11d
- movzx r11d, al
- shr eax, 8
- xor r10d, [rsi+1024+r11*4]
- xor r10d, eax
- imul edx, 54655307h
- rol edx, 7
- mov [rsp + 8*8 + ((14+0)-((14+0)/(10))*(10))*4], r10d
- mov eax, [rsp + 8*8 + ((15+0)-((15+0)/(10))*(10))*4]
- mov [rdi + (((15)-((15)/(4))*(4))*20 + (15/4)) * 4 + 80*4], eax
- rol eax, 8
- lea r11d, [r10d + ecx]
- xor r11d, edx
- mov [rdi + (((15)-((15)/(4))*(4))*20 + (15/4)) * 4], r11d
- mov r11d, 1
- and r11d, ecx
- neg r11d
- and r11d, r10d
- xor r10d, eax
- movzx eax, al
- xor r10d, [rsi+rax*4]
- mov eax, [rsp + 8*8 + ((15+3)-((15+3)/(10))*(10))*4]
- xor r11d, [rsp + 8*8 + ((15+2)-((15+2)/(10))*(10))*4]
- add edx, r11d
- movzx r11d, al
- shr eax, 8
- xor r10d, [rsi+1024+r11*4]
- xor r10d, eax
- imul ecx, 54655307h
- rol ecx, 7
- mov [rsp + 8*8 + ((15+0)-((15+0)/(10))*(10))*4], r10d
- mov eax, [rsp + 8*8 + ((16+0)-((16+0)/(10))*(10))*4]
- mov [rdi + (((16)-((16)/(4))*(4))*20 + (16/4)) * 4 + 80*4], eax
- rol eax, 8
- lea r11d, [r10d + edx]
- xor r11d, ecx
- mov [rdi + (((16)-((16)/(4))*(4))*20 + (16/4)) * 4], r11d
- mov r11d, 1
- and r11d, edx
- neg r11d
- and r11d, r10d
- xor r10d, eax
- movzx eax, al
- xor r10d, [rsi+rax*4]
- mov eax, [rsp + 8*8 + ((16+3)-((16+3)/(10))*(10))*4]
- xor r11d, [rsp + 8*8 + ((16+2)-((16+2)/(10))*(10))*4]
- add ecx, r11d
- movzx r11d, al
- shr eax, 8
- xor r10d, [rsi+1024+r11*4]
- xor r10d, eax
- imul edx, 54655307h
- rol edx, 7
- mov [rsp + 8*8 + ((16+0)-((16+0)/(10))*(10))*4], r10d
- mov eax, [rsp + 8*8 + ((17+0)-((17+0)/(10))*(10))*4]
- mov [rdi + (((17)-((17)/(4))*(4))*20 + (17/4)) * 4 + 80*4], eax
- rol eax, 8
- lea r11d, [r10d + ecx]
- xor r11d, edx
- mov [rdi + (((17)-((17)/(4))*(4))*20 + (17/4)) * 4], r11d
- mov r11d, 1
- and r11d, ecx
- neg r11d
- and r11d, r10d
- xor r10d, eax
- movzx eax, al
- xor r10d, [rsi+rax*4]
- mov eax, [rsp + 8*8 + ((17+3)-((17+3)/(10))*(10))*4]
- xor r11d, [rsp + 8*8 + ((17+2)-((17+2)/(10))*(10))*4]
- add edx, r11d
- movzx r11d, al
- shr eax, 8
- xor r10d, [rsi+1024+r11*4]
- xor r10d, eax
- imul ecx, 54655307h
- rol ecx, 7
- mov [rsp + 8*8 + ((17+0)-((17+0)/(10))*(10))*4], r10d
- mov eax, [rsp + 8*8 + ((18+0)-((18+0)/(10))*(10))*4]
- mov [rdi + (((18)-((18)/(4))*(4))*20 + (18/4)) * 4 + 80*4], eax
- rol eax, 8
- lea r11d, [r10d + edx]
- xor r11d, ecx
- mov [rdi + (((18)-((18)/(4))*(4))*20 + (18/4)) * 4], r11d
- mov r11d, 1
- and r11d, edx
- neg r11d
- and r11d, r10d
- xor r10d, eax
- movzx eax, al
- xor r10d, [rsi+rax*4]
- mov eax, [rsp + 8*8 + ((18+3)-((18+3)/(10))*(10))*4]
- xor r11d, [rsp + 8*8 + ((18+2)-((18+2)/(10))*(10))*4]
- add ecx, r11d
- movzx r11d, al
- shr eax, 8
- xor r10d, [rsi+1024+r11*4]
- xor r10d, eax
- imul edx, 54655307h
- rol edx, 7
- mov [rsp + 8*8 + ((18+0)-((18+0)/(10))*(10))*4], r10d
- mov eax, [rsp + 8*8 + ((19+0)-((19+0)/(10))*(10))*4]
- mov [rdi + (((19)-((19)/(4))*(4))*20 + (19/4)) * 4 + 80*4], eax
- rol eax, 8
- lea r11d, [r10d + ecx]
- xor r11d, edx
- mov [rdi + (((19)-((19)/(4))*(4))*20 + (19/4)) * 4], r11d
- mov r11d, 1
- and r11d, ecx
- neg r11d
- and r11d, r10d
- xor r10d, eax
- movzx eax, al
- xor r10d, [rsi+rax*4]
- mov eax, [rsp + 8*8 + ((19+3)-((19+3)/(10))*(10))*4]
- xor r11d, [rsp + 8*8 + ((19+2)-((19+2)/(10))*(10))*4]
- add edx, r11d
- movzx r11d, al
- shr eax, 8
- xor r10d, [rsi+1024+r11*4]
- xor r10d, eax
- imul ecx, 54655307h
- rol ecx, 7
- mov [rsp + 8*8 + ((19+0)-((19+0)/(10))*(10))*4], r10d
- add rdi, 5*4
- cmp rdi, QWORD PTR [rsp+4*8]
- jne label0
- mov rax, QWORD PTR [rsp+2*8]
- mov r11, QWORD PTR [rsp+1*8]
- lea rdi, [rsp + 8*8 + 12*4]
- mov rsi, QWORD PTR [rsp+7*8]
- label1:
- movdqa xmm0, [rdi+0*20*4]
- movdqa xmm2, [rdi+2*20*4]
- movdqa xmm3, [rdi+3*20*4]
- movdqa xmm1, [rdi+1*20*4]
- movdqa xmm4, xmm0
- pand xmm0, xmm2
- pxor xmm0, xmm3
- pxor xmm2, xmm1
- pxor xmm2, xmm0
- por xmm3, xmm4
- pxor xmm3, xmm1
- pxor xmm4, xmm2
- movdqa xmm1, xmm3
- por xmm3, xmm4
- pxor xmm3, xmm0
- pand xmm0, xmm1
- pxor xmm4, xmm0
- pxor xmm1, xmm3
- pxor xmm1, xmm4
- pxor xmm4, xmm7
- pxor xmm2, [rdi+80*4]
- pxor xmm3, [rdi+80*5]
- pxor xmm1, [rdi+80*6]
- pxor xmm4, [rdi+80*7]
- cmp rsi, 16
- jl label4
- movdqa xmm6, xmm2
- punpckldq xmm2, xmm3
- movdqa xmm5, xmm1
- punpckldq xmm1, xmm4
- movdqa xmm0, xmm2
- punpcklqdq xmm2, xmm1
- punpckhqdq xmm0, xmm1
- punpckhdq xmm6, xmm3
- punpckhdq xmm5, xmm4
- movdqa xmm3, xmm6
- punpcklqdq xmm6, xmm5
- punpckhqdq xmm3, xmm5
- test rax, rax
- jz labelSSE2_Sosemanuk_Output3
- test rax, 15
- jnz labelSSE2_Sosemanuk_Output7
- pxor xmm2, [rax+0*16]
- pxor xmm0, [rax+1*16]
- pxor xmm6, [rax+2*16]
- pxor xmm3, [rax+3*16]
- add rax, 4*16
- jmp labelSSE2_Sosemanuk_Output3
- labelSSE2_Sosemanuk_Output7:
- movdqu xmm1, [rax+0*16]
- pxor xmm2, xmm1
- movdqu xmm1, [rax+1*16]
- pxor xmm0, xmm1
- movdqu xmm1, [rax+2*16]
- pxor xmm6, xmm1
- movdqu xmm1, [rax+3*16]
- pxor xmm3, xmm1
- add rax, 4*16
- labelSSE2_Sosemanuk_Output3:
- test r11, 15
- jnz labelSSE2_Sosemanuk_Output8
- movdqa [r11+0*16], xmm2
- movdqa [r11+1*16], xmm0
- movdqa [r11+2*16], xmm6
- movdqa [r11+3*16], xmm3
- jmp labelSSE2_Sosemanuk_Output9
- labelSSE2_Sosemanuk_Output8:
- movdqu [r11+0*16], xmm2
- movdqu [r11+1*16], xmm0
- movdqu [r11+2*16], xmm6
- movdqu [r11+3*16], xmm3
- labelSSE2_Sosemanuk_Output9:
- add r11, 4*16
- add rdi, 4*4
- sub rsi, 16
- jnz label1
- mov rsi, QWORD PTR [rsp+3*8]
- sub rsi, 80
- jz label6
- mov QWORD PTR [rsp+3*8], rsi
- mov QWORD PTR [rsp+2*8], rax
- mov QWORD PTR [rsp+1*8], r11
- jmp label2
- label4:
- test rax, rax
- jz label5
- movd xmm0, dword ptr [rax+0*4]
- pxor xmm2, xmm0
- movd xmm0, dword ptr [rax+1*4]
- pxor xmm3, xmm0
- movd xmm0, dword ptr [rax+2*4]
- pxor xmm1, xmm0
- movd xmm0, dword ptr [rax+3*4]
- pxor xmm4, xmm0
- add rax, 16
- label5:
- movd dword ptr [r11+0*4], xmm2
- movd dword ptr [r11+1*4], xmm3
- movd dword ptr [r11+2*4], xmm1
- movd dword ptr [r11+3*4], xmm4
- sub rsi, 4
- jz label6
- add r11, 16
- psrldq xmm2, 4
- psrldq xmm3, 4
- psrldq xmm1, 4
- psrldq xmm4, 4
- jmp label4
- label6:
- mov r10, QWORD PTR [rsp+6*8]
- movdqa xmm0, [rsp + 8*8+0*16]
- movdqa [r10+0*16], xmm0
- movdqa xmm0, [rsp + 8*8+1*16]
- movdqa [r10+1*16], xmm0
- movq xmm0, QWORD PTR [rsp + 8*8+2*16]
- movq QWORD PTR [r10+2*16], xmm0
- mov [r10+10*4], ecx
- mov [r10+11*4], edx
- movdqa xmm6, [rsp + 02f0h]
- movdqa xmm7, [rsp + 0300h]
- add rsp, 80*4*2+12*4+8*8 + 2*16+8
- pop rdi
- pop rsi
- ret
- Sosemanuk_OperateKeystream ENDP
- Panama_SSE2_Pull PROC FRAME
- rex_push_reg rdi
- alloc_stack(2*16)
- save_xmm128 xmm6, 0h
- save_xmm128 xmm7, 10h
- .endprolog
- shl rcx, 5
- jz label5
- mov r10d, [rdx+4*17]
- add rcx, r10
- mov rdi, rcx
- movdqa xmm0, xmmword ptr [rdx+0*16]
- movdqa xmm1, xmmword ptr [rdx+1*16]
- movdqa xmm2, xmmword ptr [rdx+2*16]
- movdqa xmm3, xmmword ptr [rdx+3*16]
- mov eax, dword ptr [rdx+4*16]
- label4:
- movdqa xmm6, xmm2
- movss xmm6, xmm3
- pshufd xmm5, xmm6, 0*64+3*16+2*4+1
- movd xmm6, eax
- movdqa xmm7, xmm3
- movss xmm7, xmm6
- pshufd xmm6, xmm7, 0*64+3*16+2*4+1
- movd ecx, xmm2
- not ecx
- movd r11d, xmm3
- or ecx, r11d
- xor eax, ecx
- pcmpeqb xmm7, xmm7
- pxor xmm7, xmm1
- por xmm7, xmm2
- pxor xmm7, xmm3
- movd ecx, xmm7
- rol ecx, (((((5*1) MOD (17))*(((5*1) MOD (17))+1)/2)) MOD (32))
- mov [rdx+((((((5*(1)) MOD (17)))*13+16)) MOD (17))*4], ecx
- pshuflw xmm7, xmm7, 1*64+0*16+3*4+2
- movd ecx, xmm7
- rol ecx, (((((5*5) MOD (17))*(((5*5) MOD (17))+1)/2)) MOD (32))
- mov [rdx+((((((5*(5)) MOD (17)))*13+16)) MOD (17))*4], ecx
- punpckhqdq xmm7, xmm7
- movd ecx, xmm7
- rol ecx, (((((5*9) MOD (17))*(((5*9) MOD (17))+1)/2)) MOD (32))
- mov [rdx+((((((5*(9)) MOD (17)))*13+16)) MOD (17))*4], ecx
- pshuflw xmm7, xmm7, 1*64+0*16+3*4+2
- movd ecx, xmm7
- rol ecx, (((((5*13) MOD (17))*(((5*13) MOD (17))+1)/2)) MOD (32))
- mov [rdx+((((((5*(13)) MOD (17)))*13+16)) MOD (17))*4], ecx
- pcmpeqb xmm7, xmm7
- pxor xmm7, xmm0
- por xmm7, xmm1
- pxor xmm7, xmm2
- movd ecx, xmm7
- rol ecx, (((((5*2) MOD (17))*(((5*2) MOD (17))+1)/2)) MOD (32))
- mov [rdx+((((((5*(2)) MOD (17)))*13+16)) MOD (17))*4], ecx
- pshuflw xmm7, xmm7, 1*64+0*16+3*4+2
- movd ecx, xmm7
- rol ecx, (((((5*6) MOD (17))*(((5*6) MOD (17))+1)/2)) MOD (32))
- mov [rdx+((((((5*(6)) MOD (17)))*13+16)) MOD (17))*4], ecx
- punpckhqdq xmm7, xmm7
- movd ecx, xmm7
- rol ecx, (((((5*10) MOD (17))*(((5*10) MOD (17))+1)/2)) MOD (32))
- mov [rdx+((((((5*(10)) MOD (17)))*13+16)) MOD (17))*4], ecx
- pshuflw xmm7, xmm7, 1*64+0*16+3*4+2
- movd ecx, xmm7
- rol ecx, (((((5*14) MOD (17))*(((5*14) MOD (17))+1)/2)) MOD (32))
- mov [rdx+((((((5*(14)) MOD (17)))*13+16)) MOD (17))*4], ecx
- pcmpeqb xmm7, xmm7
- pxor xmm7, xmm6
- por xmm7, xmm0
- pxor xmm7, xmm1
- movd ecx, xmm7
- rol ecx, (((((5*3) MOD (17))*(((5*3) MOD (17))+1)/2)) MOD (32))
- mov [rdx+((((((5*(3)) MOD (17)))*13+16)) MOD (17))*4], ecx
- pshuflw xmm7, xmm7, 1*64+0*16+3*4+2
- movd ecx, xmm7
- rol ecx, (((((5*7) MOD (17))*(((5*7) MOD (17))+1)/2)) MOD (32))
- mov [rdx+((((((5*(7)) MOD (17)))*13+16)) MOD (17))*4], ecx
- punpckhqdq xmm7, xmm7
- movd ecx, xmm7
- rol ecx, (((((5*11) MOD (17))*(((5*11) MOD (17))+1)/2)) MOD (32))
- mov [rdx+((((((5*(11)) MOD (17)))*13+16)) MOD (17))*4], ecx
- pshuflw xmm7, xmm7, 1*64+0*16+3*4+2
- movd ecx, xmm7
- rol ecx, (((((5*15) MOD (17))*(((5*15) MOD (17))+1)/2)) MOD (32))
- mov [rdx+((((((5*(15)) MOD (17)))*13+16)) MOD (17))*4], ecx
- pcmpeqb xmm7, xmm7
- pxor xmm7, xmm5
- por xmm7, xmm6
- pxor xmm7, xmm0
- movd ecx, xmm7
- rol ecx, (((((5*4) MOD (17))*(((5*4) MOD (17))+1)/2)) MOD (32))
- mov [rdx+((((((5*(4)) MOD (17)))*13+16)) MOD (17))*4], ecx
- pshuflw xmm7, xmm7, 1*64+0*16+3*4+2
- movd ecx, xmm7
- rol ecx, (((((5*8) MOD (17))*(((5*8) MOD (17))+1)/2)) MOD (32))
- mov [rdx+((((((5*(8)) MOD (17)))*13+16)) MOD (17))*4], ecx
- punpckhqdq xmm7, xmm7
- movd ecx, xmm7
- rol ecx, (((((5*12) MOD (17))*(((5*12) MOD (17))+1)/2)) MOD (32))
- mov [rdx+((((((5*(12)) MOD (17)))*13+16)) MOD (17))*4], ecx
- pshuflw xmm7, xmm7, 1*64+0*16+3*4+2
- movd ecx, xmm7
- rol ecx, (((((5*16) MOD (17))*(((5*16) MOD (17))+1)/2)) MOD (32))
- mov [rdx+((((((5*(16)) MOD (17)))*13+16)) MOD (17))*4], ecx
- movdqa xmm4, xmm3
- punpcklqdq xmm3, xmm2
- punpckhdq xmm4, xmm2
- movdqa xmm2, xmm1
- punpcklqdq xmm1, xmm0
- punpckhdq xmm2, xmm0
- test r8, r8
- jz label0
- movdqa xmm6, xmm4
- punpcklqdq xmm4, xmm2
- punpckhqdq xmm6, xmm2
- test r9, 15
- jnz label2
- test r9, r9
- jz label1
- pxor xmm4, [r9]
- pxor xmm6, [r9+16]
- add r9, 32
- jmp label1
- label2:
- movdqu xmm0, [r9]
- movdqu xmm2, [r9+16]
- pxor xmm4, xmm0
- pxor xmm6, xmm2
- add r9, 32
- label1:
- test r8, 15
- jnz label3
- movdqa xmmword ptr [r8], xmm4
- movdqa xmmword ptr [r8+16], xmm6
- add r8, 32
- jmp label0
- label3:
- movdqu xmmword ptr [r8], xmm4
- movdqu xmmword ptr [r8+16], xmm6
- add r8, 32
- label0:
- lea rcx, [r10 + 32]
- and rcx, 31*32
- lea r11, [r10 + (32-24)*32]
- and r11, 31*32
- movdqa xmm0, xmmword ptr [rdx+20*4+rcx+0*8]
- pxor xmm3, xmm0
- pshufd xmm0, xmm0, 2*64+3*16+0*4+1
- movdqa xmmword ptr [rdx+20*4+rcx+0*8], xmm3
- pxor xmm0, xmmword ptr [rdx+20*4+r11+2*8]
- movdqa xmmword ptr [rdx+20*4+r11+2*8], xmm0
- movdqa xmm4, xmmword ptr [rdx+20*4+rcx+2*8]
- pxor xmm1, xmm4
- movdqa xmmword ptr [rdx+20*4+rcx+2*8], xmm1
- pxor xmm4, xmmword ptr [rdx+20*4+r11+0*8]
- movdqa xmmword ptr [rdx+20*4+r11+0*8], xmm4
- movdqa xmm3, xmmword ptr [rdx+3*16]
- movdqa xmm2, xmmword ptr [rdx+2*16]
- movdqa xmm1, xmmword ptr [rdx+1*16]
- movdqa xmm0, xmmword ptr [rdx+0*16]
- movd xmm6, eax
- movdqa xmm7, xmm3
- movss xmm7, xmm6
- movdqa xmm6, xmm2
- movss xmm6, xmm3
- movdqa xmm5, xmm1
- movss xmm5, xmm2
- movdqa xmm4, xmm0
- movss xmm4, xmm1
- pshufd xmm7, xmm7, 0*64+3*16+2*4+1
- pshufd xmm6, xmm6, 0*64+3*16+2*4+1
- pshufd xmm5, xmm5, 0*64+3*16+2*4+1
- pshufd xmm4, xmm4, 0*64+3*16+2*4+1
- xor eax, 1
- movd ecx, xmm0
- xor eax, ecx
- movd ecx, xmm3
- xor eax, ecx
- pxor xmm3, xmm2
- pxor xmm2, xmm1
- pxor xmm1, xmm0
- pxor xmm0, xmm7
- pxor xmm3, xmm7
- pxor xmm2, xmm6
- pxor xmm1, xmm5
- pxor xmm0, xmm4
- lea rcx, [r10 + (32-4)*32]
- and rcx, 31*32
- lea r11, [r10 + 16*32]
- and r11, 31*32
- movdqa xmm4, xmmword ptr [rdx+20*4+rcx+0*16]
- movdqa xmm5, xmmword ptr [rdx+20*4+r11+0*16]
- movdqa xmm6, xmm4
- punpcklqdq xmm4, xmm5
- punpckhqdq xmm6, xmm5
- pxor xmm3, xmm4
- pxor xmm2, xmm6
- movdqa xmm4, xmmword ptr [rdx+20*4+rcx+1*16]
- movdqa xmm5, xmmword ptr [rdx+20*4+r11+1*16]
- movdqa xmm6, xmm4
- punpcklqdq xmm4, xmm5
- punpckhqdq xmm6, xmm5
- pxor xmm1, xmm4
- pxor xmm0, xmm6
- add r10, 32
- cmp r10, rdi
- jne label4
- mov [rdx+4*16], eax
- movdqa xmmword ptr [rdx+3*16], xmm3
- movdqa xmmword ptr [rdx+2*16], xmm2
- movdqa xmmword ptr [rdx+1*16], xmm1
- movdqa xmmword ptr [rdx+0*16], xmm0
- label5:
- movdqa xmm6, [rsp + 0h]
- movdqa xmm7, [rsp + 10h]
- add rsp, 2*16
- pop rdi
- ret
- Panama_SSE2_Pull ENDP
- _TEXT ENDS
- END
|