mkl_direct_blas_kernels.h 294 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146114711481149115011511152115311541155115611571158115911601161116211631164116511661167116811691170117111721173117411751176117711781179118011811182118311841185118611871188118911901191119211931194119511961197119811991200120112021203120412051206120712081209121012111212121312141215121612171218121912201221122212231224122512261227122812291230123112321233123412351236123712381239124012411242124312441245124612471248124912501251125212531254125512561257125812591260126112621263126412651266126712681269127012711272127312741275127612771278127912801281128212831284128512861287128812891290129112921293129412951296129712981299130013011302130313041305130613071308130913101311131213131314131513161317131813191320132113221323132413251326132713281329133013311332133313341335133613371338133913401341134213431344134513461347134813491350135113521353135413551356135713581359136013611362136313641365136613671368136913701371137213731374137513761377137813791380138113821383138413851386138713881389139013911392139313941395139613971398139914001401140214031404140514061407140814091410141114121413141414151416141714181419142014211422142314241425142614271428142914301431143214331434143514361437143814391440144114421443144414451446144714481449145014511452145314541455145614571458145914601461146214631464146514661467146814691470147114721473147414751476147714781479148014811482148314841485148614871488148914901491149214931494149514961497149814991500150115021503150415051506150715081509151015111512151315141515151615171518151915201521152215231524152515261527152815291530153115321533153415351536153715381539154015411542154315441545154615471548154915501551155215531554155515561557155815591560156115621563156415651566156715681569157015711572157315741575157615771578157915801581158215831584158515861587158815891590159115921593159415951596159715981599160016011602160316041605160616071608160916101611161216131614161516161617161816191620162116221623162416251626162716281629163016311632163316341635163616371638163916401641164216431644164516461647164816491650165116521653165416551656165716581659166016611662166316641665166616671668166916701671167216731674167516761677167816791680168116821683168416851686168716881689169016911692169316941695169616971698169917001701170217031704170517061707170817091710171117121713171417151716171717181719172017211722172317241725172617271728172917301731173217331734173517361737173817391740174117421743174417451746174717481749175017511752175317541755175617571758175917601761176217631764176517661767176817691770177117721773177417751776177717781779178017811782178317841785178617871788178917901791179217931794179517961797179817991800180118021803180418051806180718081809181018111812181318141815181618171818181918201821182218231824182518261827182818291830183118321833183418351836183718381839184018411842184318441845184618471848184918501851185218531854185518561857185818591860186118621863186418651866186718681869187018711872187318741875187618771878187918801881188218831884188518861887188818891890189118921893189418951896189718981899190019011902190319041905190619071908190919101911191219131914191519161917191819191920192119221923192419251926192719281929193019311932193319341935193619371938193919401941194219431944194519461947194819491950195119521953195419551956195719581959196019611962196319641965196619671968196919701971197219731974197519761977197819791980198119821983198419851986198719881989199019911992199319941995199619971998199920002001200220032004200520062007200820092010201120122013201420152016201720182019202020212022202320242025202620272028202920302031203220332034203520362037203820392040204120422043204420452046204720482049205020512052205320542055205620572058205920602061206220632064206520662067206820692070207120722073207420752076207720782079208020812082208320842085208620872088208920902091209220932094209520962097209820992100210121022103210421052106210721082109211021112112211321142115211621172118211921202121212221232124212521262127212821292130213121322133213421352136213721382139214021412142214321442145214621472148214921502151215221532154215521562157215821592160216121622163216421652166216721682169217021712172217321742175217621772178217921802181218221832184218521862187218821892190219121922193219421952196219721982199220022012202220322042205220622072208220922102211221222132214221522162217221822192220222122222223222422252226222722282229223022312232223322342235223622372238223922402241224222432244224522462247224822492250225122522253225422552256225722582259226022612262226322642265226622672268226922702271227222732274227522762277227822792280228122822283228422852286228722882289229022912292229322942295229622972298229923002301230223032304230523062307230823092310231123122313231423152316231723182319232023212322232323242325232623272328232923302331233223332334233523362337233823392340234123422343234423452346234723482349235023512352235323542355235623572358235923602361236223632364236523662367236823692370237123722373237423752376237723782379238023812382238323842385238623872388238923902391239223932394239523962397239823992400240124022403240424052406240724082409241024112412241324142415241624172418241924202421242224232424242524262427242824292430243124322433243424352436243724382439244024412442244324442445244624472448244924502451245224532454245524562457245824592460246124622463246424652466246724682469247024712472247324742475247624772478247924802481248224832484248524862487248824892490249124922493249424952496249724982499250025012502250325042505250625072508250925102511251225132514251525162517251825192520252125222523252425252526252725282529253025312532253325342535253625372538253925402541254225432544254525462547254825492550255125522553255425552556255725582559256025612562256325642565256625672568256925702571257225732574257525762577257825792580258125822583258425852586258725882589259025912592259325942595259625972598259926002601260226032604260526062607260826092610261126122613261426152616261726182619262026212622262326242625262626272628262926302631263226332634263526362637263826392640264126422643264426452646264726482649265026512652265326542655265626572658265926602661266226632664266526662667266826692670267126722673267426752676267726782679268026812682268326842685268626872688268926902691269226932694269526962697269826992700270127022703270427052706270727082709271027112712271327142715271627172718271927202721272227232724272527262727272827292730273127322733273427352736273727382739274027412742274327442745274627472748274927502751275227532754275527562757275827592760276127622763276427652766276727682769277027712772277327742775277627772778277927802781278227832784278527862787278827892790279127922793279427952796279727982799280028012802280328042805280628072808280928102811281228132814281528162817281828192820282128222823282428252826282728282829283028312832283328342835283628372838283928402841284228432844284528462847284828492850285128522853285428552856285728582859286028612862286328642865286628672868286928702871287228732874287528762877287828792880288128822883288428852886288728882889289028912892289328942895289628972898289929002901290229032904290529062907290829092910291129122913291429152916291729182919292029212922292329242925292629272928292929302931293229332934293529362937293829392940294129422943294429452946294729482949295029512952295329542955295629572958295929602961296229632964296529662967296829692970297129722973297429752976297729782979298029812982298329842985298629872988298929902991299229932994299529962997299829993000300130023003300430053006300730083009301030113012301330143015301630173018301930203021302230233024302530263027302830293030303130323033303430353036303730383039304030413042304330443045304630473048304930503051305230533054305530563057305830593060306130623063306430653066306730683069307030713072307330743075307630773078307930803081308230833084308530863087308830893090309130923093309430953096309730983099310031013102310331043105310631073108310931103111311231133114311531163117311831193120312131223123312431253126312731283129313031313132313331343135313631373138313931403141314231433144314531463147314831493150315131523153315431553156315731583159316031613162316331643165316631673168316931703171317231733174317531763177317831793180318131823183318431853186318731883189319031913192319331943195319631973198319932003201320232033204320532063207320832093210321132123213321432153216321732183219322032213222322332243225322632273228322932303231323232333234323532363237323832393240324132423243324432453246324732483249325032513252325332543255325632573258325932603261326232633264326532663267326832693270327132723273327432753276327732783279328032813282328332843285328632873288328932903291329232933294329532963297329832993300330133023303330433053306330733083309331033113312331333143315331633173318331933203321332233233324332533263327332833293330333133323333333433353336333733383339334033413342334333443345334633473348334933503351335233533354335533563357335833593360336133623363336433653366336733683369337033713372337333743375337633773378337933803381338233833384338533863387338833893390339133923393339433953396339733983399340034013402340334043405340634073408340934103411341234133414341534163417341834193420342134223423342434253426342734283429343034313432343334343435343634373438343934403441344234433444344534463447344834493450345134523453345434553456345734583459346034613462346334643465346634673468346934703471347234733474347534763477347834793480348134823483348434853486348734883489349034913492349334943495349634973498349935003501350235033504350535063507350835093510351135123513351435153516351735183519352035213522352335243525352635273528352935303531353235333534353535363537353835393540354135423543354435453546354735483549355035513552355335543555355635573558355935603561356235633564356535663567356835693570357135723573357435753576357735783579358035813582358335843585358635873588358935903591359235933594359535963597359835993600360136023603360436053606360736083609361036113612361336143615361636173618361936203621362236233624362536263627362836293630363136323633363436353636363736383639364036413642364336443645364636473648364936503651365236533654365536563657365836593660366136623663366436653666366736683669367036713672367336743675367636773678367936803681368236833684368536863687368836893690369136923693369436953696369736983699370037013702370337043705370637073708370937103711371237133714371537163717371837193720372137223723372437253726372737283729373037313732373337343735373637373738373937403741374237433744374537463747374837493750375137523753375437553756375737583759376037613762376337643765376637673768376937703771377237733774377537763777377837793780378137823783378437853786378737883789379037913792379337943795379637973798379938003801380238033804380538063807380838093810381138123813381438153816381738183819382038213822382338243825382638273828382938303831383238333834383538363837383838393840384138423843384438453846384738483849385038513852385338543855385638573858385938603861386238633864386538663867386838693870387138723873387438753876387738783879388038813882388338843885388638873888388938903891389238933894389538963897389838993900390139023903390439053906390739083909391039113912391339143915391639173918391939203921392239233924392539263927392839293930393139323933393439353936393739383939394039413942394339443945394639473948394939503951395239533954395539563957395839593960396139623963396439653966396739683969397039713972397339743975397639773978397939803981398239833984398539863987398839893990399139923993399439953996399739983999400040014002400340044005400640074008400940104011401240134014401540164017401840194020402140224023402440254026402740284029403040314032403340344035403640374038403940404041404240434044404540464047404840494050405140524053405440554056405740584059406040614062406340644065406640674068406940704071407240734074407540764077407840794080408140824083408440854086408740884089409040914092409340944095409640974098409941004101410241034104410541064107410841094110411141124113411441154116411741184119412041214122412341244125412641274128412941304131413241334134413541364137413841394140414141424143414441454146414741484149415041514152415341544155415641574158415941604161416241634164416541664167416841694170417141724173417441754176417741784179418041814182418341844185418641874188418941904191419241934194419541964197419841994200420142024203420442054206420742084209421042114212421342144215421642174218421942204221422242234224422542264227422842294230423142324233423442354236423742384239424042414242424342444245424642474248424942504251425242534254425542564257425842594260426142624263426442654266426742684269427042714272427342744275427642774278427942804281428242834284428542864287428842894290429142924293429442954296429742984299430043014302430343044305430643074308430943104311431243134314431543164317431843194320432143224323432443254326432743284329433043314332433343344335433643374338433943404341434243434344434543464347434843494350435143524353435443554356435743584359436043614362436343644365436643674368436943704371437243734374437543764377437843794380438143824383438443854386438743884389439043914392439343944395439643974398439944004401440244034404440544064407440844094410441144124413441444154416441744184419442044214422442344244425442644274428442944304431443244334434443544364437443844394440444144424443444444454446444744484449445044514452445344544455445644574458445944604461446244634464446544664467446844694470447144724473447444754476447744784479448044814482448344844485448644874488448944904491449244934494449544964497449844994500450145024503450445054506450745084509451045114512451345144515451645174518451945204521452245234524452545264527452845294530453145324533453445354536453745384539454045414542454345444545454645474548454945504551455245534554455545564557455845594560456145624563456445654566456745684569457045714572457345744575457645774578457945804581458245834584458545864587458845894590459145924593459445954596459745984599460046014602460346044605460646074608460946104611461246134614461546164617461846194620462146224623462446254626462746284629463046314632463346344635463646374638463946404641464246434644464546464647464846494650465146524653465446554656465746584659466046614662466346644665466646674668466946704671467246734674467546764677467846794680468146824683468446854686468746884689469046914692469346944695469646974698469947004701470247034704470547064707470847094710471147124713471447154716471747184719472047214722472347244725472647274728472947304731473247334734473547364737473847394740474147424743474447454746474747484749475047514752475347544755475647574758475947604761476247634764476547664767476847694770477147724773477447754776477747784779478047814782478347844785478647874788478947904791479247934794479547964797479847994800480148024803480448054806480748084809481048114812481348144815481648174818481948204821482248234824482548264827482848294830483148324833483448354836483748384839484048414842484348444845484648474848484948504851485248534854485548564857485848594860486148624863486448654866486748684869487048714872487348744875487648774878487948804881488248834884488548864887488848894890489148924893489448954896489748984899490049014902490349044905490649074908490949104911491249134914491549164917491849194920492149224923492449254926492749284929493049314932493349344935493649374938493949404941494249434944494549464947494849494950495149524953495449554956495749584959496049614962496349644965496649674968496949704971497249734974497549764977497849794980498149824983498449854986498749884989499049914992499349944995499649974998499950005001500250035004500550065007500850095010501150125013501450155016501750185019502050215022502350245025502650275028502950305031503250335034503550365037503850395040504150425043504450455046504750485049505050515052505350545055505650575058505950605061506250635064506550665067506850695070507150725073507450755076507750785079508050815082508350845085508650875088508950905091509250935094509550965097509850995100510151025103510451055106510751085109511051115112511351145115511651175118511951205121512251235124512551265127512851295130513151325133513451355136513751385139514051415142514351445145514651475148514951505151515251535154515551565157515851595160516151625163516451655166516751685169517051715172517351745175517651775178517951805181518251835184518551865187518851895190519151925193519451955196519751985199520052015202520352045205520652075208520952105211521252135214521552165217521852195220522152225223522452255226522752285229523052315232523352345235523652375238523952405241524252435244524552465247524852495250525152525253525452555256525752585259526052615262526352645265526652675268526952705271527252735274527552765277527852795280528152825283528452855286528752885289529052915292529352945295529652975298529953005301530253035304530553065307530853095310531153125313531453155316531753185319532053215322532353245325532653275328532953305331533253335334533553365337533853395340534153425343534453455346534753485349535053515352535353545355535653575358535953605361536253635364536553665367536853695370537153725373537453755376537753785379538053815382538353845385538653875388538953905391539253935394539553965397539853995400540154025403540454055406540754085409541054115412541354145415541654175418541954205421542254235424542554265427542854295430543154325433543454355436543754385439544054415442544354445445544654475448544954505451545254535454545554565457545854595460546154625463546454655466546754685469547054715472547354745475547654775478547954805481548254835484548554865487548854895490549154925493549454955496549754985499550055015502550355045505550655075508550955105511551255135514551555165517551855195520552155225523552455255526552755285529553055315532553355345535553655375538553955405541554255435544554555465547554855495550555155525553555455555556555755585559556055615562556355645565556655675568556955705571557255735574557555765577557855795580558155825583558455855586558755885589559055915592559355945595559655975598559956005601560256035604560556065607560856095610561156125613561456155616561756185619562056215622562356245625562656275628562956305631563256335634563556365637563856395640564156425643564456455646564756485649565056515652565356545655565656575658565956605661566256635664566556665667566856695670567156725673567456755676567756785679568056815682568356845685568656875688568956905691569256935694569556965697569856995700570157025703570457055706570757085709571057115712571357145715571657175718571957205721572257235724572557265727572857295730573157325733573457355736573757385739574057415742574357445745574657475748574957505751575257535754575557565757575857595760576157625763576457655766576757685769577057715772577357745775577657775778577957805781578257835784578557865787578857895790579157925793579457955796579757985799580058015802580358045805580658075808580958105811581258135814581558165817581858195820582158225823582458255826582758285829583058315832583358345835583658375838583958405841584258435844584558465847584858495850585158525853585458555856585758585859586058615862586358645865586658675868586958705871587258735874587558765877587858795880588158825883588458855886588758885889589058915892589358945895589658975898589959005901590259035904590559065907590859095910591159125913591459155916591759185919592059215922592359245925592659275928592959305931593259335934593559365937593859395940594159425943594459455946594759485949595059515952595359545955595659575958595959605961596259635964596559665967596859695970597159725973597459755976597759785979598059815982598359845985598659875988598959905991599259935994599559965997599859996000600160026003600460056006600760086009601060116012601360146015601660176018601960206021602260236024602560266027602860296030603160326033603460356036603760386039604060416042604360446045604660476048604960506051605260536054605560566057605860596060606160626063606460656066606760686069607060716072607360746075607660776078607960806081608260836084608560866087608860896090609160926093609460956096609760986099610061016102610361046105610661076108610961106111611261136114611561166117611861196120612161226123612461256126612761286129613061316132613361346135613661376138613961406141614261436144614561466147614861496150615161526153615461556156615761586159616061616162616361646165616661676168616961706171617261736174617561766177617861796180618161826183618461856186618761886189619061916192619361946195619661976198619962006201620262036204620562066207620862096210621162126213621462156216621762186219622062216222622362246225622662276228622962306231623262336234623562366237623862396240624162426243624462456246624762486249625062516252625362546255625662576258625962606261626262636264626562666267626862696270627162726273627462756276627762786279628062816282628362846285628662876288628962906291629262936294629562966297629862996300630163026303630463056306630763086309631063116312631363146315631663176318631963206321632263236324632563266327632863296330633163326333633463356336633763386339634063416342634363446345634663476348634963506351635263536354635563566357635863596360636163626363636463656366636763686369637063716372637363746375637663776378637963806381638263836384638563866387638863896390639163926393639463956396639763986399640064016402640364046405640664076408640964106411641264136414641564166417641864196420642164226423642464256426642764286429643064316432643364346435643664376438643964406441644264436444644564466447644864496450645164526453645464556456645764586459646064616462646364646465646664676468646964706471647264736474647564766477647864796480648164826483648464856486648764886489649064916492649364946495649664976498649965006501650265036504650565066507650865096510651165126513651465156516651765186519652065216522652365246525652665276528652965306531653265336534653565366537653865396540654165426543654465456546654765486549655065516552655365546555655665576558655965606561656265636564656565666567656865696570657165726573657465756576657765786579658065816582658365846585658665876588658965906591659265936594659565966597659865996600660166026603660466056606660766086609661066116612661366146615661666176618661966206621662266236624662566266627662866296630663166326633663466356636663766386639664066416642664366446645664666476648664966506651665266536654665566566657665866596660666166626663666466656666666766686669667066716672667366746675667666776678667966806681668266836684668566866687668866896690669166926693669466956696669766986699670067016702670367046705670667076708670967106711671267136714671567166717671867196720672167226723672467256726672767286729673067316732673367346735673667376738673967406741674267436744674567466747674867496750675167526753675467556756675767586759676067616762676367646765676667676768676967706771677267736774677567766777677867796780678167826783678467856786678767886789679067916792679367946795679667976798679968006801680268036804680568066807680868096810681168126813681468156816681768186819682068216822682368246825682668276828682968306831683268336834683568366837683868396840684168426843684468456846684768486849685068516852685368546855685668576858685968606861686268636864686568666867686868696870687168726873687468756876687768786879688068816882688368846885688668876888688968906891689268936894689568966897689868996900690169026903690469056906690769086909691069116912691369146915691669176918691969206921692269236924692569266927692869296930693169326933693469356936693769386939694069416942694369446945694669476948694969506951695269536954695569566957695869596960696169626963696469656966696769686969697069716972697369746975697669776978697969806981698269836984698569866987698869896990699169926993699469956996699769986999700070017002700370047005700670077008700970107011701270137014701570167017701870197020702170227023702470257026702770287029703070317032703370347035703670377038703970407041704270437044704570467047704870497050705170527053705470557056705770587059706070617062706370647065706670677068706970707071707270737074707570767077707870797080708170827083708470857086708770887089709070917092709370947095709670977098709971007101710271037104710571067107710871097110711171127113711471157116711771187119712071217122712371247125712671277128712971307131713271337134713571367137713871397140714171427143714471457146714771487149715071517152715371547155715671577158715971607161716271637164716571667167716871697170717171727173717471757176717771787179718071817182718371847185718671877188718971907191719271937194719571967197719871997200720172027203720472057206720772087209721072117212721372147215721672177218721972207221722272237224722572267227722872297230723172327233723472357236723772387239724072417242724372447245724672477248724972507251725272537254725572567257725872597260726172627263726472657266726772687269727072717272727372747275727672777278727972807281728272837284728572867287728872897290729172927293729472957296729772987299730073017302730373047305730673077308730973107311731273137314731573167317731873197320732173227323732473257326732773287329733073317332733373347335733673377338733973407341734273437344734573467347734873497350735173527353735473557356735773587359736073617362736373647365736673677368736973707371737273737374737573767377737873797380738173827383738473857386738773887389739073917392739373947395739673977398739974007401740274037404740574067407740874097410741174127413741474157416741774187419742074217422742374247425742674277428742974307431743274337434743574367437743874397440744174427443744474457446744774487449745074517452745374547455745674577458745974607461746274637464746574667467746874697470747174727473747474757476747774787479748074817482748374847485748674877488748974907491749274937494749574967497749874997500750175027503750475057506750775087509751075117512751375147515751675177518751975207521752275237524752575267527752875297530753175327533753475357536753775387539754075417542754375447545754675477548754975507551755275537554755575567557755875597560756175627563756475657566756775687569757075717572757375747575757675777578757975807581758275837584758575867587758875897590759175927593759475957596759775987599760076017602760376047605760676077608760976107611761276137614761576167617761876197620762176227623762476257626762776287629763076317632763376347635763676377638763976407641764276437644764576467647764876497650765176527653765476557656765776587659766076617662766376647665766676677668766976707671767276737674767576767677767876797680768176827683768476857686768776887689769076917692769376947695769676977698769977007701770277037704770577067707770877097710771177127713771477157716771777187719772077217722772377247725772677277728772977307731773277337734773577367737773877397740774177427743774477457746774777487749775077517752775377547755775677577758775977607761776277637764776577667767776877697770777177727773777477757776777777787779778077817782778377847785778677877788778977907791779277937794779577967797779877997800780178027803780478057806780778087809781078117812781378147815781678177818781978207821782278237824782578267827782878297830783178327833783478357836783778387839784078417842784378447845784678477848784978507851785278537854785578567857785878597860786178627863786478657866786778687869787078717872787378747875787678777878787978807881788278837884788578867887788878897890789178927893789478957896789778987899790079017902790379047905790679077908790979107911791279137914791579167917791879197920792179227923792479257926792779287929793079317932793379347935793679377938793979407941794279437944794579467947794879497950795179527953795479557956795779587959796079617962796379647965796679677968796979707971797279737974797579767977797879797980798179827983798479857986798779887989799079917992799379947995799679977998799980008001800280038004800580068007800880098010801180128013801480158016801780188019802080218022802380248025802680278028802980308031803280338034803580368037803880398040804180428043804480458046804780488049805080518052805380548055805680578058805980608061806280638064806580668067806880698070807180728073807480758076807780788079808080818082808380848085808680878088808980908091809280938094809580968097809880998100810181028103810481058106810781088109811081118112811381148115811681178118
  1. /*******************************************************************************
  2. * Copyright 2017-2022 Intel Corporation.
  3. *
  4. * This software and the related documents are Intel copyrighted materials, and
  5. * your use of them is governed by the express license under which they were
  6. * provided to you (License). Unless the License provides otherwise, you may not
  7. * use, modify, copy, publish, distribute, disclose or transmit this software or
  8. * the related documents without Intel's prior written permission.
  9. *
  10. * This software and the related documents are provided as is, with no express
  11. * or implied warranties, other than those that are expressly stated in the
  12. * License.
  13. *******************************************************************************/
  14. /*
  15. ! Content:
  16. ! Intel(R) oneAPI Math Kernel Library (oneMKL) intrinsics code
  17. !******************************************************************************/
  18. #ifdef __AVX2__
  19. #if defined(MKL_DC_ALPHA_ONE) && defined(MKL_DC_BETA_ONE)
  20. #define MKL_DC_FNAME_GEMM_KERNEL(fname) mkl_dc_ ## fname ## _a1b1_avx2_pst
  21. #elif defined(MKL_DC_ALPHA_ONE) && defined(MKL_DC_BETA_ZERO)
  22. #define MKL_DC_FNAME_GEMM_KERNEL(fname) mkl_dc_ ## fname ## _a1b0_avx2_pst
  23. #elif defined(MKL_DC_ALPHA_ONE)
  24. #define MKL_DC_FNAME_GEMM_KERNEL(fname) mkl_dc_ ## fname ## _a1bx_avx2_pst
  25. #elif defined(MKL_DC_BETA_ONE)
  26. #define MKL_DC_FNAME_GEMM_KERNEL(fname) mkl_dc_ ## fname ## _axb1_avx2_pst
  27. #elif defined(MKL_DC_BETA_ZERO)
  28. #define MKL_DC_FNAME_GEMM_KERNEL(fname) mkl_dc_ ## fname ## _axb0_avx2_pst
  29. #else
  30. #define MKL_DC_FNAME_GEMM_KERNEL(fname) mkl_dc_ ## fname ## _axbx_avx2_pst
  31. #endif
  32. #endif
  33. #ifdef __AVX2__
  34. #ifdef MKL_DOUBLE
  35. static __inline void MKL_DC_FNAME_GEMM_KERNEL(dgemm_nn_mnk)
  36. (MKL_INT m, MKL_INT n, MKL_INT kK,
  37. const mkl_dc_type * ALPHA,
  38. const mkl_dc_type * A, MKL_INT lda,
  39. const mkl_dc_type * B, MKL_INT ldb,
  40. const mkl_dc_type * BETA,
  41. mkl_dc_type * C, MKL_INT ldc)
  42. {
  43. #undef MKL_DC_AA
  44. #undef MKL_DC_BB
  45. #undef MKL_DC_CC
  46. #define MKL_DC_AA(i,j) ((A)[(i)+lda*(j)])
  47. #define MKL_DC_BB(i,j) ((B)[(i)+ldb*(j)])
  48. #define MKL_DC_CC(i,j) ((C)[(i)+ldc*(j)])
  49. const MKL_INT m_in_ker = 8;
  50. const MKL_INT n_in_ker = 4;
  51. const MKL_INT k_in_ker = 4;
  52. const MKL_INT MKER1 = 4;
  53. const MKL_INT MKER2 = 2;
  54. const MKL_INT MKER3 = 1;
  55. const MKL_INT MKER4 = 0;
  56. MKL_INT m0 = (m/m_in_ker)*m_in_ker;
  57. MKL_INT n0 = (n/n_in_ker)*n_in_ker;
  58. MKL_INT k0 = (kK/k_in_ker)*k_in_ker;
  59. MKL_INT krem = kK - k0;
  60. MKL_DC_YMMTYPE ymm_temp;
  61. MKL_DC_YMMTYPE ymm_temp0, ymm_temp1;
  62. MKL_DC_YMMTYPE ymm_temp2, ymm_temp3;
  63. MKL_DC_YMMTYPE ymm_temp4, ymm_temp5;
  64. MKL_DC_YMMTYPE ymm_temp6, ymm_temp7;
  65. MKL_DC_YMMTYPE ymm_c0, ymm_c1;
  66. MKL_DC_YMMTYPE ymm_c2, ymm_c3;
  67. MKL_DC_YMMTYPE ymm_c4, ymm_c5;
  68. MKL_DC_YMMTYPE ymm_c6, ymm_c7;
  69. MKL_DC_YMMTYPE ymm_a, ymm_a1, ymm_b;
  70. MKL_DC_YMMTYPE ymm_alpha;
  71. MKL_DC_XMMTYPE xmm_a, xmm_b;
  72. MKL_DC_XMMTYPE xmm_temp0, xmm_temp3, xmm_temp5, xmm_temp7;
  73. MKL_DC_XMMTYPE xmm_temp;
  74. MKL_DC_XMMTYPE xmm_c, xmm_c3, xmm_c5, xmm_c7;
  75. MKL_DC_XMMTYPE xmm_alpha;
  76. #if !defined(MKL_DC_ALPHA_ZERO) && !defined(MKL_DC_ALPHA_ONE)
  77. ymm_alpha = MKL_DC_BCAST_YMM(ALPHA);
  78. xmm_alpha = MKL_DC_CAST_YMM_TO_XMM(ymm_alpha);
  79. #endif
  80. #if !defined(MKL_DC_BETA_ZERO) && !defined(MKL_DC_BETA_ONE)
  81. MKL_DC_YMMTYPE ymm_beta = MKL_DC_BCAST_YMM(BETA);
  82. MKL_DC_XMMTYPE xmm_beta = MKL_DC_CAST_YMM_TO_XMM(ymm_beta);
  83. #endif
  84. MKL_INT j;
  85. for (j=0; j<n0; j+=n_in_ker) {
  86. MKL_INT i;
  87. for (i=0; i<m0; i+=m_in_ker) {
  88. ymm_temp0 = MKL_DC_SETZERO_YMM();
  89. ymm_temp1 = MKL_DC_SETZERO_YMM();
  90. ymm_temp2 = MKL_DC_SETZERO_YMM();
  91. ymm_temp3 = MKL_DC_SETZERO_YMM();
  92. ymm_temp4 = MKL_DC_SETZERO_YMM();
  93. ymm_temp5 = MKL_DC_SETZERO_YMM();
  94. ymm_temp6 = MKL_DC_SETZERO_YMM();
  95. ymm_temp7 = MKL_DC_SETZERO_YMM();
  96. MKL_INT k;
  97. for (k=0; k<k0; k+=k_in_ker) {
  98. ymm_a = MKL_DC_LOAD_YMM(&MKL_DC_AA(i,k + 0));
  99. ymm_b = MKL_DC_BCAST_YMM(&MKL_DC_BB(k + 0,j));
  100. MKL_DC_MUL_ADD_YMM(ymm_a, ymm_b, ymm_temp0, ymm_temp);
  101. ymm_a1 = MKL_DC_LOAD_YMM(&MKL_DC_AA(i+4,k + 0));
  102. MKL_DC_MUL_ADD_YMM(ymm_a1, ymm_b, ymm_temp1, ymm_temp);
  103. ymm_b = MKL_DC_BCAST_YMM(&MKL_DC_BB(k + 0,j+1));
  104. MKL_DC_MUL_ADD_YMM(ymm_a, ymm_b, ymm_temp2, ymm_temp);
  105. MKL_DC_MUL_ADD_YMM(ymm_a1, ymm_b, ymm_temp3, ymm_temp);
  106. ymm_b = MKL_DC_BCAST_YMM(&MKL_DC_BB(k + 0,j+2));
  107. MKL_DC_MUL_ADD_YMM(ymm_a, ymm_b, ymm_temp4, ymm_temp);
  108. MKL_DC_MUL_ADD_YMM(ymm_a1, ymm_b, ymm_temp5, ymm_temp);
  109. ymm_b = MKL_DC_BCAST_YMM(&MKL_DC_BB(k + 0,j+3));
  110. MKL_DC_MUL_ADD_YMM(ymm_a, ymm_b, ymm_temp6, ymm_temp);
  111. MKL_DC_MUL_ADD_YMM(ymm_a1, ymm_b, ymm_temp7, ymm_temp);
  112. ymm_a = MKL_DC_LOAD_YMM(&MKL_DC_AA(i,k + 1));
  113. ymm_b = MKL_DC_BCAST_YMM(&MKL_DC_BB(k + 1,j));
  114. MKL_DC_MUL_ADD_YMM(ymm_a, ymm_b, ymm_temp0, ymm_temp);
  115. ymm_a1 = MKL_DC_LOAD_YMM(&MKL_DC_AA(i+4,k + 1));
  116. MKL_DC_MUL_ADD_YMM(ymm_a1, ymm_b, ymm_temp1, ymm_temp);
  117. ymm_b = MKL_DC_BCAST_YMM(&MKL_DC_BB(k + 1,j+1));
  118. MKL_DC_MUL_ADD_YMM(ymm_a, ymm_b, ymm_temp2, ymm_temp);
  119. MKL_DC_MUL_ADD_YMM(ymm_a1, ymm_b, ymm_temp3, ymm_temp);
  120. ymm_b = MKL_DC_BCAST_YMM(&MKL_DC_BB(k + 1,j+2));
  121. MKL_DC_MUL_ADD_YMM(ymm_a, ymm_b, ymm_temp4, ymm_temp);
  122. MKL_DC_MUL_ADD_YMM(ymm_a1, ymm_b, ymm_temp5, ymm_temp);
  123. ymm_b = MKL_DC_BCAST_YMM(&MKL_DC_BB(k + 1,j+3));
  124. MKL_DC_MUL_ADD_YMM(ymm_a, ymm_b, ymm_temp6, ymm_temp);
  125. MKL_DC_MUL_ADD_YMM(ymm_a1, ymm_b, ymm_temp7, ymm_temp);
  126. ymm_a = MKL_DC_LOAD_YMM(&MKL_DC_AA(i,k + 2));
  127. ymm_b = MKL_DC_BCAST_YMM(&MKL_DC_BB(k + 2,j));
  128. MKL_DC_MUL_ADD_YMM(ymm_a, ymm_b, ymm_temp0, ymm_temp);
  129. ymm_a1 = MKL_DC_LOAD_YMM(&MKL_DC_AA(i+4,k + 2));
  130. MKL_DC_MUL_ADD_YMM(ymm_a1, ymm_b, ymm_temp1, ymm_temp);
  131. ymm_b = MKL_DC_BCAST_YMM(&MKL_DC_BB(k + 2,j+1));
  132. MKL_DC_MUL_ADD_YMM(ymm_a, ymm_b, ymm_temp2, ymm_temp);
  133. MKL_DC_MUL_ADD_YMM(ymm_a1, ymm_b, ymm_temp3, ymm_temp);
  134. ymm_b = MKL_DC_BCAST_YMM(&MKL_DC_BB(k + 2,j+2));
  135. MKL_DC_MUL_ADD_YMM(ymm_a, ymm_b, ymm_temp4, ymm_temp);
  136. MKL_DC_MUL_ADD_YMM(ymm_a1, ymm_b, ymm_temp5, ymm_temp);
  137. ymm_b = MKL_DC_BCAST_YMM(&MKL_DC_BB(k + 2,j+3));
  138. MKL_DC_MUL_ADD_YMM(ymm_a, ymm_b, ymm_temp6, ymm_temp);
  139. MKL_DC_MUL_ADD_YMM(ymm_a1, ymm_b, ymm_temp7, ymm_temp);
  140. ymm_a = MKL_DC_LOAD_YMM(&MKL_DC_AA(i,k + 3));
  141. ymm_b = MKL_DC_BCAST_YMM(&MKL_DC_BB(k + 3,j));
  142. MKL_DC_MUL_ADD_YMM(ymm_a, ymm_b, ymm_temp0, ymm_temp);
  143. ymm_a1 = MKL_DC_LOAD_YMM(&MKL_DC_AA(i+4,k + 3));
  144. MKL_DC_MUL_ADD_YMM(ymm_a1, ymm_b, ymm_temp1, ymm_temp);
  145. ymm_b = MKL_DC_BCAST_YMM(&MKL_DC_BB(k + 3,j+1));
  146. MKL_DC_MUL_ADD_YMM(ymm_a, ymm_b, ymm_temp2, ymm_temp);
  147. MKL_DC_MUL_ADD_YMM(ymm_a1, ymm_b, ymm_temp3, ymm_temp);
  148. ymm_b = MKL_DC_BCAST_YMM(&MKL_DC_BB(k + 3,j+2));
  149. MKL_DC_MUL_ADD_YMM(ymm_a, ymm_b, ymm_temp4, ymm_temp);
  150. MKL_DC_MUL_ADD_YMM(ymm_a1, ymm_b, ymm_temp5, ymm_temp);
  151. ymm_b = MKL_DC_BCAST_YMM(&MKL_DC_BB(k + 3,j+3));
  152. MKL_DC_MUL_ADD_YMM(ymm_a, ymm_b, ymm_temp6, ymm_temp);
  153. MKL_DC_MUL_ADD_YMM(ymm_a1, ymm_b, ymm_temp7, ymm_temp);
  154. }
  155. if (krem & 2) {
  156. ymm_a = MKL_DC_LOAD_YMM(&MKL_DC_AA(i,k + 0));
  157. ymm_b = MKL_DC_BCAST_YMM(&MKL_DC_BB(k + 0,j));
  158. MKL_DC_MUL_ADD_YMM(ymm_a, ymm_b, ymm_temp0, ymm_temp);
  159. ymm_a1 = MKL_DC_LOAD_YMM(&MKL_DC_AA(i+4,k + 0));
  160. MKL_DC_MUL_ADD_YMM(ymm_a1, ymm_b, ymm_temp1, ymm_temp);
  161. ymm_b = MKL_DC_BCAST_YMM(&MKL_DC_BB(k + 0,j+1));
  162. MKL_DC_MUL_ADD_YMM(ymm_a, ymm_b, ymm_temp2, ymm_temp);
  163. MKL_DC_MUL_ADD_YMM(ymm_a1, ymm_b, ymm_temp3, ymm_temp);
  164. ymm_b = MKL_DC_BCAST_YMM(&MKL_DC_BB(k + 0,j+2));
  165. MKL_DC_MUL_ADD_YMM(ymm_a, ymm_b, ymm_temp4, ymm_temp);
  166. MKL_DC_MUL_ADD_YMM(ymm_a1, ymm_b, ymm_temp5, ymm_temp);
  167. ymm_b = MKL_DC_BCAST_YMM(&MKL_DC_BB(k + 0,j+3));
  168. MKL_DC_MUL_ADD_YMM(ymm_a, ymm_b, ymm_temp6, ymm_temp);
  169. MKL_DC_MUL_ADD_YMM(ymm_a1, ymm_b, ymm_temp7, ymm_temp);
  170. k++;
  171. ymm_a = MKL_DC_LOAD_YMM(&MKL_DC_AA(i,k + 0));
  172. ymm_b = MKL_DC_BCAST_YMM(&MKL_DC_BB(k + 0,j));
  173. MKL_DC_MUL_ADD_YMM(ymm_a, ymm_b, ymm_temp0, ymm_temp);
  174. ymm_a1 = MKL_DC_LOAD_YMM(&MKL_DC_AA(i+4,k + 0));
  175. MKL_DC_MUL_ADD_YMM(ymm_a1, ymm_b, ymm_temp1, ymm_temp);
  176. ymm_b = MKL_DC_BCAST_YMM(&MKL_DC_BB(k + 0,j+1));
  177. MKL_DC_MUL_ADD_YMM(ymm_a, ymm_b, ymm_temp2, ymm_temp);
  178. MKL_DC_MUL_ADD_YMM(ymm_a1, ymm_b, ymm_temp3, ymm_temp);
  179. ymm_b = MKL_DC_BCAST_YMM(&MKL_DC_BB(k + 0,j+2));
  180. MKL_DC_MUL_ADD_YMM(ymm_a, ymm_b, ymm_temp4, ymm_temp);
  181. MKL_DC_MUL_ADD_YMM(ymm_a1, ymm_b, ymm_temp5, ymm_temp);
  182. ymm_b = MKL_DC_BCAST_YMM(&MKL_DC_BB(k + 0,j+3));
  183. MKL_DC_MUL_ADD_YMM(ymm_a, ymm_b, ymm_temp6, ymm_temp);
  184. MKL_DC_MUL_ADD_YMM(ymm_a1, ymm_b, ymm_temp7, ymm_temp);
  185. k++;
  186. }
  187. if (krem & 1) {
  188. ymm_a = MKL_DC_LOAD_YMM(&MKL_DC_AA(i,k + 0));
  189. ymm_b = MKL_DC_BCAST_YMM(&MKL_DC_BB(k + 0,j));
  190. MKL_DC_MUL_ADD_YMM(ymm_a, ymm_b, ymm_temp0, ymm_temp);
  191. ymm_a1 = MKL_DC_LOAD_YMM(&MKL_DC_AA(i+4,k + 0));
  192. MKL_DC_MUL_ADD_YMM(ymm_a1, ymm_b, ymm_temp1, ymm_temp);
  193. ymm_b = MKL_DC_BCAST_YMM(&MKL_DC_BB(k + 0,j+1));
  194. MKL_DC_MUL_ADD_YMM(ymm_a, ymm_b, ymm_temp2, ymm_temp);
  195. MKL_DC_MUL_ADD_YMM(ymm_a1, ymm_b, ymm_temp3, ymm_temp);
  196. ymm_b = MKL_DC_BCAST_YMM(&MKL_DC_BB(k + 0,j+2));
  197. MKL_DC_MUL_ADD_YMM(ymm_a, ymm_b, ymm_temp4, ymm_temp);
  198. MKL_DC_MUL_ADD_YMM(ymm_a1, ymm_b, ymm_temp5, ymm_temp);
  199. ymm_b = MKL_DC_BCAST_YMM(&MKL_DC_BB(k + 0,j+3));
  200. MKL_DC_MUL_ADD_YMM(ymm_a, ymm_b, ymm_temp6, ymm_temp);
  201. MKL_DC_MUL_ADD_YMM(ymm_a1, ymm_b, ymm_temp7, ymm_temp);
  202. k++;
  203. }
  204. #if !defined(MKL_DC_BETA_ZERO)
  205. ymm_c0 = MKL_DC_LOAD_YMM(&MKL_DC_CC(i,j));
  206. #if !defined(MKL_DC_BETA_ONE)
  207. ymm_c0 = MKL_DC_MUL_YMM(ymm_c0, ymm_beta);
  208. #endif
  209. #if defined(MKL_DC_ALPHA_ONE)
  210. ymm_c0 = MKL_DC_ADD_YMM(ymm_c0, ymm_temp0);
  211. #else
  212. MKL_DC_MUL_ADD_YMM(ymm_alpha, ymm_temp0, ymm_c0, ymm_temp);
  213. #endif
  214. #else
  215. #if !defined(MKL_DC_ALPHA_ONE)
  216. ymm_c0 = MKL_DC_MUL_YMM(ymm_alpha, ymm_temp0);
  217. #else
  218. ymm_c0 = ymm_temp0;
  219. #endif
  220. #endif
  221. MKL_DC_STORE_YMM(&MKL_DC_CC(i,j), ymm_c0);
  222. #if !defined(MKL_DC_BETA_ZERO)
  223. ymm_c1 = MKL_DC_LOAD_YMM(&MKL_DC_CC(i+4,j));
  224. #if !defined(MKL_DC_BETA_ONE)
  225. ymm_c1 = MKL_DC_MUL_YMM(ymm_c1, ymm_beta);
  226. #endif
  227. #if defined(MKL_DC_ALPHA_ONE)
  228. ymm_c1 = MKL_DC_ADD_YMM(ymm_temp1, ymm_c1);
  229. #else
  230. MKL_DC_MUL_ADD_YMM(ymm_alpha, ymm_temp1, ymm_c1, ymm_temp);
  231. #endif
  232. #else
  233. #if !defined(MKL_DC_ALPHA_ONE)
  234. ymm_c1 = MKL_DC_MUL_YMM(ymm_alpha, ymm_temp1);
  235. #else
  236. ymm_c1 = ymm_temp1;
  237. #endif
  238. #endif
  239. MKL_DC_STORE_YMM(&MKL_DC_CC(i+4,j), ymm_c1);
  240. #if !defined(MKL_DC_BETA_ZERO)
  241. ymm_c2 = MKL_DC_LOAD_YMM(&MKL_DC_CC(i,j+1));
  242. #if !defined(MKL_DC_BETA_ONE)
  243. ymm_c2 = MKL_DC_MUL_YMM(ymm_c2, ymm_beta);
  244. #endif
  245. #if defined(MKL_DC_ALPHA_ONE)
  246. ymm_c2 = MKL_DC_ADD_YMM(ymm_temp2, ymm_c2);
  247. #else
  248. MKL_DC_MUL_ADD_YMM(ymm_alpha, ymm_temp2, ymm_c2, ymm_temp);
  249. #endif
  250. #else
  251. #if !defined(MKL_DC_ALPHA_ONE)
  252. ymm_c2 = MKL_DC_MUL_YMM(ymm_alpha, ymm_temp2);
  253. #else
  254. ymm_c2 = ymm_temp2;
  255. #endif
  256. #endif
  257. MKL_DC_STORE_YMM(&MKL_DC_CC(i,j+1), ymm_c2);
  258. #if !defined(MKL_DC_BETA_ZERO)
  259. ymm_c3 = MKL_DC_LOAD_YMM(&MKL_DC_CC(i+4,j+1));
  260. #if !defined(MKL_DC_BETA_ONE)
  261. ymm_c3 = MKL_DC_MUL_YMM(ymm_c3, ymm_beta);
  262. #endif
  263. #if defined(MKL_DC_ALPHA_ONE)
  264. ymm_c3 = MKL_DC_ADD_YMM(ymm_temp3, ymm_c3);
  265. #else
  266. MKL_DC_MUL_ADD_YMM(ymm_alpha, ymm_temp3, ymm_c3, ymm_temp);
  267. #endif
  268. #else
  269. #if !defined(MKL_DC_ALPHA_ONE)
  270. ymm_c3 = MKL_DC_MUL_YMM(ymm_alpha, ymm_temp3);
  271. #else
  272. ymm_c3 = ymm_temp3;
  273. #endif
  274. #endif
  275. MKL_DC_STORE_YMM(&MKL_DC_CC(i+4,j+1), ymm_c3);
  276. #if !defined(MKL_DC_BETA_ZERO)
  277. ymm_c4 = MKL_DC_LOAD_YMM(&MKL_DC_CC(i,j+2));
  278. #if !defined(MKL_DC_BETA_ONE)
  279. ymm_c4 = MKL_DC_MUL_YMM(ymm_c4, ymm_beta);
  280. #endif
  281. #if defined(MKL_DC_ALPHA_ONE)
  282. ymm_c4 = MKL_DC_ADD_YMM(ymm_temp4, ymm_c4);
  283. #else
  284. MKL_DC_MUL_ADD_YMM(ymm_alpha, ymm_temp4, ymm_c4, ymm_temp);
  285. #endif
  286. #else
  287. #if !defined(MKL_DC_ALPHA_ONE)
  288. ymm_c4 = MKL_DC_MUL_YMM(ymm_alpha, ymm_temp4);
  289. #else
  290. ymm_c4 = ymm_temp4;
  291. #endif
  292. #endif
  293. MKL_DC_STORE_YMM(&MKL_DC_CC(i,j+2), ymm_c4);
  294. #if !defined(MKL_DC_BETA_ZERO)
  295. ymm_c5 = MKL_DC_LOAD_YMM(&MKL_DC_CC(i+4,j+2));
  296. #if !defined(MKL_DC_BETA_ONE)
  297. ymm_c5 = MKL_DC_MUL_YMM(ymm_c5, ymm_beta);
  298. #endif
  299. #if defined(MKL_DC_ALPHA_ONE)
  300. ymm_c5 = MKL_DC_ADD_YMM(ymm_temp5, ymm_c5);
  301. #else
  302. MKL_DC_MUL_ADD_YMM(ymm_alpha, ymm_temp5, ymm_c5, ymm_temp);
  303. #endif
  304. #else
  305. #if !defined(MKL_DC_ALPHA_ONE)
  306. ymm_c5 = MKL_DC_MUL_YMM(ymm_alpha, ymm_temp5);
  307. #else
  308. ymm_c5 = ymm_temp5;
  309. #endif
  310. #endif
  311. MKL_DC_STORE_YMM(&MKL_DC_CC(i+4,j+2), ymm_c5);
  312. #if !defined(MKL_DC_BETA_ZERO)
  313. ymm_c6 = MKL_DC_LOAD_YMM(&MKL_DC_CC(i,j+3));
  314. #if !defined(MKL_DC_BETA_ONE)
  315. ymm_c6 = MKL_DC_MUL_YMM(ymm_c6, ymm_beta);
  316. #endif
  317. #if defined(MKL_DC_ALPHA_ONE)
  318. ymm_c6 = MKL_DC_ADD_YMM(ymm_temp6, ymm_c6);
  319. #else
  320. MKL_DC_MUL_ADD_YMM(ymm_alpha, ymm_temp6, ymm_c6, ymm_temp);
  321. #endif
  322. #else
  323. #if !defined(MKL_DC_ALPHA_ONE)
  324. ymm_c6 = MKL_DC_MUL_YMM(ymm_alpha, ymm_temp6);
  325. #else
  326. ymm_c6 = ymm_temp6;
  327. #endif
  328. #endif
  329. MKL_DC_STORE_YMM(&MKL_DC_CC(i,j+3), ymm_c6);
  330. #if !defined(MKL_DC_BETA_ZERO)
  331. ymm_c7 = MKL_DC_LOAD_YMM(&MKL_DC_CC(i+4,j+3));
  332. #if !defined(MKL_DC_BETA_ONE)
  333. ymm_c7 = MKL_DC_MUL_YMM(ymm_c7, ymm_beta);
  334. #endif
  335. #if defined(MKL_DC_ALPHA_ONE)
  336. ymm_c7 = MKL_DC_ADD_YMM(ymm_temp7, ymm_c7);
  337. #else
  338. MKL_DC_MUL_ADD_YMM(ymm_alpha, ymm_temp7, ymm_c7, ymm_temp);
  339. #endif
  340. #else
  341. #if !defined(MKL_DC_ALPHA_ONE)
  342. ymm_c7 = MKL_DC_MUL_YMM(ymm_alpha, ymm_temp7);
  343. #else
  344. ymm_c7 = ymm_temp7;
  345. #endif
  346. #endif
  347. MKL_DC_STORE_YMM(&MKL_DC_CC(i+4,j+3), ymm_c7);
  348. }
  349. if ((m-i) & MKER1) {
  350. ymm_temp0 = MKL_DC_SETZERO_YMM();
  351. ymm_temp3 = MKL_DC_SETZERO_YMM();
  352. ymm_temp5 = MKL_DC_SETZERO_YMM();
  353. ymm_temp7 = MKL_DC_SETZERO_YMM();
  354. MKL_INT k;
  355. for (k=0; k<k0; k+=k_in_ker) {
  356. ymm_a = MKL_DC_LOAD_YMM(&MKL_DC_AA(i,k + 0));
  357. ymm_b = MKL_DC_BCAST_YMM(&MKL_DC_BB(k + 0,j));
  358. MKL_DC_MUL_ADD_YMM(ymm_a, ymm_b, ymm_temp0, ymm_temp);
  359. ymm_b = MKL_DC_BCAST_YMM(&MKL_DC_BB(k + 0,j+1));
  360. MKL_DC_MUL_ADD_YMM(ymm_a, ymm_b, ymm_temp3, ymm_temp);
  361. ymm_b = MKL_DC_BCAST_YMM(&MKL_DC_BB(k + 0,j+2));
  362. MKL_DC_MUL_ADD_YMM(ymm_a, ymm_b, ymm_temp5, ymm_temp);
  363. ymm_b = MKL_DC_BCAST_YMM(&MKL_DC_BB(k + 0,j+3));
  364. MKL_DC_MUL_ADD_YMM(ymm_a, ymm_b, ymm_temp7, ymm_temp);
  365. ymm_a = MKL_DC_LOAD_YMM(&MKL_DC_AA(i,k + 1));
  366. ymm_b = MKL_DC_BCAST_YMM(&MKL_DC_BB(k + 1,j));
  367. MKL_DC_MUL_ADD_YMM(ymm_a, ymm_b, ymm_temp0, ymm_temp);
  368. ymm_b = MKL_DC_BCAST_YMM(&MKL_DC_BB(k + 1,j+1));
  369. MKL_DC_MUL_ADD_YMM(ymm_a, ymm_b, ymm_temp3, ymm_temp);
  370. ymm_b = MKL_DC_BCAST_YMM(&MKL_DC_BB(k + 1,j+2));
  371. MKL_DC_MUL_ADD_YMM(ymm_a, ymm_b, ymm_temp5, ymm_temp);
  372. ymm_b = MKL_DC_BCAST_YMM(&MKL_DC_BB(k + 1,j+3));
  373. MKL_DC_MUL_ADD_YMM(ymm_a, ymm_b, ymm_temp7, ymm_temp);
  374. ymm_a = MKL_DC_LOAD_YMM(&MKL_DC_AA(i,k + 2));
  375. ymm_b = MKL_DC_BCAST_YMM(&MKL_DC_BB(k + 2,j));
  376. MKL_DC_MUL_ADD_YMM(ymm_a, ymm_b, ymm_temp0, ymm_temp);
  377. ymm_b = MKL_DC_BCAST_YMM(&MKL_DC_BB(k + 2,j+1));
  378. MKL_DC_MUL_ADD_YMM(ymm_a, ymm_b, ymm_temp3, ymm_temp);
  379. ymm_b = MKL_DC_BCAST_YMM(&MKL_DC_BB(k + 2,j+2));
  380. MKL_DC_MUL_ADD_YMM(ymm_a, ymm_b, ymm_temp5, ymm_temp);
  381. ymm_b = MKL_DC_BCAST_YMM(&MKL_DC_BB(k + 2,j+3));
  382. MKL_DC_MUL_ADD_YMM(ymm_a, ymm_b, ymm_temp7, ymm_temp);
  383. ymm_a = MKL_DC_LOAD_YMM(&MKL_DC_AA(i,k + 3));
  384. ymm_b = MKL_DC_BCAST_YMM(&MKL_DC_BB(k + 3,j));
  385. MKL_DC_MUL_ADD_YMM(ymm_a, ymm_b, ymm_temp0, ymm_temp);
  386. ymm_b = MKL_DC_BCAST_YMM(&MKL_DC_BB(k + 3,j+1));
  387. MKL_DC_MUL_ADD_YMM(ymm_a, ymm_b, ymm_temp3, ymm_temp);
  388. ymm_b = MKL_DC_BCAST_YMM(&MKL_DC_BB(k + 3,j+2));
  389. MKL_DC_MUL_ADD_YMM(ymm_a, ymm_b, ymm_temp5, ymm_temp);
  390. ymm_b = MKL_DC_BCAST_YMM(&MKL_DC_BB(k + 3,j+3));
  391. MKL_DC_MUL_ADD_YMM(ymm_a, ymm_b, ymm_temp7, ymm_temp);
  392. }
  393. if (krem & 2) {
  394. ymm_a = MKL_DC_LOAD_YMM(&MKL_DC_AA(i,k + 0));
  395. ymm_b = MKL_DC_BCAST_YMM(&MKL_DC_BB(k + 0,j));
  396. MKL_DC_MUL_ADD_YMM(ymm_a, ymm_b, ymm_temp0, ymm_temp);
  397. ymm_b = MKL_DC_BCAST_YMM(&MKL_DC_BB(k + 0,j+1));
  398. MKL_DC_MUL_ADD_YMM(ymm_a, ymm_b, ymm_temp3, ymm_temp);
  399. ymm_b = MKL_DC_BCAST_YMM(&MKL_DC_BB(k + 0,j+2));
  400. MKL_DC_MUL_ADD_YMM(ymm_a, ymm_b, ymm_temp5, ymm_temp);
  401. ymm_b = MKL_DC_BCAST_YMM(&MKL_DC_BB(k + 0,j+3));
  402. MKL_DC_MUL_ADD_YMM(ymm_a, ymm_b, ymm_temp7, ymm_temp);
  403. k++;
  404. ymm_a = MKL_DC_LOAD_YMM(&MKL_DC_AA(i,k + 0));
  405. ymm_b = MKL_DC_BCAST_YMM(&MKL_DC_BB(k + 0,j));
  406. MKL_DC_MUL_ADD_YMM(ymm_a, ymm_b, ymm_temp0, ymm_temp);
  407. ymm_b = MKL_DC_BCAST_YMM(&MKL_DC_BB(k + 0,j+1));
  408. MKL_DC_MUL_ADD_YMM(ymm_a, ymm_b, ymm_temp3, ymm_temp);
  409. ymm_b = MKL_DC_BCAST_YMM(&MKL_DC_BB(k + 0,j+2));
  410. MKL_DC_MUL_ADD_YMM(ymm_a, ymm_b, ymm_temp5, ymm_temp);
  411. ymm_b = MKL_DC_BCAST_YMM(&MKL_DC_BB(k + 0,j+3));
  412. MKL_DC_MUL_ADD_YMM(ymm_a, ymm_b, ymm_temp7, ymm_temp);
  413. k++;
  414. }
  415. if (krem & 1) {
  416. ymm_a = MKL_DC_LOAD_YMM(&MKL_DC_AA(i,k + 0));
  417. ymm_b = MKL_DC_BCAST_YMM(&MKL_DC_BB(k + 0,j));
  418. MKL_DC_MUL_ADD_YMM(ymm_a, ymm_b, ymm_temp0, ymm_temp);
  419. ymm_b = MKL_DC_BCAST_YMM(&MKL_DC_BB(k + 0,j+1));
  420. MKL_DC_MUL_ADD_YMM(ymm_a, ymm_b, ymm_temp3, ymm_temp);
  421. ymm_b = MKL_DC_BCAST_YMM(&MKL_DC_BB(k + 0,j+2));
  422. MKL_DC_MUL_ADD_YMM(ymm_a, ymm_b, ymm_temp5, ymm_temp);
  423. ymm_b = MKL_DC_BCAST_YMM(&MKL_DC_BB(k + 0,j+3));
  424. MKL_DC_MUL_ADD_YMM(ymm_a, ymm_b, ymm_temp7, ymm_temp);
  425. k++;
  426. }
  427. #if !defined(MKL_DC_BETA_ZERO)
  428. ymm_c0 = MKL_DC_LOAD_YMM(&MKL_DC_CC(i,j));
  429. #if !defined(MKL_DC_BETA_ONE)
  430. ymm_c0 = MKL_DC_MUL_YMM(ymm_c0, ymm_beta);
  431. #endif
  432. #if defined(MKL_DC_ALPHA_ONE)
  433. ymm_c0 = MKL_DC_ADD_YMM(ymm_temp0, ymm_c0);
  434. #else
  435. MKL_DC_MUL_ADD_YMM(ymm_alpha, ymm_temp0, ymm_c0, ymm_temp);
  436. #endif
  437. #else
  438. #if !defined(MKL_DC_ALPHA_ONE)
  439. ymm_c0 = MKL_DC_MUL_YMM(ymm_alpha, ymm_temp0);
  440. #else
  441. ymm_c0 = ymm_temp0;
  442. #endif
  443. #endif
  444. #if !defined(MKL_DC_BETA_ZERO)
  445. ymm_c3 = MKL_DC_LOAD_YMM(&MKL_DC_CC(i,j+1));
  446. #if !defined(MKL_DC_BETA_ONE)
  447. ymm_c3 = MKL_DC_MUL_YMM(ymm_c3, ymm_beta);
  448. #endif
  449. #if defined(MKL_DC_ALPHA_ONE)
  450. ymm_c3 = MKL_DC_ADD_YMM(ymm_temp3, ymm_c3);
  451. #else
  452. MKL_DC_MUL_ADD_YMM(ymm_alpha, ymm_temp3, ymm_c3, ymm_temp);
  453. #endif
  454. #else
  455. #if !defined(MKL_DC_ALPHA_ONE)
  456. ymm_c3 = MKL_DC_MUL_YMM(ymm_alpha, ymm_temp3);
  457. #else
  458. ymm_c3 = ymm_temp3;
  459. #endif
  460. #endif
  461. #if !defined(MKL_DC_BETA_ZERO)
  462. ymm_c5 = MKL_DC_LOAD_YMM(&MKL_DC_CC(i,j+2));
  463. #if !defined(MKL_DC_BETA_ONE)
  464. ymm_c5 = MKL_DC_MUL_YMM(ymm_c5, ymm_beta);
  465. #endif
  466. #if defined(MKL_DC_ALPHA_ONE)
  467. ymm_c5 = MKL_DC_ADD_YMM(ymm_temp5, ymm_c5);
  468. #else
  469. MKL_DC_MUL_ADD_YMM(ymm_alpha, ymm_temp5, ymm_c5, ymm_temp);
  470. #endif
  471. #else
  472. #if !defined(MKL_DC_ALPHA_ONE)
  473. ymm_c5 = MKL_DC_MUL_YMM(ymm_alpha, ymm_temp5);
  474. #else
  475. ymm_c5 = ymm_temp5;
  476. #endif
  477. #endif
  478. #if !defined(MKL_DC_BETA_ZERO)
  479. ymm_c7 = MKL_DC_LOAD_YMM(&MKL_DC_CC(i,j+3));
  480. #if !defined(MKL_DC_BETA_ONE)
  481. ymm_c7 = MKL_DC_MUL_YMM(ymm_c7, ymm_beta);
  482. #endif
  483. #if defined(MKL_DC_ALPHA_ONE)
  484. ymm_c7 = MKL_DC_ADD_YMM(ymm_temp7, ymm_c7);
  485. #else
  486. MKL_DC_MUL_ADD_YMM(ymm_alpha, ymm_temp7, ymm_c7, ymm_temp);
  487. #endif
  488. #else
  489. #if !defined(MKL_DC_ALPHA_ONE)
  490. ymm_c7 = MKL_DC_MUL_YMM(ymm_alpha, ymm_temp7);
  491. #else
  492. ymm_c7 = ymm_temp7;
  493. #endif
  494. #endif
  495. MKL_DC_STORE_YMM(&MKL_DC_CC(i,j+0), ymm_c0);
  496. MKL_DC_STORE_YMM(&MKL_DC_CC(i,j+1), ymm_c3);
  497. MKL_DC_STORE_YMM(&MKL_DC_CC(i,j+2), ymm_c5);
  498. MKL_DC_STORE_YMM(&MKL_DC_CC(i,j+3), ymm_c7);
  499. i += MKER1;
  500. }
  501. if ((m-i) & MKER2) {
  502. xmm_temp0 = MKL_DC_SETZERO_XMM();
  503. xmm_temp3 = MKL_DC_SETZERO_XMM();
  504. xmm_temp5 = MKL_DC_SETZERO_XMM();
  505. xmm_temp7 = MKL_DC_SETZERO_XMM();
  506. MKL_INT k;
  507. for (k=0; k<k0; k+=k_in_ker) {
  508. xmm_a = MKL_DC_LOAD_XMM(&MKL_DC_AA(i,k));
  509. ymm_b = MKL_DC_BCAST_YMM(&MKL_DC_BB(k,j));
  510. xmm_b = MKL_DC_CAST_YMM_TO_XMM(ymm_b);
  511. MKL_DC_MUL_ADD_XMM(xmm_a, xmm_b, xmm_temp0, xmm_b);
  512. ymm_b = MKL_DC_BCAST_YMM(&MKL_DC_BB(k,j+1));
  513. xmm_b = MKL_DC_CAST_YMM_TO_XMM(ymm_b);
  514. MKL_DC_MUL_ADD_XMM(xmm_a, xmm_b, xmm_temp3, xmm_b);
  515. ymm_b = MKL_DC_BCAST_YMM(&MKL_DC_BB(k,j+2));
  516. xmm_b = MKL_DC_CAST_YMM_TO_XMM(ymm_b);
  517. MKL_DC_MUL_ADD_XMM(xmm_a, xmm_b, xmm_temp5, xmm_b);
  518. ymm_b = MKL_DC_BCAST_YMM(&MKL_DC_BB(k,j+3));
  519. xmm_b = MKL_DC_CAST_YMM_TO_XMM(ymm_b);
  520. MKL_DC_MUL_ADD_XMM(xmm_a, xmm_b, xmm_temp7, xmm_b);
  521. xmm_a = MKL_DC_LOAD_XMM(&MKL_DC_AA(i,k+1));
  522. ymm_b = MKL_DC_BCAST_YMM(&MKL_DC_BB(k+1,j));
  523. xmm_b = MKL_DC_CAST_YMM_TO_XMM(ymm_b);
  524. MKL_DC_MUL_ADD_XMM(xmm_a, xmm_b, xmm_temp0, xmm_b);
  525. ymm_b = MKL_DC_BCAST_YMM(&MKL_DC_BB(k+1,j+1));
  526. xmm_b = MKL_DC_CAST_YMM_TO_XMM(ymm_b);
  527. MKL_DC_MUL_ADD_XMM(xmm_a, xmm_b, xmm_temp3, xmm_b);
  528. ymm_b = MKL_DC_BCAST_YMM(&MKL_DC_BB(k+1,j+2));
  529. xmm_b = MKL_DC_CAST_YMM_TO_XMM(ymm_b);
  530. MKL_DC_MUL_ADD_XMM(xmm_a, xmm_b, xmm_temp5, xmm_b);
  531. ymm_b = MKL_DC_BCAST_YMM(&MKL_DC_BB(k+1,j+3));
  532. xmm_b = MKL_DC_CAST_YMM_TO_XMM(ymm_b);
  533. MKL_DC_MUL_ADD_XMM(xmm_a, xmm_b, xmm_temp7, xmm_b);
  534. xmm_a = MKL_DC_LOAD_XMM(&MKL_DC_AA(i,k+2));
  535. ymm_b = MKL_DC_BCAST_YMM(&MKL_DC_BB(k+2,j));
  536. xmm_b = MKL_DC_CAST_YMM_TO_XMM(ymm_b);
  537. MKL_DC_MUL_ADD_XMM(xmm_a, xmm_b, xmm_temp0, xmm_b);
  538. ymm_b = MKL_DC_BCAST_YMM(&MKL_DC_BB(k+2,j+1));
  539. xmm_b = MKL_DC_CAST_YMM_TO_XMM(ymm_b);
  540. MKL_DC_MUL_ADD_XMM(xmm_a, xmm_b, xmm_temp3, xmm_b);
  541. ymm_b = MKL_DC_BCAST_YMM(&MKL_DC_BB(k+2,j+2));
  542. xmm_b = MKL_DC_CAST_YMM_TO_XMM(ymm_b);
  543. MKL_DC_MUL_ADD_XMM(xmm_a, xmm_b, xmm_temp5, xmm_b);
  544. ymm_b = MKL_DC_BCAST_YMM(&MKL_DC_BB(k+2,j+3));
  545. xmm_b = MKL_DC_CAST_YMM_TO_XMM(ymm_b);
  546. MKL_DC_MUL_ADD_XMM(xmm_a, xmm_b, xmm_temp7, xmm_b);
  547. xmm_a = MKL_DC_LOAD_XMM(&MKL_DC_AA(i,k+3));
  548. ymm_b = MKL_DC_BCAST_YMM(&MKL_DC_BB(k+3,j));
  549. xmm_b = MKL_DC_CAST_YMM_TO_XMM(ymm_b);
  550. MKL_DC_MUL_ADD_XMM(xmm_a, xmm_b, xmm_temp0, xmm_b);
  551. ymm_b = MKL_DC_BCAST_YMM(&MKL_DC_BB(k+3,j+1));
  552. xmm_b = MKL_DC_CAST_YMM_TO_XMM(ymm_b);
  553. MKL_DC_MUL_ADD_XMM(xmm_a, xmm_b, xmm_temp3, xmm_b);
  554. ymm_b = MKL_DC_BCAST_YMM(&MKL_DC_BB(k+3,j+2));
  555. xmm_b = MKL_DC_CAST_YMM_TO_XMM(ymm_b);
  556. MKL_DC_MUL_ADD_XMM(xmm_a, xmm_b, xmm_temp5, xmm_b);
  557. ymm_b = MKL_DC_BCAST_YMM(&MKL_DC_BB(k+3,j+3));
  558. xmm_b = MKL_DC_CAST_YMM_TO_XMM(ymm_b);
  559. MKL_DC_MUL_ADD_XMM(xmm_a, xmm_b, xmm_temp7, xmm_b);
  560. }
  561. if (krem & 2) {
  562. xmm_a = MKL_DC_LOAD_XMM(&MKL_DC_AA(i,k));
  563. ymm_b = MKL_DC_BCAST_YMM(&MKL_DC_BB(k,j));
  564. xmm_b = MKL_DC_CAST_YMM_TO_XMM(ymm_b);
  565. MKL_DC_MUL_ADD_XMM(xmm_a, xmm_b, xmm_temp0, xmm_b);
  566. ymm_b = MKL_DC_BCAST_YMM(&MKL_DC_BB(k,j+1));
  567. xmm_b = MKL_DC_CAST_YMM_TO_XMM(ymm_b);
  568. MKL_DC_MUL_ADD_XMM(xmm_a, xmm_b, xmm_temp3, xmm_b);
  569. ymm_b = MKL_DC_BCAST_YMM(&MKL_DC_BB(k,j+2));
  570. xmm_b = MKL_DC_CAST_YMM_TO_XMM(ymm_b);
  571. MKL_DC_MUL_ADD_XMM(xmm_a, xmm_b, xmm_temp5, xmm_b);
  572. ymm_b = MKL_DC_BCAST_YMM(&MKL_DC_BB(k,j+3));
  573. xmm_b = MKL_DC_CAST_YMM_TO_XMM(ymm_b);
  574. MKL_DC_MUL_ADD_XMM(xmm_a, xmm_b, xmm_temp7, xmm_b);
  575. k++;
  576. xmm_a = MKL_DC_LOAD_XMM(&MKL_DC_AA(i,k));
  577. ymm_b = MKL_DC_BCAST_YMM(&MKL_DC_BB(k,j));
  578. xmm_b = MKL_DC_CAST_YMM_TO_XMM(ymm_b);
  579. MKL_DC_MUL_ADD_XMM(xmm_a, xmm_b, xmm_temp0, xmm_b);
  580. ymm_b = MKL_DC_BCAST_YMM(&MKL_DC_BB(k,j+1));
  581. xmm_b = MKL_DC_CAST_YMM_TO_XMM(ymm_b);
  582. MKL_DC_MUL_ADD_XMM(xmm_a, xmm_b, xmm_temp3, xmm_b);
  583. ymm_b = MKL_DC_BCAST_YMM(&MKL_DC_BB(k,j+2));
  584. xmm_b = MKL_DC_CAST_YMM_TO_XMM(ymm_b);
  585. MKL_DC_MUL_ADD_XMM(xmm_a, xmm_b, xmm_temp5, xmm_b);
  586. ymm_b = MKL_DC_BCAST_YMM(&MKL_DC_BB(k,j+3));
  587. xmm_b = MKL_DC_CAST_YMM_TO_XMM(ymm_b);
  588. MKL_DC_MUL_ADD_XMM(xmm_a, xmm_b, xmm_temp7, xmm_b);
  589. k++;
  590. }
  591. if (krem & 1) {
  592. xmm_a = MKL_DC_LOAD_XMM(&MKL_DC_AA(i,k));
  593. ymm_b = MKL_DC_BCAST_YMM(&MKL_DC_BB(k,j));
  594. xmm_b = MKL_DC_CAST_YMM_TO_XMM(ymm_b);
  595. MKL_DC_MUL_ADD_XMM(xmm_a, xmm_b, xmm_temp0, xmm_b);
  596. ymm_b = MKL_DC_BCAST_YMM(&MKL_DC_BB(k,j+1));
  597. xmm_b = MKL_DC_CAST_YMM_TO_XMM(ymm_b);
  598. MKL_DC_MUL_ADD_XMM(xmm_a, xmm_b, xmm_temp3, xmm_b);
  599. ymm_b = MKL_DC_BCAST_YMM(&MKL_DC_BB(k,j+2));
  600. xmm_b = MKL_DC_CAST_YMM_TO_XMM(ymm_b);
  601. MKL_DC_MUL_ADD_XMM(xmm_a, xmm_b, xmm_temp5, xmm_b);
  602. ymm_b = MKL_DC_BCAST_YMM(&MKL_DC_BB(k,j+3));
  603. xmm_b = MKL_DC_CAST_YMM_TO_XMM(ymm_b);
  604. MKL_DC_MUL_ADD_XMM(xmm_a, xmm_b, xmm_temp7, xmm_b);
  605. k++;
  606. }
  607. #if !defined(MKL_DC_BETA_ZERO)
  608. xmm_c = MKL_DC_LOAD_XMM(&MKL_DC_CC(i,j));
  609. #if !defined(MKL_DC_BETA_ONE)
  610. xmm_c = MKL_DC_MUL_XMM(xmm_c, xmm_beta);
  611. #endif
  612. #if defined(MKL_DC_ALPHA_ONE)
  613. xmm_c = MKL_DC_ADD_XMM(xmm_temp0, xmm_c);
  614. #else
  615. MKL_DC_MUL_ADD_XMM(xmm_alpha, xmm_temp0, xmm_c, xmm_temp);
  616. #endif
  617. #else
  618. #if !defined(MKL_DC_ALPHA_ONE)
  619. xmm_c = MKL_DC_MUL_XMM(xmm_alpha, xmm_temp0);
  620. #else
  621. xmm_c = xmm_temp0;
  622. #endif
  623. #endif
  624. #if !defined(MKL_DC_BETA_ZERO)
  625. xmm_c3 = MKL_DC_LOAD_XMM(&MKL_DC_CC(i,j+1));
  626. #if !defined(MKL_DC_BETA_ONE)
  627. xmm_c3 = MKL_DC_MUL_XMM(xmm_c3, xmm_beta);
  628. #endif
  629. #if defined(MKL_DC_ALPHA_ONE)
  630. xmm_c3 = MKL_DC_ADD_XMM(xmm_temp3, xmm_c3);
  631. #else
  632. MKL_DC_MUL_ADD_XMM(xmm_alpha, xmm_temp3, xmm_c3, xmm_temp);
  633. #endif
  634. #else
  635. #if !defined(MKL_DC_ALPHA_ONE)
  636. xmm_c3 = MKL_DC_MUL_XMM(xmm_alpha, xmm_temp3);
  637. #else
  638. xmm_c3 = xmm_temp3;
  639. #endif
  640. #endif
  641. #if !defined(MKL_DC_BETA_ZERO)
  642. xmm_c5 = MKL_DC_LOAD_XMM(&MKL_DC_CC(i,j+2));
  643. #if !defined(MKL_DC_BETA_ONE)
  644. xmm_c5 = MKL_DC_MUL_XMM(xmm_c5, xmm_beta);
  645. #endif
  646. #if defined(MKL_DC_ALPHA_ONE)
  647. xmm_c5 = MKL_DC_ADD_XMM(xmm_temp5, xmm_c5);
  648. #else
  649. MKL_DC_MUL_ADD_XMM(xmm_alpha, xmm_temp5, xmm_c5, xmm_temp);
  650. #endif
  651. #else
  652. #if !defined(MKL_DC_ALPHA_ONE)
  653. xmm_c5 = MKL_DC_MUL_XMM(xmm_alpha, xmm_temp5);
  654. #else
  655. xmm_c5 = xmm_temp5;
  656. #endif
  657. #endif
  658. #if !defined(MKL_DC_BETA_ZERO)
  659. xmm_c7 = MKL_DC_LOAD_XMM(&MKL_DC_CC(i,j+3));
  660. #if !defined(MKL_DC_BETA_ONE)
  661. xmm_c7 = MKL_DC_MUL_XMM(xmm_c7, xmm_beta);
  662. #endif
  663. #if defined(MKL_DC_ALPHA_ONE)
  664. xmm_c7 = MKL_DC_ADD_XMM(xmm_temp7, xmm_c7);
  665. #else
  666. MKL_DC_MUL_ADD_XMM(xmm_alpha, xmm_temp7, xmm_c7, xmm_temp);
  667. #endif
  668. #else
  669. #if !defined(MKL_DC_ALPHA_ONE)
  670. xmm_c7 = MKL_DC_MUL_XMM(xmm_alpha, xmm_temp7);
  671. #else
  672. xmm_c7 = xmm_temp7;
  673. #endif
  674. #endif
  675. MKL_DC_STORE_XMM(&MKL_DC_CC(i,j), xmm_c);
  676. MKL_DC_STORE_XMM(&MKL_DC_CC(i,j+1), xmm_c3);
  677. MKL_DC_STORE_XMM(&MKL_DC_CC(i,j+2), xmm_c5);
  678. MKL_DC_STORE_XMM(&MKL_DC_CC(i,j+3), xmm_c7);
  679. i += MKER2;
  680. }
  681. if ((m-i) & MKER3) {
  682. xmm_temp0 = MKL_DC_SETZERO_XMM();
  683. xmm_temp3 = MKL_DC_SETZERO_XMM();
  684. xmm_temp5 = MKL_DC_SETZERO_XMM();
  685. xmm_temp7 = MKL_DC_SETZERO_XMM();
  686. MKL_INT k;
  687. for (k=0; k<k0; k+=k_in_ker) {
  688. xmm_a = MKL_DC_LOAD_XMM_S(&MKL_DC_AA(i,k));
  689. xmm_b = MKL_DC_LOAD_XMM_S(&MKL_DC_BB(k,j));
  690. MKL_DC_MUL_ADD_XMM_S(xmm_a, xmm_b, xmm_temp0, xmm_b);
  691. xmm_b = MKL_DC_LOAD_XMM_S(&MKL_DC_BB(k,j+1));
  692. MKL_DC_MUL_ADD_XMM_S(xmm_a, xmm_b, xmm_temp3, xmm_b);
  693. xmm_b = MKL_DC_LOAD_XMM_S(&MKL_DC_BB(k,j+2));
  694. MKL_DC_MUL_ADD_XMM_S(xmm_a, xmm_b, xmm_temp5, xmm_b);
  695. xmm_b = MKL_DC_LOAD_XMM_S(&MKL_DC_BB(k,j+3));
  696. MKL_DC_MUL_ADD_XMM_S(xmm_a, xmm_b, xmm_temp7, xmm_b);
  697. xmm_a = MKL_DC_LOAD_XMM_S(&MKL_DC_AA(i,k+1));
  698. xmm_b = MKL_DC_LOAD_XMM_S(&MKL_DC_BB(k+1,j));
  699. MKL_DC_MUL_ADD_XMM_S(xmm_a, xmm_b, xmm_temp0, xmm_b);
  700. xmm_b = MKL_DC_LOAD_XMM_S(&MKL_DC_BB(k+1,j+1));
  701. MKL_DC_MUL_ADD_XMM_S(xmm_a, xmm_b, xmm_temp3, xmm_b);
  702. xmm_b = MKL_DC_LOAD_XMM_S(&MKL_DC_BB(k+1,j+2));
  703. MKL_DC_MUL_ADD_XMM_S(xmm_a, xmm_b, xmm_temp5, xmm_b);
  704. xmm_b = MKL_DC_LOAD_XMM_S(&MKL_DC_BB(k+1,j+3));
  705. MKL_DC_MUL_ADD_XMM_S(xmm_a, xmm_b, xmm_temp7, xmm_b);
  706. xmm_a = MKL_DC_LOAD_XMM_S(&MKL_DC_AA(i,k+2));
  707. xmm_b = MKL_DC_LOAD_XMM_S(&MKL_DC_BB(k+2,j));
  708. MKL_DC_MUL_ADD_XMM_S(xmm_a, xmm_b, xmm_temp0, xmm_b);
  709. xmm_b = MKL_DC_LOAD_XMM_S(&MKL_DC_BB(k+2,j+1));
  710. MKL_DC_MUL_ADD_XMM_S(xmm_a, xmm_b, xmm_temp3, xmm_b);
  711. xmm_b = MKL_DC_LOAD_XMM_S(&MKL_DC_BB(k+2,j+2));
  712. MKL_DC_MUL_ADD_XMM_S(xmm_a, xmm_b, xmm_temp5, xmm_b);
  713. xmm_b = MKL_DC_LOAD_XMM_S(&MKL_DC_BB(k+2,j+3));
  714. MKL_DC_MUL_ADD_XMM_S(xmm_a, xmm_b, xmm_temp7, xmm_b);
  715. xmm_a = MKL_DC_LOAD_XMM_S(&MKL_DC_AA(i,k+3));
  716. xmm_b = MKL_DC_LOAD_XMM_S(&MKL_DC_BB(k+3,j));
  717. MKL_DC_MUL_ADD_XMM_S(xmm_a, xmm_b, xmm_temp0, xmm_b);
  718. xmm_b = MKL_DC_LOAD_XMM_S(&MKL_DC_BB(k+3,j+1));
  719. MKL_DC_MUL_ADD_XMM_S(xmm_a, xmm_b, xmm_temp3, xmm_b);
  720. xmm_b = MKL_DC_LOAD_XMM_S(&MKL_DC_BB(k+3,j+2));
  721. MKL_DC_MUL_ADD_XMM_S(xmm_a, xmm_b, xmm_temp5, xmm_b);
  722. xmm_b = MKL_DC_LOAD_XMM_S(&MKL_DC_BB(k+3,j+3));
  723. MKL_DC_MUL_ADD_XMM_S(xmm_a, xmm_b, xmm_temp7, xmm_b);
  724. }
  725. if (krem & 2) {
  726. xmm_a = MKL_DC_LOAD_XMM_S(&MKL_DC_AA(i,k));
  727. xmm_b = MKL_DC_LOAD_XMM_S(&MKL_DC_BB(k,j));
  728. MKL_DC_MUL_ADD_XMM_S(xmm_a, xmm_b, xmm_temp0, xmm_b);
  729. xmm_b = MKL_DC_LOAD_XMM_S(&MKL_DC_BB(k,j+1));
  730. MKL_DC_MUL_ADD_XMM_S(xmm_a, xmm_b, xmm_temp3, xmm_b);
  731. xmm_b = MKL_DC_LOAD_XMM_S(&MKL_DC_BB(k,j+2));
  732. MKL_DC_MUL_ADD_XMM_S(xmm_a, xmm_b, xmm_temp5, xmm_b);
  733. xmm_b = MKL_DC_LOAD_XMM_S(&MKL_DC_BB(k,j+3));
  734. MKL_DC_MUL_ADD_XMM_S(xmm_a, xmm_b, xmm_temp7, xmm_b);
  735. k++;
  736. xmm_a = MKL_DC_LOAD_XMM_S(&MKL_DC_AA(i,k));
  737. xmm_b = MKL_DC_LOAD_XMM_S(&MKL_DC_BB(k,j));
  738. MKL_DC_MUL_ADD_XMM_S(xmm_a, xmm_b, xmm_temp0, xmm_b);
  739. xmm_b = MKL_DC_LOAD_XMM_S(&MKL_DC_BB(k,j+1));
  740. MKL_DC_MUL_ADD_XMM_S(xmm_a, xmm_b, xmm_temp3, xmm_b);
  741. xmm_b = MKL_DC_LOAD_XMM_S(&MKL_DC_BB(k,j+2));
  742. MKL_DC_MUL_ADD_XMM_S(xmm_a, xmm_b, xmm_temp5, xmm_b);
  743. xmm_b = MKL_DC_LOAD_XMM_S(&MKL_DC_BB(k,j+3));
  744. MKL_DC_MUL_ADD_XMM_S(xmm_a, xmm_b, xmm_temp7, xmm_b);
  745. k++;
  746. }
  747. if (krem & 1) {
  748. xmm_a = MKL_DC_LOAD_XMM_S(&MKL_DC_AA(i,k));
  749. xmm_b = MKL_DC_LOAD_XMM_S(&MKL_DC_BB(k,j));
  750. MKL_DC_MUL_ADD_XMM_S(xmm_a, xmm_b, xmm_temp0, xmm_b);
  751. xmm_b = MKL_DC_LOAD_XMM_S(&MKL_DC_BB(k,j+1));
  752. MKL_DC_MUL_ADD_XMM_S(xmm_a, xmm_b, xmm_temp3, xmm_b);
  753. xmm_b = MKL_DC_LOAD_XMM_S(&MKL_DC_BB(k,j+2));
  754. MKL_DC_MUL_ADD_XMM_S(xmm_a, xmm_b, xmm_temp5, xmm_b);
  755. xmm_b = MKL_DC_LOAD_XMM_S(&MKL_DC_BB(k,j+3));
  756. MKL_DC_MUL_ADD_XMM_S(xmm_a, xmm_b, xmm_temp7, xmm_b);
  757. k++;
  758. }
  759. #if !defined(MKL_DC_BETA_ZERO)
  760. xmm_c = MKL_DC_LOAD_XMM_S(&MKL_DC_CC(i,j));
  761. #if !defined(MKL_DC_BETA_ONE)
  762. xmm_c = MKL_DC_MUL_XMM(xmm_c, xmm_beta);
  763. #endif
  764. #if defined(MKL_DC_ALPHA_ONE)
  765. xmm_c = MKL_DC_ADD_XMM_S(xmm_temp0, xmm_c);
  766. #else
  767. MKL_DC_MUL_ADD_XMM_S(xmm_alpha, xmm_temp0, xmm_c, xmm_temp);
  768. #endif
  769. #else
  770. #if !defined(MKL_DC_ALPHA_ONE)
  771. xmm_c = MKL_DC_MUL_XMM_S(xmm_alpha, xmm_temp0);
  772. #else
  773. xmm_c = xmm_temp0;
  774. #endif
  775. #endif
  776. #if !defined(MKL_DC_BETA_ZERO)
  777. xmm_c3 = MKL_DC_LOAD_XMM_S(&MKL_DC_CC(i,j+1));
  778. #if !defined(MKL_DC_BETA_ONE)
  779. xmm_c3 = MKL_DC_MUL_XMM(xmm_c3, xmm_beta);
  780. #endif
  781. #if defined(MKL_DC_ALPHA_ONE)
  782. xmm_c3 = MKL_DC_ADD_XMM_S(xmm_temp3, xmm_c3);
  783. #else
  784. MKL_DC_MUL_ADD_XMM_S(xmm_alpha, xmm_temp3, xmm_c3, xmm_temp);
  785. #endif
  786. #else
  787. #if !defined(MKL_DC_ALPHA_ONE)
  788. xmm_c3 = MKL_DC_MUL_XMM_S(xmm_alpha, xmm_temp3);
  789. #else
  790. xmm_c3 = xmm_temp3;
  791. #endif
  792. #endif
  793. #if !defined(MKL_DC_BETA_ZERO)
  794. xmm_c5 = MKL_DC_LOAD_XMM_S(&MKL_DC_CC(i,j+2));
  795. #if !defined(MKL_DC_BETA_ONE)
  796. xmm_c5 = MKL_DC_MUL_XMM(xmm_c5, xmm_beta);
  797. #endif
  798. #if defined(MKL_DC_ALPHA_ONE)
  799. xmm_c5 = MKL_DC_ADD_XMM_S(xmm_temp5, xmm_c5);
  800. #else
  801. MKL_DC_MUL_ADD_XMM_S(xmm_alpha, xmm_temp5, xmm_c5, xmm_temp);
  802. #endif
  803. #else
  804. #if !defined(MKL_DC_ALPHA_ONE)
  805. xmm_c5 = MKL_DC_MUL_XMM_S(xmm_alpha, xmm_temp5);
  806. #else
  807. xmm_c5 = xmm_temp5;
  808. #endif
  809. #endif
  810. #if !defined(MKL_DC_BETA_ZERO)
  811. xmm_c7 = MKL_DC_LOAD_XMM_S(&MKL_DC_CC(i,j+3));
  812. #if !defined(MKL_DC_BETA_ONE)
  813. xmm_c7 = MKL_DC_MUL_XMM(xmm_c7, xmm_beta);
  814. #endif
  815. #if defined(MKL_DC_ALPHA_ONE)
  816. xmm_c7 = MKL_DC_ADD_XMM_S(xmm_temp7, xmm_c7);
  817. #else
  818. MKL_DC_MUL_ADD_XMM_S(xmm_alpha, xmm_temp7, xmm_c7, xmm_temp);
  819. #endif
  820. #else
  821. #if !defined(MKL_DC_ALPHA_ONE)
  822. xmm_c7 = MKL_DC_MUL_XMM_S(xmm_alpha, xmm_temp7);
  823. #else
  824. xmm_c7 = xmm_temp7;
  825. #endif
  826. #endif
  827. MKL_DC_STORE_XMM_S(&MKL_DC_CC(i,j), xmm_c);
  828. MKL_DC_STORE_XMM_S(&MKL_DC_CC(i,j+1), xmm_c3);
  829. MKL_DC_STORE_XMM_S(&MKL_DC_CC(i,j+2), xmm_c5);
  830. MKL_DC_STORE_XMM_S(&MKL_DC_CC(i,j+3), xmm_c7);
  831. i += MKER3;
  832. }
  833. }
  834. if ((n-j) == 3) {
  835. MKL_INT i;
  836. for (i=0; i<m0; i+=m_in_ker) {
  837. ymm_temp0 = MKL_DC_SETZERO_YMM();
  838. ymm_temp1 = MKL_DC_SETZERO_YMM();
  839. ymm_temp2 = MKL_DC_SETZERO_YMM();
  840. ymm_temp3 = MKL_DC_SETZERO_YMM();
  841. ymm_temp4 = MKL_DC_SETZERO_YMM();
  842. ymm_temp5 = MKL_DC_SETZERO_YMM();
  843. MKL_INT k;
  844. for (k=0; k<k0; k+=k_in_ker) {
  845. ymm_a = MKL_DC_LOAD_YMM(&MKL_DC_AA(i,k + 0));
  846. ymm_b = MKL_DC_BCAST_YMM(&MKL_DC_BB(k + 0,j));
  847. MKL_DC_MUL_ADD_YMM(ymm_a, ymm_b, ymm_temp0, ymm_temp);
  848. ymm_a1 = MKL_DC_LOAD_YMM(&MKL_DC_AA(i+4,k + 0));
  849. MKL_DC_MUL_ADD_YMM(ymm_a1, ymm_b, ymm_temp1, ymm_temp);
  850. ymm_b = MKL_DC_BCAST_YMM(&MKL_DC_BB(k + 0,j+1));
  851. MKL_DC_MUL_ADD_YMM(ymm_a, ymm_b, ymm_temp2, ymm_temp);
  852. MKL_DC_MUL_ADD_YMM(ymm_a1, ymm_b, ymm_temp3, ymm_temp);
  853. ymm_b = MKL_DC_BCAST_YMM(&MKL_DC_BB(k + 0,j+2));
  854. MKL_DC_MUL_ADD_YMM(ymm_a, ymm_b, ymm_temp4, ymm_temp);
  855. MKL_DC_MUL_ADD_YMM(ymm_a1, ymm_b, ymm_temp5, ymm_temp);
  856. ymm_a = MKL_DC_LOAD_YMM(&MKL_DC_AA(i,k + 1));
  857. ymm_b = MKL_DC_BCAST_YMM(&MKL_DC_BB(k + 1,j));
  858. MKL_DC_MUL_ADD_YMM(ymm_a, ymm_b, ymm_temp0, ymm_temp);
  859. ymm_a1 = MKL_DC_LOAD_YMM(&MKL_DC_AA(i+4,k + 1));
  860. MKL_DC_MUL_ADD_YMM(ymm_a1, ymm_b, ymm_temp1, ymm_temp);
  861. ymm_b = MKL_DC_BCAST_YMM(&MKL_DC_BB(k + 1,j+1));
  862. MKL_DC_MUL_ADD_YMM(ymm_a, ymm_b, ymm_temp2, ymm_temp);
  863. MKL_DC_MUL_ADD_YMM(ymm_a1, ymm_b, ymm_temp3, ymm_temp);
  864. ymm_b = MKL_DC_BCAST_YMM(&MKL_DC_BB(k + 1,j+2));
  865. MKL_DC_MUL_ADD_YMM(ymm_a, ymm_b, ymm_temp4, ymm_temp);
  866. MKL_DC_MUL_ADD_YMM(ymm_a1, ymm_b, ymm_temp5, ymm_temp);
  867. ymm_a = MKL_DC_LOAD_YMM(&MKL_DC_AA(i,k + 2));
  868. ymm_b = MKL_DC_BCAST_YMM(&MKL_DC_BB(k + 2,j));
  869. MKL_DC_MUL_ADD_YMM(ymm_a, ymm_b, ymm_temp0, ymm_temp);
  870. ymm_a1 = MKL_DC_LOAD_YMM(&MKL_DC_AA(i+4,k + 2));
  871. MKL_DC_MUL_ADD_YMM(ymm_a1, ymm_b, ymm_temp1, ymm_temp);
  872. ymm_b = MKL_DC_BCAST_YMM(&MKL_DC_BB(k + 2,j+1));
  873. MKL_DC_MUL_ADD_YMM(ymm_a, ymm_b, ymm_temp2, ymm_temp);
  874. MKL_DC_MUL_ADD_YMM(ymm_a1, ymm_b, ymm_temp3, ymm_temp);
  875. ymm_b = MKL_DC_BCAST_YMM(&MKL_DC_BB(k + 2,j+2));
  876. MKL_DC_MUL_ADD_YMM(ymm_a, ymm_b, ymm_temp4, ymm_temp);
  877. MKL_DC_MUL_ADD_YMM(ymm_a1, ymm_b, ymm_temp5, ymm_temp);
  878. ymm_a = MKL_DC_LOAD_YMM(&MKL_DC_AA(i,k + 3));
  879. ymm_b = MKL_DC_BCAST_YMM(&MKL_DC_BB(k + 3,j));
  880. MKL_DC_MUL_ADD_YMM(ymm_a, ymm_b, ymm_temp0, ymm_temp);
  881. ymm_a1 = MKL_DC_LOAD_YMM(&MKL_DC_AA(i+4,k + 3));
  882. MKL_DC_MUL_ADD_YMM(ymm_a1, ymm_b, ymm_temp1, ymm_temp);
  883. ymm_b = MKL_DC_BCAST_YMM(&MKL_DC_BB(k + 3,j+1));
  884. MKL_DC_MUL_ADD_YMM(ymm_a, ymm_b, ymm_temp2, ymm_temp);
  885. MKL_DC_MUL_ADD_YMM(ymm_a1, ymm_b, ymm_temp3, ymm_temp);
  886. ymm_b = MKL_DC_BCAST_YMM(&MKL_DC_BB(k + 3,j+2));
  887. MKL_DC_MUL_ADD_YMM(ymm_a, ymm_b, ymm_temp4, ymm_temp);
  888. MKL_DC_MUL_ADD_YMM(ymm_a1, ymm_b, ymm_temp5, ymm_temp);
  889. }
  890. if (krem & 2) {
  891. ymm_a = MKL_DC_LOAD_YMM(&MKL_DC_AA(i,k + 0));
  892. ymm_b = MKL_DC_BCAST_YMM(&MKL_DC_BB(k + 0,j));
  893. MKL_DC_MUL_ADD_YMM(ymm_a, ymm_b, ymm_temp0, ymm_temp);
  894. ymm_a1 = MKL_DC_LOAD_YMM(&MKL_DC_AA(i+4,k + 0));
  895. MKL_DC_MUL_ADD_YMM(ymm_a1, ymm_b, ymm_temp1, ymm_temp);
  896. ymm_b = MKL_DC_BCAST_YMM(&MKL_DC_BB(k + 0,j+1));
  897. MKL_DC_MUL_ADD_YMM(ymm_a, ymm_b, ymm_temp2, ymm_temp);
  898. MKL_DC_MUL_ADD_YMM(ymm_a1, ymm_b, ymm_temp3, ymm_temp);
  899. ymm_b = MKL_DC_BCAST_YMM(&MKL_DC_BB(k + 0,j+2));
  900. MKL_DC_MUL_ADD_YMM(ymm_a, ymm_b, ymm_temp4, ymm_temp);
  901. MKL_DC_MUL_ADD_YMM(ymm_a1, ymm_b, ymm_temp5, ymm_temp);
  902. k++;
  903. ymm_a = MKL_DC_LOAD_YMM(&MKL_DC_AA(i,k + 0));
  904. ymm_b = MKL_DC_BCAST_YMM(&MKL_DC_BB(k + 0,j));
  905. MKL_DC_MUL_ADD_YMM(ymm_a, ymm_b, ymm_temp0, ymm_temp);
  906. ymm_a1 = MKL_DC_LOAD_YMM(&MKL_DC_AA(i+4,k + 0));
  907. MKL_DC_MUL_ADD_YMM(ymm_a1, ymm_b, ymm_temp1, ymm_temp);
  908. ymm_b = MKL_DC_BCAST_YMM(&MKL_DC_BB(k + 0,j+1));
  909. MKL_DC_MUL_ADD_YMM(ymm_a, ymm_b, ymm_temp2, ymm_temp);
  910. MKL_DC_MUL_ADD_YMM(ymm_a1, ymm_b, ymm_temp3, ymm_temp);
  911. ymm_b = MKL_DC_BCAST_YMM(&MKL_DC_BB(k + 0,j+2));
  912. MKL_DC_MUL_ADD_YMM(ymm_a, ymm_b, ymm_temp4, ymm_temp);
  913. MKL_DC_MUL_ADD_YMM(ymm_a1, ymm_b, ymm_temp5, ymm_temp);
  914. k++;
  915. }
  916. if (krem & 1) {
  917. ymm_a = MKL_DC_LOAD_YMM(&MKL_DC_AA(i,k + 0));
  918. ymm_b = MKL_DC_BCAST_YMM(&MKL_DC_BB(k + 0,j));
  919. MKL_DC_MUL_ADD_YMM(ymm_a, ymm_b, ymm_temp0, ymm_temp);
  920. ymm_a1 = MKL_DC_LOAD_YMM(&MKL_DC_AA(i+4,k + 0));
  921. MKL_DC_MUL_ADD_YMM(ymm_a1, ymm_b, ymm_temp1, ymm_temp);
  922. ymm_b = MKL_DC_BCAST_YMM(&MKL_DC_BB(k + 0,j+1));
  923. MKL_DC_MUL_ADD_YMM(ymm_a, ymm_b, ymm_temp2, ymm_temp);
  924. MKL_DC_MUL_ADD_YMM(ymm_a1, ymm_b, ymm_temp3, ymm_temp);
  925. ymm_b = MKL_DC_BCAST_YMM(&MKL_DC_BB(k + 0,j+2));
  926. MKL_DC_MUL_ADD_YMM(ymm_a, ymm_b, ymm_temp4, ymm_temp);
  927. MKL_DC_MUL_ADD_YMM(ymm_a1, ymm_b, ymm_temp5, ymm_temp);
  928. k++;
  929. }
  930. #if !defined(MKL_DC_BETA_ZERO)
  931. ymm_c0 = MKL_DC_LOAD_YMM(&MKL_DC_CC(i,j));
  932. #if !defined(MKL_DC_BETA_ONE)
  933. ymm_c0 = MKL_DC_MUL_YMM(ymm_c0, ymm_beta);
  934. #endif
  935. #if defined(MKL_DC_ALPHA_ONE)
  936. ymm_c0 = MKL_DC_ADD_YMM(ymm_c0, ymm_temp0);
  937. #else
  938. MKL_DC_MUL_ADD_YMM(ymm_alpha, ymm_temp0, ymm_c0, ymm_temp);
  939. #endif
  940. #else
  941. #if !defined(MKL_DC_ALPHA_ONE)
  942. ymm_c0 = MKL_DC_MUL_YMM(ymm_alpha, ymm_temp0);
  943. #else
  944. ymm_c0 = ymm_temp0;
  945. #endif
  946. #endif
  947. MKL_DC_STORE_YMM(&MKL_DC_CC(i,j), ymm_c0);
  948. #if !defined(MKL_DC_BETA_ZERO)
  949. ymm_c1 = MKL_DC_LOAD_YMM(&MKL_DC_CC(i+4,j));
  950. #if !defined(MKL_DC_BETA_ONE)
  951. ymm_c1 = MKL_DC_MUL_YMM(ymm_c1, ymm_beta);
  952. #endif
  953. #if defined(MKL_DC_ALPHA_ONE)
  954. ymm_c1 = MKL_DC_ADD_YMM(ymm_temp1, ymm_c1);
  955. #else
  956. MKL_DC_MUL_ADD_YMM(ymm_alpha, ymm_temp1, ymm_c1, ymm_temp);
  957. #endif
  958. #else
  959. #if !defined(MKL_DC_ALPHA_ONE)
  960. ymm_c1 = MKL_DC_MUL_YMM(ymm_alpha, ymm_temp1);
  961. #else
  962. ymm_c1 = ymm_temp1;
  963. #endif
  964. #endif
  965. MKL_DC_STORE_YMM(&MKL_DC_CC(i+4,j), ymm_c1);
  966. #if !defined(MKL_DC_BETA_ZERO)
  967. ymm_c2 = MKL_DC_LOAD_YMM(&MKL_DC_CC(i,j+1));
  968. #if !defined(MKL_DC_BETA_ONE)
  969. ymm_c2 = MKL_DC_MUL_YMM(ymm_c2, ymm_beta);
  970. #endif
  971. #if defined(MKL_DC_ALPHA_ONE)
  972. ymm_c2 = MKL_DC_ADD_YMM(ymm_temp2, ymm_c2);
  973. #else
  974. MKL_DC_MUL_ADD_YMM(ymm_alpha, ymm_temp2, ymm_c2, ymm_temp);
  975. #endif
  976. #else
  977. #if !defined(MKL_DC_ALPHA_ONE)
  978. ymm_c2 = MKL_DC_MUL_YMM(ymm_alpha, ymm_temp2);
  979. #else
  980. ymm_c2 = ymm_temp2;
  981. #endif
  982. #endif
  983. MKL_DC_STORE_YMM(&MKL_DC_CC(i,j+1), ymm_c2);
  984. #if !defined(MKL_DC_BETA_ZERO)
  985. ymm_c3 = MKL_DC_LOAD_YMM(&MKL_DC_CC(i+4,j+1));
  986. #if !defined(MKL_DC_BETA_ONE)
  987. ymm_c3 = MKL_DC_MUL_YMM(ymm_c3, ymm_beta);
  988. #endif
  989. #if defined(MKL_DC_ALPHA_ONE)
  990. ymm_c3 = MKL_DC_ADD_YMM(ymm_temp3, ymm_c3);
  991. #else
  992. MKL_DC_MUL_ADD_YMM(ymm_alpha, ymm_temp3, ymm_c3, ymm_temp);
  993. #endif
  994. #else
  995. #if !defined(MKL_DC_ALPHA_ONE)
  996. ymm_c3 = MKL_DC_MUL_YMM(ymm_alpha, ymm_temp3);
  997. #else
  998. ymm_c3 = ymm_temp3;
  999. #endif
  1000. #endif
  1001. MKL_DC_STORE_YMM(&MKL_DC_CC(i+4,j+1), ymm_c3);
  1002. #if !defined(MKL_DC_BETA_ZERO)
  1003. ymm_c4 = MKL_DC_LOAD_YMM(&MKL_DC_CC(i,j+2));
  1004. #if !defined(MKL_DC_BETA_ONE)
  1005. ymm_c4 = MKL_DC_MUL_YMM(ymm_c4, ymm_beta);
  1006. #endif
  1007. #if defined(MKL_DC_ALPHA_ONE)
  1008. ymm_c4 = MKL_DC_ADD_YMM(ymm_temp4, ymm_c4);
  1009. #else
  1010. MKL_DC_MUL_ADD_YMM(ymm_alpha, ymm_temp4, ymm_c4, ymm_temp);
  1011. #endif
  1012. #else
  1013. #if !defined(MKL_DC_ALPHA_ONE)
  1014. ymm_c4 = MKL_DC_MUL_YMM(ymm_alpha, ymm_temp4);
  1015. #else
  1016. ymm_c4 = ymm_temp4;
  1017. #endif
  1018. #endif
  1019. MKL_DC_STORE_YMM(&MKL_DC_CC(i,j+2), ymm_c4);
  1020. #if !defined(MKL_DC_BETA_ZERO)
  1021. ymm_c5 = MKL_DC_LOAD_YMM(&MKL_DC_CC(i+4,j+2));
  1022. #if !defined(MKL_DC_BETA_ONE)
  1023. ymm_c5 = MKL_DC_MUL_YMM(ymm_c5, ymm_beta);
  1024. #endif
  1025. #if defined(MKL_DC_ALPHA_ONE)
  1026. ymm_c5 = MKL_DC_ADD_YMM(ymm_temp5, ymm_c5);
  1027. #else
  1028. MKL_DC_MUL_ADD_YMM(ymm_alpha, ymm_temp5, ymm_c5, ymm_temp);
  1029. #endif
  1030. #else
  1031. #if !defined(MKL_DC_ALPHA_ONE)
  1032. ymm_c5 = MKL_DC_MUL_YMM(ymm_alpha, ymm_temp5);
  1033. #else
  1034. ymm_c5 = ymm_temp5;
  1035. #endif
  1036. #endif
  1037. MKL_DC_STORE_YMM(&MKL_DC_CC(i+4,j+2), ymm_c5);
  1038. }
  1039. if ((m-i) & MKER1) {
  1040. ymm_temp0 = MKL_DC_SETZERO_YMM();
  1041. ymm_temp3 = MKL_DC_SETZERO_YMM();
  1042. ymm_temp5 = MKL_DC_SETZERO_YMM();
  1043. MKL_INT k;
  1044. for (k=0; k<k0; k+=k_in_ker) {
  1045. ymm_a = MKL_DC_LOAD_YMM(&MKL_DC_AA(i,k + 0));
  1046. ymm_b = MKL_DC_BCAST_YMM(&MKL_DC_BB(k + 0,j));
  1047. MKL_DC_MUL_ADD_YMM(ymm_a, ymm_b, ymm_temp0, ymm_temp);
  1048. ymm_b = MKL_DC_BCAST_YMM(&MKL_DC_BB(k + 0,j+1));
  1049. MKL_DC_MUL_ADD_YMM(ymm_a, ymm_b, ymm_temp3, ymm_temp);
  1050. ymm_b = MKL_DC_BCAST_YMM(&MKL_DC_BB(k + 0,j+2));
  1051. MKL_DC_MUL_ADD_YMM(ymm_a, ymm_b, ymm_temp5, ymm_temp);
  1052. ymm_a = MKL_DC_LOAD_YMM(&MKL_DC_AA(i,k + 1));
  1053. ymm_b = MKL_DC_BCAST_YMM(&MKL_DC_BB(k + 1,j));
  1054. MKL_DC_MUL_ADD_YMM(ymm_a, ymm_b, ymm_temp0, ymm_temp);
  1055. ymm_b = MKL_DC_BCAST_YMM(&MKL_DC_BB(k + 1,j+1));
  1056. MKL_DC_MUL_ADD_YMM(ymm_a, ymm_b, ymm_temp3, ymm_temp);
  1057. ymm_b = MKL_DC_BCAST_YMM(&MKL_DC_BB(k + 1,j+2));
  1058. MKL_DC_MUL_ADD_YMM(ymm_a, ymm_b, ymm_temp5, ymm_temp);
  1059. ymm_a = MKL_DC_LOAD_YMM(&MKL_DC_AA(i,k + 2));
  1060. ymm_b = MKL_DC_BCAST_YMM(&MKL_DC_BB(k + 2,j));
  1061. MKL_DC_MUL_ADD_YMM(ymm_a, ymm_b, ymm_temp0, ymm_temp);
  1062. ymm_b = MKL_DC_BCAST_YMM(&MKL_DC_BB(k + 2,j+1));
  1063. MKL_DC_MUL_ADD_YMM(ymm_a, ymm_b, ymm_temp3, ymm_temp);
  1064. ymm_b = MKL_DC_BCAST_YMM(&MKL_DC_BB(k + 2,j+2));
  1065. MKL_DC_MUL_ADD_YMM(ymm_a, ymm_b, ymm_temp5, ymm_temp);
  1066. ymm_a = MKL_DC_LOAD_YMM(&MKL_DC_AA(i,k + 3));
  1067. ymm_b = MKL_DC_BCAST_YMM(&MKL_DC_BB(k + 3,j));
  1068. MKL_DC_MUL_ADD_YMM(ymm_a, ymm_b, ymm_temp0, ymm_temp);
  1069. ymm_b = MKL_DC_BCAST_YMM(&MKL_DC_BB(k + 3,j+1));
  1070. MKL_DC_MUL_ADD_YMM(ymm_a, ymm_b, ymm_temp3, ymm_temp);
  1071. ymm_b = MKL_DC_BCAST_YMM(&MKL_DC_BB(k + 3,j+2));
  1072. MKL_DC_MUL_ADD_YMM(ymm_a, ymm_b, ymm_temp5, ymm_temp);
  1073. }
  1074. if (krem & 2) {
  1075. ymm_a = MKL_DC_LOAD_YMM(&MKL_DC_AA(i,k + 0));
  1076. ymm_b = MKL_DC_BCAST_YMM(&MKL_DC_BB(k + 0,j));
  1077. MKL_DC_MUL_ADD_YMM(ymm_a, ymm_b, ymm_temp0, ymm_temp);
  1078. ymm_b = MKL_DC_BCAST_YMM(&MKL_DC_BB(k + 0,j+1));
  1079. MKL_DC_MUL_ADD_YMM(ymm_a, ymm_b, ymm_temp3, ymm_temp);
  1080. ymm_b = MKL_DC_BCAST_YMM(&MKL_DC_BB(k + 0,j+2));
  1081. MKL_DC_MUL_ADD_YMM(ymm_a, ymm_b, ymm_temp5, ymm_temp);
  1082. k++;
  1083. ymm_a = MKL_DC_LOAD_YMM(&MKL_DC_AA(i,k + 0));
  1084. ymm_b = MKL_DC_BCAST_YMM(&MKL_DC_BB(k + 0,j));
  1085. MKL_DC_MUL_ADD_YMM(ymm_a, ymm_b, ymm_temp0, ymm_temp);
  1086. ymm_b = MKL_DC_BCAST_YMM(&MKL_DC_BB(k + 0,j+1));
  1087. MKL_DC_MUL_ADD_YMM(ymm_a, ymm_b, ymm_temp3, ymm_temp);
  1088. ymm_b = MKL_DC_BCAST_YMM(&MKL_DC_BB(k + 0,j+2));
  1089. MKL_DC_MUL_ADD_YMM(ymm_a, ymm_b, ymm_temp5, ymm_temp);
  1090. k++;
  1091. }
  1092. if (krem & 1) {
  1093. ymm_a = MKL_DC_LOAD_YMM(&MKL_DC_AA(i,k + 0));
  1094. ymm_b = MKL_DC_BCAST_YMM(&MKL_DC_BB(k + 0,j));
  1095. MKL_DC_MUL_ADD_YMM(ymm_a, ymm_b, ymm_temp0, ymm_temp);
  1096. ymm_b = MKL_DC_BCAST_YMM(&MKL_DC_BB(k + 0,j+1));
  1097. MKL_DC_MUL_ADD_YMM(ymm_a, ymm_b, ymm_temp3, ymm_temp);
  1098. ymm_b = MKL_DC_BCAST_YMM(&MKL_DC_BB(k + 0,j+2));
  1099. MKL_DC_MUL_ADD_YMM(ymm_a, ymm_b, ymm_temp5, ymm_temp);
  1100. k++;
  1101. }
  1102. #if !defined(MKL_DC_BETA_ZERO)
  1103. ymm_c0 = MKL_DC_LOAD_YMM(&MKL_DC_CC(i,j));
  1104. #if !defined(MKL_DC_BETA_ONE)
  1105. ymm_c0 = MKL_DC_MUL_YMM(ymm_c0, ymm_beta);
  1106. #endif
  1107. #if defined(MKL_DC_ALPHA_ONE)
  1108. ymm_c0 = MKL_DC_ADD_YMM(ymm_temp0, ymm_c0);
  1109. #else
  1110. MKL_DC_MUL_ADD_YMM(ymm_alpha, ymm_temp0, ymm_c0, ymm_temp);
  1111. #endif
  1112. #else
  1113. #if !defined(MKL_DC_ALPHA_ONE)
  1114. ymm_c0 = MKL_DC_MUL_YMM(ymm_alpha, ymm_temp0);
  1115. #else
  1116. ymm_c0 = ymm_temp0;
  1117. #endif
  1118. #endif
  1119. #if !defined(MKL_DC_BETA_ZERO)
  1120. ymm_c3 = MKL_DC_LOAD_YMM(&MKL_DC_CC(i,j+1));
  1121. #if !defined(MKL_DC_BETA_ONE)
  1122. ymm_c3 = MKL_DC_MUL_YMM(ymm_c3, ymm_beta);
  1123. #endif
  1124. #if defined(MKL_DC_ALPHA_ONE)
  1125. ymm_c3 = MKL_DC_ADD_YMM(ymm_temp3, ymm_c3);
  1126. #else
  1127. MKL_DC_MUL_ADD_YMM(ymm_alpha, ymm_temp3, ymm_c3, ymm_temp);
  1128. #endif
  1129. #else
  1130. #if !defined(MKL_DC_ALPHA_ONE)
  1131. ymm_c3 = MKL_DC_MUL_YMM(ymm_alpha, ymm_temp3);
  1132. #else
  1133. ymm_c3 = ymm_temp3;
  1134. #endif
  1135. #endif
  1136. #if !defined(MKL_DC_BETA_ZERO)
  1137. ymm_c5 = MKL_DC_LOAD_YMM(&MKL_DC_CC(i,j+2));
  1138. #if !defined(MKL_DC_BETA_ONE)
  1139. ymm_c5 = MKL_DC_MUL_YMM(ymm_c5, ymm_beta);
  1140. #endif
  1141. #if defined(MKL_DC_ALPHA_ONE)
  1142. ymm_c5 = MKL_DC_ADD_YMM(ymm_temp5, ymm_c5);
  1143. #else
  1144. MKL_DC_MUL_ADD_YMM(ymm_alpha, ymm_temp5, ymm_c5, ymm_temp);
  1145. #endif
  1146. #else
  1147. #if !defined(MKL_DC_ALPHA_ONE)
  1148. ymm_c5 = MKL_DC_MUL_YMM(ymm_alpha, ymm_temp5);
  1149. #else
  1150. ymm_c5 = ymm_temp5;
  1151. #endif
  1152. #endif
  1153. MKL_DC_STORE_YMM(&MKL_DC_CC(i,j+0), ymm_c0);
  1154. MKL_DC_STORE_YMM(&MKL_DC_CC(i,j+1), ymm_c3);
  1155. MKL_DC_STORE_YMM(&MKL_DC_CC(i,j+2), ymm_c5);
  1156. i += MKER1;
  1157. }
  1158. if ((m-i) & MKER2) {
  1159. xmm_temp0 = MKL_DC_SETZERO_XMM();
  1160. xmm_temp3 = MKL_DC_SETZERO_XMM();
  1161. xmm_temp5 = MKL_DC_SETZERO_XMM();
  1162. MKL_INT k;
  1163. for (k=0; k<k0; k+=k_in_ker) {
  1164. xmm_a = MKL_DC_LOAD_XMM(&MKL_DC_AA(i,k));
  1165. ymm_b = MKL_DC_BCAST_YMM(&MKL_DC_BB(k,j));
  1166. xmm_b = MKL_DC_CAST_YMM_TO_XMM(ymm_b);
  1167. MKL_DC_MUL_ADD_XMM(xmm_a, xmm_b, xmm_temp0, xmm_b);
  1168. ymm_b = MKL_DC_BCAST_YMM(&MKL_DC_BB(k,j+1));
  1169. xmm_b = MKL_DC_CAST_YMM_TO_XMM(ymm_b);
  1170. MKL_DC_MUL_ADD_XMM(xmm_a, xmm_b, xmm_temp3, xmm_b);
  1171. ymm_b = MKL_DC_BCAST_YMM(&MKL_DC_BB(k,j+2));
  1172. xmm_b = MKL_DC_CAST_YMM_TO_XMM(ymm_b);
  1173. MKL_DC_MUL_ADD_XMM(xmm_a, xmm_b, xmm_temp5, xmm_b);
  1174. xmm_a = MKL_DC_LOAD_XMM(&MKL_DC_AA(i,k+1));
  1175. ymm_b = MKL_DC_BCAST_YMM(&MKL_DC_BB(k+1,j));
  1176. xmm_b = MKL_DC_CAST_YMM_TO_XMM(ymm_b);
  1177. MKL_DC_MUL_ADD_XMM(xmm_a, xmm_b, xmm_temp0, xmm_b);
  1178. ymm_b = MKL_DC_BCAST_YMM(&MKL_DC_BB(k+1,j+1));
  1179. xmm_b = MKL_DC_CAST_YMM_TO_XMM(ymm_b);
  1180. MKL_DC_MUL_ADD_XMM(xmm_a, xmm_b, xmm_temp3, xmm_b);
  1181. ymm_b = MKL_DC_BCAST_YMM(&MKL_DC_BB(k+1,j+2));
  1182. xmm_b = MKL_DC_CAST_YMM_TO_XMM(ymm_b);
  1183. MKL_DC_MUL_ADD_XMM(xmm_a, xmm_b, xmm_temp5, xmm_b);
  1184. xmm_a = MKL_DC_LOAD_XMM(&MKL_DC_AA(i,k+2));
  1185. ymm_b = MKL_DC_BCAST_YMM(&MKL_DC_BB(k+2,j));
  1186. xmm_b = MKL_DC_CAST_YMM_TO_XMM(ymm_b);
  1187. MKL_DC_MUL_ADD_XMM(xmm_a, xmm_b, xmm_temp0, xmm_b);
  1188. ymm_b = MKL_DC_BCAST_YMM(&MKL_DC_BB(k+2,j+1));
  1189. xmm_b = MKL_DC_CAST_YMM_TO_XMM(ymm_b);
  1190. MKL_DC_MUL_ADD_XMM(xmm_a, xmm_b, xmm_temp3, xmm_b);
  1191. ymm_b = MKL_DC_BCAST_YMM(&MKL_DC_BB(k+2,j+2));
  1192. xmm_b = MKL_DC_CAST_YMM_TO_XMM(ymm_b);
  1193. MKL_DC_MUL_ADD_XMM(xmm_a, xmm_b, xmm_temp5, xmm_b);
  1194. xmm_a = MKL_DC_LOAD_XMM(&MKL_DC_AA(i,k+3));
  1195. ymm_b = MKL_DC_BCAST_YMM(&MKL_DC_BB(k+3,j));
  1196. xmm_b = MKL_DC_CAST_YMM_TO_XMM(ymm_b);
  1197. MKL_DC_MUL_ADD_XMM(xmm_a, xmm_b, xmm_temp0, xmm_b);
  1198. ymm_b = MKL_DC_BCAST_YMM(&MKL_DC_BB(k+3,j+1));
  1199. xmm_b = MKL_DC_CAST_YMM_TO_XMM(ymm_b);
  1200. MKL_DC_MUL_ADD_XMM(xmm_a, xmm_b, xmm_temp3, xmm_b);
  1201. ymm_b = MKL_DC_BCAST_YMM(&MKL_DC_BB(k+3,j+2));
  1202. xmm_b = MKL_DC_CAST_YMM_TO_XMM(ymm_b);
  1203. MKL_DC_MUL_ADD_XMM(xmm_a, xmm_b, xmm_temp5, xmm_b);
  1204. }
  1205. if (krem & 2) {
  1206. xmm_a = MKL_DC_LOAD_XMM(&MKL_DC_AA(i,k));
  1207. ymm_b = MKL_DC_BCAST_YMM(&MKL_DC_BB(k,j));
  1208. xmm_b = MKL_DC_CAST_YMM_TO_XMM(ymm_b);
  1209. MKL_DC_MUL_ADD_XMM(xmm_a, xmm_b, xmm_temp0, xmm_b);
  1210. ymm_b = MKL_DC_BCAST_YMM(&MKL_DC_BB(k,j+1));
  1211. xmm_b = MKL_DC_CAST_YMM_TO_XMM(ymm_b);
  1212. MKL_DC_MUL_ADD_XMM(xmm_a, xmm_b, xmm_temp3, xmm_b);
  1213. ymm_b = MKL_DC_BCAST_YMM(&MKL_DC_BB(k,j+2));
  1214. xmm_b = MKL_DC_CAST_YMM_TO_XMM(ymm_b);
  1215. MKL_DC_MUL_ADD_XMM(xmm_a, xmm_b, xmm_temp5, xmm_b);
  1216. k++;
  1217. xmm_a = MKL_DC_LOAD_XMM(&MKL_DC_AA(i,k));
  1218. ymm_b = MKL_DC_BCAST_YMM(&MKL_DC_BB(k,j));
  1219. xmm_b = MKL_DC_CAST_YMM_TO_XMM(ymm_b);
  1220. MKL_DC_MUL_ADD_XMM(xmm_a, xmm_b, xmm_temp0, xmm_b);
  1221. ymm_b = MKL_DC_BCAST_YMM(&MKL_DC_BB(k,j+1));
  1222. xmm_b = MKL_DC_CAST_YMM_TO_XMM(ymm_b);
  1223. MKL_DC_MUL_ADD_XMM(xmm_a, xmm_b, xmm_temp3, xmm_b);
  1224. ymm_b = MKL_DC_BCAST_YMM(&MKL_DC_BB(k,j+2));
  1225. xmm_b = MKL_DC_CAST_YMM_TO_XMM(ymm_b);
  1226. MKL_DC_MUL_ADD_XMM(xmm_a, xmm_b, xmm_temp5, xmm_b);
  1227. k++;
  1228. }
  1229. if (krem & 1) {
  1230. xmm_a = MKL_DC_LOAD_XMM(&MKL_DC_AA(i,k));
  1231. ymm_b = MKL_DC_BCAST_YMM(&MKL_DC_BB(k,j));
  1232. xmm_b = MKL_DC_CAST_YMM_TO_XMM(ymm_b);
  1233. MKL_DC_MUL_ADD_XMM(xmm_a, xmm_b, xmm_temp0, xmm_b);
  1234. ymm_b = MKL_DC_BCAST_YMM(&MKL_DC_BB(k,j+1));
  1235. xmm_b = MKL_DC_CAST_YMM_TO_XMM(ymm_b);
  1236. MKL_DC_MUL_ADD_XMM(xmm_a, xmm_b, xmm_temp3, xmm_b);
  1237. ymm_b = MKL_DC_BCAST_YMM(&MKL_DC_BB(k,j+2));
  1238. xmm_b = MKL_DC_CAST_YMM_TO_XMM(ymm_b);
  1239. MKL_DC_MUL_ADD_XMM(xmm_a, xmm_b, xmm_temp5, xmm_b);
  1240. k++;
  1241. }
  1242. #if !defined(MKL_DC_BETA_ZERO)
  1243. xmm_c = MKL_DC_LOAD_XMM(&MKL_DC_CC(i,j));
  1244. #if !defined(MKL_DC_BETA_ONE)
  1245. xmm_c = MKL_DC_MUL_XMM(xmm_c, xmm_beta);
  1246. #endif
  1247. #if defined(MKL_DC_ALPHA_ONE)
  1248. xmm_c = MKL_DC_ADD_XMM(xmm_temp0, xmm_c);
  1249. #else
  1250. MKL_DC_MUL_ADD_XMM(xmm_alpha, xmm_temp0, xmm_c, xmm_temp);
  1251. #endif
  1252. #else
  1253. #if !defined(MKL_DC_ALPHA_ONE)
  1254. xmm_c = MKL_DC_MUL_XMM(xmm_alpha, xmm_temp0);
  1255. #else
  1256. xmm_c = xmm_temp0;
  1257. #endif
  1258. #endif
  1259. #if !defined(MKL_DC_BETA_ZERO)
  1260. xmm_c3 = MKL_DC_LOAD_XMM(&MKL_DC_CC(i,j+1));
  1261. #if !defined(MKL_DC_BETA_ONE)
  1262. xmm_c3 = MKL_DC_MUL_XMM(xmm_c3, xmm_beta);
  1263. #endif
  1264. #if defined(MKL_DC_ALPHA_ONE)
  1265. xmm_c3 = MKL_DC_ADD_XMM(xmm_temp3, xmm_c3);
  1266. #else
  1267. MKL_DC_MUL_ADD_XMM(xmm_alpha, xmm_temp3, xmm_c3, xmm_temp);
  1268. #endif
  1269. #else
  1270. #if !defined(MKL_DC_ALPHA_ONE)
  1271. xmm_c3 = MKL_DC_MUL_XMM(xmm_alpha, xmm_temp3);
  1272. #else
  1273. xmm_c3 = xmm_temp3;
  1274. #endif
  1275. #endif
  1276. #if !defined(MKL_DC_BETA_ZERO)
  1277. xmm_c5 = MKL_DC_LOAD_XMM(&MKL_DC_CC(i,j+2));
  1278. #if !defined(MKL_DC_BETA_ONE)
  1279. xmm_c5 = MKL_DC_MUL_XMM(xmm_c5, xmm_beta);
  1280. #endif
  1281. #if defined(MKL_DC_ALPHA_ONE)
  1282. xmm_c5 = MKL_DC_ADD_XMM(xmm_temp5, xmm_c5);
  1283. #else
  1284. MKL_DC_MUL_ADD_XMM(xmm_alpha, xmm_temp5, xmm_c5, xmm_temp);
  1285. #endif
  1286. #else
  1287. #if !defined(MKL_DC_ALPHA_ONE)
  1288. xmm_c5 = MKL_DC_MUL_XMM(xmm_alpha, xmm_temp5);
  1289. #else
  1290. xmm_c5 = xmm_temp5;
  1291. #endif
  1292. #endif
  1293. MKL_DC_STORE_XMM(&MKL_DC_CC(i,j), xmm_c);
  1294. MKL_DC_STORE_XMM(&MKL_DC_CC(i,j+1), xmm_c3);
  1295. MKL_DC_STORE_XMM(&MKL_DC_CC(i,j+2), xmm_c5);
  1296. i += MKER2;
  1297. }
  1298. if ((m-i) & MKER3) {
  1299. xmm_temp0 = MKL_DC_SETZERO_XMM();
  1300. xmm_temp3 = MKL_DC_SETZERO_XMM();
  1301. xmm_temp5 = MKL_DC_SETZERO_XMM();
  1302. MKL_INT k;
  1303. for (k=0; k<k0; k+=k_in_ker) {
  1304. xmm_a = MKL_DC_LOAD_XMM_S(&MKL_DC_AA(i,k));
  1305. xmm_b = MKL_DC_LOAD_XMM_S(&MKL_DC_BB(k,j));
  1306. MKL_DC_MUL_ADD_XMM_S(xmm_a, xmm_b, xmm_temp0, xmm_b);
  1307. xmm_b = MKL_DC_LOAD_XMM_S(&MKL_DC_BB(k,j+1));
  1308. MKL_DC_MUL_ADD_XMM_S(xmm_a, xmm_b, xmm_temp3, xmm_b);
  1309. xmm_b = MKL_DC_LOAD_XMM_S(&MKL_DC_BB(k,j+2));
  1310. MKL_DC_MUL_ADD_XMM_S(xmm_a, xmm_b, xmm_temp5, xmm_b);
  1311. xmm_a = MKL_DC_LOAD_XMM_S(&MKL_DC_AA(i,k+1));
  1312. xmm_b = MKL_DC_LOAD_XMM_S(&MKL_DC_BB(k+1,j));
  1313. MKL_DC_MUL_ADD_XMM_S(xmm_a, xmm_b, xmm_temp0, xmm_b);
  1314. xmm_b = MKL_DC_LOAD_XMM_S(&MKL_DC_BB(k+1,j+1));
  1315. MKL_DC_MUL_ADD_XMM_S(xmm_a, xmm_b, xmm_temp3, xmm_b);
  1316. xmm_b = MKL_DC_LOAD_XMM_S(&MKL_DC_BB(k+1,j+2));
  1317. MKL_DC_MUL_ADD_XMM_S(xmm_a, xmm_b, xmm_temp5, xmm_b);
  1318. xmm_a = MKL_DC_LOAD_XMM_S(&MKL_DC_AA(i,k+2));
  1319. xmm_b = MKL_DC_LOAD_XMM_S(&MKL_DC_BB(k+2,j));
  1320. MKL_DC_MUL_ADD_XMM_S(xmm_a, xmm_b, xmm_temp0, xmm_b);
  1321. xmm_b = MKL_DC_LOAD_XMM_S(&MKL_DC_BB(k+2,j+1));
  1322. MKL_DC_MUL_ADD_XMM_S(xmm_a, xmm_b, xmm_temp3, xmm_b);
  1323. xmm_b = MKL_DC_LOAD_XMM_S(&MKL_DC_BB(k+2,j+2));
  1324. MKL_DC_MUL_ADD_XMM_S(xmm_a, xmm_b, xmm_temp5, xmm_b);
  1325. xmm_a = MKL_DC_LOAD_XMM_S(&MKL_DC_AA(i,k+3));
  1326. xmm_b = MKL_DC_LOAD_XMM_S(&MKL_DC_BB(k+3,j));
  1327. MKL_DC_MUL_ADD_XMM_S(xmm_a, xmm_b, xmm_temp0, xmm_b);
  1328. xmm_b = MKL_DC_LOAD_XMM_S(&MKL_DC_BB(k+3,j+1));
  1329. MKL_DC_MUL_ADD_XMM_S(xmm_a, xmm_b, xmm_temp3, xmm_b);
  1330. xmm_b = MKL_DC_LOAD_XMM_S(&MKL_DC_BB(k+3,j+2));
  1331. MKL_DC_MUL_ADD_XMM_S(xmm_a, xmm_b, xmm_temp5, xmm_b);
  1332. }
  1333. if (krem & 2) {
  1334. xmm_a = MKL_DC_LOAD_XMM_S(&MKL_DC_AA(i,k));
  1335. xmm_b = MKL_DC_LOAD_XMM_S(&MKL_DC_BB(k,j));
  1336. MKL_DC_MUL_ADD_XMM_S(xmm_a, xmm_b, xmm_temp0, xmm_b);
  1337. xmm_b = MKL_DC_LOAD_XMM_S(&MKL_DC_BB(k,j+1));
  1338. MKL_DC_MUL_ADD_XMM_S(xmm_a, xmm_b, xmm_temp3, xmm_b);
  1339. xmm_b = MKL_DC_LOAD_XMM_S(&MKL_DC_BB(k,j+2));
  1340. MKL_DC_MUL_ADD_XMM_S(xmm_a, xmm_b, xmm_temp5, xmm_b);
  1341. k++;
  1342. xmm_a = MKL_DC_LOAD_XMM_S(&MKL_DC_AA(i,k));
  1343. xmm_b = MKL_DC_LOAD_XMM_S(&MKL_DC_BB(k,j));
  1344. MKL_DC_MUL_ADD_XMM_S(xmm_a, xmm_b, xmm_temp0, xmm_b);
  1345. xmm_b = MKL_DC_LOAD_XMM_S(&MKL_DC_BB(k,j+1));
  1346. MKL_DC_MUL_ADD_XMM_S(xmm_a, xmm_b, xmm_temp3, xmm_b);
  1347. xmm_b = MKL_DC_LOAD_XMM_S(&MKL_DC_BB(k,j+2));
  1348. MKL_DC_MUL_ADD_XMM_S(xmm_a, xmm_b, xmm_temp5, xmm_b);
  1349. k++;
  1350. }
  1351. if (krem & 1) {
  1352. xmm_a = MKL_DC_LOAD_XMM_S(&MKL_DC_AA(i,k));
  1353. xmm_b = MKL_DC_LOAD_XMM_S(&MKL_DC_BB(k,j));
  1354. MKL_DC_MUL_ADD_XMM_S(xmm_a, xmm_b, xmm_temp0, xmm_b);
  1355. xmm_b = MKL_DC_LOAD_XMM_S(&MKL_DC_BB(k,j+1));
  1356. MKL_DC_MUL_ADD_XMM_S(xmm_a, xmm_b, xmm_temp3, xmm_b);
  1357. xmm_b = MKL_DC_LOAD_XMM_S(&MKL_DC_BB(k,j+2));
  1358. MKL_DC_MUL_ADD_XMM_S(xmm_a, xmm_b, xmm_temp5, xmm_b);
  1359. k++;
  1360. }
  1361. #if !defined(MKL_DC_BETA_ZERO)
  1362. xmm_c = MKL_DC_LOAD_XMM_S(&MKL_DC_CC(i,j));
  1363. #if !defined(MKL_DC_BETA_ONE)
  1364. xmm_c = MKL_DC_MUL_XMM(xmm_c, xmm_beta);
  1365. #endif
  1366. #if defined(MKL_DC_ALPHA_ONE)
  1367. xmm_c = MKL_DC_ADD_XMM_S(xmm_temp0, xmm_c);
  1368. #else
  1369. MKL_DC_MUL_ADD_XMM_S(xmm_alpha, xmm_temp0, xmm_c, xmm_temp);
  1370. #endif
  1371. #else
  1372. #if !defined(MKL_DC_ALPHA_ONE)
  1373. xmm_c = MKL_DC_MUL_XMM_S(xmm_alpha, xmm_temp0);
  1374. #else
  1375. xmm_c = xmm_temp0;
  1376. #endif
  1377. #endif
  1378. #if !defined(MKL_DC_BETA_ZERO)
  1379. xmm_c3 = MKL_DC_LOAD_XMM_S(&MKL_DC_CC(i,j+1));
  1380. #if !defined(MKL_DC_BETA_ONE)
  1381. xmm_c3 = MKL_DC_MUL_XMM(xmm_c3, xmm_beta);
  1382. #endif
  1383. #if defined(MKL_DC_ALPHA_ONE)
  1384. xmm_c3 = MKL_DC_ADD_XMM_S(xmm_temp3, xmm_c3);
  1385. #else
  1386. MKL_DC_MUL_ADD_XMM_S(xmm_alpha, xmm_temp3, xmm_c3, xmm_temp);
  1387. #endif
  1388. #else
  1389. #if !defined(MKL_DC_ALPHA_ONE)
  1390. xmm_c3 = MKL_DC_MUL_XMM_S(xmm_alpha, xmm_temp3);
  1391. #else
  1392. xmm_c3 = xmm_temp3;
  1393. #endif
  1394. #endif
  1395. #if !defined(MKL_DC_BETA_ZERO)
  1396. xmm_c5 = MKL_DC_LOAD_XMM_S(&MKL_DC_CC(i,j+2));
  1397. #if !defined(MKL_DC_BETA_ONE)
  1398. xmm_c5 = MKL_DC_MUL_XMM(xmm_c5, xmm_beta);
  1399. #endif
  1400. #if defined(MKL_DC_ALPHA_ONE)
  1401. xmm_c5 = MKL_DC_ADD_XMM_S(xmm_temp5, xmm_c5);
  1402. #else
  1403. MKL_DC_MUL_ADD_XMM_S(xmm_alpha, xmm_temp5, xmm_c5, xmm_temp);
  1404. #endif
  1405. #else
  1406. #if !defined(MKL_DC_ALPHA_ONE)
  1407. xmm_c5 = MKL_DC_MUL_XMM_S(xmm_alpha, xmm_temp5);
  1408. #else
  1409. xmm_c5 = xmm_temp5;
  1410. #endif
  1411. #endif
  1412. MKL_DC_STORE_XMM_S(&MKL_DC_CC(i,j), xmm_c);
  1413. MKL_DC_STORE_XMM_S(&MKL_DC_CC(i,j+1), xmm_c3);
  1414. MKL_DC_STORE_XMM_S(&MKL_DC_CC(i,j+2), xmm_c5);
  1415. i += MKER3;
  1416. }
  1417. } else if ((n-j) == 2) {
  1418. MKL_INT i;
  1419. for (i=0; i<m0; i+=m_in_ker) {
  1420. ymm_temp0 = MKL_DC_SETZERO_YMM();
  1421. ymm_temp1 = MKL_DC_SETZERO_YMM();
  1422. ymm_temp2 = MKL_DC_SETZERO_YMM();
  1423. ymm_temp3 = MKL_DC_SETZERO_YMM();
  1424. ymm_temp4 = MKL_DC_SETZERO_YMM();
  1425. ymm_temp5 = MKL_DC_SETZERO_YMM();
  1426. ymm_temp6 = MKL_DC_SETZERO_YMM();
  1427. ymm_temp7 = MKL_DC_SETZERO_YMM();
  1428. MKL_INT k;
  1429. for (k=0; k<k0; k+=k_in_ker) {
  1430. ymm_a = MKL_DC_LOAD_YMM(&MKL_DC_AA(i,k + 0));
  1431. ymm_b = MKL_DC_BCAST_YMM(&MKL_DC_BB(k + 0,j));
  1432. MKL_DC_MUL_ADD_YMM(ymm_a, ymm_b, ymm_temp0, ymm_temp);
  1433. ymm_a1 = MKL_DC_LOAD_YMM(&MKL_DC_AA(i+4,k + 0));
  1434. MKL_DC_MUL_ADD_YMM(ymm_a1, ymm_b, ymm_temp1, ymm_temp);
  1435. ymm_b = MKL_DC_BCAST_YMM(&MKL_DC_BB(k + 0,j+1));
  1436. MKL_DC_MUL_ADD_YMM(ymm_a, ymm_b, ymm_temp2, ymm_temp);
  1437. MKL_DC_MUL_ADD_YMM(ymm_a1, ymm_b, ymm_temp3, ymm_temp);
  1438. ymm_a = MKL_DC_LOAD_YMM(&MKL_DC_AA(i,k + 1));
  1439. ymm_b = MKL_DC_BCAST_YMM(&MKL_DC_BB(k + 1,j));
  1440. MKL_DC_MUL_ADD_YMM(ymm_a, ymm_b, ymm_temp4, ymm_temp);
  1441. ymm_a1 = MKL_DC_LOAD_YMM(&MKL_DC_AA(i+4,k + 1));
  1442. MKL_DC_MUL_ADD_YMM(ymm_a1, ymm_b, ymm_temp5, ymm_temp);
  1443. ymm_b = MKL_DC_BCAST_YMM(&MKL_DC_BB(k + 1,j+1));
  1444. MKL_DC_MUL_ADD_YMM(ymm_a, ymm_b, ymm_temp6, ymm_temp);
  1445. MKL_DC_MUL_ADD_YMM(ymm_a1, ymm_b, ymm_temp7, ymm_temp);
  1446. ymm_a = MKL_DC_LOAD_YMM(&MKL_DC_AA(i,k + 2));
  1447. ymm_b = MKL_DC_BCAST_YMM(&MKL_DC_BB(k + 2,j));
  1448. MKL_DC_MUL_ADD_YMM(ymm_a, ymm_b, ymm_temp0, ymm_temp);
  1449. ymm_a1 = MKL_DC_LOAD_YMM(&MKL_DC_AA(i+4,k + 2));
  1450. MKL_DC_MUL_ADD_YMM(ymm_a1, ymm_b, ymm_temp1, ymm_temp);
  1451. ymm_b = MKL_DC_BCAST_YMM(&MKL_DC_BB(k + 2,j+1));
  1452. MKL_DC_MUL_ADD_YMM(ymm_a, ymm_b, ymm_temp2, ymm_temp);
  1453. MKL_DC_MUL_ADD_YMM(ymm_a1, ymm_b, ymm_temp3, ymm_temp);
  1454. ymm_a = MKL_DC_LOAD_YMM(&MKL_DC_AA(i,k + 3));
  1455. ymm_b = MKL_DC_BCAST_YMM(&MKL_DC_BB(k + 3,j));
  1456. MKL_DC_MUL_ADD_YMM(ymm_a, ymm_b, ymm_temp4, ymm_temp);
  1457. ymm_a1 = MKL_DC_LOAD_YMM(&MKL_DC_AA(i+4,k + 3));
  1458. MKL_DC_MUL_ADD_YMM(ymm_a1, ymm_b, ymm_temp5, ymm_temp);
  1459. ymm_b = MKL_DC_BCAST_YMM(&MKL_DC_BB(k + 3,j+1));
  1460. MKL_DC_MUL_ADD_YMM(ymm_a, ymm_b, ymm_temp6, ymm_temp);
  1461. MKL_DC_MUL_ADD_YMM(ymm_a1, ymm_b, ymm_temp7, ymm_temp);
  1462. }
  1463. if (krem & 2) {
  1464. ymm_a = MKL_DC_LOAD_YMM(&MKL_DC_AA(i,k + 0));
  1465. ymm_b = MKL_DC_BCAST_YMM(&MKL_DC_BB(k + 0,j));
  1466. MKL_DC_MUL_ADD_YMM(ymm_a, ymm_b, ymm_temp0, ymm_temp);
  1467. ymm_a1 = MKL_DC_LOAD_YMM(&MKL_DC_AA(i+4,k + 0));
  1468. MKL_DC_MUL_ADD_YMM(ymm_a1, ymm_b, ymm_temp1, ymm_temp);
  1469. ymm_b = MKL_DC_BCAST_YMM(&MKL_DC_BB(k + 0,j+1));
  1470. MKL_DC_MUL_ADD_YMM(ymm_a, ymm_b, ymm_temp2, ymm_temp);
  1471. MKL_DC_MUL_ADD_YMM(ymm_a1, ymm_b, ymm_temp3, ymm_temp);
  1472. k++;
  1473. ymm_a = MKL_DC_LOAD_YMM(&MKL_DC_AA(i,k + 0));
  1474. ymm_b = MKL_DC_BCAST_YMM(&MKL_DC_BB(k + 0,j));
  1475. MKL_DC_MUL_ADD_YMM(ymm_a, ymm_b, ymm_temp4, ymm_temp);
  1476. ymm_a1 = MKL_DC_LOAD_YMM(&MKL_DC_AA(i+4,k + 0));
  1477. MKL_DC_MUL_ADD_YMM(ymm_a1, ymm_b, ymm_temp5, ymm_temp);
  1478. ymm_b = MKL_DC_BCAST_YMM(&MKL_DC_BB(k + 0,j+1));
  1479. MKL_DC_MUL_ADD_YMM(ymm_a, ymm_b, ymm_temp6, ymm_temp);
  1480. MKL_DC_MUL_ADD_YMM(ymm_a1, ymm_b, ymm_temp7, ymm_temp);
  1481. k++;
  1482. }
  1483. if (kK>=2) {
  1484. ymm_temp0 = MKL_DC_ADD_YMM(ymm_temp0, ymm_temp4);
  1485. ymm_temp1 = MKL_DC_ADD_YMM(ymm_temp1, ymm_temp5);
  1486. ymm_temp2 = MKL_DC_ADD_YMM(ymm_temp2, ymm_temp6);
  1487. ymm_temp3 = MKL_DC_ADD_YMM(ymm_temp3, ymm_temp7);
  1488. }
  1489. if (krem & 1) {
  1490. ymm_a = MKL_DC_LOAD_YMM(&MKL_DC_AA(i,k + 0));
  1491. ymm_b = MKL_DC_BCAST_YMM(&MKL_DC_BB(k + 0,j));
  1492. MKL_DC_MUL_ADD_YMM(ymm_a, ymm_b, ymm_temp0, ymm_temp);
  1493. ymm_a1 = MKL_DC_LOAD_YMM(&MKL_DC_AA(i+4,k + 0));
  1494. MKL_DC_MUL_ADD_YMM(ymm_a1, ymm_b, ymm_temp1, ymm_temp);
  1495. ymm_b = MKL_DC_BCAST_YMM(&MKL_DC_BB(k + 0,j+1));
  1496. MKL_DC_MUL_ADD_YMM(ymm_a, ymm_b, ymm_temp2, ymm_temp);
  1497. MKL_DC_MUL_ADD_YMM(ymm_a1, ymm_b, ymm_temp3, ymm_temp);
  1498. k++;
  1499. }
  1500. #if !defined(MKL_DC_BETA_ZERO)
  1501. ymm_c0 = MKL_DC_LOAD_YMM(&MKL_DC_CC(i,j));
  1502. #if !defined(MKL_DC_BETA_ONE)
  1503. ymm_c0 = MKL_DC_MUL_YMM(ymm_c0, ymm_beta);
  1504. #endif
  1505. #if defined(MKL_DC_ALPHA_ONE)
  1506. ymm_c0 = MKL_DC_ADD_YMM(ymm_temp0, ymm_c0);
  1507. #else
  1508. MKL_DC_MUL_ADD_YMM(ymm_alpha, ymm_temp0, ymm_c0, ymm_temp);
  1509. #endif
  1510. #else
  1511. #if !defined(MKL_DC_ALPHA_ONE)
  1512. ymm_c0 = MKL_DC_MUL_YMM(ymm_alpha, ymm_temp0);
  1513. #else
  1514. ymm_c0 = ymm_temp0;
  1515. #endif
  1516. #endif
  1517. MKL_DC_STORE_YMM(&MKL_DC_CC(i,j), ymm_c0);
  1518. #if !defined(MKL_DC_BETA_ZERO)
  1519. ymm_c1 = MKL_DC_LOAD_YMM(&MKL_DC_CC(i+4,j));
  1520. #if !defined(MKL_DC_BETA_ONE)
  1521. ymm_c1 = MKL_DC_MUL_YMM(ymm_c1, ymm_beta);
  1522. #endif
  1523. #if defined(MKL_DC_ALPHA_ONE)
  1524. ymm_c1 = MKL_DC_ADD_YMM(ymm_temp1, ymm_c1);
  1525. #else
  1526. MKL_DC_MUL_ADD_YMM(ymm_alpha, ymm_temp1, ymm_c1, ymm_temp);
  1527. #endif
  1528. #else
  1529. #if !defined(MKL_DC_ALPHA_ONE)
  1530. ymm_c1 = MKL_DC_MUL_YMM(ymm_alpha, ymm_temp1);
  1531. #else
  1532. ymm_c1 = ymm_temp1;
  1533. #endif
  1534. #endif
  1535. MKL_DC_STORE_YMM(&MKL_DC_CC(i+4,j), ymm_c1);
  1536. #if !defined(MKL_DC_BETA_ZERO)
  1537. ymm_c2 = MKL_DC_LOAD_YMM(&MKL_DC_CC(i,j+1));
  1538. #if !defined(MKL_DC_BETA_ONE)
  1539. ymm_c2 = MKL_DC_MUL_YMM(ymm_c2, ymm_beta);
  1540. #endif
  1541. #if defined(MKL_DC_ALPHA_ONE)
  1542. ymm_c2 = MKL_DC_ADD_YMM(ymm_temp2, ymm_c2);
  1543. #else
  1544. MKL_DC_MUL_ADD_YMM(ymm_alpha, ymm_temp2, ymm_c2, ymm_temp);
  1545. #endif
  1546. #else
  1547. #if !defined(MKL_DC_ALPHA_ONE)
  1548. ymm_c2 = MKL_DC_MUL_YMM(ymm_alpha, ymm_temp2);
  1549. #else
  1550. ymm_c2 = ymm_temp2;
  1551. #endif
  1552. #endif
  1553. MKL_DC_STORE_YMM(&MKL_DC_CC(i,j+1), ymm_c2);
  1554. #if !defined(MKL_DC_BETA_ZERO)
  1555. ymm_c3 = MKL_DC_LOAD_YMM(&MKL_DC_CC(i+4,j+1));
  1556. #if !defined(MKL_DC_BETA_ONE)
  1557. ymm_c3 = MKL_DC_MUL_YMM(ymm_c3, ymm_beta);
  1558. #endif
  1559. #if defined(MKL_DC_ALPHA_ONE)
  1560. ymm_c3 = MKL_DC_ADD_YMM(ymm_temp3, ymm_c3);
  1561. #else
  1562. MKL_DC_MUL_ADD_YMM(ymm_alpha, ymm_temp3, ymm_c3, ymm_temp);
  1563. #endif
  1564. #else
  1565. #if !defined(MKL_DC_ALPHA_ONE)
  1566. ymm_c3 = MKL_DC_MUL_YMM(ymm_alpha, ymm_temp3);
  1567. #else
  1568. ymm_c3 = ymm_temp3;
  1569. #endif
  1570. #endif
  1571. MKL_DC_STORE_YMM(&MKL_DC_CC(i+4,j+1), ymm_c3);
  1572. }
  1573. if ((m-i) & MKER1) {
  1574. #if !defined(MKL_DC_BETA_ZERO)
  1575. ymm_c0 = MKL_DC_LOAD_YMM(&MKL_DC_CC(i,j));
  1576. ymm_c3 = MKL_DC_LOAD_YMM(&MKL_DC_CC(i,j+1));
  1577. #endif
  1578. ymm_temp0 = MKL_DC_SETZERO_YMM();
  1579. ymm_temp3 = MKL_DC_SETZERO_YMM();
  1580. ymm_temp4 = MKL_DC_SETZERO_YMM();
  1581. ymm_temp7 = MKL_DC_SETZERO_YMM();
  1582. MKL_INT k;
  1583. for (k=0; k<k0; k+=k_in_ker) {
  1584. ymm_a = MKL_DC_LOAD_YMM(&MKL_DC_AA(i,k + 0));
  1585. ymm_b = MKL_DC_BCAST_YMM(&MKL_DC_BB(k + 0,j));
  1586. MKL_DC_MUL_ADD_YMM(ymm_a, ymm_b, ymm_temp0, ymm_temp);
  1587. ymm_b = MKL_DC_BCAST_YMM(&MKL_DC_BB(k + 0,j+1));
  1588. MKL_DC_MUL_ADD_YMM(ymm_a, ymm_b, ymm_temp3, ymm_temp);
  1589. ymm_a = MKL_DC_LOAD_YMM(&MKL_DC_AA(i,k + 1));
  1590. ymm_b = MKL_DC_BCAST_YMM(&MKL_DC_BB(k + 1,j));
  1591. MKL_DC_MUL_ADD_YMM(ymm_a, ymm_b, ymm_temp4, ymm_temp);
  1592. ymm_b = MKL_DC_BCAST_YMM(&MKL_DC_BB(k + 1,j+1));
  1593. MKL_DC_MUL_ADD_YMM(ymm_a, ymm_b, ymm_temp7, ymm_temp);
  1594. ymm_a = MKL_DC_LOAD_YMM(&MKL_DC_AA(i,k + 2));
  1595. ymm_b = MKL_DC_BCAST_YMM(&MKL_DC_BB(k + 2,j));
  1596. MKL_DC_MUL_ADD_YMM(ymm_a, ymm_b, ymm_temp0, ymm_temp);
  1597. ymm_b = MKL_DC_BCAST_YMM(&MKL_DC_BB(k + 2,j+1));
  1598. MKL_DC_MUL_ADD_YMM(ymm_a, ymm_b, ymm_temp3, ymm_temp);
  1599. ymm_a = MKL_DC_LOAD_YMM(&MKL_DC_AA(i,k + 3));
  1600. ymm_b = MKL_DC_BCAST_YMM(&MKL_DC_BB(k + 3,j));
  1601. MKL_DC_MUL_ADD_YMM(ymm_a, ymm_b, ymm_temp4, ymm_temp);
  1602. ymm_b = MKL_DC_BCAST_YMM(&MKL_DC_BB(k + 3,j+1));
  1603. MKL_DC_MUL_ADD_YMM(ymm_a, ymm_b, ymm_temp7, ymm_temp);
  1604. }
  1605. if (krem & 2) {
  1606. ymm_a = MKL_DC_LOAD_YMM(&MKL_DC_AA(i,k + 0));
  1607. ymm_b = MKL_DC_BCAST_YMM(&MKL_DC_BB(k + 0,j));
  1608. MKL_DC_MUL_ADD_YMM(ymm_a, ymm_b, ymm_temp0, ymm_temp);
  1609. ymm_a = MKL_DC_LOAD_YMM(&MKL_DC_AA(i,k + 0));
  1610. ymm_b = MKL_DC_BCAST_YMM(&MKL_DC_BB(k + 0,j+1));
  1611. MKL_DC_MUL_ADD_YMM(ymm_a, ymm_b, ymm_temp3, ymm_temp);
  1612. k++;
  1613. ymm_a = MKL_DC_LOAD_YMM(&MKL_DC_AA(i,k + 0));
  1614. ymm_b = MKL_DC_BCAST_YMM(&MKL_DC_BB(k + 0,j));
  1615. MKL_DC_MUL_ADD_YMM(ymm_a, ymm_b, ymm_temp4, ymm_temp);
  1616. ymm_a = MKL_DC_LOAD_YMM(&MKL_DC_AA(i,k + 0));
  1617. ymm_b = MKL_DC_BCAST_YMM(&MKL_DC_BB(k + 0,j+1));
  1618. MKL_DC_MUL_ADD_YMM(ymm_a, ymm_b, ymm_temp7, ymm_temp);
  1619. k++;
  1620. }
  1621. if (kK>=2) {
  1622. ymm_temp0 = MKL_DC_ADD_YMM(ymm_temp0, ymm_temp4);
  1623. ymm_temp3 = MKL_DC_ADD_YMM(ymm_temp3, ymm_temp7);
  1624. }
  1625. if (krem & 1) {
  1626. ymm_a = MKL_DC_LOAD_YMM(&MKL_DC_AA(i,k + 0));
  1627. ymm_b = MKL_DC_BCAST_YMM(&MKL_DC_BB(k + 0,j));
  1628. MKL_DC_MUL_ADD_YMM(ymm_a, ymm_b, ymm_temp0, ymm_temp);
  1629. ymm_a = MKL_DC_LOAD_YMM(&MKL_DC_AA(i,k + 0));
  1630. ymm_b = MKL_DC_BCAST_YMM(&MKL_DC_BB(k + 0,j+1));
  1631. MKL_DC_MUL_ADD_YMM(ymm_a, ymm_b, ymm_temp3, ymm_temp);
  1632. k++;
  1633. }
  1634. #if !defined(MKL_DC_BETA_ZERO)
  1635. #if !defined(MKL_DC_BETA_ONE)
  1636. ymm_c0 = MKL_DC_MUL_YMM(ymm_c0, ymm_beta);
  1637. #endif
  1638. #if defined(MKL_DC_ALPHA_ONE)
  1639. ymm_c0 = MKL_DC_ADD_YMM(ymm_temp0, ymm_c0);
  1640. #else
  1641. MKL_DC_MUL_ADD_YMM(ymm_alpha, ymm_temp0, ymm_c0, ymm_temp);
  1642. #endif
  1643. #else
  1644. #if !defined(MKL_DC_ALPHA_ONE)
  1645. ymm_c0 = MKL_DC_MUL_YMM(ymm_alpha, ymm_temp0);
  1646. #else
  1647. ymm_c0 = ymm_temp0;
  1648. #endif
  1649. #endif
  1650. MKL_DC_STORE_YMM(&MKL_DC_CC(i,j), ymm_c0);
  1651. #if !defined(MKL_DC_BETA_ZERO)
  1652. #if !defined(MKL_DC_BETA_ONE)
  1653. ymm_c3 = MKL_DC_MUL_YMM(ymm_c3, ymm_beta);
  1654. #endif
  1655. #if defined(MKL_DC_ALPHA_ONE)
  1656. ymm_c3 = MKL_DC_ADD_YMM(ymm_temp3, ymm_c3);
  1657. #else
  1658. MKL_DC_MUL_ADD_YMM(ymm_alpha, ymm_temp3, ymm_c3, ymm_temp);
  1659. #endif
  1660. #else
  1661. #if !defined(MKL_DC_ALPHA_ONE)
  1662. ymm_c3 = MKL_DC_MUL_YMM(ymm_alpha, ymm_temp3);
  1663. #else
  1664. ymm_c3 = ymm_temp3;
  1665. #endif
  1666. #endif
  1667. MKL_DC_STORE_YMM(&MKL_DC_CC(i,j+1), ymm_c3);
  1668. i += MKER1;
  1669. }
  1670. if ((m-i) & MKER2) {
  1671. xmm_temp0 = MKL_DC_SETZERO_XMM();
  1672. xmm_temp3 = MKL_DC_SETZERO_XMM();
  1673. xmm_temp5 = MKL_DC_SETZERO_XMM();
  1674. xmm_temp7 = MKL_DC_SETZERO_XMM();
  1675. #if !defined(MKL_DC_BETA_ZERO)
  1676. xmm_c = MKL_DC_LOAD_XMM(&MKL_DC_CC(i,j));
  1677. xmm_c3 = MKL_DC_LOAD_XMM(&MKL_DC_CC(i,j+1));
  1678. #endif
  1679. MKL_INT k;
  1680. for (k=0; k<k0; k+=k_in_ker) {
  1681. xmm_a = MKL_DC_LOAD_XMM(&MKL_DC_AA(i,k));
  1682. ymm_b = MKL_DC_BCAST_YMM(&MKL_DC_BB(k,j));
  1683. xmm_b = MKL_DC_CAST_YMM_TO_XMM(ymm_b);
  1684. MKL_DC_MUL_ADD_XMM(xmm_a, xmm_b, xmm_temp0, xmm_b);
  1685. ymm_b = MKL_DC_BCAST_YMM(&MKL_DC_BB(k,j+1));
  1686. xmm_b = MKL_DC_CAST_YMM_TO_XMM(ymm_b);
  1687. MKL_DC_MUL_ADD_XMM(xmm_a, xmm_b, xmm_temp3, xmm_b);
  1688. xmm_a = MKL_DC_LOAD_XMM(&MKL_DC_AA(i,k+1));
  1689. ymm_b = MKL_DC_BCAST_YMM(&MKL_DC_BB(k+1,j));
  1690. xmm_b = MKL_DC_CAST_YMM_TO_XMM(ymm_b);
  1691. MKL_DC_MUL_ADD_XMM(xmm_a, xmm_b, xmm_temp5, xmm_b);
  1692. ymm_b = MKL_DC_BCAST_YMM(&MKL_DC_BB(k+1,j+1));
  1693. xmm_b = MKL_DC_CAST_YMM_TO_XMM(ymm_b);
  1694. MKL_DC_MUL_ADD_XMM(xmm_a, xmm_b, xmm_temp7, xmm_b);
  1695. xmm_a = MKL_DC_LOAD_XMM(&MKL_DC_AA(i,k+2));
  1696. ymm_b = MKL_DC_BCAST_YMM(&MKL_DC_BB(k+2,j));
  1697. xmm_b = MKL_DC_CAST_YMM_TO_XMM(ymm_b);
  1698. MKL_DC_MUL_ADD_XMM(xmm_a, xmm_b, xmm_temp0, xmm_b);
  1699. ymm_b = MKL_DC_BCAST_YMM(&MKL_DC_BB(k+2,j+1));
  1700. xmm_b = MKL_DC_CAST_YMM_TO_XMM(ymm_b);
  1701. MKL_DC_MUL_ADD_XMM(xmm_a, xmm_b, xmm_temp3, xmm_b);
  1702. xmm_a = MKL_DC_LOAD_XMM(&MKL_DC_AA(i,k+3));
  1703. ymm_b = MKL_DC_BCAST_YMM(&MKL_DC_BB(k+3,j));
  1704. xmm_b = MKL_DC_CAST_YMM_TO_XMM(ymm_b);
  1705. MKL_DC_MUL_ADD_XMM(xmm_a, xmm_b, xmm_temp5, xmm_b);
  1706. ymm_b = MKL_DC_BCAST_YMM(&MKL_DC_BB(k+3,j+1));
  1707. xmm_b = MKL_DC_CAST_YMM_TO_XMM(ymm_b);
  1708. MKL_DC_MUL_ADD_XMM(xmm_a, xmm_b, xmm_temp7, xmm_b);
  1709. }
  1710. if (krem & 2) {
  1711. xmm_a = MKL_DC_LOAD_XMM(&MKL_DC_AA(i,k));
  1712. ymm_b = MKL_DC_BCAST_YMM(&MKL_DC_BB(k,j));
  1713. xmm_b = MKL_DC_CAST_YMM_TO_XMM(ymm_b);
  1714. MKL_DC_MUL_ADD_XMM(xmm_a, xmm_b, xmm_temp0, xmm_b);
  1715. ymm_b = MKL_DC_BCAST_YMM(&MKL_DC_BB(k,j+1));
  1716. xmm_b = MKL_DC_CAST_YMM_TO_XMM(ymm_b);
  1717. MKL_DC_MUL_ADD_XMM(xmm_a, xmm_b, xmm_temp3, xmm_b);
  1718. k++;
  1719. xmm_a = MKL_DC_LOAD_XMM(&MKL_DC_AA(i,k));
  1720. ymm_b = MKL_DC_BCAST_YMM(&MKL_DC_BB(k,j));
  1721. xmm_b = MKL_DC_CAST_YMM_TO_XMM(ymm_b);
  1722. MKL_DC_MUL_ADD_XMM(xmm_a, xmm_b, xmm_temp5, xmm_b);
  1723. ymm_b = MKL_DC_BCAST_YMM(&MKL_DC_BB(k,j+1));
  1724. xmm_b = MKL_DC_CAST_YMM_TO_XMM(ymm_b);
  1725. MKL_DC_MUL_ADD_XMM(xmm_a, xmm_b, xmm_temp7, xmm_b);
  1726. k++;
  1727. }
  1728. if (kK>=2) {
  1729. xmm_temp0 = MKL_DC_ADD_XMM(xmm_temp0, xmm_temp5);
  1730. xmm_temp3 = MKL_DC_ADD_XMM(xmm_temp3, xmm_temp7);
  1731. }
  1732. if (krem & 1) {
  1733. xmm_a = MKL_DC_LOAD_XMM(&MKL_DC_AA(i,k));
  1734. ymm_b = MKL_DC_BCAST_YMM(&MKL_DC_BB(k,j));
  1735. xmm_b = MKL_DC_CAST_YMM_TO_XMM(ymm_b);
  1736. MKL_DC_MUL_ADD_XMM(xmm_a, xmm_b, xmm_temp0, xmm_b);
  1737. ymm_b = MKL_DC_BCAST_YMM(&MKL_DC_BB(k,j+1));
  1738. xmm_b = MKL_DC_CAST_YMM_TO_XMM(ymm_b);
  1739. MKL_DC_MUL_ADD_XMM(xmm_a, xmm_b, xmm_temp3, xmm_b);
  1740. k++;
  1741. }
  1742. #if !defined(MKL_DC_BETA_ZERO)
  1743. #if !defined(MKL_DC_BETA_ONE)
  1744. xmm_c = MKL_DC_MUL_XMM(xmm_c, xmm_beta);
  1745. #endif
  1746. #if defined(MKL_DC_ALPHA_ONE)
  1747. xmm_c = MKL_DC_ADD_XMM(xmm_temp0, xmm_c);
  1748. #else
  1749. MKL_DC_MUL_ADD_XMM(xmm_alpha, xmm_temp0, xmm_c, xmm_temp);
  1750. #endif
  1751. #else
  1752. #if !defined(MKL_DC_ALPHA_ONE)
  1753. xmm_c = MKL_DC_MUL_XMM(xmm_alpha, xmm_temp0);
  1754. #else
  1755. xmm_c = xmm_temp0;
  1756. #endif
  1757. #endif
  1758. #if !defined(MKL_DC_BETA_ZERO)
  1759. #if !defined(MKL_DC_BETA_ONE)
  1760. xmm_c3 = MKL_DC_MUL_XMM(xmm_c3, xmm_beta);
  1761. #endif
  1762. #if defined(MKL_DC_ALPHA_ONE)
  1763. xmm_c3 = MKL_DC_ADD_XMM(xmm_temp3, xmm_c3);
  1764. #else
  1765. MKL_DC_MUL_ADD_XMM(xmm_alpha, xmm_temp3, xmm_c3, xmm_temp);
  1766. #endif
  1767. #else
  1768. #if !defined(MKL_DC_ALPHA_ONE)
  1769. xmm_c3 = MKL_DC_MUL_XMM(xmm_alpha, xmm_temp3);
  1770. #else
  1771. xmm_c3 = xmm_temp3;
  1772. #endif
  1773. #endif
  1774. MKL_DC_STORE_XMM(&MKL_DC_CC(i,j), xmm_c);
  1775. MKL_DC_STORE_XMM(&MKL_DC_CC(i,j+1), xmm_c3);
  1776. i += MKER2;
  1777. }
  1778. if ((m - i) & MKER3) {
  1779. xmm_temp0 = MKL_DC_SETZERO_XMM();
  1780. xmm_temp3 = MKL_DC_SETZERO_XMM();
  1781. #if !defined(MKL_DC_BETA_ZERO)
  1782. xmm_c = MKL_DC_LOAD_XMM_S(&MKL_DC_CC(i,j));
  1783. xmm_c3 = MKL_DC_LOAD_XMM_S(&MKL_DC_CC(i,j+1));
  1784. #endif
  1785. MKL_INT k;
  1786. for (k=0; k<k0; k+=k_in_ker) {
  1787. xmm_a = MKL_DC_LOAD_XMM_S(&MKL_DC_AA(i,k));
  1788. xmm_b = MKL_DC_LOAD_XMM_S(&MKL_DC_BB(k,j));
  1789. MKL_DC_MUL_ADD_XMM_S(xmm_a, xmm_b, xmm_temp0, xmm_b);
  1790. xmm_b = MKL_DC_LOAD_XMM_S(&MKL_DC_BB(k,j+1));
  1791. MKL_DC_MUL_ADD_XMM_S(xmm_a, xmm_b, xmm_temp3, xmm_b);
  1792. xmm_a = MKL_DC_LOAD_XMM_S(&MKL_DC_AA(i,k+1));
  1793. xmm_b = MKL_DC_LOAD_XMM_S(&MKL_DC_BB(k+1,j));
  1794. MKL_DC_MUL_ADD_XMM_S(xmm_a, xmm_b, xmm_temp0, xmm_b);
  1795. xmm_b = MKL_DC_LOAD_XMM_S(&MKL_DC_BB(k+1,j+1));
  1796. MKL_DC_MUL_ADD_XMM_S(xmm_a, xmm_b, xmm_temp3, xmm_b);
  1797. xmm_a = MKL_DC_LOAD_XMM_S(&MKL_DC_AA(i,k+2));
  1798. xmm_b = MKL_DC_LOAD_XMM_S(&MKL_DC_BB(k+2,j));
  1799. MKL_DC_MUL_ADD_XMM_S(xmm_a, xmm_b, xmm_temp0, xmm_b);
  1800. xmm_b = MKL_DC_LOAD_XMM_S(&MKL_DC_BB(k+2,j+1));
  1801. MKL_DC_MUL_ADD_XMM_S(xmm_a, xmm_b, xmm_temp3, xmm_b);
  1802. xmm_a = MKL_DC_LOAD_XMM_S(&MKL_DC_AA(i,k+3));
  1803. xmm_b = MKL_DC_LOAD_XMM_S(&MKL_DC_BB(k+3,j));
  1804. MKL_DC_MUL_ADD_XMM_S(xmm_a, xmm_b, xmm_temp0, xmm_b);
  1805. xmm_b = MKL_DC_LOAD_XMM_S(&MKL_DC_BB(k+3,j+1));
  1806. MKL_DC_MUL_ADD_XMM_S(xmm_a, xmm_b, xmm_temp3, xmm_b);
  1807. }
  1808. if (krem & 2) {
  1809. xmm_a = MKL_DC_LOAD_XMM_S(&MKL_DC_AA(i,k));
  1810. xmm_b = MKL_DC_LOAD_XMM_S(&MKL_DC_BB(k,j));
  1811. MKL_DC_MUL_ADD_XMM_S(xmm_a, xmm_b, xmm_temp0, xmm_b);
  1812. xmm_b = MKL_DC_LOAD_XMM_S(&MKL_DC_BB(k,j+1));
  1813. MKL_DC_MUL_ADD_XMM_S(xmm_a, xmm_b, xmm_temp3, xmm_b);
  1814. k++;
  1815. xmm_a = MKL_DC_LOAD_XMM_S(&MKL_DC_AA(i,k));
  1816. xmm_b = MKL_DC_LOAD_XMM_S(&MKL_DC_BB(k,j));
  1817. MKL_DC_MUL_ADD_XMM_S(xmm_a, xmm_b, xmm_temp0, xmm_b);
  1818. xmm_b = MKL_DC_LOAD_XMM_S(&MKL_DC_BB(k,j+1));
  1819. MKL_DC_MUL_ADD_XMM_S(xmm_a, xmm_b, xmm_temp3, xmm_b);
  1820. k++;
  1821. }
  1822. if (krem & 1) {
  1823. xmm_a = MKL_DC_LOAD_XMM_S(&MKL_DC_AA(i,k));
  1824. xmm_b = MKL_DC_LOAD_XMM_S(&MKL_DC_BB(k,j));
  1825. MKL_DC_MUL_ADD_XMM_S(xmm_a, xmm_b, xmm_temp0, xmm_b);
  1826. xmm_b = MKL_DC_LOAD_XMM_S(&MKL_DC_BB(k,j+1));
  1827. MKL_DC_MUL_ADD_XMM_S(xmm_a, xmm_b, xmm_temp3, xmm_b);
  1828. k++;
  1829. }
  1830. #if !defined(MKL_DC_BETA_ZERO)
  1831. #if !defined(MKL_DC_BETA_ONE)
  1832. xmm_c = MKL_DC_MUL_XMM(xmm_c, xmm_beta);
  1833. #endif
  1834. #if defined(MKL_DC_ALPHA_ONE)
  1835. xmm_c = MKL_DC_ADD_XMM_S(xmm_temp0, xmm_c);
  1836. #else
  1837. MKL_DC_MUL_ADD_XMM_S(xmm_alpha, xmm_temp0, xmm_c, xmm_temp);
  1838. #endif
  1839. #else
  1840. #if !defined(MKL_DC_ALPHA_ONE)
  1841. xmm_c = MKL_DC_MUL_XMM_S(xmm_alpha, xmm_temp0);
  1842. #else
  1843. xmm_c = xmm_temp0;
  1844. #endif
  1845. #endif
  1846. #if !defined(MKL_DC_BETA_ZERO)
  1847. #if !defined(MKL_DC_BETA_ONE)
  1848. xmm_c3 = MKL_DC_MUL_XMM(xmm_c3, xmm_beta);
  1849. #endif
  1850. #if defined(MKL_DC_ALPHA_ONE)
  1851. xmm_c3 = MKL_DC_ADD_XMM_S(xmm_temp3, xmm_c3);
  1852. #else
  1853. MKL_DC_MUL_ADD_XMM_S(xmm_alpha, xmm_temp3, xmm_c3, xmm_temp);
  1854. #endif
  1855. #else
  1856. #if !defined(MKL_DC_ALPHA_ONE)
  1857. xmm_c3 = MKL_DC_MUL_XMM_S(xmm_alpha, xmm_temp3);
  1858. #else
  1859. xmm_c3 = xmm_temp3;
  1860. #endif
  1861. #endif
  1862. MKL_DC_STORE_XMM_S(&MKL_DC_CC(i,j), xmm_c);
  1863. MKL_DC_STORE_XMM_S(&MKL_DC_CC(i,j+1), xmm_c3);
  1864. i += MKER3;
  1865. }
  1866. } else if ((n-j) == 1) {
  1867. MKL_INT i;
  1868. for (i=0; i<m0; i+=m_in_ker) {
  1869. ymm_temp0 = MKL_DC_SETZERO_YMM();
  1870. ymm_temp1 = MKL_DC_SETZERO_YMM();
  1871. ymm_temp2 = MKL_DC_SETZERO_YMM();
  1872. ymm_temp3 = MKL_DC_SETZERO_YMM();
  1873. MKL_INT k;
  1874. for (k=0; k<k0; k+=k_in_ker) {
  1875. ymm_a = MKL_DC_LOAD_YMM(&MKL_DC_AA(i,k + 0));
  1876. ymm_b = MKL_DC_BCAST_YMM(&MKL_DC_BB(k + 0,j));
  1877. MKL_DC_MUL_ADD_YMM(ymm_a, ymm_b, ymm_temp0, ymm_temp);
  1878. ymm_a1 = MKL_DC_LOAD_YMM(&MKL_DC_AA(i+4,k + 0));
  1879. MKL_DC_MUL_ADD_YMM(ymm_a1, ymm_b, ymm_temp1, ymm_temp);
  1880. ymm_a = MKL_DC_LOAD_YMM(&MKL_DC_AA(i,k + 1));
  1881. ymm_b = MKL_DC_BCAST_YMM(&MKL_DC_BB(k + 1,j));
  1882. MKL_DC_MUL_ADD_YMM(ymm_a, ymm_b, ymm_temp2, ymm_temp);
  1883. ymm_a1 = MKL_DC_LOAD_YMM(&MKL_DC_AA(i+4,k + 1));
  1884. MKL_DC_MUL_ADD_YMM(ymm_a1, ymm_b, ymm_temp3, ymm_temp);
  1885. ymm_a = MKL_DC_LOAD_YMM(&MKL_DC_AA(i,k + 2));
  1886. ymm_b = MKL_DC_BCAST_YMM(&MKL_DC_BB(k + 2,j));
  1887. MKL_DC_MUL_ADD_YMM(ymm_a, ymm_b, ymm_temp0, ymm_temp);
  1888. ymm_a1 = MKL_DC_LOAD_YMM(&MKL_DC_AA(i+4,k + 2));
  1889. MKL_DC_MUL_ADD_YMM(ymm_a1, ymm_b, ymm_temp1, ymm_temp);
  1890. ymm_a = MKL_DC_LOAD_YMM(&MKL_DC_AA(i,k + 3));
  1891. ymm_b = MKL_DC_BCAST_YMM(&MKL_DC_BB(k + 3,j));
  1892. MKL_DC_MUL_ADD_YMM(ymm_a, ymm_b, ymm_temp2, ymm_temp);
  1893. ymm_a1 = MKL_DC_LOAD_YMM(&MKL_DC_AA(i+4,k + 3));
  1894. MKL_DC_MUL_ADD_YMM(ymm_a1, ymm_b, ymm_temp3, ymm_temp);
  1895. }
  1896. if (krem & 2) {
  1897. ymm_a = MKL_DC_LOAD_YMM(&MKL_DC_AA(i,k + 0));
  1898. ymm_b = MKL_DC_BCAST_YMM(&MKL_DC_BB(k + 0,j));
  1899. MKL_DC_MUL_ADD_YMM(ymm_a, ymm_b, ymm_temp0, ymm_temp);
  1900. ymm_a1 = MKL_DC_LOAD_YMM(&MKL_DC_AA(i+4,k + 0));
  1901. MKL_DC_MUL_ADD_YMM(ymm_a1, ymm_b, ymm_temp1, ymm_temp);
  1902. k++;
  1903. ymm_a = MKL_DC_LOAD_YMM(&MKL_DC_AA(i,k + 0));
  1904. ymm_b = MKL_DC_BCAST_YMM(&MKL_DC_BB(k + 0,j));
  1905. MKL_DC_MUL_ADD_YMM(ymm_a, ymm_b, ymm_temp2, ymm_temp);
  1906. ymm_a1 = MKL_DC_LOAD_YMM(&MKL_DC_AA(i+4,k + 0));
  1907. MKL_DC_MUL_ADD_YMM(ymm_a1, ymm_b, ymm_temp3, ymm_temp);
  1908. k++;
  1909. }
  1910. if (kK>=2) {
  1911. ymm_temp0 = MKL_DC_ADD_YMM(ymm_temp0, ymm_temp2);
  1912. ymm_temp1 = MKL_DC_ADD_YMM(ymm_temp1, ymm_temp3);
  1913. }
  1914. if (krem & 1) {
  1915. ymm_a = MKL_DC_LOAD_YMM(&MKL_DC_AA(i,k + 0));
  1916. ymm_b = MKL_DC_BCAST_YMM(&MKL_DC_BB(k + 0,j));
  1917. MKL_DC_MUL_ADD_YMM(ymm_a, ymm_b, ymm_temp0, ymm_temp);
  1918. ymm_a1 = MKL_DC_LOAD_YMM(&MKL_DC_AA(i+4,k + 0));
  1919. MKL_DC_MUL_ADD_YMM(ymm_a1, ymm_b, ymm_temp1, ymm_temp);
  1920. k++;
  1921. }
  1922. #if !defined(MKL_DC_BETA_ZERO)
  1923. ymm_c0 = MKL_DC_LOAD_YMM(&MKL_DC_CC(i,j));
  1924. #if !defined(MKL_DC_BETA_ONE)
  1925. ymm_c0 = MKL_DC_MUL_YMM(ymm_c0, ymm_beta);
  1926. #endif
  1927. #if defined(MKL_DC_ALPHA_ONE)
  1928. ymm_c0 = MKL_DC_ADD_YMM(ymm_temp0, ymm_c0);
  1929. #else
  1930. MKL_DC_MUL_ADD_YMM(ymm_alpha, ymm_temp0, ymm_c0, ymm_temp);
  1931. #endif
  1932. #else
  1933. #if !defined(MKL_DC_ALPHA_ONE)
  1934. ymm_c0 = MKL_DC_MUL_YMM(ymm_alpha, ymm_temp0);
  1935. #else
  1936. ymm_c0 = ymm_temp0;
  1937. #endif
  1938. #endif
  1939. MKL_DC_STORE_YMM(&MKL_DC_CC(i,j), ymm_c0);
  1940. #if !defined(MKL_DC_BETA_ZERO)
  1941. ymm_c1 = MKL_DC_LOAD_YMM(&MKL_DC_CC(i+4,j));
  1942. #if !defined(MKL_DC_BETA_ONE)
  1943. ymm_c1 = MKL_DC_MUL_YMM(ymm_c1, ymm_beta);
  1944. #endif
  1945. #if defined(MKL_DC_ALPHA_ONE)
  1946. ymm_c1 = MKL_DC_ADD_YMM(ymm_temp1, ymm_c1);
  1947. #else
  1948. MKL_DC_MUL_ADD_YMM(ymm_alpha, ymm_temp1, ymm_c1, ymm_temp);
  1949. #endif
  1950. #else
  1951. #if !defined(MKL_DC_ALPHA_ONE)
  1952. ymm_c1 = MKL_DC_MUL_YMM(ymm_alpha, ymm_temp1);
  1953. #else
  1954. ymm_c1 = ymm_temp1;
  1955. #endif
  1956. #endif
  1957. MKL_DC_STORE_YMM(&MKL_DC_CC(i+4,j), ymm_c1);
  1958. }
  1959. if ((m-i) & MKER1) {
  1960. #if !defined(MKL_DC_BETA_ZERO)
  1961. ymm_c0 = MKL_DC_LOAD_YMM(&MKL_DC_CC(i,j));
  1962. #endif
  1963. ymm_temp0 = MKL_DC_SETZERO_YMM();
  1964. ymm_temp1 = MKL_DC_SETZERO_YMM();
  1965. MKL_INT k;
  1966. for (k=0; k<k0; k+=k_in_ker) {
  1967. ymm_a = MKL_DC_LOAD_YMM(&MKL_DC_AA(i,k + 0));
  1968. ymm_b = MKL_DC_BCAST_YMM(&MKL_DC_BB(k + 0,j));
  1969. MKL_DC_MUL_ADD_YMM(ymm_a, ymm_b, ymm_temp0, ymm_temp);
  1970. ymm_a = MKL_DC_LOAD_YMM(&MKL_DC_AA(i,k + 1));
  1971. ymm_b = MKL_DC_BCAST_YMM(&MKL_DC_BB(k + 1,j));
  1972. MKL_DC_MUL_ADD_YMM(ymm_a, ymm_b, ymm_temp1, ymm_temp);
  1973. ymm_a = MKL_DC_LOAD_YMM(&MKL_DC_AA(i,k + 2));
  1974. ymm_b = MKL_DC_BCAST_YMM(&MKL_DC_BB(k + 2,j));
  1975. MKL_DC_MUL_ADD_YMM(ymm_a, ymm_b, ymm_temp0, ymm_temp);
  1976. ymm_a = MKL_DC_LOAD_YMM(&MKL_DC_AA(i,k + 3));
  1977. ymm_b = MKL_DC_BCAST_YMM(&MKL_DC_BB(k + 3,j));
  1978. MKL_DC_MUL_ADD_YMM(ymm_a, ymm_b, ymm_temp1, ymm_temp);
  1979. }
  1980. if (krem & 2) {
  1981. ymm_a = MKL_DC_LOAD_YMM(&MKL_DC_AA(i,k + 0));
  1982. ymm_b = MKL_DC_BCAST_YMM(&MKL_DC_BB(k + 0,j));
  1983. MKL_DC_MUL_ADD_YMM(ymm_a, ymm_b, ymm_temp0, ymm_temp);
  1984. k++;
  1985. ymm_a = MKL_DC_LOAD_YMM(&MKL_DC_AA(i,k + 0));
  1986. ymm_b = MKL_DC_BCAST_YMM(&MKL_DC_BB(k + 0,j));
  1987. MKL_DC_MUL_ADD_YMM(ymm_a, ymm_b, ymm_temp1, ymm_temp);
  1988. k++;
  1989. }
  1990. if (kK>=2) {
  1991. ymm_temp0 = MKL_DC_ADD_YMM(ymm_temp0, ymm_temp1);
  1992. }
  1993. if (krem & 1) {
  1994. ymm_a = MKL_DC_LOAD_YMM(&MKL_DC_AA(i,k + 0));
  1995. ymm_b = MKL_DC_BCAST_YMM(&MKL_DC_BB(k + 0,j));
  1996. MKL_DC_MUL_ADD_YMM(ymm_a, ymm_b, ymm_temp0, ymm_temp);
  1997. k++;
  1998. }
  1999. #if !defined(MKL_DC_BETA_ZERO)
  2000. #if !defined(MKL_DC_BETA_ONE)
  2001. ymm_c0 = MKL_DC_MUL_YMM(ymm_c0, ymm_beta);
  2002. #endif
  2003. #if defined(MKL_DC_ALPHA_ONE)
  2004. ymm_c0 = MKL_DC_ADD_YMM(ymm_temp0, ymm_c0);
  2005. #else
  2006. MKL_DC_MUL_ADD_YMM(ymm_alpha, ymm_temp0, ymm_c0, ymm_temp);
  2007. #endif
  2008. #else
  2009. #if !defined(MKL_DC_ALPHA_ONE)
  2010. ymm_c0 = MKL_DC_MUL_YMM(ymm_alpha, ymm_temp0);
  2011. #else
  2012. ymm_c0 = ymm_temp0;
  2013. #endif
  2014. #endif
  2015. MKL_DC_STORE_YMM(&MKL_DC_CC(i,j), ymm_c0);
  2016. i += MKER1;
  2017. }
  2018. if ((m-i) & MKER2) {
  2019. xmm_temp0 = MKL_DC_SETZERO_XMM();
  2020. xmm_temp3 = MKL_DC_SETZERO_XMM();
  2021. #if !defined(MKL_DC_BETA_ZERO)
  2022. xmm_c = MKL_DC_LOAD_XMM(&MKL_DC_CC(i,j));
  2023. #endif
  2024. MKL_INT k;
  2025. for (k=0; k<k0; k+=k_in_ker) {
  2026. xmm_a = MKL_DC_LOAD_XMM(&MKL_DC_AA(i,k));
  2027. ymm_b = MKL_DC_BCAST_YMM(&MKL_DC_BB(k,j));
  2028. xmm_b = MKL_DC_CAST_YMM_TO_XMM(ymm_b);
  2029. MKL_DC_MUL_ADD_XMM(xmm_a, xmm_b, xmm_temp0, xmm_b);
  2030. xmm_a = MKL_DC_LOAD_XMM(&MKL_DC_AA(i,k+1));
  2031. ymm_b = MKL_DC_BCAST_YMM(&MKL_DC_BB(k+1,j));
  2032. xmm_b = MKL_DC_CAST_YMM_TO_XMM(ymm_b);
  2033. MKL_DC_MUL_ADD_XMM(xmm_a, xmm_b, xmm_temp3, xmm_b);
  2034. xmm_a = MKL_DC_LOAD_XMM(&MKL_DC_AA(i,k+2));
  2035. ymm_b = MKL_DC_BCAST_YMM(&MKL_DC_BB(k+2,j));
  2036. xmm_b = MKL_DC_CAST_YMM_TO_XMM(ymm_b);
  2037. MKL_DC_MUL_ADD_XMM(xmm_a, xmm_b, xmm_temp0, xmm_b);
  2038. xmm_a = MKL_DC_LOAD_XMM(&MKL_DC_AA(i,k+3));
  2039. ymm_b = MKL_DC_BCAST_YMM(&MKL_DC_BB(k+3,j));
  2040. xmm_b = MKL_DC_CAST_YMM_TO_XMM(ymm_b);
  2041. MKL_DC_MUL_ADD_XMM(xmm_a, xmm_b, xmm_temp3, xmm_b);
  2042. }
  2043. if (krem & 2) {
  2044. xmm_a = MKL_DC_LOAD_XMM(&MKL_DC_AA(i,k));
  2045. ymm_b = MKL_DC_BCAST_YMM(&MKL_DC_BB(k,j));
  2046. xmm_b = MKL_DC_CAST_YMM_TO_XMM(ymm_b);
  2047. MKL_DC_MUL_ADD_XMM(xmm_a, xmm_b, xmm_temp0, xmm_b);
  2048. k++;
  2049. xmm_a = MKL_DC_LOAD_XMM(&MKL_DC_AA(i,k));
  2050. ymm_b = MKL_DC_BCAST_YMM(&MKL_DC_BB(k,j));
  2051. xmm_b = MKL_DC_CAST_YMM_TO_XMM(ymm_b);
  2052. MKL_DC_MUL_ADD_XMM(xmm_a, xmm_b, xmm_temp3, xmm_b);
  2053. k++;
  2054. }
  2055. if (kK>=2) {
  2056. xmm_temp0 = MKL_DC_ADD_XMM(xmm_temp0, xmm_temp3);
  2057. }
  2058. if (krem & 1) {
  2059. xmm_a = MKL_DC_LOAD_XMM(&MKL_DC_AA(i,k));
  2060. ymm_b = MKL_DC_BCAST_YMM(&MKL_DC_BB(k,j));
  2061. xmm_b = MKL_DC_CAST_YMM_TO_XMM(ymm_b);
  2062. MKL_DC_MUL_ADD_XMM(xmm_a, xmm_b, xmm_temp0, xmm_b);
  2063. k++;
  2064. }
  2065. #if !defined(MKL_DC_BETA_ZERO)
  2066. #if !defined(MKL_DC_BETA_ONE)
  2067. xmm_c = MKL_DC_MUL_XMM(xmm_c, xmm_beta);
  2068. #endif
  2069. #if defined(MKL_DC_ALPHA_ONE)
  2070. xmm_c = MKL_DC_ADD_XMM(xmm_temp0, xmm_c);
  2071. #else
  2072. MKL_DC_MUL_ADD_XMM(xmm_alpha, xmm_temp0, xmm_c, xmm_temp);
  2073. #endif
  2074. #else
  2075. #if !defined(MKL_DC_ALPHA_ONE)
  2076. xmm_c = MKL_DC_MUL_XMM(xmm_alpha, xmm_temp0);
  2077. #else
  2078. xmm_c = xmm_temp0;
  2079. #endif
  2080. #endif
  2081. MKL_DC_STORE_XMM(&MKL_DC_CC(i,j), xmm_c);
  2082. i += MKER2;
  2083. }
  2084. if ((m-i) & MKER3) {
  2085. xmm_temp0 = MKL_DC_SETZERO_XMM();
  2086. xmm_temp3 = MKL_DC_SETZERO_XMM();
  2087. #if !defined(MKL_DC_BETA_ZERO)
  2088. xmm_c = MKL_DC_LOAD_XMM_S(&MKL_DC_CC(i,j));
  2089. #endif
  2090. MKL_INT k;
  2091. for (k=0; k<k0; k+=k_in_ker) {
  2092. xmm_a = MKL_DC_LOAD_XMM_S(&MKL_DC_AA(i,k));
  2093. xmm_b = MKL_DC_LOAD_XMM_S(&MKL_DC_BB(k,j));
  2094. MKL_DC_MUL_ADD_XMM_S(xmm_a, xmm_b, xmm_temp0, xmm_b);
  2095. xmm_a = MKL_DC_LOAD_XMM_S(&MKL_DC_AA(i,k+1));
  2096. xmm_b = MKL_DC_LOAD_XMM_S(&MKL_DC_BB(k+1,j));
  2097. MKL_DC_MUL_ADD_XMM_S(xmm_a, xmm_b, xmm_temp3, xmm_b);
  2098. xmm_a = MKL_DC_LOAD_XMM_S(&MKL_DC_AA(i,k+2));
  2099. xmm_b = MKL_DC_LOAD_XMM_S(&MKL_DC_BB(k+2,j));
  2100. MKL_DC_MUL_ADD_XMM_S(xmm_a, xmm_b, xmm_temp0, xmm_b);
  2101. xmm_a = MKL_DC_LOAD_XMM_S(&MKL_DC_AA(i,k+3));
  2102. xmm_b = MKL_DC_LOAD_XMM_S(&MKL_DC_BB(k+3,j));
  2103. MKL_DC_MUL_ADD_XMM_S(xmm_a, xmm_b, xmm_temp3, xmm_b);
  2104. }
  2105. if (krem & 2) {
  2106. xmm_a = MKL_DC_LOAD_XMM_S(&MKL_DC_AA(i,k));
  2107. xmm_b = MKL_DC_LOAD_XMM_S(&MKL_DC_BB(k,j));
  2108. MKL_DC_MUL_ADD_XMM_S(xmm_a, xmm_b, xmm_temp0, xmm_b);
  2109. k++;
  2110. xmm_a = MKL_DC_LOAD_XMM_S(&MKL_DC_AA(i,k));
  2111. xmm_b = MKL_DC_LOAD_XMM_S(&MKL_DC_BB(k,j));
  2112. MKL_DC_MUL_ADD_XMM_S(xmm_a, xmm_b, xmm_temp3, xmm_b);
  2113. k++;
  2114. }
  2115. if (kK>=2) {
  2116. xmm_temp0 = MKL_DC_ADD_XMM_S(xmm_temp0, xmm_temp3);
  2117. }
  2118. if (krem & 1) {
  2119. xmm_a = MKL_DC_LOAD_XMM_S(&MKL_DC_AA(i,k));
  2120. xmm_b = MKL_DC_LOAD_XMM_S(&MKL_DC_BB(k,j));
  2121. MKL_DC_MUL_ADD_XMM_S(xmm_a, xmm_b, xmm_temp0, xmm_b);
  2122. k++;
  2123. }
  2124. #if !defined(MKL_DC_BETA_ZERO)
  2125. #if !defined(MKL_DC_BETA_ONE)
  2126. xmm_c = MKL_DC_MUL_XMM(xmm_c, xmm_beta);
  2127. #endif
  2128. #if defined(MKL_DC_ALPHA_ONE)
  2129. xmm_c = MKL_DC_ADD_XMM_S(xmm_temp0, xmm_c);
  2130. #else
  2131. MKL_DC_MUL_ADD_XMM_S(xmm_alpha, xmm_temp0, xmm_c, xmm_temp);
  2132. #endif
  2133. #else
  2134. #if !defined(MKL_DC_ALPHA_ONE)
  2135. xmm_c = MKL_DC_MUL_XMM_S(xmm_alpha, xmm_temp0);
  2136. #else
  2137. xmm_c = xmm_temp0;
  2138. #endif
  2139. #endif
  2140. MKL_DC_STORE_XMM_S(&MKL_DC_CC(i,j), xmm_c);
  2141. i += MKER3;
  2142. }
  2143. }
  2144. }
  2145. static __inline void MKL_DC_FNAME_GEMM_KERNEL(dgemm_nt_mnk)
  2146. (MKL_INT m, MKL_INT n, MKL_INT kK,
  2147. const mkl_dc_type * ALPHA,
  2148. const mkl_dc_type * A, MKL_INT lda,
  2149. const mkl_dc_type * B, MKL_INT ldb,
  2150. const mkl_dc_type * BETA,
  2151. mkl_dc_type * C, MKL_INT ldc)
  2152. {
  2153. #undef MKL_DC_AA
  2154. #undef MKL_DC_BB
  2155. #undef MKL_DC_CC
  2156. #define MKL_DC_AA(i,j) ((A)[(i)+lda*(j)])
  2157. #define MKL_DC_BB(i,j) ((B)[(j)+ldb*(i)])
  2158. #define MKL_DC_CC(i,j) ((C)[(i)+ldc*(j)])
  2159. const MKL_INT m_in_ker = 8;
  2160. const MKL_INT n_in_ker = 4;
  2161. const MKL_INT k_in_ker = 4;
  2162. const MKL_INT MKER1 = 4;
  2163. const MKL_INT MKER2 = 2;
  2164. const MKL_INT MKER3 = 1;
  2165. const MKL_INT MKER4 = 0;
  2166. MKL_INT m0 = (m/m_in_ker)*m_in_ker;
  2167. MKL_INT n0 = (n/n_in_ker)*n_in_ker;
  2168. MKL_INT k0 = (kK/k_in_ker)*k_in_ker;
  2169. MKL_INT krem = kK - k0;
  2170. MKL_DC_YMMTYPE ymm_temp;
  2171. MKL_DC_YMMTYPE ymm_temp0, ymm_temp1;
  2172. MKL_DC_YMMTYPE ymm_temp2, ymm_temp3;
  2173. MKL_DC_YMMTYPE ymm_temp4, ymm_temp5;
  2174. MKL_DC_YMMTYPE ymm_temp6, ymm_temp7;
  2175. MKL_DC_YMMTYPE ymm_c0, ymm_c1;
  2176. MKL_DC_YMMTYPE ymm_c2, ymm_c3;
  2177. MKL_DC_YMMTYPE ymm_c4, ymm_c5;
  2178. MKL_DC_YMMTYPE ymm_c6, ymm_c7;
  2179. MKL_DC_YMMTYPE ymm_a, ymm_a1, ymm_b;
  2180. MKL_DC_YMMTYPE ymm_alpha;
  2181. MKL_DC_XMMTYPE xmm_a, xmm_b;
  2182. MKL_DC_XMMTYPE xmm_temp0, xmm_temp3, xmm_temp5, xmm_temp7;
  2183. MKL_DC_XMMTYPE xmm_temp;
  2184. MKL_DC_XMMTYPE xmm_c, xmm_c3, xmm_c5, xmm_c7;
  2185. MKL_DC_XMMTYPE xmm_alpha;
  2186. #if !defined(MKL_DC_ALPHA_ZERO) && !defined(MKL_DC_ALPHA_ONE)
  2187. ymm_alpha = MKL_DC_BCAST_YMM(ALPHA);
  2188. xmm_alpha = MKL_DC_CAST_YMM_TO_XMM(ymm_alpha);
  2189. #endif
  2190. #if !defined(MKL_DC_BETA_ZERO) && !defined(MKL_DC_BETA_ONE)
  2191. MKL_DC_YMMTYPE ymm_beta = MKL_DC_BCAST_YMM(BETA);
  2192. MKL_DC_XMMTYPE xmm_beta = MKL_DC_CAST_YMM_TO_XMM(ymm_beta);
  2193. #endif
  2194. MKL_INT j;
  2195. for (j=0; j<n0; j+=n_in_ker) {
  2196. MKL_INT i;
  2197. for (i=0; i<m0; i+=m_in_ker) {
  2198. ymm_temp0 = MKL_DC_SETZERO_YMM();
  2199. ymm_temp1 = MKL_DC_SETZERO_YMM();
  2200. ymm_temp2 = MKL_DC_SETZERO_YMM();
  2201. ymm_temp3 = MKL_DC_SETZERO_YMM();
  2202. ymm_temp4 = MKL_DC_SETZERO_YMM();
  2203. ymm_temp5 = MKL_DC_SETZERO_YMM();
  2204. ymm_temp6 = MKL_DC_SETZERO_YMM();
  2205. ymm_temp7 = MKL_DC_SETZERO_YMM();
  2206. MKL_INT k;
  2207. for (k=0; k<k0; k+=k_in_ker) {
  2208. ymm_a = MKL_DC_LOAD_YMM(&MKL_DC_AA(i,k + 0));
  2209. ymm_b = MKL_DC_BCAST_YMM(&MKL_DC_BB(k + 0,j));
  2210. MKL_DC_MUL_ADD_YMM(ymm_a, ymm_b, ymm_temp0, ymm_temp);
  2211. ymm_a1 = MKL_DC_LOAD_YMM(&MKL_DC_AA(i+4,k + 0));
  2212. MKL_DC_MUL_ADD_YMM(ymm_a1, ymm_b, ymm_temp1, ymm_temp);
  2213. ymm_b = MKL_DC_BCAST_YMM(&MKL_DC_BB(k + 0,j+1));
  2214. MKL_DC_MUL_ADD_YMM(ymm_a, ymm_b, ymm_temp2, ymm_temp);
  2215. MKL_DC_MUL_ADD_YMM(ymm_a1, ymm_b, ymm_temp3, ymm_temp);
  2216. ymm_b = MKL_DC_BCAST_YMM(&MKL_DC_BB(k + 0,j+2));
  2217. MKL_DC_MUL_ADD_YMM(ymm_a, ymm_b, ymm_temp4, ymm_temp);
  2218. MKL_DC_MUL_ADD_YMM(ymm_a1, ymm_b, ymm_temp5, ymm_temp);
  2219. ymm_b = MKL_DC_BCAST_YMM(&MKL_DC_BB(k + 0,j+3));
  2220. MKL_DC_MUL_ADD_YMM(ymm_a, ymm_b, ymm_temp6, ymm_temp);
  2221. MKL_DC_MUL_ADD_YMM(ymm_a1, ymm_b, ymm_temp7, ymm_temp);
  2222. ymm_a = MKL_DC_LOAD_YMM(&MKL_DC_AA(i,k + 1));
  2223. ymm_b = MKL_DC_BCAST_YMM(&MKL_DC_BB(k + 1,j));
  2224. MKL_DC_MUL_ADD_YMM(ymm_a, ymm_b, ymm_temp0, ymm_temp);
  2225. ymm_a1 = MKL_DC_LOAD_YMM(&MKL_DC_AA(i+4,k + 1));
  2226. MKL_DC_MUL_ADD_YMM(ymm_a1, ymm_b, ymm_temp1, ymm_temp);
  2227. ymm_b = MKL_DC_BCAST_YMM(&MKL_DC_BB(k + 1,j+1));
  2228. MKL_DC_MUL_ADD_YMM(ymm_a, ymm_b, ymm_temp2, ymm_temp);
  2229. MKL_DC_MUL_ADD_YMM(ymm_a1, ymm_b, ymm_temp3, ymm_temp);
  2230. ymm_b = MKL_DC_BCAST_YMM(&MKL_DC_BB(k + 1,j+2));
  2231. MKL_DC_MUL_ADD_YMM(ymm_a, ymm_b, ymm_temp4, ymm_temp);
  2232. MKL_DC_MUL_ADD_YMM(ymm_a1, ymm_b, ymm_temp5, ymm_temp);
  2233. ymm_b = MKL_DC_BCAST_YMM(&MKL_DC_BB(k + 1,j+3));
  2234. MKL_DC_MUL_ADD_YMM(ymm_a, ymm_b, ymm_temp6, ymm_temp);
  2235. MKL_DC_MUL_ADD_YMM(ymm_a1, ymm_b, ymm_temp7, ymm_temp);
  2236. ymm_a = MKL_DC_LOAD_YMM(&MKL_DC_AA(i,k + 2));
  2237. ymm_b = MKL_DC_BCAST_YMM(&MKL_DC_BB(k + 2,j));
  2238. MKL_DC_MUL_ADD_YMM(ymm_a, ymm_b, ymm_temp0, ymm_temp);
  2239. ymm_a1 = MKL_DC_LOAD_YMM(&MKL_DC_AA(i+4,k + 2));
  2240. MKL_DC_MUL_ADD_YMM(ymm_a1, ymm_b, ymm_temp1, ymm_temp);
  2241. ymm_b = MKL_DC_BCAST_YMM(&MKL_DC_BB(k + 2,j+1));
  2242. MKL_DC_MUL_ADD_YMM(ymm_a, ymm_b, ymm_temp2, ymm_temp);
  2243. MKL_DC_MUL_ADD_YMM(ymm_a1, ymm_b, ymm_temp3, ymm_temp);
  2244. ymm_b = MKL_DC_BCAST_YMM(&MKL_DC_BB(k + 2,j+2));
  2245. MKL_DC_MUL_ADD_YMM(ymm_a, ymm_b, ymm_temp4, ymm_temp);
  2246. MKL_DC_MUL_ADD_YMM(ymm_a1, ymm_b, ymm_temp5, ymm_temp);
  2247. ymm_b = MKL_DC_BCAST_YMM(&MKL_DC_BB(k + 2,j+3));
  2248. MKL_DC_MUL_ADD_YMM(ymm_a, ymm_b, ymm_temp6, ymm_temp);
  2249. MKL_DC_MUL_ADD_YMM(ymm_a1, ymm_b, ymm_temp7, ymm_temp);
  2250. ymm_a = MKL_DC_LOAD_YMM(&MKL_DC_AA(i,k + 3));
  2251. ymm_b = MKL_DC_BCAST_YMM(&MKL_DC_BB(k + 3,j));
  2252. MKL_DC_MUL_ADD_YMM(ymm_a, ymm_b, ymm_temp0, ymm_temp);
  2253. ymm_a1 = MKL_DC_LOAD_YMM(&MKL_DC_AA(i+4,k + 3));
  2254. MKL_DC_MUL_ADD_YMM(ymm_a1, ymm_b, ymm_temp1, ymm_temp);
  2255. ymm_b = MKL_DC_BCAST_YMM(&MKL_DC_BB(k + 3,j+1));
  2256. MKL_DC_MUL_ADD_YMM(ymm_a, ymm_b, ymm_temp2, ymm_temp);
  2257. MKL_DC_MUL_ADD_YMM(ymm_a1, ymm_b, ymm_temp3, ymm_temp);
  2258. ymm_b = MKL_DC_BCAST_YMM(&MKL_DC_BB(k + 3,j+2));
  2259. MKL_DC_MUL_ADD_YMM(ymm_a, ymm_b, ymm_temp4, ymm_temp);
  2260. MKL_DC_MUL_ADD_YMM(ymm_a1, ymm_b, ymm_temp5, ymm_temp);
  2261. ymm_b = MKL_DC_BCAST_YMM(&MKL_DC_BB(k + 3,j+3));
  2262. MKL_DC_MUL_ADD_YMM(ymm_a, ymm_b, ymm_temp6, ymm_temp);
  2263. MKL_DC_MUL_ADD_YMM(ymm_a1, ymm_b, ymm_temp7, ymm_temp);
  2264. }
  2265. if (krem & 2) {
  2266. ymm_a = MKL_DC_LOAD_YMM(&MKL_DC_AA(i,k + 0));
  2267. ymm_b = MKL_DC_BCAST_YMM(&MKL_DC_BB(k + 0,j));
  2268. MKL_DC_MUL_ADD_YMM(ymm_a, ymm_b, ymm_temp0, ymm_temp);
  2269. ymm_a1 = MKL_DC_LOAD_YMM(&MKL_DC_AA(i+4,k + 0));
  2270. MKL_DC_MUL_ADD_YMM(ymm_a1, ymm_b, ymm_temp1, ymm_temp);
  2271. ymm_b = MKL_DC_BCAST_YMM(&MKL_DC_BB(k + 0,j+1));
  2272. MKL_DC_MUL_ADD_YMM(ymm_a, ymm_b, ymm_temp2, ymm_temp);
  2273. MKL_DC_MUL_ADD_YMM(ymm_a1, ymm_b, ymm_temp3, ymm_temp);
  2274. ymm_b = MKL_DC_BCAST_YMM(&MKL_DC_BB(k + 0,j+2));
  2275. MKL_DC_MUL_ADD_YMM(ymm_a, ymm_b, ymm_temp4, ymm_temp);
  2276. MKL_DC_MUL_ADD_YMM(ymm_a1, ymm_b, ymm_temp5, ymm_temp);
  2277. ymm_b = MKL_DC_BCAST_YMM(&MKL_DC_BB(k + 0,j+3));
  2278. MKL_DC_MUL_ADD_YMM(ymm_a, ymm_b, ymm_temp6, ymm_temp);
  2279. MKL_DC_MUL_ADD_YMM(ymm_a1, ymm_b, ymm_temp7, ymm_temp);
  2280. k++;
  2281. ymm_a = MKL_DC_LOAD_YMM(&MKL_DC_AA(i,k + 0));
  2282. ymm_b = MKL_DC_BCAST_YMM(&MKL_DC_BB(k + 0,j));
  2283. MKL_DC_MUL_ADD_YMM(ymm_a, ymm_b, ymm_temp0, ymm_temp);
  2284. ymm_a1 = MKL_DC_LOAD_YMM(&MKL_DC_AA(i+4,k + 0));
  2285. MKL_DC_MUL_ADD_YMM(ymm_a1, ymm_b, ymm_temp1, ymm_temp);
  2286. ymm_b = MKL_DC_BCAST_YMM(&MKL_DC_BB(k + 0,j+1));
  2287. MKL_DC_MUL_ADD_YMM(ymm_a, ymm_b, ymm_temp2, ymm_temp);
  2288. MKL_DC_MUL_ADD_YMM(ymm_a1, ymm_b, ymm_temp3, ymm_temp);
  2289. ymm_b = MKL_DC_BCAST_YMM(&MKL_DC_BB(k + 0,j+2));
  2290. MKL_DC_MUL_ADD_YMM(ymm_a, ymm_b, ymm_temp4, ymm_temp);
  2291. MKL_DC_MUL_ADD_YMM(ymm_a1, ymm_b, ymm_temp5, ymm_temp);
  2292. ymm_b = MKL_DC_BCAST_YMM(&MKL_DC_BB(k + 0,j+3));
  2293. MKL_DC_MUL_ADD_YMM(ymm_a, ymm_b, ymm_temp6, ymm_temp);
  2294. MKL_DC_MUL_ADD_YMM(ymm_a1, ymm_b, ymm_temp7, ymm_temp);
  2295. k++;
  2296. }
  2297. if (krem & 1) {
  2298. ymm_a = MKL_DC_LOAD_YMM(&MKL_DC_AA(i,k + 0));
  2299. ymm_b = MKL_DC_BCAST_YMM(&MKL_DC_BB(k + 0,j));
  2300. MKL_DC_MUL_ADD_YMM(ymm_a, ymm_b, ymm_temp0, ymm_temp);
  2301. ymm_a1 = MKL_DC_LOAD_YMM(&MKL_DC_AA(i+4,k + 0));
  2302. MKL_DC_MUL_ADD_YMM(ymm_a1, ymm_b, ymm_temp1, ymm_temp);
  2303. ymm_b = MKL_DC_BCAST_YMM(&MKL_DC_BB(k + 0,j+1));
  2304. MKL_DC_MUL_ADD_YMM(ymm_a, ymm_b, ymm_temp2, ymm_temp);
  2305. MKL_DC_MUL_ADD_YMM(ymm_a1, ymm_b, ymm_temp3, ymm_temp);
  2306. ymm_b = MKL_DC_BCAST_YMM(&MKL_DC_BB(k + 0,j+2));
  2307. MKL_DC_MUL_ADD_YMM(ymm_a, ymm_b, ymm_temp4, ymm_temp);
  2308. MKL_DC_MUL_ADD_YMM(ymm_a1, ymm_b, ymm_temp5, ymm_temp);
  2309. ymm_b = MKL_DC_BCAST_YMM(&MKL_DC_BB(k + 0,j+3));
  2310. MKL_DC_MUL_ADD_YMM(ymm_a, ymm_b, ymm_temp6, ymm_temp);
  2311. MKL_DC_MUL_ADD_YMM(ymm_a1, ymm_b, ymm_temp7, ymm_temp);
  2312. k++;
  2313. }
  2314. #if !defined(MKL_DC_BETA_ZERO)
  2315. ymm_c0 = MKL_DC_LOAD_YMM(&MKL_DC_CC(i,j));
  2316. #if !defined(MKL_DC_BETA_ONE)
  2317. ymm_c0 = MKL_DC_MUL_YMM(ymm_c0, ymm_beta);
  2318. #endif
  2319. #if defined(MKL_DC_ALPHA_ONE)
  2320. ymm_c0 = MKL_DC_ADD_YMM(ymm_c0, ymm_temp0);
  2321. #else
  2322. MKL_DC_MUL_ADD_YMM(ymm_alpha, ymm_temp0, ymm_c0, ymm_temp);
  2323. #endif
  2324. #else
  2325. #if !defined(MKL_DC_ALPHA_ONE)
  2326. ymm_c0 = MKL_DC_MUL_YMM(ymm_alpha, ymm_temp0);
  2327. #else
  2328. ymm_c0 = ymm_temp0;
  2329. #endif
  2330. #endif
  2331. MKL_DC_STORE_YMM(&MKL_DC_CC(i,j), ymm_c0);
  2332. #if !defined(MKL_DC_BETA_ZERO)
  2333. ymm_c1 = MKL_DC_LOAD_YMM(&MKL_DC_CC(i+4,j));
  2334. #if !defined(MKL_DC_BETA_ONE)
  2335. ymm_c1 = MKL_DC_MUL_YMM(ymm_c1, ymm_beta);
  2336. #endif
  2337. #if defined(MKL_DC_ALPHA_ONE)
  2338. ymm_c1 = MKL_DC_ADD_YMM(ymm_temp1, ymm_c1);
  2339. #else
  2340. MKL_DC_MUL_ADD_YMM(ymm_alpha, ymm_temp1, ymm_c1, ymm_temp);
  2341. #endif
  2342. #else
  2343. #if !defined(MKL_DC_ALPHA_ONE)
  2344. ymm_c1 = MKL_DC_MUL_YMM(ymm_alpha, ymm_temp1);
  2345. #else
  2346. ymm_c1 = ymm_temp1;
  2347. #endif
  2348. #endif
  2349. MKL_DC_STORE_YMM(&MKL_DC_CC(i+4,j), ymm_c1);
  2350. #if !defined(MKL_DC_BETA_ZERO)
  2351. ymm_c2 = MKL_DC_LOAD_YMM(&MKL_DC_CC(i,j+1));
  2352. #if !defined(MKL_DC_BETA_ONE)
  2353. ymm_c2 = MKL_DC_MUL_YMM(ymm_c2, ymm_beta);
  2354. #endif
  2355. #if defined(MKL_DC_ALPHA_ONE)
  2356. ymm_c2 = MKL_DC_ADD_YMM(ymm_temp2, ymm_c2);
  2357. #else
  2358. MKL_DC_MUL_ADD_YMM(ymm_alpha, ymm_temp2, ymm_c2, ymm_temp);
  2359. #endif
  2360. #else
  2361. #if !defined(MKL_DC_ALPHA_ONE)
  2362. ymm_c2 = MKL_DC_MUL_YMM(ymm_alpha, ymm_temp2);
  2363. #else
  2364. ymm_c2 = ymm_temp2;
  2365. #endif
  2366. #endif
  2367. MKL_DC_STORE_YMM(&MKL_DC_CC(i,j+1), ymm_c2);
  2368. #if !defined(MKL_DC_BETA_ZERO)
  2369. ymm_c3 = MKL_DC_LOAD_YMM(&MKL_DC_CC(i+4,j+1));
  2370. #if !defined(MKL_DC_BETA_ONE)
  2371. ymm_c3 = MKL_DC_MUL_YMM(ymm_c3, ymm_beta);
  2372. #endif
  2373. #if defined(MKL_DC_ALPHA_ONE)
  2374. ymm_c3 = MKL_DC_ADD_YMM(ymm_temp3, ymm_c3);
  2375. #else
  2376. MKL_DC_MUL_ADD_YMM(ymm_alpha, ymm_temp3, ymm_c3, ymm_temp);
  2377. #endif
  2378. #else
  2379. #if !defined(MKL_DC_ALPHA_ONE)
  2380. ymm_c3 = MKL_DC_MUL_YMM(ymm_alpha, ymm_temp3);
  2381. #else
  2382. ymm_c3 = ymm_temp3;
  2383. #endif
  2384. #endif
  2385. MKL_DC_STORE_YMM(&MKL_DC_CC(i+4,j+1), ymm_c3);
  2386. #if !defined(MKL_DC_BETA_ZERO)
  2387. ymm_c4 = MKL_DC_LOAD_YMM(&MKL_DC_CC(i,j+2));
  2388. #if !defined(MKL_DC_BETA_ONE)
  2389. ymm_c4 = MKL_DC_MUL_YMM(ymm_c4, ymm_beta);
  2390. #endif
  2391. #if defined(MKL_DC_ALPHA_ONE)
  2392. ymm_c4 = MKL_DC_ADD_YMM(ymm_temp4, ymm_c4);
  2393. #else
  2394. MKL_DC_MUL_ADD_YMM(ymm_alpha, ymm_temp4, ymm_c4, ymm_temp);
  2395. #endif
  2396. #else
  2397. #if !defined(MKL_DC_ALPHA_ONE)
  2398. ymm_c4 = MKL_DC_MUL_YMM(ymm_alpha, ymm_temp4);
  2399. #else
  2400. ymm_c4 = ymm_temp4;
  2401. #endif
  2402. #endif
  2403. MKL_DC_STORE_YMM(&MKL_DC_CC(i,j+2), ymm_c4);
  2404. #if !defined(MKL_DC_BETA_ZERO)
  2405. ymm_c5 = MKL_DC_LOAD_YMM(&MKL_DC_CC(i+4,j+2));
  2406. #if !defined(MKL_DC_BETA_ONE)
  2407. ymm_c5 = MKL_DC_MUL_YMM(ymm_c5, ymm_beta);
  2408. #endif
  2409. #if defined(MKL_DC_ALPHA_ONE)
  2410. ymm_c5 = MKL_DC_ADD_YMM(ymm_temp5, ymm_c5);
  2411. #else
  2412. MKL_DC_MUL_ADD_YMM(ymm_alpha, ymm_temp5, ymm_c5, ymm_temp);
  2413. #endif
  2414. #else
  2415. #if !defined(MKL_DC_ALPHA_ONE)
  2416. ymm_c5 = MKL_DC_MUL_YMM(ymm_alpha, ymm_temp5);
  2417. #else
  2418. ymm_c5 = ymm_temp5;
  2419. #endif
  2420. #endif
  2421. MKL_DC_STORE_YMM(&MKL_DC_CC(i+4,j+2), ymm_c5);
  2422. #if !defined(MKL_DC_BETA_ZERO)
  2423. ymm_c6 = MKL_DC_LOAD_YMM(&MKL_DC_CC(i,j+3));
  2424. #if !defined(MKL_DC_BETA_ONE)
  2425. ymm_c6 = MKL_DC_MUL_YMM(ymm_c6, ymm_beta);
  2426. #endif
  2427. #if defined(MKL_DC_ALPHA_ONE)
  2428. ymm_c6 = MKL_DC_ADD_YMM(ymm_temp6, ymm_c6);
  2429. #else
  2430. MKL_DC_MUL_ADD_YMM(ymm_alpha, ymm_temp6, ymm_c6, ymm_temp);
  2431. #endif
  2432. #else
  2433. #if !defined(MKL_DC_ALPHA_ONE)
  2434. ymm_c6 = MKL_DC_MUL_YMM(ymm_alpha, ymm_temp6);
  2435. #else
  2436. ymm_c6 = ymm_temp6;
  2437. #endif
  2438. #endif
  2439. MKL_DC_STORE_YMM(&MKL_DC_CC(i,j+3), ymm_c6);
  2440. #if !defined(MKL_DC_BETA_ZERO)
  2441. ymm_c7 = MKL_DC_LOAD_YMM(&MKL_DC_CC(i+4,j+3));
  2442. #if !defined(MKL_DC_BETA_ONE)
  2443. ymm_c7 = MKL_DC_MUL_YMM(ymm_c7, ymm_beta);
  2444. #endif
  2445. #if defined(MKL_DC_ALPHA_ONE)
  2446. ymm_c7 = MKL_DC_ADD_YMM(ymm_temp7, ymm_c7);
  2447. #else
  2448. MKL_DC_MUL_ADD_YMM(ymm_alpha, ymm_temp7, ymm_c7, ymm_temp);
  2449. #endif
  2450. #else
  2451. #if !defined(MKL_DC_ALPHA_ONE)
  2452. ymm_c7 = MKL_DC_MUL_YMM(ymm_alpha, ymm_temp7);
  2453. #else
  2454. ymm_c7 = ymm_temp7;
  2455. #endif
  2456. #endif
  2457. MKL_DC_STORE_YMM(&MKL_DC_CC(i+4,j+3), ymm_c7);
  2458. }
  2459. if ((m-i) & MKER1) {
  2460. ymm_temp0 = MKL_DC_SETZERO_YMM();
  2461. ymm_temp3 = MKL_DC_SETZERO_YMM();
  2462. ymm_temp5 = MKL_DC_SETZERO_YMM();
  2463. ymm_temp7 = MKL_DC_SETZERO_YMM();
  2464. MKL_INT k;
  2465. for (k=0; k<k0; k+=k_in_ker) {
  2466. ymm_a = MKL_DC_LOAD_YMM(&MKL_DC_AA(i,k + 0));
  2467. ymm_b = MKL_DC_BCAST_YMM(&MKL_DC_BB(k + 0,j));
  2468. MKL_DC_MUL_ADD_YMM(ymm_a, ymm_b, ymm_temp0, ymm_temp);
  2469. ymm_b = MKL_DC_BCAST_YMM(&MKL_DC_BB(k + 0,j+1));
  2470. MKL_DC_MUL_ADD_YMM(ymm_a, ymm_b, ymm_temp3, ymm_temp);
  2471. ymm_b = MKL_DC_BCAST_YMM(&MKL_DC_BB(k + 0,j+2));
  2472. MKL_DC_MUL_ADD_YMM(ymm_a, ymm_b, ymm_temp5, ymm_temp);
  2473. ymm_b = MKL_DC_BCAST_YMM(&MKL_DC_BB(k + 0,j+3));
  2474. MKL_DC_MUL_ADD_YMM(ymm_a, ymm_b, ymm_temp7, ymm_temp);
  2475. ymm_a = MKL_DC_LOAD_YMM(&MKL_DC_AA(i,k + 1));
  2476. ymm_b = MKL_DC_BCAST_YMM(&MKL_DC_BB(k + 1,j));
  2477. MKL_DC_MUL_ADD_YMM(ymm_a, ymm_b, ymm_temp0, ymm_temp);
  2478. ymm_b = MKL_DC_BCAST_YMM(&MKL_DC_BB(k + 1,j+1));
  2479. MKL_DC_MUL_ADD_YMM(ymm_a, ymm_b, ymm_temp3, ymm_temp);
  2480. ymm_b = MKL_DC_BCAST_YMM(&MKL_DC_BB(k + 1,j+2));
  2481. MKL_DC_MUL_ADD_YMM(ymm_a, ymm_b, ymm_temp5, ymm_temp);
  2482. ymm_b = MKL_DC_BCAST_YMM(&MKL_DC_BB(k + 1,j+3));
  2483. MKL_DC_MUL_ADD_YMM(ymm_a, ymm_b, ymm_temp7, ymm_temp);
  2484. ymm_a = MKL_DC_LOAD_YMM(&MKL_DC_AA(i,k + 2));
  2485. ymm_b = MKL_DC_BCAST_YMM(&MKL_DC_BB(k + 2,j));
  2486. MKL_DC_MUL_ADD_YMM(ymm_a, ymm_b, ymm_temp0, ymm_temp);
  2487. ymm_b = MKL_DC_BCAST_YMM(&MKL_DC_BB(k + 2,j+1));
  2488. MKL_DC_MUL_ADD_YMM(ymm_a, ymm_b, ymm_temp3, ymm_temp);
  2489. ymm_b = MKL_DC_BCAST_YMM(&MKL_DC_BB(k + 2,j+2));
  2490. MKL_DC_MUL_ADD_YMM(ymm_a, ymm_b, ymm_temp5, ymm_temp);
  2491. ymm_b = MKL_DC_BCAST_YMM(&MKL_DC_BB(k + 2,j+3));
  2492. MKL_DC_MUL_ADD_YMM(ymm_a, ymm_b, ymm_temp7, ymm_temp);
  2493. ymm_a = MKL_DC_LOAD_YMM(&MKL_DC_AA(i,k + 3));
  2494. ymm_b = MKL_DC_BCAST_YMM(&MKL_DC_BB(k + 3,j));
  2495. MKL_DC_MUL_ADD_YMM(ymm_a, ymm_b, ymm_temp0, ymm_temp);
  2496. ymm_b = MKL_DC_BCAST_YMM(&MKL_DC_BB(k + 3,j+1));
  2497. MKL_DC_MUL_ADD_YMM(ymm_a, ymm_b, ymm_temp3, ymm_temp);
  2498. ymm_b = MKL_DC_BCAST_YMM(&MKL_DC_BB(k + 3,j+2));
  2499. MKL_DC_MUL_ADD_YMM(ymm_a, ymm_b, ymm_temp5, ymm_temp);
  2500. ymm_b = MKL_DC_BCAST_YMM(&MKL_DC_BB(k + 3,j+3));
  2501. MKL_DC_MUL_ADD_YMM(ymm_a, ymm_b, ymm_temp7, ymm_temp);
  2502. }
  2503. if (krem & 2) {
  2504. ymm_a = MKL_DC_LOAD_YMM(&MKL_DC_AA(i,k + 0));
  2505. ymm_b = MKL_DC_BCAST_YMM(&MKL_DC_BB(k + 0,j));
  2506. MKL_DC_MUL_ADD_YMM(ymm_a, ymm_b, ymm_temp0, ymm_temp);
  2507. ymm_b = MKL_DC_BCAST_YMM(&MKL_DC_BB(k + 0,j+1));
  2508. MKL_DC_MUL_ADD_YMM(ymm_a, ymm_b, ymm_temp3, ymm_temp);
  2509. ymm_b = MKL_DC_BCAST_YMM(&MKL_DC_BB(k + 0,j+2));
  2510. MKL_DC_MUL_ADD_YMM(ymm_a, ymm_b, ymm_temp5, ymm_temp);
  2511. ymm_b = MKL_DC_BCAST_YMM(&MKL_DC_BB(k + 0,j+3));
  2512. MKL_DC_MUL_ADD_YMM(ymm_a, ymm_b, ymm_temp7, ymm_temp);
  2513. k++;
  2514. ymm_a = MKL_DC_LOAD_YMM(&MKL_DC_AA(i,k + 0));
  2515. ymm_b = MKL_DC_BCAST_YMM(&MKL_DC_BB(k + 0,j));
  2516. MKL_DC_MUL_ADD_YMM(ymm_a, ymm_b, ymm_temp0, ymm_temp);
  2517. ymm_b = MKL_DC_BCAST_YMM(&MKL_DC_BB(k + 0,j+1));
  2518. MKL_DC_MUL_ADD_YMM(ymm_a, ymm_b, ymm_temp3, ymm_temp);
  2519. ymm_b = MKL_DC_BCAST_YMM(&MKL_DC_BB(k + 0,j+2));
  2520. MKL_DC_MUL_ADD_YMM(ymm_a, ymm_b, ymm_temp5, ymm_temp);
  2521. ymm_b = MKL_DC_BCAST_YMM(&MKL_DC_BB(k + 0,j+3));
  2522. MKL_DC_MUL_ADD_YMM(ymm_a, ymm_b, ymm_temp7, ymm_temp);
  2523. k++;
  2524. }
  2525. if (krem & 1) {
  2526. ymm_a = MKL_DC_LOAD_YMM(&MKL_DC_AA(i,k + 0));
  2527. ymm_b = MKL_DC_BCAST_YMM(&MKL_DC_BB(k + 0,j));
  2528. MKL_DC_MUL_ADD_YMM(ymm_a, ymm_b, ymm_temp0, ymm_temp);
  2529. ymm_b = MKL_DC_BCAST_YMM(&MKL_DC_BB(k + 0,j+1));
  2530. MKL_DC_MUL_ADD_YMM(ymm_a, ymm_b, ymm_temp3, ymm_temp);
  2531. ymm_b = MKL_DC_BCAST_YMM(&MKL_DC_BB(k + 0,j+2));
  2532. MKL_DC_MUL_ADD_YMM(ymm_a, ymm_b, ymm_temp5, ymm_temp);
  2533. ymm_b = MKL_DC_BCAST_YMM(&MKL_DC_BB(k + 0,j+3));
  2534. MKL_DC_MUL_ADD_YMM(ymm_a, ymm_b, ymm_temp7, ymm_temp);
  2535. k++;
  2536. }
  2537. #if !defined(MKL_DC_BETA_ZERO)
  2538. ymm_c0 = MKL_DC_LOAD_YMM(&MKL_DC_CC(i,j));
  2539. #if !defined(MKL_DC_BETA_ONE)
  2540. ymm_c0 = MKL_DC_MUL_YMM(ymm_c0, ymm_beta);
  2541. #endif
  2542. #if defined(MKL_DC_ALPHA_ONE)
  2543. ymm_c0 = MKL_DC_ADD_YMM(ymm_temp0, ymm_c0);
  2544. #else
  2545. MKL_DC_MUL_ADD_YMM(ymm_alpha, ymm_temp0, ymm_c0, ymm_temp);
  2546. #endif
  2547. #else
  2548. #if !defined(MKL_DC_ALPHA_ONE)
  2549. ymm_c0 = MKL_DC_MUL_YMM(ymm_alpha, ymm_temp0);
  2550. #else
  2551. ymm_c0 = ymm_temp0;
  2552. #endif
  2553. #endif
  2554. #if !defined(MKL_DC_BETA_ZERO)
  2555. ymm_c3 = MKL_DC_LOAD_YMM(&MKL_DC_CC(i,j+1));
  2556. #if !defined(MKL_DC_BETA_ONE)
  2557. ymm_c3 = MKL_DC_MUL_YMM(ymm_c3, ymm_beta);
  2558. #endif
  2559. #if defined(MKL_DC_ALPHA_ONE)
  2560. ymm_c3 = MKL_DC_ADD_YMM(ymm_temp3, ymm_c3);
  2561. #else
  2562. MKL_DC_MUL_ADD_YMM(ymm_alpha, ymm_temp3, ymm_c3, ymm_temp);
  2563. #endif
  2564. #else
  2565. #if !defined(MKL_DC_ALPHA_ONE)
  2566. ymm_c3 = MKL_DC_MUL_YMM(ymm_alpha, ymm_temp3);
  2567. #else
  2568. ymm_c3 = ymm_temp3;
  2569. #endif
  2570. #endif
  2571. #if !defined(MKL_DC_BETA_ZERO)
  2572. ymm_c5 = MKL_DC_LOAD_YMM(&MKL_DC_CC(i,j+2));
  2573. #if !defined(MKL_DC_BETA_ONE)
  2574. ymm_c5 = MKL_DC_MUL_YMM(ymm_c5, ymm_beta);
  2575. #endif
  2576. #if defined(MKL_DC_ALPHA_ONE)
  2577. ymm_c5 = MKL_DC_ADD_YMM(ymm_temp5, ymm_c5);
  2578. #else
  2579. MKL_DC_MUL_ADD_YMM(ymm_alpha, ymm_temp5, ymm_c5, ymm_temp);
  2580. #endif
  2581. #else
  2582. #if !defined(MKL_DC_ALPHA_ONE)
  2583. ymm_c5 = MKL_DC_MUL_YMM(ymm_alpha, ymm_temp5);
  2584. #else
  2585. ymm_c5 = ymm_temp5;
  2586. #endif
  2587. #endif
  2588. #if !defined(MKL_DC_BETA_ZERO)
  2589. ymm_c7 = MKL_DC_LOAD_YMM(&MKL_DC_CC(i,j+3));
  2590. #if !defined(MKL_DC_BETA_ONE)
  2591. ymm_c7 = MKL_DC_MUL_YMM(ymm_c7, ymm_beta);
  2592. #endif
  2593. #if defined(MKL_DC_ALPHA_ONE)
  2594. ymm_c7 = MKL_DC_ADD_YMM(ymm_temp7, ymm_c7);
  2595. #else
  2596. MKL_DC_MUL_ADD_YMM(ymm_alpha, ymm_temp7, ymm_c7, ymm_temp);
  2597. #endif
  2598. #else
  2599. #if !defined(MKL_DC_ALPHA_ONE)
  2600. ymm_c7 = MKL_DC_MUL_YMM(ymm_alpha, ymm_temp7);
  2601. #else
  2602. ymm_c7 = ymm_temp7;
  2603. #endif
  2604. #endif
  2605. MKL_DC_STORE_YMM(&MKL_DC_CC(i,j+0), ymm_c0);
  2606. MKL_DC_STORE_YMM(&MKL_DC_CC(i,j+1), ymm_c3);
  2607. MKL_DC_STORE_YMM(&MKL_DC_CC(i,j+2), ymm_c5);
  2608. MKL_DC_STORE_YMM(&MKL_DC_CC(i,j+3), ymm_c7);
  2609. i += MKER1;
  2610. }
  2611. if ((m-i) & MKER2) {
  2612. xmm_temp0 = MKL_DC_SETZERO_XMM();
  2613. xmm_temp3 = MKL_DC_SETZERO_XMM();
  2614. xmm_temp5 = MKL_DC_SETZERO_XMM();
  2615. xmm_temp7 = MKL_DC_SETZERO_XMM();
  2616. MKL_INT k;
  2617. for (k=0; k<k0; k+=k_in_ker) {
  2618. xmm_a = MKL_DC_LOAD_XMM(&MKL_DC_AA(i,k));
  2619. ymm_b = MKL_DC_BCAST_YMM(&MKL_DC_BB(k,j));
  2620. xmm_b = MKL_DC_CAST_YMM_TO_XMM(ymm_b);
  2621. MKL_DC_MUL_ADD_XMM(xmm_a, xmm_b, xmm_temp0, xmm_b);
  2622. ymm_b = MKL_DC_BCAST_YMM(&MKL_DC_BB(k,j+1));
  2623. xmm_b = MKL_DC_CAST_YMM_TO_XMM(ymm_b);
  2624. MKL_DC_MUL_ADD_XMM(xmm_a, xmm_b, xmm_temp3, xmm_b);
  2625. ymm_b = MKL_DC_BCAST_YMM(&MKL_DC_BB(k,j+2));
  2626. xmm_b = MKL_DC_CAST_YMM_TO_XMM(ymm_b);
  2627. MKL_DC_MUL_ADD_XMM(xmm_a, xmm_b, xmm_temp5, xmm_b);
  2628. ymm_b = MKL_DC_BCAST_YMM(&MKL_DC_BB(k,j+3));
  2629. xmm_b = MKL_DC_CAST_YMM_TO_XMM(ymm_b);
  2630. MKL_DC_MUL_ADD_XMM(xmm_a, xmm_b, xmm_temp7, xmm_b);
  2631. xmm_a = MKL_DC_LOAD_XMM(&MKL_DC_AA(i,k+1));
  2632. ymm_b = MKL_DC_BCAST_YMM(&MKL_DC_BB(k+1,j));
  2633. xmm_b = MKL_DC_CAST_YMM_TO_XMM(ymm_b);
  2634. MKL_DC_MUL_ADD_XMM(xmm_a, xmm_b, xmm_temp0, xmm_b);
  2635. ymm_b = MKL_DC_BCAST_YMM(&MKL_DC_BB(k+1,j+1));
  2636. xmm_b = MKL_DC_CAST_YMM_TO_XMM(ymm_b);
  2637. MKL_DC_MUL_ADD_XMM(xmm_a, xmm_b, xmm_temp3, xmm_b);
  2638. ymm_b = MKL_DC_BCAST_YMM(&MKL_DC_BB(k+1,j+2));
  2639. xmm_b = MKL_DC_CAST_YMM_TO_XMM(ymm_b);
  2640. MKL_DC_MUL_ADD_XMM(xmm_a, xmm_b, xmm_temp5, xmm_b);
  2641. ymm_b = MKL_DC_BCAST_YMM(&MKL_DC_BB(k+1,j+3));
  2642. xmm_b = MKL_DC_CAST_YMM_TO_XMM(ymm_b);
  2643. MKL_DC_MUL_ADD_XMM(xmm_a, xmm_b, xmm_temp7, xmm_b);
  2644. xmm_a = MKL_DC_LOAD_XMM(&MKL_DC_AA(i,k+2));
  2645. ymm_b = MKL_DC_BCAST_YMM(&MKL_DC_BB(k+2,j));
  2646. xmm_b = MKL_DC_CAST_YMM_TO_XMM(ymm_b);
  2647. MKL_DC_MUL_ADD_XMM(xmm_a, xmm_b, xmm_temp0, xmm_b);
  2648. ymm_b = MKL_DC_BCAST_YMM(&MKL_DC_BB(k+2,j+1));
  2649. xmm_b = MKL_DC_CAST_YMM_TO_XMM(ymm_b);
  2650. MKL_DC_MUL_ADD_XMM(xmm_a, xmm_b, xmm_temp3, xmm_b);
  2651. ymm_b = MKL_DC_BCAST_YMM(&MKL_DC_BB(k+2,j+2));
  2652. xmm_b = MKL_DC_CAST_YMM_TO_XMM(ymm_b);
  2653. MKL_DC_MUL_ADD_XMM(xmm_a, xmm_b, xmm_temp5, xmm_b);
  2654. ymm_b = MKL_DC_BCAST_YMM(&MKL_DC_BB(k+2,j+3));
  2655. xmm_b = MKL_DC_CAST_YMM_TO_XMM(ymm_b);
  2656. MKL_DC_MUL_ADD_XMM(xmm_a, xmm_b, xmm_temp7, xmm_b);
  2657. xmm_a = MKL_DC_LOAD_XMM(&MKL_DC_AA(i,k+3));
  2658. ymm_b = MKL_DC_BCAST_YMM(&MKL_DC_BB(k+3,j));
  2659. xmm_b = MKL_DC_CAST_YMM_TO_XMM(ymm_b);
  2660. MKL_DC_MUL_ADD_XMM(xmm_a, xmm_b, xmm_temp0, xmm_b);
  2661. ymm_b = MKL_DC_BCAST_YMM(&MKL_DC_BB(k+3,j+1));
  2662. xmm_b = MKL_DC_CAST_YMM_TO_XMM(ymm_b);
  2663. MKL_DC_MUL_ADD_XMM(xmm_a, xmm_b, xmm_temp3, xmm_b);
  2664. ymm_b = MKL_DC_BCAST_YMM(&MKL_DC_BB(k+3,j+2));
  2665. xmm_b = MKL_DC_CAST_YMM_TO_XMM(ymm_b);
  2666. MKL_DC_MUL_ADD_XMM(xmm_a, xmm_b, xmm_temp5, xmm_b);
  2667. ymm_b = MKL_DC_BCAST_YMM(&MKL_DC_BB(k+3,j+3));
  2668. xmm_b = MKL_DC_CAST_YMM_TO_XMM(ymm_b);
  2669. MKL_DC_MUL_ADD_XMM(xmm_a, xmm_b, xmm_temp7, xmm_b);
  2670. }
  2671. if (krem & 2) {
  2672. xmm_a = MKL_DC_LOAD_XMM(&MKL_DC_AA(i,k));
  2673. ymm_b = MKL_DC_BCAST_YMM(&MKL_DC_BB(k,j));
  2674. xmm_b = MKL_DC_CAST_YMM_TO_XMM(ymm_b);
  2675. MKL_DC_MUL_ADD_XMM(xmm_a, xmm_b, xmm_temp0, xmm_b);
  2676. ymm_b = MKL_DC_BCAST_YMM(&MKL_DC_BB(k,j+1));
  2677. xmm_b = MKL_DC_CAST_YMM_TO_XMM(ymm_b);
  2678. MKL_DC_MUL_ADD_XMM(xmm_a, xmm_b, xmm_temp3, xmm_b);
  2679. ymm_b = MKL_DC_BCAST_YMM(&MKL_DC_BB(k,j+2));
  2680. xmm_b = MKL_DC_CAST_YMM_TO_XMM(ymm_b);
  2681. MKL_DC_MUL_ADD_XMM(xmm_a, xmm_b, xmm_temp5, xmm_b);
  2682. ymm_b = MKL_DC_BCAST_YMM(&MKL_DC_BB(k,j+3));
  2683. xmm_b = MKL_DC_CAST_YMM_TO_XMM(ymm_b);
  2684. MKL_DC_MUL_ADD_XMM(xmm_a, xmm_b, xmm_temp7, xmm_b);
  2685. k++;
  2686. xmm_a = MKL_DC_LOAD_XMM(&MKL_DC_AA(i,k));
  2687. ymm_b = MKL_DC_BCAST_YMM(&MKL_DC_BB(k,j));
  2688. xmm_b = MKL_DC_CAST_YMM_TO_XMM(ymm_b);
  2689. MKL_DC_MUL_ADD_XMM(xmm_a, xmm_b, xmm_temp0, xmm_b);
  2690. ymm_b = MKL_DC_BCAST_YMM(&MKL_DC_BB(k,j+1));
  2691. xmm_b = MKL_DC_CAST_YMM_TO_XMM(ymm_b);
  2692. MKL_DC_MUL_ADD_XMM(xmm_a, xmm_b, xmm_temp3, xmm_b);
  2693. ymm_b = MKL_DC_BCAST_YMM(&MKL_DC_BB(k,j+2));
  2694. xmm_b = MKL_DC_CAST_YMM_TO_XMM(ymm_b);
  2695. MKL_DC_MUL_ADD_XMM(xmm_a, xmm_b, xmm_temp5, xmm_b);
  2696. ymm_b = MKL_DC_BCAST_YMM(&MKL_DC_BB(k,j+3));
  2697. xmm_b = MKL_DC_CAST_YMM_TO_XMM(ymm_b);
  2698. MKL_DC_MUL_ADD_XMM(xmm_a, xmm_b, xmm_temp7, xmm_b);
  2699. k++;
  2700. }
  2701. if (krem & 1) {
  2702. xmm_a = MKL_DC_LOAD_XMM(&MKL_DC_AA(i,k));
  2703. ymm_b = MKL_DC_BCAST_YMM(&MKL_DC_BB(k,j));
  2704. xmm_b = MKL_DC_CAST_YMM_TO_XMM(ymm_b);
  2705. MKL_DC_MUL_ADD_XMM(xmm_a, xmm_b, xmm_temp0, xmm_b);
  2706. ymm_b = MKL_DC_BCAST_YMM(&MKL_DC_BB(k,j+1));
  2707. xmm_b = MKL_DC_CAST_YMM_TO_XMM(ymm_b);
  2708. MKL_DC_MUL_ADD_XMM(xmm_a, xmm_b, xmm_temp3, xmm_b);
  2709. ymm_b = MKL_DC_BCAST_YMM(&MKL_DC_BB(k,j+2));
  2710. xmm_b = MKL_DC_CAST_YMM_TO_XMM(ymm_b);
  2711. MKL_DC_MUL_ADD_XMM(xmm_a, xmm_b, xmm_temp5, xmm_b);
  2712. ymm_b = MKL_DC_BCAST_YMM(&MKL_DC_BB(k,j+3));
  2713. xmm_b = MKL_DC_CAST_YMM_TO_XMM(ymm_b);
  2714. MKL_DC_MUL_ADD_XMM(xmm_a, xmm_b, xmm_temp7, xmm_b);
  2715. k++;
  2716. }
  2717. #if !defined(MKL_DC_BETA_ZERO)
  2718. xmm_c = MKL_DC_LOAD_XMM(&MKL_DC_CC(i,j));
  2719. #if !defined(MKL_DC_BETA_ONE)
  2720. xmm_c = MKL_DC_MUL_XMM(xmm_c, xmm_beta);
  2721. #endif
  2722. #if defined(MKL_DC_ALPHA_ONE)
  2723. xmm_c = MKL_DC_ADD_XMM(xmm_temp0, xmm_c);
  2724. #else
  2725. MKL_DC_MUL_ADD_XMM(xmm_alpha, xmm_temp0, xmm_c, xmm_temp);
  2726. #endif
  2727. #else
  2728. #if !defined(MKL_DC_ALPHA_ONE)
  2729. xmm_c = MKL_DC_MUL_XMM(xmm_alpha, xmm_temp0);
  2730. #else
  2731. xmm_c = xmm_temp0;
  2732. #endif
  2733. #endif
  2734. #if !defined(MKL_DC_BETA_ZERO)
  2735. xmm_c3 = MKL_DC_LOAD_XMM(&MKL_DC_CC(i,j+1));
  2736. #if !defined(MKL_DC_BETA_ONE)
  2737. xmm_c3 = MKL_DC_MUL_XMM(xmm_c3, xmm_beta);
  2738. #endif
  2739. #if defined(MKL_DC_ALPHA_ONE)
  2740. xmm_c3 = MKL_DC_ADD_XMM(xmm_temp3, xmm_c3);
  2741. #else
  2742. MKL_DC_MUL_ADD_XMM(xmm_alpha, xmm_temp3, xmm_c3, xmm_temp);
  2743. #endif
  2744. #else
  2745. #if !defined(MKL_DC_ALPHA_ONE)
  2746. xmm_c3 = MKL_DC_MUL_XMM(xmm_alpha, xmm_temp3);
  2747. #else
  2748. xmm_c3 = xmm_temp3;
  2749. #endif
  2750. #endif
  2751. #if !defined(MKL_DC_BETA_ZERO)
  2752. xmm_c5 = MKL_DC_LOAD_XMM(&MKL_DC_CC(i,j+2));
  2753. #if !defined(MKL_DC_BETA_ONE)
  2754. xmm_c5 = MKL_DC_MUL_XMM(xmm_c5, xmm_beta);
  2755. #endif
  2756. #if defined(MKL_DC_ALPHA_ONE)
  2757. xmm_c5 = MKL_DC_ADD_XMM(xmm_temp5, xmm_c5);
  2758. #else
  2759. MKL_DC_MUL_ADD_XMM(xmm_alpha, xmm_temp5, xmm_c5, xmm_temp);
  2760. #endif
  2761. #else
  2762. #if !defined(MKL_DC_ALPHA_ONE)
  2763. xmm_c5 = MKL_DC_MUL_XMM(xmm_alpha, xmm_temp5);
  2764. #else
  2765. xmm_c5 = xmm_temp5;
  2766. #endif
  2767. #endif
  2768. #if !defined(MKL_DC_BETA_ZERO)
  2769. xmm_c7 = MKL_DC_LOAD_XMM(&MKL_DC_CC(i,j+3));
  2770. #if !defined(MKL_DC_BETA_ONE)
  2771. xmm_c7 = MKL_DC_MUL_XMM(xmm_c7, xmm_beta);
  2772. #endif
  2773. #if defined(MKL_DC_ALPHA_ONE)
  2774. xmm_c7 = MKL_DC_ADD_XMM(xmm_temp7, xmm_c7);
  2775. #else
  2776. MKL_DC_MUL_ADD_XMM(xmm_alpha, xmm_temp7, xmm_c7, xmm_temp);
  2777. #endif
  2778. #else
  2779. #if !defined(MKL_DC_ALPHA_ONE)
  2780. xmm_c7 = MKL_DC_MUL_XMM(xmm_alpha, xmm_temp7);
  2781. #else
  2782. xmm_c7 = xmm_temp7;
  2783. #endif
  2784. #endif
  2785. MKL_DC_STORE_XMM(&MKL_DC_CC(i,j), xmm_c);
  2786. MKL_DC_STORE_XMM(&MKL_DC_CC(i,j+1), xmm_c3);
  2787. MKL_DC_STORE_XMM(&MKL_DC_CC(i,j+2), xmm_c5);
  2788. MKL_DC_STORE_XMM(&MKL_DC_CC(i,j+3), xmm_c7);
  2789. i += MKER2;
  2790. }
  2791. if ((m-i) & MKER3) {
  2792. xmm_temp0 = MKL_DC_SETZERO_XMM();
  2793. xmm_temp3 = MKL_DC_SETZERO_XMM();
  2794. xmm_temp5 = MKL_DC_SETZERO_XMM();
  2795. xmm_temp7 = MKL_DC_SETZERO_XMM();
  2796. MKL_INT k;
  2797. for (k=0; k<k0; k+=k_in_ker) {
  2798. xmm_a = MKL_DC_LOAD_XMM_S(&MKL_DC_AA(i,k));
  2799. xmm_b = MKL_DC_LOAD_XMM_S(&MKL_DC_BB(k,j));
  2800. MKL_DC_MUL_ADD_XMM_S(xmm_a, xmm_b, xmm_temp0, xmm_b);
  2801. xmm_b = MKL_DC_LOAD_XMM_S(&MKL_DC_BB(k,j+1));
  2802. MKL_DC_MUL_ADD_XMM_S(xmm_a, xmm_b, xmm_temp3, xmm_b);
  2803. xmm_b = MKL_DC_LOAD_XMM_S(&MKL_DC_BB(k,j+2));
  2804. MKL_DC_MUL_ADD_XMM_S(xmm_a, xmm_b, xmm_temp5, xmm_b);
  2805. xmm_b = MKL_DC_LOAD_XMM_S(&MKL_DC_BB(k,j+3));
  2806. MKL_DC_MUL_ADD_XMM_S(xmm_a, xmm_b, xmm_temp7, xmm_b);
  2807. xmm_a = MKL_DC_LOAD_XMM_S(&MKL_DC_AA(i,k+1));
  2808. xmm_b = MKL_DC_LOAD_XMM_S(&MKL_DC_BB(k+1,j));
  2809. MKL_DC_MUL_ADD_XMM_S(xmm_a, xmm_b, xmm_temp0, xmm_b);
  2810. xmm_b = MKL_DC_LOAD_XMM_S(&MKL_DC_BB(k+1,j+1));
  2811. MKL_DC_MUL_ADD_XMM_S(xmm_a, xmm_b, xmm_temp3, xmm_b);
  2812. xmm_b = MKL_DC_LOAD_XMM_S(&MKL_DC_BB(k+1,j+2));
  2813. MKL_DC_MUL_ADD_XMM_S(xmm_a, xmm_b, xmm_temp5, xmm_b);
  2814. xmm_b = MKL_DC_LOAD_XMM_S(&MKL_DC_BB(k+1,j+3));
  2815. MKL_DC_MUL_ADD_XMM_S(xmm_a, xmm_b, xmm_temp7, xmm_b);
  2816. xmm_a = MKL_DC_LOAD_XMM_S(&MKL_DC_AA(i,k+2));
  2817. xmm_b = MKL_DC_LOAD_XMM_S(&MKL_DC_BB(k+2,j));
  2818. MKL_DC_MUL_ADD_XMM_S(xmm_a, xmm_b, xmm_temp0, xmm_b);
  2819. xmm_b = MKL_DC_LOAD_XMM_S(&MKL_DC_BB(k+2,j+1));
  2820. MKL_DC_MUL_ADD_XMM_S(xmm_a, xmm_b, xmm_temp3, xmm_b);
  2821. xmm_b = MKL_DC_LOAD_XMM_S(&MKL_DC_BB(k+2,j+2));
  2822. MKL_DC_MUL_ADD_XMM_S(xmm_a, xmm_b, xmm_temp5, xmm_b);
  2823. xmm_b = MKL_DC_LOAD_XMM_S(&MKL_DC_BB(k+2,j+3));
  2824. MKL_DC_MUL_ADD_XMM_S(xmm_a, xmm_b, xmm_temp7, xmm_b);
  2825. xmm_a = MKL_DC_LOAD_XMM_S(&MKL_DC_AA(i,k+3));
  2826. xmm_b = MKL_DC_LOAD_XMM_S(&MKL_DC_BB(k+3,j));
  2827. MKL_DC_MUL_ADD_XMM_S(xmm_a, xmm_b, xmm_temp0, xmm_b);
  2828. xmm_b = MKL_DC_LOAD_XMM_S(&MKL_DC_BB(k+3,j+1));
  2829. MKL_DC_MUL_ADD_XMM_S(xmm_a, xmm_b, xmm_temp3, xmm_b);
  2830. xmm_b = MKL_DC_LOAD_XMM_S(&MKL_DC_BB(k+3,j+2));
  2831. MKL_DC_MUL_ADD_XMM_S(xmm_a, xmm_b, xmm_temp5, xmm_b);
  2832. xmm_b = MKL_DC_LOAD_XMM_S(&MKL_DC_BB(k+3,j+3));
  2833. MKL_DC_MUL_ADD_XMM_S(xmm_a, xmm_b, xmm_temp7, xmm_b);
  2834. }
  2835. if (krem & 2) {
  2836. xmm_a = MKL_DC_LOAD_XMM_S(&MKL_DC_AA(i,k));
  2837. xmm_b = MKL_DC_LOAD_XMM_S(&MKL_DC_BB(k,j));
  2838. MKL_DC_MUL_ADD_XMM_S(xmm_a, xmm_b, xmm_temp0, xmm_b);
  2839. xmm_b = MKL_DC_LOAD_XMM_S(&MKL_DC_BB(k,j+1));
  2840. MKL_DC_MUL_ADD_XMM_S(xmm_a, xmm_b, xmm_temp3, xmm_b);
  2841. xmm_b = MKL_DC_LOAD_XMM_S(&MKL_DC_BB(k,j+2));
  2842. MKL_DC_MUL_ADD_XMM_S(xmm_a, xmm_b, xmm_temp5, xmm_b);
  2843. xmm_b = MKL_DC_LOAD_XMM_S(&MKL_DC_BB(k,j+3));
  2844. MKL_DC_MUL_ADD_XMM_S(xmm_a, xmm_b, xmm_temp7, xmm_b);
  2845. k++;
  2846. xmm_a = MKL_DC_LOAD_XMM_S(&MKL_DC_AA(i,k));
  2847. xmm_b = MKL_DC_LOAD_XMM_S(&MKL_DC_BB(k,j));
  2848. MKL_DC_MUL_ADD_XMM_S(xmm_a, xmm_b, xmm_temp0, xmm_b);
  2849. xmm_b = MKL_DC_LOAD_XMM_S(&MKL_DC_BB(k,j+1));
  2850. MKL_DC_MUL_ADD_XMM_S(xmm_a, xmm_b, xmm_temp3, xmm_b);
  2851. xmm_b = MKL_DC_LOAD_XMM_S(&MKL_DC_BB(k,j+2));
  2852. MKL_DC_MUL_ADD_XMM_S(xmm_a, xmm_b, xmm_temp5, xmm_b);
  2853. xmm_b = MKL_DC_LOAD_XMM_S(&MKL_DC_BB(k,j+3));
  2854. MKL_DC_MUL_ADD_XMM_S(xmm_a, xmm_b, xmm_temp7, xmm_b);
  2855. k++;
  2856. }
  2857. if (krem & 1) {
  2858. xmm_a = MKL_DC_LOAD_XMM_S(&MKL_DC_AA(i,k));
  2859. xmm_b = MKL_DC_LOAD_XMM_S(&MKL_DC_BB(k,j));
  2860. MKL_DC_MUL_ADD_XMM_S(xmm_a, xmm_b, xmm_temp0, xmm_b);
  2861. xmm_b = MKL_DC_LOAD_XMM_S(&MKL_DC_BB(k,j+1));
  2862. MKL_DC_MUL_ADD_XMM_S(xmm_a, xmm_b, xmm_temp3, xmm_b);
  2863. xmm_b = MKL_DC_LOAD_XMM_S(&MKL_DC_BB(k,j+2));
  2864. MKL_DC_MUL_ADD_XMM_S(xmm_a, xmm_b, xmm_temp5, xmm_b);
  2865. xmm_b = MKL_DC_LOAD_XMM_S(&MKL_DC_BB(k,j+3));
  2866. MKL_DC_MUL_ADD_XMM_S(xmm_a, xmm_b, xmm_temp7, xmm_b);
  2867. k++;
  2868. }
  2869. #if !defined(MKL_DC_BETA_ZERO)
  2870. xmm_c = MKL_DC_LOAD_XMM_S(&MKL_DC_CC(i,j));
  2871. #if !defined(MKL_DC_BETA_ONE)
  2872. xmm_c = MKL_DC_MUL_XMM(xmm_c, xmm_beta);
  2873. #endif
  2874. #if defined(MKL_DC_ALPHA_ONE)
  2875. xmm_c = MKL_DC_ADD_XMM_S(xmm_temp0, xmm_c);
  2876. #else
  2877. MKL_DC_MUL_ADD_XMM_S(xmm_alpha, xmm_temp0, xmm_c, xmm_temp);
  2878. #endif
  2879. #else
  2880. #if !defined(MKL_DC_ALPHA_ONE)
  2881. xmm_c = MKL_DC_MUL_XMM_S(xmm_alpha, xmm_temp0);
  2882. #else
  2883. xmm_c = xmm_temp0;
  2884. #endif
  2885. #endif
  2886. #if !defined(MKL_DC_BETA_ZERO)
  2887. xmm_c3 = MKL_DC_LOAD_XMM_S(&MKL_DC_CC(i,j+1));
  2888. #if !defined(MKL_DC_BETA_ONE)
  2889. xmm_c3 = MKL_DC_MUL_XMM(xmm_c3, xmm_beta);
  2890. #endif
  2891. #if defined(MKL_DC_ALPHA_ONE)
  2892. xmm_c3 = MKL_DC_ADD_XMM_S(xmm_temp3, xmm_c3);
  2893. #else
  2894. MKL_DC_MUL_ADD_XMM_S(xmm_alpha, xmm_temp3, xmm_c3, xmm_temp);
  2895. #endif
  2896. #else
  2897. #if !defined(MKL_DC_ALPHA_ONE)
  2898. xmm_c3 = MKL_DC_MUL_XMM_S(xmm_alpha, xmm_temp3);
  2899. #else
  2900. xmm_c3 = xmm_temp3;
  2901. #endif
  2902. #endif
  2903. #if !defined(MKL_DC_BETA_ZERO)
  2904. xmm_c5 = MKL_DC_LOAD_XMM_S(&MKL_DC_CC(i,j+2));
  2905. #if !defined(MKL_DC_BETA_ONE)
  2906. xmm_c5 = MKL_DC_MUL_XMM(xmm_c5, xmm_beta);
  2907. #endif
  2908. #if defined(MKL_DC_ALPHA_ONE)
  2909. xmm_c5 = MKL_DC_ADD_XMM_S(xmm_temp5, xmm_c5);
  2910. #else
  2911. MKL_DC_MUL_ADD_XMM_S(xmm_alpha, xmm_temp5, xmm_c5, xmm_temp);
  2912. #endif
  2913. #else
  2914. #if !defined(MKL_DC_ALPHA_ONE)
  2915. xmm_c5 = MKL_DC_MUL_XMM_S(xmm_alpha, xmm_temp5);
  2916. #else
  2917. xmm_c5 = xmm_temp5;
  2918. #endif
  2919. #endif
  2920. #if !defined(MKL_DC_BETA_ZERO)
  2921. xmm_c7 = MKL_DC_LOAD_XMM_S(&MKL_DC_CC(i,j+3));
  2922. #if !defined(MKL_DC_BETA_ONE)
  2923. xmm_c7 = MKL_DC_MUL_XMM(xmm_c7, xmm_beta);
  2924. #endif
  2925. #if defined(MKL_DC_ALPHA_ONE)
  2926. xmm_c7 = MKL_DC_ADD_XMM_S(xmm_temp7, xmm_c7);
  2927. #else
  2928. MKL_DC_MUL_ADD_XMM_S(xmm_alpha, xmm_temp7, xmm_c7, xmm_temp);
  2929. #endif
  2930. #else
  2931. #if !defined(MKL_DC_ALPHA_ONE)
  2932. xmm_c7 = MKL_DC_MUL_XMM_S(xmm_alpha, xmm_temp7);
  2933. #else
  2934. xmm_c7 = xmm_temp7;
  2935. #endif
  2936. #endif
  2937. MKL_DC_STORE_XMM_S(&MKL_DC_CC(i,j), xmm_c);
  2938. MKL_DC_STORE_XMM_S(&MKL_DC_CC(i,j+1), xmm_c3);
  2939. MKL_DC_STORE_XMM_S(&MKL_DC_CC(i,j+2), xmm_c5);
  2940. MKL_DC_STORE_XMM_S(&MKL_DC_CC(i,j+3), xmm_c7);
  2941. i += MKER3;
  2942. }
  2943. }
  2944. if ((n-j) == 3) {
  2945. MKL_INT i;
  2946. for (i=0; i<m0; i+=m_in_ker) {
  2947. ymm_temp0 = MKL_DC_SETZERO_YMM();
  2948. ymm_temp1 = MKL_DC_SETZERO_YMM();
  2949. ymm_temp2 = MKL_DC_SETZERO_YMM();
  2950. ymm_temp3 = MKL_DC_SETZERO_YMM();
  2951. ymm_temp4 = MKL_DC_SETZERO_YMM();
  2952. ymm_temp5 = MKL_DC_SETZERO_YMM();
  2953. MKL_INT k;
  2954. for (k=0; k<k0; k+=k_in_ker) {
  2955. ymm_a = MKL_DC_LOAD_YMM(&MKL_DC_AA(i,k + 0));
  2956. ymm_b = MKL_DC_BCAST_YMM(&MKL_DC_BB(k + 0,j));
  2957. MKL_DC_MUL_ADD_YMM(ymm_a, ymm_b, ymm_temp0, ymm_temp);
  2958. ymm_a1 = MKL_DC_LOAD_YMM(&MKL_DC_AA(i+4,k + 0));
  2959. MKL_DC_MUL_ADD_YMM(ymm_a1, ymm_b, ymm_temp1, ymm_temp);
  2960. ymm_b = MKL_DC_BCAST_YMM(&MKL_DC_BB(k + 0,j+1));
  2961. MKL_DC_MUL_ADD_YMM(ymm_a, ymm_b, ymm_temp2, ymm_temp);
  2962. MKL_DC_MUL_ADD_YMM(ymm_a1, ymm_b, ymm_temp3, ymm_temp);
  2963. ymm_b = MKL_DC_BCAST_YMM(&MKL_DC_BB(k + 0,j+2));
  2964. MKL_DC_MUL_ADD_YMM(ymm_a, ymm_b, ymm_temp4, ymm_temp);
  2965. MKL_DC_MUL_ADD_YMM(ymm_a1, ymm_b, ymm_temp5, ymm_temp);
  2966. ymm_a = MKL_DC_LOAD_YMM(&MKL_DC_AA(i,k + 1));
  2967. ymm_b = MKL_DC_BCAST_YMM(&MKL_DC_BB(k + 1,j));
  2968. MKL_DC_MUL_ADD_YMM(ymm_a, ymm_b, ymm_temp0, ymm_temp);
  2969. ymm_a1 = MKL_DC_LOAD_YMM(&MKL_DC_AA(i+4,k + 1));
  2970. MKL_DC_MUL_ADD_YMM(ymm_a1, ymm_b, ymm_temp1, ymm_temp);
  2971. ymm_b = MKL_DC_BCAST_YMM(&MKL_DC_BB(k + 1,j+1));
  2972. MKL_DC_MUL_ADD_YMM(ymm_a, ymm_b, ymm_temp2, ymm_temp);
  2973. MKL_DC_MUL_ADD_YMM(ymm_a1, ymm_b, ymm_temp3, ymm_temp);
  2974. ymm_b = MKL_DC_BCAST_YMM(&MKL_DC_BB(k + 1,j+2));
  2975. MKL_DC_MUL_ADD_YMM(ymm_a, ymm_b, ymm_temp4, ymm_temp);
  2976. MKL_DC_MUL_ADD_YMM(ymm_a1, ymm_b, ymm_temp5, ymm_temp);
  2977. ymm_a = MKL_DC_LOAD_YMM(&MKL_DC_AA(i,k + 2));
  2978. ymm_b = MKL_DC_BCAST_YMM(&MKL_DC_BB(k + 2,j));
  2979. MKL_DC_MUL_ADD_YMM(ymm_a, ymm_b, ymm_temp0, ymm_temp);
  2980. ymm_a1 = MKL_DC_LOAD_YMM(&MKL_DC_AA(i+4,k + 2));
  2981. MKL_DC_MUL_ADD_YMM(ymm_a1, ymm_b, ymm_temp1, ymm_temp);
  2982. ymm_b = MKL_DC_BCAST_YMM(&MKL_DC_BB(k + 2,j+1));
  2983. MKL_DC_MUL_ADD_YMM(ymm_a, ymm_b, ymm_temp2, ymm_temp);
  2984. MKL_DC_MUL_ADD_YMM(ymm_a1, ymm_b, ymm_temp3, ymm_temp);
  2985. ymm_b = MKL_DC_BCAST_YMM(&MKL_DC_BB(k + 2,j+2));
  2986. MKL_DC_MUL_ADD_YMM(ymm_a, ymm_b, ymm_temp4, ymm_temp);
  2987. MKL_DC_MUL_ADD_YMM(ymm_a1, ymm_b, ymm_temp5, ymm_temp);
  2988. ymm_a = MKL_DC_LOAD_YMM(&MKL_DC_AA(i,k + 3));
  2989. ymm_b = MKL_DC_BCAST_YMM(&MKL_DC_BB(k + 3,j));
  2990. MKL_DC_MUL_ADD_YMM(ymm_a, ymm_b, ymm_temp0, ymm_temp);
  2991. ymm_a1 = MKL_DC_LOAD_YMM(&MKL_DC_AA(i+4,k + 3));
  2992. MKL_DC_MUL_ADD_YMM(ymm_a1, ymm_b, ymm_temp1, ymm_temp);
  2993. ymm_b = MKL_DC_BCAST_YMM(&MKL_DC_BB(k + 3,j+1));
  2994. MKL_DC_MUL_ADD_YMM(ymm_a, ymm_b, ymm_temp2, ymm_temp);
  2995. MKL_DC_MUL_ADD_YMM(ymm_a1, ymm_b, ymm_temp3, ymm_temp);
  2996. ymm_b = MKL_DC_BCAST_YMM(&MKL_DC_BB(k + 3,j+2));
  2997. MKL_DC_MUL_ADD_YMM(ymm_a, ymm_b, ymm_temp4, ymm_temp);
  2998. MKL_DC_MUL_ADD_YMM(ymm_a1, ymm_b, ymm_temp5, ymm_temp);
  2999. }
  3000. if (krem & 2) {
  3001. ymm_a = MKL_DC_LOAD_YMM(&MKL_DC_AA(i,k + 0));
  3002. ymm_b = MKL_DC_BCAST_YMM(&MKL_DC_BB(k + 0,j));
  3003. MKL_DC_MUL_ADD_YMM(ymm_a, ymm_b, ymm_temp0, ymm_temp);
  3004. ymm_a1 = MKL_DC_LOAD_YMM(&MKL_DC_AA(i+4,k + 0));
  3005. MKL_DC_MUL_ADD_YMM(ymm_a1, ymm_b, ymm_temp1, ymm_temp);
  3006. ymm_b = MKL_DC_BCAST_YMM(&MKL_DC_BB(k + 0,j+1));
  3007. MKL_DC_MUL_ADD_YMM(ymm_a, ymm_b, ymm_temp2, ymm_temp);
  3008. MKL_DC_MUL_ADD_YMM(ymm_a1, ymm_b, ymm_temp3, ymm_temp);
  3009. ymm_b = MKL_DC_BCAST_YMM(&MKL_DC_BB(k + 0,j+2));
  3010. MKL_DC_MUL_ADD_YMM(ymm_a, ymm_b, ymm_temp4, ymm_temp);
  3011. MKL_DC_MUL_ADD_YMM(ymm_a1, ymm_b, ymm_temp5, ymm_temp);
  3012. k++;
  3013. ymm_a = MKL_DC_LOAD_YMM(&MKL_DC_AA(i,k + 0));
  3014. ymm_b = MKL_DC_BCAST_YMM(&MKL_DC_BB(k + 0,j));
  3015. MKL_DC_MUL_ADD_YMM(ymm_a, ymm_b, ymm_temp0, ymm_temp);
  3016. ymm_a1 = MKL_DC_LOAD_YMM(&MKL_DC_AA(i+4,k + 0));
  3017. MKL_DC_MUL_ADD_YMM(ymm_a1, ymm_b, ymm_temp1, ymm_temp);
  3018. ymm_b = MKL_DC_BCAST_YMM(&MKL_DC_BB(k + 0,j+1));
  3019. MKL_DC_MUL_ADD_YMM(ymm_a, ymm_b, ymm_temp2, ymm_temp);
  3020. MKL_DC_MUL_ADD_YMM(ymm_a1, ymm_b, ymm_temp3, ymm_temp);
  3021. ymm_b = MKL_DC_BCAST_YMM(&MKL_DC_BB(k + 0,j+2));
  3022. MKL_DC_MUL_ADD_YMM(ymm_a, ymm_b, ymm_temp4, ymm_temp);
  3023. MKL_DC_MUL_ADD_YMM(ymm_a1, ymm_b, ymm_temp5, ymm_temp);
  3024. k++;
  3025. }
  3026. if (krem & 1) {
  3027. ymm_a = MKL_DC_LOAD_YMM(&MKL_DC_AA(i,k + 0));
  3028. ymm_b = MKL_DC_BCAST_YMM(&MKL_DC_BB(k + 0,j));
  3029. MKL_DC_MUL_ADD_YMM(ymm_a, ymm_b, ymm_temp0, ymm_temp);
  3030. ymm_a1 = MKL_DC_LOAD_YMM(&MKL_DC_AA(i+4,k + 0));
  3031. MKL_DC_MUL_ADD_YMM(ymm_a1, ymm_b, ymm_temp1, ymm_temp);
  3032. ymm_b = MKL_DC_BCAST_YMM(&MKL_DC_BB(k + 0,j+1));
  3033. MKL_DC_MUL_ADD_YMM(ymm_a, ymm_b, ymm_temp2, ymm_temp);
  3034. MKL_DC_MUL_ADD_YMM(ymm_a1, ymm_b, ymm_temp3, ymm_temp);
  3035. ymm_b = MKL_DC_BCAST_YMM(&MKL_DC_BB(k + 0,j+2));
  3036. MKL_DC_MUL_ADD_YMM(ymm_a, ymm_b, ymm_temp4, ymm_temp);
  3037. MKL_DC_MUL_ADD_YMM(ymm_a1, ymm_b, ymm_temp5, ymm_temp);
  3038. k++;
  3039. }
  3040. #if !defined(MKL_DC_BETA_ZERO)
  3041. ymm_c0 = MKL_DC_LOAD_YMM(&MKL_DC_CC(i,j));
  3042. #if !defined(MKL_DC_BETA_ONE)
  3043. ymm_c0 = MKL_DC_MUL_YMM(ymm_c0, ymm_beta);
  3044. #endif
  3045. #if defined(MKL_DC_ALPHA_ONE)
  3046. ymm_c0 = MKL_DC_ADD_YMM(ymm_c0, ymm_temp0);
  3047. #else
  3048. MKL_DC_MUL_ADD_YMM(ymm_alpha, ymm_temp0, ymm_c0, ymm_temp);
  3049. #endif
  3050. #else
  3051. #if !defined(MKL_DC_ALPHA_ONE)
  3052. ymm_c0 = MKL_DC_MUL_YMM(ymm_alpha, ymm_temp0);
  3053. #else
  3054. ymm_c0 = ymm_temp0;
  3055. #endif
  3056. #endif
  3057. MKL_DC_STORE_YMM(&MKL_DC_CC(i,j), ymm_c0);
  3058. #if !defined(MKL_DC_BETA_ZERO)
  3059. ymm_c1 = MKL_DC_LOAD_YMM(&MKL_DC_CC(i+4,j));
  3060. #if !defined(MKL_DC_BETA_ONE)
  3061. ymm_c1 = MKL_DC_MUL_YMM(ymm_c1, ymm_beta);
  3062. #endif
  3063. #if defined(MKL_DC_ALPHA_ONE)
  3064. ymm_c1 = MKL_DC_ADD_YMM(ymm_temp1, ymm_c1);
  3065. #else
  3066. MKL_DC_MUL_ADD_YMM(ymm_alpha, ymm_temp1, ymm_c1, ymm_temp);
  3067. #endif
  3068. #else
  3069. #if !defined(MKL_DC_ALPHA_ONE)
  3070. ymm_c1 = MKL_DC_MUL_YMM(ymm_alpha, ymm_temp1);
  3071. #else
  3072. ymm_c1 = ymm_temp1;
  3073. #endif
  3074. #endif
  3075. MKL_DC_STORE_YMM(&MKL_DC_CC(i+4,j), ymm_c1);
  3076. #if !defined(MKL_DC_BETA_ZERO)
  3077. ymm_c2 = MKL_DC_LOAD_YMM(&MKL_DC_CC(i,j+1));
  3078. #if !defined(MKL_DC_BETA_ONE)
  3079. ymm_c2 = MKL_DC_MUL_YMM(ymm_c2, ymm_beta);
  3080. #endif
  3081. #if defined(MKL_DC_ALPHA_ONE)
  3082. ymm_c2 = MKL_DC_ADD_YMM(ymm_temp2, ymm_c2);
  3083. #else
  3084. MKL_DC_MUL_ADD_YMM(ymm_alpha, ymm_temp2, ymm_c2, ymm_temp);
  3085. #endif
  3086. #else
  3087. #if !defined(MKL_DC_ALPHA_ONE)
  3088. ymm_c2 = MKL_DC_MUL_YMM(ymm_alpha, ymm_temp2);
  3089. #else
  3090. ymm_c2 = ymm_temp2;
  3091. #endif
  3092. #endif
  3093. MKL_DC_STORE_YMM(&MKL_DC_CC(i,j+1), ymm_c2);
  3094. #if !defined(MKL_DC_BETA_ZERO)
  3095. ymm_c3 = MKL_DC_LOAD_YMM(&MKL_DC_CC(i+4,j+1));
  3096. #if !defined(MKL_DC_BETA_ONE)
  3097. ymm_c3 = MKL_DC_MUL_YMM(ymm_c3, ymm_beta);
  3098. #endif
  3099. #if defined(MKL_DC_ALPHA_ONE)
  3100. ymm_c3 = MKL_DC_ADD_YMM(ymm_temp3, ymm_c3);
  3101. #else
  3102. MKL_DC_MUL_ADD_YMM(ymm_alpha, ymm_temp3, ymm_c3, ymm_temp);
  3103. #endif
  3104. #else
  3105. #if !defined(MKL_DC_ALPHA_ONE)
  3106. ymm_c3 = MKL_DC_MUL_YMM(ymm_alpha, ymm_temp3);
  3107. #else
  3108. ymm_c3 = ymm_temp3;
  3109. #endif
  3110. #endif
  3111. MKL_DC_STORE_YMM(&MKL_DC_CC(i+4,j+1), ymm_c3);
  3112. #if !defined(MKL_DC_BETA_ZERO)
  3113. ymm_c4 = MKL_DC_LOAD_YMM(&MKL_DC_CC(i,j+2));
  3114. #if !defined(MKL_DC_BETA_ONE)
  3115. ymm_c4 = MKL_DC_MUL_YMM(ymm_c4, ymm_beta);
  3116. #endif
  3117. #if defined(MKL_DC_ALPHA_ONE)
  3118. ymm_c4 = MKL_DC_ADD_YMM(ymm_temp4, ymm_c4);
  3119. #else
  3120. MKL_DC_MUL_ADD_YMM(ymm_alpha, ymm_temp4, ymm_c4, ymm_temp);
  3121. #endif
  3122. #else
  3123. #if !defined(MKL_DC_ALPHA_ONE)
  3124. ymm_c4 = MKL_DC_MUL_YMM(ymm_alpha, ymm_temp4);
  3125. #else
  3126. ymm_c4 = ymm_temp4;
  3127. #endif
  3128. #endif
  3129. MKL_DC_STORE_YMM(&MKL_DC_CC(i,j+2), ymm_c4);
  3130. #if !defined(MKL_DC_BETA_ZERO)
  3131. ymm_c5 = MKL_DC_LOAD_YMM(&MKL_DC_CC(i+4,j+2));
  3132. #if !defined(MKL_DC_BETA_ONE)
  3133. ymm_c5 = MKL_DC_MUL_YMM(ymm_c5, ymm_beta);
  3134. #endif
  3135. #if defined(MKL_DC_ALPHA_ONE)
  3136. ymm_c5 = MKL_DC_ADD_YMM(ymm_temp5, ymm_c5);
  3137. #else
  3138. MKL_DC_MUL_ADD_YMM(ymm_alpha, ymm_temp5, ymm_c5, ymm_temp);
  3139. #endif
  3140. #else
  3141. #if !defined(MKL_DC_ALPHA_ONE)
  3142. ymm_c5 = MKL_DC_MUL_YMM(ymm_alpha, ymm_temp5);
  3143. #else
  3144. ymm_c5 = ymm_temp5;
  3145. #endif
  3146. #endif
  3147. MKL_DC_STORE_YMM(&MKL_DC_CC(i+4,j+2), ymm_c5);
  3148. }
  3149. if ((m-i) & MKER1) {
  3150. ymm_temp0 = MKL_DC_SETZERO_YMM();
  3151. ymm_temp3 = MKL_DC_SETZERO_YMM();
  3152. ymm_temp5 = MKL_DC_SETZERO_YMM();
  3153. MKL_INT k;
  3154. for (k=0; k<k0; k+=k_in_ker) {
  3155. ymm_a = MKL_DC_LOAD_YMM(&MKL_DC_AA(i,k + 0));
  3156. ymm_b = MKL_DC_BCAST_YMM(&MKL_DC_BB(k + 0,j));
  3157. MKL_DC_MUL_ADD_YMM(ymm_a, ymm_b, ymm_temp0, ymm_temp);
  3158. ymm_b = MKL_DC_BCAST_YMM(&MKL_DC_BB(k + 0,j+1));
  3159. MKL_DC_MUL_ADD_YMM(ymm_a, ymm_b, ymm_temp3, ymm_temp);
  3160. ymm_b = MKL_DC_BCAST_YMM(&MKL_DC_BB(k + 0,j+2));
  3161. MKL_DC_MUL_ADD_YMM(ymm_a, ymm_b, ymm_temp5, ymm_temp);
  3162. ymm_a = MKL_DC_LOAD_YMM(&MKL_DC_AA(i,k + 1));
  3163. ymm_b = MKL_DC_BCAST_YMM(&MKL_DC_BB(k + 1,j));
  3164. MKL_DC_MUL_ADD_YMM(ymm_a, ymm_b, ymm_temp0, ymm_temp);
  3165. ymm_b = MKL_DC_BCAST_YMM(&MKL_DC_BB(k + 1,j+1));
  3166. MKL_DC_MUL_ADD_YMM(ymm_a, ymm_b, ymm_temp3, ymm_temp);
  3167. ymm_b = MKL_DC_BCAST_YMM(&MKL_DC_BB(k + 1,j+2));
  3168. MKL_DC_MUL_ADD_YMM(ymm_a, ymm_b, ymm_temp5, ymm_temp);
  3169. ymm_a = MKL_DC_LOAD_YMM(&MKL_DC_AA(i,k + 2));
  3170. ymm_b = MKL_DC_BCAST_YMM(&MKL_DC_BB(k + 2,j));
  3171. MKL_DC_MUL_ADD_YMM(ymm_a, ymm_b, ymm_temp0, ymm_temp);
  3172. ymm_b = MKL_DC_BCAST_YMM(&MKL_DC_BB(k + 2,j+1));
  3173. MKL_DC_MUL_ADD_YMM(ymm_a, ymm_b, ymm_temp3, ymm_temp);
  3174. ymm_b = MKL_DC_BCAST_YMM(&MKL_DC_BB(k + 2,j+2));
  3175. MKL_DC_MUL_ADD_YMM(ymm_a, ymm_b, ymm_temp5, ymm_temp);
  3176. ymm_a = MKL_DC_LOAD_YMM(&MKL_DC_AA(i,k + 3));
  3177. ymm_b = MKL_DC_BCAST_YMM(&MKL_DC_BB(k + 3,j));
  3178. MKL_DC_MUL_ADD_YMM(ymm_a, ymm_b, ymm_temp0, ymm_temp);
  3179. ymm_b = MKL_DC_BCAST_YMM(&MKL_DC_BB(k + 3,j+1));
  3180. MKL_DC_MUL_ADD_YMM(ymm_a, ymm_b, ymm_temp3, ymm_temp);
  3181. ymm_b = MKL_DC_BCAST_YMM(&MKL_DC_BB(k + 3,j+2));
  3182. MKL_DC_MUL_ADD_YMM(ymm_a, ymm_b, ymm_temp5, ymm_temp);
  3183. }
  3184. if (krem & 2) {
  3185. ymm_a = MKL_DC_LOAD_YMM(&MKL_DC_AA(i,k + 0));
  3186. ymm_b = MKL_DC_BCAST_YMM(&MKL_DC_BB(k + 0,j));
  3187. MKL_DC_MUL_ADD_YMM(ymm_a, ymm_b, ymm_temp0, ymm_temp);
  3188. ymm_b = MKL_DC_BCAST_YMM(&MKL_DC_BB(k + 0,j+1));
  3189. MKL_DC_MUL_ADD_YMM(ymm_a, ymm_b, ymm_temp3, ymm_temp);
  3190. ymm_b = MKL_DC_BCAST_YMM(&MKL_DC_BB(k + 0,j+2));
  3191. MKL_DC_MUL_ADD_YMM(ymm_a, ymm_b, ymm_temp5, ymm_temp);
  3192. k++;
  3193. ymm_a = MKL_DC_LOAD_YMM(&MKL_DC_AA(i,k + 0));
  3194. ymm_b = MKL_DC_BCAST_YMM(&MKL_DC_BB(k + 0,j));
  3195. MKL_DC_MUL_ADD_YMM(ymm_a, ymm_b, ymm_temp0, ymm_temp);
  3196. ymm_b = MKL_DC_BCAST_YMM(&MKL_DC_BB(k + 0,j+1));
  3197. MKL_DC_MUL_ADD_YMM(ymm_a, ymm_b, ymm_temp3, ymm_temp);
  3198. ymm_b = MKL_DC_BCAST_YMM(&MKL_DC_BB(k + 0,j+2));
  3199. MKL_DC_MUL_ADD_YMM(ymm_a, ymm_b, ymm_temp5, ymm_temp);
  3200. k++;
  3201. }
  3202. if (krem & 1) {
  3203. ymm_a = MKL_DC_LOAD_YMM(&MKL_DC_AA(i,k + 0));
  3204. ymm_b = MKL_DC_BCAST_YMM(&MKL_DC_BB(k + 0,j));
  3205. MKL_DC_MUL_ADD_YMM(ymm_a, ymm_b, ymm_temp0, ymm_temp);
  3206. ymm_b = MKL_DC_BCAST_YMM(&MKL_DC_BB(k + 0,j+1));
  3207. MKL_DC_MUL_ADD_YMM(ymm_a, ymm_b, ymm_temp3, ymm_temp);
  3208. ymm_b = MKL_DC_BCAST_YMM(&MKL_DC_BB(k + 0,j+2));
  3209. MKL_DC_MUL_ADD_YMM(ymm_a, ymm_b, ymm_temp5, ymm_temp);
  3210. k++;
  3211. }
  3212. #if !defined(MKL_DC_BETA_ZERO)
  3213. ymm_c0 = MKL_DC_LOAD_YMM(&MKL_DC_CC(i,j));
  3214. #if !defined(MKL_DC_BETA_ONE)
  3215. ymm_c0 = MKL_DC_MUL_YMM(ymm_c0, ymm_beta);
  3216. #endif
  3217. #if defined(MKL_DC_ALPHA_ONE)
  3218. ymm_c0 = MKL_DC_ADD_YMM(ymm_temp0, ymm_c0);
  3219. #else
  3220. MKL_DC_MUL_ADD_YMM(ymm_alpha, ymm_temp0, ymm_c0, ymm_temp);
  3221. #endif
  3222. #else
  3223. #if !defined(MKL_DC_ALPHA_ONE)
  3224. ymm_c0 = MKL_DC_MUL_YMM(ymm_alpha, ymm_temp0);
  3225. #else
  3226. ymm_c0 = ymm_temp0;
  3227. #endif
  3228. #endif
  3229. #if !defined(MKL_DC_BETA_ZERO)
  3230. ymm_c3 = MKL_DC_LOAD_YMM(&MKL_DC_CC(i,j+1));
  3231. #if !defined(MKL_DC_BETA_ONE)
  3232. ymm_c3 = MKL_DC_MUL_YMM(ymm_c3, ymm_beta);
  3233. #endif
  3234. #if defined(MKL_DC_ALPHA_ONE)
  3235. ymm_c3 = MKL_DC_ADD_YMM(ymm_temp3, ymm_c3);
  3236. #else
  3237. MKL_DC_MUL_ADD_YMM(ymm_alpha, ymm_temp3, ymm_c3, ymm_temp);
  3238. #endif
  3239. #else
  3240. #if !defined(MKL_DC_ALPHA_ONE)
  3241. ymm_c3 = MKL_DC_MUL_YMM(ymm_alpha, ymm_temp3);
  3242. #else
  3243. ymm_c3 = ymm_temp3;
  3244. #endif
  3245. #endif
  3246. #if !defined(MKL_DC_BETA_ZERO)
  3247. ymm_c5 = MKL_DC_LOAD_YMM(&MKL_DC_CC(i,j+2));
  3248. #if !defined(MKL_DC_BETA_ONE)
  3249. ymm_c5 = MKL_DC_MUL_YMM(ymm_c5, ymm_beta);
  3250. #endif
  3251. #if defined(MKL_DC_ALPHA_ONE)
  3252. ymm_c5 = MKL_DC_ADD_YMM(ymm_temp5, ymm_c5);
  3253. #else
  3254. MKL_DC_MUL_ADD_YMM(ymm_alpha, ymm_temp5, ymm_c5, ymm_temp);
  3255. #endif
  3256. #else
  3257. #if !defined(MKL_DC_ALPHA_ONE)
  3258. ymm_c5 = MKL_DC_MUL_YMM(ymm_alpha, ymm_temp5);
  3259. #else
  3260. ymm_c5 = ymm_temp5;
  3261. #endif
  3262. #endif
  3263. MKL_DC_STORE_YMM(&MKL_DC_CC(i,j+0), ymm_c0);
  3264. MKL_DC_STORE_YMM(&MKL_DC_CC(i,j+1), ymm_c3);
  3265. MKL_DC_STORE_YMM(&MKL_DC_CC(i,j+2), ymm_c5);
  3266. i += MKER1;
  3267. }
  3268. if ((m-i) & MKER2) {
  3269. xmm_temp0 = MKL_DC_SETZERO_XMM();
  3270. xmm_temp3 = MKL_DC_SETZERO_XMM();
  3271. xmm_temp5 = MKL_DC_SETZERO_XMM();
  3272. MKL_INT k;
  3273. for (k=0; k<k0; k+=k_in_ker) {
  3274. xmm_a = MKL_DC_LOAD_XMM(&MKL_DC_AA(i,k));
  3275. ymm_b = MKL_DC_BCAST_YMM(&MKL_DC_BB(k,j));
  3276. xmm_b = MKL_DC_CAST_YMM_TO_XMM(ymm_b);
  3277. MKL_DC_MUL_ADD_XMM(xmm_a, xmm_b, xmm_temp0, xmm_b);
  3278. ymm_b = MKL_DC_BCAST_YMM(&MKL_DC_BB(k,j+1));
  3279. xmm_b = MKL_DC_CAST_YMM_TO_XMM(ymm_b);
  3280. MKL_DC_MUL_ADD_XMM(xmm_a, xmm_b, xmm_temp3, xmm_b);
  3281. ymm_b = MKL_DC_BCAST_YMM(&MKL_DC_BB(k,j+2));
  3282. xmm_b = MKL_DC_CAST_YMM_TO_XMM(ymm_b);
  3283. MKL_DC_MUL_ADD_XMM(xmm_a, xmm_b, xmm_temp5, xmm_b);
  3284. xmm_a = MKL_DC_LOAD_XMM(&MKL_DC_AA(i,k+1));
  3285. ymm_b = MKL_DC_BCAST_YMM(&MKL_DC_BB(k+1,j));
  3286. xmm_b = MKL_DC_CAST_YMM_TO_XMM(ymm_b);
  3287. MKL_DC_MUL_ADD_XMM(xmm_a, xmm_b, xmm_temp0, xmm_b);
  3288. ymm_b = MKL_DC_BCAST_YMM(&MKL_DC_BB(k+1,j+1));
  3289. xmm_b = MKL_DC_CAST_YMM_TO_XMM(ymm_b);
  3290. MKL_DC_MUL_ADD_XMM(xmm_a, xmm_b, xmm_temp3, xmm_b);
  3291. ymm_b = MKL_DC_BCAST_YMM(&MKL_DC_BB(k+1,j+2));
  3292. xmm_b = MKL_DC_CAST_YMM_TO_XMM(ymm_b);
  3293. MKL_DC_MUL_ADD_XMM(xmm_a, xmm_b, xmm_temp5, xmm_b);
  3294. xmm_a = MKL_DC_LOAD_XMM(&MKL_DC_AA(i,k+2));
  3295. ymm_b = MKL_DC_BCAST_YMM(&MKL_DC_BB(k+2,j));
  3296. xmm_b = MKL_DC_CAST_YMM_TO_XMM(ymm_b);
  3297. MKL_DC_MUL_ADD_XMM(xmm_a, xmm_b, xmm_temp0, xmm_b);
  3298. ymm_b = MKL_DC_BCAST_YMM(&MKL_DC_BB(k+2,j+1));
  3299. xmm_b = MKL_DC_CAST_YMM_TO_XMM(ymm_b);
  3300. MKL_DC_MUL_ADD_XMM(xmm_a, xmm_b, xmm_temp3, xmm_b);
  3301. ymm_b = MKL_DC_BCAST_YMM(&MKL_DC_BB(k+2,j+2));
  3302. xmm_b = MKL_DC_CAST_YMM_TO_XMM(ymm_b);
  3303. MKL_DC_MUL_ADD_XMM(xmm_a, xmm_b, xmm_temp5, xmm_b);
  3304. xmm_a = MKL_DC_LOAD_XMM(&MKL_DC_AA(i,k+3));
  3305. ymm_b = MKL_DC_BCAST_YMM(&MKL_DC_BB(k+3,j));
  3306. xmm_b = MKL_DC_CAST_YMM_TO_XMM(ymm_b);
  3307. MKL_DC_MUL_ADD_XMM(xmm_a, xmm_b, xmm_temp0, xmm_b);
  3308. ymm_b = MKL_DC_BCAST_YMM(&MKL_DC_BB(k+3,j+1));
  3309. xmm_b = MKL_DC_CAST_YMM_TO_XMM(ymm_b);
  3310. MKL_DC_MUL_ADD_XMM(xmm_a, xmm_b, xmm_temp3, xmm_b);
  3311. ymm_b = MKL_DC_BCAST_YMM(&MKL_DC_BB(k+3,j+2));
  3312. xmm_b = MKL_DC_CAST_YMM_TO_XMM(ymm_b);
  3313. MKL_DC_MUL_ADD_XMM(xmm_a, xmm_b, xmm_temp5, xmm_b);
  3314. }
  3315. if (krem & 2) {
  3316. xmm_a = MKL_DC_LOAD_XMM(&MKL_DC_AA(i,k));
  3317. ymm_b = MKL_DC_BCAST_YMM(&MKL_DC_BB(k,j));
  3318. xmm_b = MKL_DC_CAST_YMM_TO_XMM(ymm_b);
  3319. MKL_DC_MUL_ADD_XMM(xmm_a, xmm_b, xmm_temp0, xmm_b);
  3320. ymm_b = MKL_DC_BCAST_YMM(&MKL_DC_BB(k,j+1));
  3321. xmm_b = MKL_DC_CAST_YMM_TO_XMM(ymm_b);
  3322. MKL_DC_MUL_ADD_XMM(xmm_a, xmm_b, xmm_temp3, xmm_b);
  3323. ymm_b = MKL_DC_BCAST_YMM(&MKL_DC_BB(k,j+2));
  3324. xmm_b = MKL_DC_CAST_YMM_TO_XMM(ymm_b);
  3325. MKL_DC_MUL_ADD_XMM(xmm_a, xmm_b, xmm_temp5, xmm_b);
  3326. k++;
  3327. xmm_a = MKL_DC_LOAD_XMM(&MKL_DC_AA(i,k));
  3328. ymm_b = MKL_DC_BCAST_YMM(&MKL_DC_BB(k,j));
  3329. xmm_b = MKL_DC_CAST_YMM_TO_XMM(ymm_b);
  3330. MKL_DC_MUL_ADD_XMM(xmm_a, xmm_b, xmm_temp0, xmm_b);
  3331. ymm_b = MKL_DC_BCAST_YMM(&MKL_DC_BB(k,j+1));
  3332. xmm_b = MKL_DC_CAST_YMM_TO_XMM(ymm_b);
  3333. MKL_DC_MUL_ADD_XMM(xmm_a, xmm_b, xmm_temp3, xmm_b);
  3334. ymm_b = MKL_DC_BCAST_YMM(&MKL_DC_BB(k,j+2));
  3335. xmm_b = MKL_DC_CAST_YMM_TO_XMM(ymm_b);
  3336. MKL_DC_MUL_ADD_XMM(xmm_a, xmm_b, xmm_temp5, xmm_b);
  3337. k++;
  3338. }
  3339. if (krem & 1) {
  3340. xmm_a = MKL_DC_LOAD_XMM(&MKL_DC_AA(i,k));
  3341. ymm_b = MKL_DC_BCAST_YMM(&MKL_DC_BB(k,j));
  3342. xmm_b = MKL_DC_CAST_YMM_TO_XMM(ymm_b);
  3343. MKL_DC_MUL_ADD_XMM(xmm_a, xmm_b, xmm_temp0, xmm_b);
  3344. ymm_b = MKL_DC_BCAST_YMM(&MKL_DC_BB(k,j+1));
  3345. xmm_b = MKL_DC_CAST_YMM_TO_XMM(ymm_b);
  3346. MKL_DC_MUL_ADD_XMM(xmm_a, xmm_b, xmm_temp3, xmm_b);
  3347. ymm_b = MKL_DC_BCAST_YMM(&MKL_DC_BB(k,j+2));
  3348. xmm_b = MKL_DC_CAST_YMM_TO_XMM(ymm_b);
  3349. MKL_DC_MUL_ADD_XMM(xmm_a, xmm_b, xmm_temp5, xmm_b);
  3350. k++;
  3351. }
  3352. #if !defined(MKL_DC_BETA_ZERO)
  3353. xmm_c = MKL_DC_LOAD_XMM(&MKL_DC_CC(i,j));
  3354. #if !defined(MKL_DC_BETA_ONE)
  3355. xmm_c = MKL_DC_MUL_XMM(xmm_c, xmm_beta);
  3356. #endif
  3357. #if defined(MKL_DC_ALPHA_ONE)
  3358. xmm_c = MKL_DC_ADD_XMM(xmm_temp0, xmm_c);
  3359. #else
  3360. MKL_DC_MUL_ADD_XMM(xmm_alpha, xmm_temp0, xmm_c, xmm_temp);
  3361. #endif
  3362. #else
  3363. #if !defined(MKL_DC_ALPHA_ONE)
  3364. xmm_c = MKL_DC_MUL_XMM(xmm_alpha, xmm_temp0);
  3365. #else
  3366. xmm_c = xmm_temp0;
  3367. #endif
  3368. #endif
  3369. #if !defined(MKL_DC_BETA_ZERO)
  3370. xmm_c3 = MKL_DC_LOAD_XMM(&MKL_DC_CC(i,j+1));
  3371. #if !defined(MKL_DC_BETA_ONE)
  3372. xmm_c3 = MKL_DC_MUL_XMM(xmm_c3, xmm_beta);
  3373. #endif
  3374. #if defined(MKL_DC_ALPHA_ONE)
  3375. xmm_c3 = MKL_DC_ADD_XMM(xmm_temp3, xmm_c3);
  3376. #else
  3377. MKL_DC_MUL_ADD_XMM(xmm_alpha, xmm_temp3, xmm_c3, xmm_temp);
  3378. #endif
  3379. #else
  3380. #if !defined(MKL_DC_ALPHA_ONE)
  3381. xmm_c3 = MKL_DC_MUL_XMM(xmm_alpha, xmm_temp3);
  3382. #else
  3383. xmm_c3 = xmm_temp3;
  3384. #endif
  3385. #endif
  3386. #if !defined(MKL_DC_BETA_ZERO)
  3387. xmm_c5 = MKL_DC_LOAD_XMM(&MKL_DC_CC(i,j+2));
  3388. #if !defined(MKL_DC_BETA_ONE)
  3389. xmm_c5 = MKL_DC_MUL_XMM(xmm_c5, xmm_beta);
  3390. #endif
  3391. #if defined(MKL_DC_ALPHA_ONE)
  3392. xmm_c5 = MKL_DC_ADD_XMM(xmm_temp5, xmm_c5);
  3393. #else
  3394. MKL_DC_MUL_ADD_XMM(xmm_alpha, xmm_temp5, xmm_c5, xmm_temp);
  3395. #endif
  3396. #else
  3397. #if !defined(MKL_DC_ALPHA_ONE)
  3398. xmm_c5 = MKL_DC_MUL_XMM(xmm_alpha, xmm_temp5);
  3399. #else
  3400. xmm_c5 = xmm_temp5;
  3401. #endif
  3402. #endif
  3403. MKL_DC_STORE_XMM(&MKL_DC_CC(i,j), xmm_c);
  3404. MKL_DC_STORE_XMM(&MKL_DC_CC(i,j+1), xmm_c3);
  3405. MKL_DC_STORE_XMM(&MKL_DC_CC(i,j+2), xmm_c5);
  3406. i += MKER2;
  3407. }
  3408. if ((m-i) & MKER3) {
  3409. xmm_temp0 = MKL_DC_SETZERO_XMM();
  3410. xmm_temp3 = MKL_DC_SETZERO_XMM();
  3411. xmm_temp5 = MKL_DC_SETZERO_XMM();
  3412. MKL_INT k;
  3413. for (k=0; k<k0; k+=k_in_ker) {
  3414. xmm_a = MKL_DC_LOAD_XMM_S(&MKL_DC_AA(i,k));
  3415. xmm_b = MKL_DC_LOAD_XMM_S(&MKL_DC_BB(k,j));
  3416. MKL_DC_MUL_ADD_XMM_S(xmm_a, xmm_b, xmm_temp0, xmm_b);
  3417. xmm_b = MKL_DC_LOAD_XMM_S(&MKL_DC_BB(k,j+1));
  3418. MKL_DC_MUL_ADD_XMM_S(xmm_a, xmm_b, xmm_temp3, xmm_b);
  3419. xmm_b = MKL_DC_LOAD_XMM_S(&MKL_DC_BB(k,j+2));
  3420. MKL_DC_MUL_ADD_XMM_S(xmm_a, xmm_b, xmm_temp5, xmm_b);
  3421. xmm_a = MKL_DC_LOAD_XMM_S(&MKL_DC_AA(i,k+1));
  3422. xmm_b = MKL_DC_LOAD_XMM_S(&MKL_DC_BB(k+1,j));
  3423. MKL_DC_MUL_ADD_XMM_S(xmm_a, xmm_b, xmm_temp0, xmm_b);
  3424. xmm_b = MKL_DC_LOAD_XMM_S(&MKL_DC_BB(k+1,j+1));
  3425. MKL_DC_MUL_ADD_XMM_S(xmm_a, xmm_b, xmm_temp3, xmm_b);
  3426. xmm_b = MKL_DC_LOAD_XMM_S(&MKL_DC_BB(k+1,j+2));
  3427. MKL_DC_MUL_ADD_XMM_S(xmm_a, xmm_b, xmm_temp5, xmm_b);
  3428. xmm_a = MKL_DC_LOAD_XMM_S(&MKL_DC_AA(i,k+2));
  3429. xmm_b = MKL_DC_LOAD_XMM_S(&MKL_DC_BB(k+2,j));
  3430. MKL_DC_MUL_ADD_XMM_S(xmm_a, xmm_b, xmm_temp0, xmm_b);
  3431. xmm_b = MKL_DC_LOAD_XMM_S(&MKL_DC_BB(k+2,j+1));
  3432. MKL_DC_MUL_ADD_XMM_S(xmm_a, xmm_b, xmm_temp3, xmm_b);
  3433. xmm_b = MKL_DC_LOAD_XMM_S(&MKL_DC_BB(k+2,j+2));
  3434. MKL_DC_MUL_ADD_XMM_S(xmm_a, xmm_b, xmm_temp5, xmm_b);
  3435. xmm_a = MKL_DC_LOAD_XMM_S(&MKL_DC_AA(i,k+3));
  3436. xmm_b = MKL_DC_LOAD_XMM_S(&MKL_DC_BB(k+3,j));
  3437. MKL_DC_MUL_ADD_XMM_S(xmm_a, xmm_b, xmm_temp0, xmm_b);
  3438. xmm_b = MKL_DC_LOAD_XMM_S(&MKL_DC_BB(k+3,j+1));
  3439. MKL_DC_MUL_ADD_XMM_S(xmm_a, xmm_b, xmm_temp3, xmm_b);
  3440. xmm_b = MKL_DC_LOAD_XMM_S(&MKL_DC_BB(k+3,j+2));
  3441. MKL_DC_MUL_ADD_XMM_S(xmm_a, xmm_b, xmm_temp5, xmm_b);
  3442. }
  3443. if (krem & 2) {
  3444. xmm_a = MKL_DC_LOAD_XMM_S(&MKL_DC_AA(i,k));
  3445. xmm_b = MKL_DC_LOAD_XMM_S(&MKL_DC_BB(k,j));
  3446. MKL_DC_MUL_ADD_XMM_S(xmm_a, xmm_b, xmm_temp0, xmm_b);
  3447. xmm_b = MKL_DC_LOAD_XMM_S(&MKL_DC_BB(k,j+1));
  3448. MKL_DC_MUL_ADD_XMM_S(xmm_a, xmm_b, xmm_temp3, xmm_b);
  3449. xmm_b = MKL_DC_LOAD_XMM_S(&MKL_DC_BB(k,j+2));
  3450. MKL_DC_MUL_ADD_XMM_S(xmm_a, xmm_b, xmm_temp5, xmm_b);
  3451. k++;
  3452. xmm_a = MKL_DC_LOAD_XMM_S(&MKL_DC_AA(i,k));
  3453. xmm_b = MKL_DC_LOAD_XMM_S(&MKL_DC_BB(k,j));
  3454. MKL_DC_MUL_ADD_XMM_S(xmm_a, xmm_b, xmm_temp0, xmm_b);
  3455. xmm_b = MKL_DC_LOAD_XMM_S(&MKL_DC_BB(k,j+1));
  3456. MKL_DC_MUL_ADD_XMM_S(xmm_a, xmm_b, xmm_temp3, xmm_b);
  3457. xmm_b = MKL_DC_LOAD_XMM_S(&MKL_DC_BB(k,j+2));
  3458. MKL_DC_MUL_ADD_XMM_S(xmm_a, xmm_b, xmm_temp5, xmm_b);
  3459. k++;
  3460. }
  3461. if (krem & 1) {
  3462. xmm_a = MKL_DC_LOAD_XMM_S(&MKL_DC_AA(i,k));
  3463. xmm_b = MKL_DC_LOAD_XMM_S(&MKL_DC_BB(k,j));
  3464. MKL_DC_MUL_ADD_XMM_S(xmm_a, xmm_b, xmm_temp0, xmm_b);
  3465. xmm_b = MKL_DC_LOAD_XMM_S(&MKL_DC_BB(k,j+1));
  3466. MKL_DC_MUL_ADD_XMM_S(xmm_a, xmm_b, xmm_temp3, xmm_b);
  3467. xmm_b = MKL_DC_LOAD_XMM_S(&MKL_DC_BB(k,j+2));
  3468. MKL_DC_MUL_ADD_XMM_S(xmm_a, xmm_b, xmm_temp5, xmm_b);
  3469. k++;
  3470. }
  3471. #if !defined(MKL_DC_BETA_ZERO)
  3472. xmm_c = MKL_DC_LOAD_XMM_S(&MKL_DC_CC(i,j));
  3473. #if !defined(MKL_DC_BETA_ONE)
  3474. xmm_c = MKL_DC_MUL_XMM(xmm_c, xmm_beta);
  3475. #endif
  3476. #if defined(MKL_DC_ALPHA_ONE)
  3477. xmm_c = MKL_DC_ADD_XMM_S(xmm_temp0, xmm_c);
  3478. #else
  3479. MKL_DC_MUL_ADD_XMM_S(xmm_alpha, xmm_temp0, xmm_c, xmm_temp);
  3480. #endif
  3481. #else
  3482. #if !defined(MKL_DC_ALPHA_ONE)
  3483. xmm_c = MKL_DC_MUL_XMM_S(xmm_alpha, xmm_temp0);
  3484. #else
  3485. xmm_c = xmm_temp0;
  3486. #endif
  3487. #endif
  3488. #if !defined(MKL_DC_BETA_ZERO)
  3489. xmm_c3 = MKL_DC_LOAD_XMM_S(&MKL_DC_CC(i,j+1));
  3490. #if !defined(MKL_DC_BETA_ONE)
  3491. xmm_c3 = MKL_DC_MUL_XMM(xmm_c3, xmm_beta);
  3492. #endif
  3493. #if defined(MKL_DC_ALPHA_ONE)
  3494. xmm_c3 = MKL_DC_ADD_XMM_S(xmm_temp3, xmm_c3);
  3495. #else
  3496. MKL_DC_MUL_ADD_XMM_S(xmm_alpha, xmm_temp3, xmm_c3, xmm_temp);
  3497. #endif
  3498. #else
  3499. #if !defined(MKL_DC_ALPHA_ONE)
  3500. xmm_c3 = MKL_DC_MUL_XMM_S(xmm_alpha, xmm_temp3);
  3501. #else
  3502. xmm_c3 = xmm_temp3;
  3503. #endif
  3504. #endif
  3505. #if !defined(MKL_DC_BETA_ZERO)
  3506. xmm_c5 = MKL_DC_LOAD_XMM_S(&MKL_DC_CC(i,j+2));
  3507. #if !defined(MKL_DC_BETA_ONE)
  3508. xmm_c5 = MKL_DC_MUL_XMM(xmm_c5, xmm_beta);
  3509. #endif
  3510. #if defined(MKL_DC_ALPHA_ONE)
  3511. xmm_c5 = MKL_DC_ADD_XMM_S(xmm_temp5, xmm_c5);
  3512. #else
  3513. MKL_DC_MUL_ADD_XMM_S(xmm_alpha, xmm_temp5, xmm_c5, xmm_temp);
  3514. #endif
  3515. #else
  3516. #if !defined(MKL_DC_ALPHA_ONE)
  3517. xmm_c5 = MKL_DC_MUL_XMM_S(xmm_alpha, xmm_temp5);
  3518. #else
  3519. xmm_c5 = xmm_temp5;
  3520. #endif
  3521. #endif
  3522. MKL_DC_STORE_XMM_S(&MKL_DC_CC(i,j), xmm_c);
  3523. MKL_DC_STORE_XMM_S(&MKL_DC_CC(i,j+1), xmm_c3);
  3524. MKL_DC_STORE_XMM_S(&MKL_DC_CC(i,j+2), xmm_c5);
  3525. i += MKER3;
  3526. }
  3527. } else if ((n-j) == 2) {
  3528. MKL_INT i;
  3529. for (i=0; i<m0; i+=m_in_ker) {
  3530. ymm_temp0 = MKL_DC_SETZERO_YMM();
  3531. ymm_temp1 = MKL_DC_SETZERO_YMM();
  3532. ymm_temp2 = MKL_DC_SETZERO_YMM();
  3533. ymm_temp3 = MKL_DC_SETZERO_YMM();
  3534. ymm_temp4 = MKL_DC_SETZERO_YMM();
  3535. ymm_temp5 = MKL_DC_SETZERO_YMM();
  3536. ymm_temp6 = MKL_DC_SETZERO_YMM();
  3537. ymm_temp7 = MKL_DC_SETZERO_YMM();
  3538. MKL_INT k;
  3539. for (k=0; k<k0; k+=k_in_ker) {
  3540. ymm_a = MKL_DC_LOAD_YMM(&MKL_DC_AA(i,k + 0));
  3541. ymm_b = MKL_DC_BCAST_YMM(&MKL_DC_BB(k + 0,j));
  3542. MKL_DC_MUL_ADD_YMM(ymm_a, ymm_b, ymm_temp0, ymm_temp);
  3543. ymm_a1 = MKL_DC_LOAD_YMM(&MKL_DC_AA(i+4,k + 0));
  3544. MKL_DC_MUL_ADD_YMM(ymm_a1, ymm_b, ymm_temp1, ymm_temp);
  3545. ymm_b = MKL_DC_BCAST_YMM(&MKL_DC_BB(k + 0,j+1));
  3546. MKL_DC_MUL_ADD_YMM(ymm_a, ymm_b, ymm_temp2, ymm_temp);
  3547. MKL_DC_MUL_ADD_YMM(ymm_a1, ymm_b, ymm_temp3, ymm_temp);
  3548. ymm_a = MKL_DC_LOAD_YMM(&MKL_DC_AA(i,k + 1));
  3549. ymm_b = MKL_DC_BCAST_YMM(&MKL_DC_BB(k + 1,j));
  3550. MKL_DC_MUL_ADD_YMM(ymm_a, ymm_b, ymm_temp4, ymm_temp);
  3551. ymm_a1 = MKL_DC_LOAD_YMM(&MKL_DC_AA(i+4,k + 1));
  3552. MKL_DC_MUL_ADD_YMM(ymm_a1, ymm_b, ymm_temp5, ymm_temp);
  3553. ymm_b = MKL_DC_BCAST_YMM(&MKL_DC_BB(k + 1,j+1));
  3554. MKL_DC_MUL_ADD_YMM(ymm_a, ymm_b, ymm_temp6, ymm_temp);
  3555. MKL_DC_MUL_ADD_YMM(ymm_a1, ymm_b, ymm_temp7, ymm_temp);
  3556. ymm_a = MKL_DC_LOAD_YMM(&MKL_DC_AA(i,k + 2));
  3557. ymm_b = MKL_DC_BCAST_YMM(&MKL_DC_BB(k + 2,j));
  3558. MKL_DC_MUL_ADD_YMM(ymm_a, ymm_b, ymm_temp0, ymm_temp);
  3559. ymm_a1 = MKL_DC_LOAD_YMM(&MKL_DC_AA(i+4,k + 2));
  3560. MKL_DC_MUL_ADD_YMM(ymm_a1, ymm_b, ymm_temp1, ymm_temp);
  3561. ymm_b = MKL_DC_BCAST_YMM(&MKL_DC_BB(k + 2,j+1));
  3562. MKL_DC_MUL_ADD_YMM(ymm_a, ymm_b, ymm_temp2, ymm_temp);
  3563. MKL_DC_MUL_ADD_YMM(ymm_a1, ymm_b, ymm_temp3, ymm_temp);
  3564. ymm_a = MKL_DC_LOAD_YMM(&MKL_DC_AA(i,k + 3));
  3565. ymm_b = MKL_DC_BCAST_YMM(&MKL_DC_BB(k + 3,j));
  3566. MKL_DC_MUL_ADD_YMM(ymm_a, ymm_b, ymm_temp4, ymm_temp);
  3567. ymm_a1 = MKL_DC_LOAD_YMM(&MKL_DC_AA(i+4,k + 3));
  3568. MKL_DC_MUL_ADD_YMM(ymm_a1, ymm_b, ymm_temp5, ymm_temp);
  3569. ymm_b = MKL_DC_BCAST_YMM(&MKL_DC_BB(k + 3,j+1));
  3570. MKL_DC_MUL_ADD_YMM(ymm_a, ymm_b, ymm_temp6, ymm_temp);
  3571. MKL_DC_MUL_ADD_YMM(ymm_a1, ymm_b, ymm_temp7, ymm_temp);
  3572. }
  3573. if (krem & 2) {
  3574. ymm_a = MKL_DC_LOAD_YMM(&MKL_DC_AA(i,k + 0));
  3575. ymm_b = MKL_DC_BCAST_YMM(&MKL_DC_BB(k + 0,j));
  3576. MKL_DC_MUL_ADD_YMM(ymm_a, ymm_b, ymm_temp0, ymm_temp);
  3577. ymm_a1 = MKL_DC_LOAD_YMM(&MKL_DC_AA(i+4,k + 0));
  3578. MKL_DC_MUL_ADD_YMM(ymm_a1, ymm_b, ymm_temp1, ymm_temp);
  3579. ymm_b = MKL_DC_BCAST_YMM(&MKL_DC_BB(k + 0,j+1));
  3580. MKL_DC_MUL_ADD_YMM(ymm_a, ymm_b, ymm_temp2, ymm_temp);
  3581. MKL_DC_MUL_ADD_YMM(ymm_a1, ymm_b, ymm_temp3, ymm_temp);
  3582. k++;
  3583. ymm_a = MKL_DC_LOAD_YMM(&MKL_DC_AA(i,k + 0));
  3584. ymm_b = MKL_DC_BCAST_YMM(&MKL_DC_BB(k + 0,j));
  3585. MKL_DC_MUL_ADD_YMM(ymm_a, ymm_b, ymm_temp4, ymm_temp);
  3586. ymm_a1 = MKL_DC_LOAD_YMM(&MKL_DC_AA(i+4,k + 0));
  3587. MKL_DC_MUL_ADD_YMM(ymm_a1, ymm_b, ymm_temp5, ymm_temp);
  3588. ymm_b = MKL_DC_BCAST_YMM(&MKL_DC_BB(k + 0,j+1));
  3589. MKL_DC_MUL_ADD_YMM(ymm_a, ymm_b, ymm_temp6, ymm_temp);
  3590. MKL_DC_MUL_ADD_YMM(ymm_a1, ymm_b, ymm_temp7, ymm_temp);
  3591. k++;
  3592. }
  3593. if (kK>=2) {
  3594. ymm_temp0 = MKL_DC_ADD_YMM(ymm_temp0, ymm_temp4);
  3595. ymm_temp1 = MKL_DC_ADD_YMM(ymm_temp1, ymm_temp5);
  3596. ymm_temp2 = MKL_DC_ADD_YMM(ymm_temp2, ymm_temp6);
  3597. ymm_temp3 = MKL_DC_ADD_YMM(ymm_temp3, ymm_temp7);
  3598. }
  3599. if (krem & 1) {
  3600. ymm_a = MKL_DC_LOAD_YMM(&MKL_DC_AA(i,k + 0));
  3601. ymm_b = MKL_DC_BCAST_YMM(&MKL_DC_BB(k + 0,j));
  3602. MKL_DC_MUL_ADD_YMM(ymm_a, ymm_b, ymm_temp0, ymm_temp);
  3603. ymm_a1 = MKL_DC_LOAD_YMM(&MKL_DC_AA(i+4,k + 0));
  3604. MKL_DC_MUL_ADD_YMM(ymm_a1, ymm_b, ymm_temp1, ymm_temp);
  3605. ymm_b = MKL_DC_BCAST_YMM(&MKL_DC_BB(k + 0,j+1));
  3606. MKL_DC_MUL_ADD_YMM(ymm_a, ymm_b, ymm_temp2, ymm_temp);
  3607. MKL_DC_MUL_ADD_YMM(ymm_a1, ymm_b, ymm_temp3, ymm_temp);
  3608. k++;
  3609. }
  3610. #if !defined(MKL_DC_BETA_ZERO)
  3611. ymm_c0 = MKL_DC_LOAD_YMM(&MKL_DC_CC(i,j));
  3612. #if !defined(MKL_DC_BETA_ONE)
  3613. ymm_c0 = MKL_DC_MUL_YMM(ymm_c0, ymm_beta);
  3614. #endif
  3615. #if defined(MKL_DC_ALPHA_ONE)
  3616. ymm_c0 = MKL_DC_ADD_YMM(ymm_temp0, ymm_c0);
  3617. #else
  3618. MKL_DC_MUL_ADD_YMM(ymm_alpha, ymm_temp0, ymm_c0, ymm_temp);
  3619. #endif
  3620. #else
  3621. #if !defined(MKL_DC_ALPHA_ONE)
  3622. ymm_c0 = MKL_DC_MUL_YMM(ymm_alpha, ymm_temp0);
  3623. #else
  3624. ymm_c0 = ymm_temp0;
  3625. #endif
  3626. #endif
  3627. MKL_DC_STORE_YMM(&MKL_DC_CC(i,j), ymm_c0);
  3628. #if !defined(MKL_DC_BETA_ZERO)
  3629. ymm_c1 = MKL_DC_LOAD_YMM(&MKL_DC_CC(i+4,j));
  3630. #if !defined(MKL_DC_BETA_ONE)
  3631. ymm_c1 = MKL_DC_MUL_YMM(ymm_c1, ymm_beta);
  3632. #endif
  3633. #if defined(MKL_DC_ALPHA_ONE)
  3634. ymm_c1 = MKL_DC_ADD_YMM(ymm_temp1, ymm_c1);
  3635. #else
  3636. MKL_DC_MUL_ADD_YMM(ymm_alpha, ymm_temp1, ymm_c1, ymm_temp);
  3637. #endif
  3638. #else
  3639. #if !defined(MKL_DC_ALPHA_ONE)
  3640. ymm_c1 = MKL_DC_MUL_YMM(ymm_alpha, ymm_temp1);
  3641. #else
  3642. ymm_c1 = ymm_temp1;
  3643. #endif
  3644. #endif
  3645. MKL_DC_STORE_YMM(&MKL_DC_CC(i+4,j), ymm_c1);
  3646. #if !defined(MKL_DC_BETA_ZERO)
  3647. ymm_c2 = MKL_DC_LOAD_YMM(&MKL_DC_CC(i,j+1));
  3648. #if !defined(MKL_DC_BETA_ONE)
  3649. ymm_c2 = MKL_DC_MUL_YMM(ymm_c2, ymm_beta);
  3650. #endif
  3651. #if defined(MKL_DC_ALPHA_ONE)
  3652. ymm_c2 = MKL_DC_ADD_YMM(ymm_temp2, ymm_c2);
  3653. #else
  3654. MKL_DC_MUL_ADD_YMM(ymm_alpha, ymm_temp2, ymm_c2, ymm_temp);
  3655. #endif
  3656. #else
  3657. #if !defined(MKL_DC_ALPHA_ONE)
  3658. ymm_c2 = MKL_DC_MUL_YMM(ymm_alpha, ymm_temp2);
  3659. #else
  3660. ymm_c2 = ymm_temp2;
  3661. #endif
  3662. #endif
  3663. MKL_DC_STORE_YMM(&MKL_DC_CC(i,j+1), ymm_c2);
  3664. #if !defined(MKL_DC_BETA_ZERO)
  3665. ymm_c3 = MKL_DC_LOAD_YMM(&MKL_DC_CC(i+4,j+1));
  3666. #if !defined(MKL_DC_BETA_ONE)
  3667. ymm_c3 = MKL_DC_MUL_YMM(ymm_c3, ymm_beta);
  3668. #endif
  3669. #if defined(MKL_DC_ALPHA_ONE)
  3670. ymm_c3 = MKL_DC_ADD_YMM(ymm_temp3, ymm_c3);
  3671. #else
  3672. MKL_DC_MUL_ADD_YMM(ymm_alpha, ymm_temp3, ymm_c3, ymm_temp);
  3673. #endif
  3674. #else
  3675. #if !defined(MKL_DC_ALPHA_ONE)
  3676. ymm_c3 = MKL_DC_MUL_YMM(ymm_alpha, ymm_temp3);
  3677. #else
  3678. ymm_c3 = ymm_temp3;
  3679. #endif
  3680. #endif
  3681. MKL_DC_STORE_YMM(&MKL_DC_CC(i+4,j+1), ymm_c3);
  3682. }
  3683. if ((m-i) & MKER1) {
  3684. #if !defined(MKL_DC_BETA_ZERO)
  3685. ymm_c0 = MKL_DC_LOAD_YMM(&MKL_DC_CC(i,j));
  3686. ymm_c3 = MKL_DC_LOAD_YMM(&MKL_DC_CC(i,j+1));
  3687. #endif
  3688. ymm_temp0 = MKL_DC_SETZERO_YMM();
  3689. ymm_temp3 = MKL_DC_SETZERO_YMM();
  3690. ymm_temp4 = MKL_DC_SETZERO_YMM();
  3691. ymm_temp7 = MKL_DC_SETZERO_YMM();
  3692. MKL_INT k;
  3693. for (k=0; k<k0; k+=k_in_ker) {
  3694. ymm_a = MKL_DC_LOAD_YMM(&MKL_DC_AA(i,k + 0));
  3695. ymm_b = MKL_DC_BCAST_YMM(&MKL_DC_BB(k + 0,j));
  3696. MKL_DC_MUL_ADD_YMM(ymm_a, ymm_b, ymm_temp0, ymm_temp);
  3697. ymm_b = MKL_DC_BCAST_YMM(&MKL_DC_BB(k + 0,j+1));
  3698. MKL_DC_MUL_ADD_YMM(ymm_a, ymm_b, ymm_temp3, ymm_temp);
  3699. ymm_a = MKL_DC_LOAD_YMM(&MKL_DC_AA(i,k + 1));
  3700. ymm_b = MKL_DC_BCAST_YMM(&MKL_DC_BB(k + 1,j));
  3701. MKL_DC_MUL_ADD_YMM(ymm_a, ymm_b, ymm_temp4, ymm_temp);
  3702. ymm_b = MKL_DC_BCAST_YMM(&MKL_DC_BB(k + 1,j+1));
  3703. MKL_DC_MUL_ADD_YMM(ymm_a, ymm_b, ymm_temp7, ymm_temp);
  3704. ymm_a = MKL_DC_LOAD_YMM(&MKL_DC_AA(i,k + 2));
  3705. ymm_b = MKL_DC_BCAST_YMM(&MKL_DC_BB(k + 2,j));
  3706. MKL_DC_MUL_ADD_YMM(ymm_a, ymm_b, ymm_temp0, ymm_temp);
  3707. ymm_b = MKL_DC_BCAST_YMM(&MKL_DC_BB(k + 2,j+1));
  3708. MKL_DC_MUL_ADD_YMM(ymm_a, ymm_b, ymm_temp3, ymm_temp);
  3709. ymm_a = MKL_DC_LOAD_YMM(&MKL_DC_AA(i,k + 3));
  3710. ymm_b = MKL_DC_BCAST_YMM(&MKL_DC_BB(k + 3,j));
  3711. MKL_DC_MUL_ADD_YMM(ymm_a, ymm_b, ymm_temp4, ymm_temp);
  3712. ymm_b = MKL_DC_BCAST_YMM(&MKL_DC_BB(k + 3,j+1));
  3713. MKL_DC_MUL_ADD_YMM(ymm_a, ymm_b, ymm_temp7, ymm_temp);
  3714. }
  3715. if (krem & 2) {
  3716. ymm_a = MKL_DC_LOAD_YMM(&MKL_DC_AA(i,k + 0));
  3717. ymm_b = MKL_DC_BCAST_YMM(&MKL_DC_BB(k + 0,j));
  3718. MKL_DC_MUL_ADD_YMM(ymm_a, ymm_b, ymm_temp0, ymm_temp);
  3719. ymm_a = MKL_DC_LOAD_YMM(&MKL_DC_AA(i,k + 0));
  3720. ymm_b = MKL_DC_BCAST_YMM(&MKL_DC_BB(k + 0,j+1));
  3721. MKL_DC_MUL_ADD_YMM(ymm_a, ymm_b, ymm_temp3, ymm_temp);
  3722. k++;
  3723. ymm_a = MKL_DC_LOAD_YMM(&MKL_DC_AA(i,k + 0));
  3724. ymm_b = MKL_DC_BCAST_YMM(&MKL_DC_BB(k + 0,j));
  3725. MKL_DC_MUL_ADD_YMM(ymm_a, ymm_b, ymm_temp4, ymm_temp);
  3726. ymm_a = MKL_DC_LOAD_YMM(&MKL_DC_AA(i,k + 0));
  3727. ymm_b = MKL_DC_BCAST_YMM(&MKL_DC_BB(k + 0,j+1));
  3728. MKL_DC_MUL_ADD_YMM(ymm_a, ymm_b, ymm_temp7, ymm_temp);
  3729. k++;
  3730. }
  3731. if (kK>=2) {
  3732. ymm_temp0 = MKL_DC_ADD_YMM(ymm_temp0, ymm_temp4);
  3733. ymm_temp3 = MKL_DC_ADD_YMM(ymm_temp3, ymm_temp7);
  3734. }
  3735. if (krem & 1) {
  3736. ymm_a = MKL_DC_LOAD_YMM(&MKL_DC_AA(i,k + 0));
  3737. ymm_b = MKL_DC_BCAST_YMM(&MKL_DC_BB(k + 0,j));
  3738. MKL_DC_MUL_ADD_YMM(ymm_a, ymm_b, ymm_temp0, ymm_temp);
  3739. ymm_a = MKL_DC_LOAD_YMM(&MKL_DC_AA(i,k + 0));
  3740. ymm_b = MKL_DC_BCAST_YMM(&MKL_DC_BB(k + 0,j+1));
  3741. MKL_DC_MUL_ADD_YMM(ymm_a, ymm_b, ymm_temp3, ymm_temp);
  3742. k++;
  3743. }
  3744. #if !defined(MKL_DC_BETA_ZERO)
  3745. #if !defined(MKL_DC_BETA_ONE)
  3746. ymm_c0 = MKL_DC_MUL_YMM(ymm_c0, ymm_beta);
  3747. #endif
  3748. #if defined(MKL_DC_ALPHA_ONE)
  3749. ymm_c0 = MKL_DC_ADD_YMM(ymm_temp0, ymm_c0);
  3750. #else
  3751. MKL_DC_MUL_ADD_YMM(ymm_alpha, ymm_temp0, ymm_c0, ymm_temp);
  3752. #endif
  3753. #else
  3754. #if !defined(MKL_DC_ALPHA_ONE)
  3755. ymm_c0 = MKL_DC_MUL_YMM(ymm_alpha, ymm_temp0);
  3756. #else
  3757. ymm_c0 = ymm_temp0;
  3758. #endif
  3759. #endif
  3760. MKL_DC_STORE_YMM(&MKL_DC_CC(i,j), ymm_c0);
  3761. #if !defined(MKL_DC_BETA_ZERO)
  3762. #if !defined(MKL_DC_BETA_ONE)
  3763. ymm_c3 = MKL_DC_MUL_YMM(ymm_c3, ymm_beta);
  3764. #endif
  3765. #if defined(MKL_DC_ALPHA_ONE)
  3766. ymm_c3 = MKL_DC_ADD_YMM(ymm_temp3, ymm_c3);
  3767. #else
  3768. MKL_DC_MUL_ADD_YMM(ymm_alpha, ymm_temp3, ymm_c3, ymm_temp);
  3769. #endif
  3770. #else
  3771. #if !defined(MKL_DC_ALPHA_ONE)
  3772. ymm_c3 = MKL_DC_MUL_YMM(ymm_alpha, ymm_temp3);
  3773. #else
  3774. ymm_c3 = ymm_temp3;
  3775. #endif
  3776. #endif
  3777. MKL_DC_STORE_YMM(&MKL_DC_CC(i,j+1), ymm_c3);
  3778. i += MKER1;
  3779. }
  3780. if ((m-i) & MKER2) {
  3781. xmm_temp0 = MKL_DC_SETZERO_XMM();
  3782. xmm_temp3 = MKL_DC_SETZERO_XMM();
  3783. xmm_temp5 = MKL_DC_SETZERO_XMM();
  3784. xmm_temp7 = MKL_DC_SETZERO_XMM();
  3785. #if !defined(MKL_DC_BETA_ZERO)
  3786. xmm_c = MKL_DC_LOAD_XMM(&MKL_DC_CC(i,j));
  3787. xmm_c3 = MKL_DC_LOAD_XMM(&MKL_DC_CC(i,j+1));
  3788. #endif
  3789. MKL_INT k;
  3790. for (k=0; k<k0; k+=k_in_ker) {
  3791. xmm_a = MKL_DC_LOAD_XMM(&MKL_DC_AA(i,k));
  3792. ymm_b = MKL_DC_BCAST_YMM(&MKL_DC_BB(k,j));
  3793. xmm_b = MKL_DC_CAST_YMM_TO_XMM(ymm_b);
  3794. MKL_DC_MUL_ADD_XMM(xmm_a, xmm_b, xmm_temp0, xmm_b);
  3795. ymm_b = MKL_DC_BCAST_YMM(&MKL_DC_BB(k,j+1));
  3796. xmm_b = MKL_DC_CAST_YMM_TO_XMM(ymm_b);
  3797. MKL_DC_MUL_ADD_XMM(xmm_a, xmm_b, xmm_temp3, xmm_b);
  3798. xmm_a = MKL_DC_LOAD_XMM(&MKL_DC_AA(i,k+1));
  3799. ymm_b = MKL_DC_BCAST_YMM(&MKL_DC_BB(k+1,j));
  3800. xmm_b = MKL_DC_CAST_YMM_TO_XMM(ymm_b);
  3801. MKL_DC_MUL_ADD_XMM(xmm_a, xmm_b, xmm_temp5, xmm_b);
  3802. ymm_b = MKL_DC_BCAST_YMM(&MKL_DC_BB(k+1,j+1));
  3803. xmm_b = MKL_DC_CAST_YMM_TO_XMM(ymm_b);
  3804. MKL_DC_MUL_ADD_XMM(xmm_a, xmm_b, xmm_temp7, xmm_b);
  3805. xmm_a = MKL_DC_LOAD_XMM(&MKL_DC_AA(i,k+2));
  3806. ymm_b = MKL_DC_BCAST_YMM(&MKL_DC_BB(k+2,j));
  3807. xmm_b = MKL_DC_CAST_YMM_TO_XMM(ymm_b);
  3808. MKL_DC_MUL_ADD_XMM(xmm_a, xmm_b, xmm_temp0, xmm_b);
  3809. ymm_b = MKL_DC_BCAST_YMM(&MKL_DC_BB(k+2,j+1));
  3810. xmm_b = MKL_DC_CAST_YMM_TO_XMM(ymm_b);
  3811. MKL_DC_MUL_ADD_XMM(xmm_a, xmm_b, xmm_temp3, xmm_b);
  3812. xmm_a = MKL_DC_LOAD_XMM(&MKL_DC_AA(i,k+3));
  3813. ymm_b = MKL_DC_BCAST_YMM(&MKL_DC_BB(k+3,j));
  3814. xmm_b = MKL_DC_CAST_YMM_TO_XMM(ymm_b);
  3815. MKL_DC_MUL_ADD_XMM(xmm_a, xmm_b, xmm_temp5, xmm_b);
  3816. ymm_b = MKL_DC_BCAST_YMM(&MKL_DC_BB(k+3,j+1));
  3817. xmm_b = MKL_DC_CAST_YMM_TO_XMM(ymm_b);
  3818. MKL_DC_MUL_ADD_XMM(xmm_a, xmm_b, xmm_temp7, xmm_b);
  3819. }
  3820. if (krem & 2) {
  3821. xmm_a = MKL_DC_LOAD_XMM(&MKL_DC_AA(i,k));
  3822. ymm_b = MKL_DC_BCAST_YMM(&MKL_DC_BB(k,j));
  3823. xmm_b = MKL_DC_CAST_YMM_TO_XMM(ymm_b);
  3824. MKL_DC_MUL_ADD_XMM(xmm_a, xmm_b, xmm_temp0, xmm_b);
  3825. ymm_b = MKL_DC_BCAST_YMM(&MKL_DC_BB(k,j+1));
  3826. xmm_b = MKL_DC_CAST_YMM_TO_XMM(ymm_b);
  3827. MKL_DC_MUL_ADD_XMM(xmm_a, xmm_b, xmm_temp3, xmm_b);
  3828. k++;
  3829. xmm_a = MKL_DC_LOAD_XMM(&MKL_DC_AA(i,k));
  3830. ymm_b = MKL_DC_BCAST_YMM(&MKL_DC_BB(k,j));
  3831. xmm_b = MKL_DC_CAST_YMM_TO_XMM(ymm_b);
  3832. MKL_DC_MUL_ADD_XMM(xmm_a, xmm_b, xmm_temp5, xmm_b);
  3833. ymm_b = MKL_DC_BCAST_YMM(&MKL_DC_BB(k,j+1));
  3834. xmm_b = MKL_DC_CAST_YMM_TO_XMM(ymm_b);
  3835. MKL_DC_MUL_ADD_XMM(xmm_a, xmm_b, xmm_temp7, xmm_b);
  3836. k++;
  3837. }
  3838. if (kK>=2) {
  3839. xmm_temp0 = MKL_DC_ADD_XMM(xmm_temp0, xmm_temp5);
  3840. xmm_temp3 = MKL_DC_ADD_XMM(xmm_temp3, xmm_temp7);
  3841. }
  3842. if (krem & 1) {
  3843. xmm_a = MKL_DC_LOAD_XMM(&MKL_DC_AA(i,k));
  3844. ymm_b = MKL_DC_BCAST_YMM(&MKL_DC_BB(k,j));
  3845. xmm_b = MKL_DC_CAST_YMM_TO_XMM(ymm_b);
  3846. MKL_DC_MUL_ADD_XMM(xmm_a, xmm_b, xmm_temp0, xmm_b);
  3847. ymm_b = MKL_DC_BCAST_YMM(&MKL_DC_BB(k,j+1));
  3848. xmm_b = MKL_DC_CAST_YMM_TO_XMM(ymm_b);
  3849. MKL_DC_MUL_ADD_XMM(xmm_a, xmm_b, xmm_temp3, xmm_b);
  3850. k++;
  3851. }
  3852. #if !defined(MKL_DC_BETA_ZERO)
  3853. #if !defined(MKL_DC_BETA_ONE)
  3854. xmm_c = MKL_DC_MUL_XMM(xmm_c, xmm_beta);
  3855. #endif
  3856. #if defined(MKL_DC_ALPHA_ONE)
  3857. xmm_c = MKL_DC_ADD_XMM(xmm_temp0, xmm_c);
  3858. #else
  3859. MKL_DC_MUL_ADD_XMM(xmm_alpha, xmm_temp0, xmm_c, xmm_temp);
  3860. #endif
  3861. #else
  3862. #if !defined(MKL_DC_ALPHA_ONE)
  3863. xmm_c = MKL_DC_MUL_XMM(xmm_alpha, xmm_temp0);
  3864. #else
  3865. xmm_c = xmm_temp0;
  3866. #endif
  3867. #endif
  3868. #if !defined(MKL_DC_BETA_ZERO)
  3869. #if !defined(MKL_DC_BETA_ONE)
  3870. xmm_c3 = MKL_DC_MUL_XMM(xmm_c3, xmm_beta);
  3871. #endif
  3872. #if defined(MKL_DC_ALPHA_ONE)
  3873. xmm_c3 = MKL_DC_ADD_XMM(xmm_temp3, xmm_c3);
  3874. #else
  3875. MKL_DC_MUL_ADD_XMM(xmm_alpha, xmm_temp3, xmm_c3, xmm_temp);
  3876. #endif
  3877. #else
  3878. #if !defined(MKL_DC_ALPHA_ONE)
  3879. xmm_c3 = MKL_DC_MUL_XMM(xmm_alpha, xmm_temp3);
  3880. #else
  3881. xmm_c3 = xmm_temp3;
  3882. #endif
  3883. #endif
  3884. MKL_DC_STORE_XMM(&MKL_DC_CC(i,j), xmm_c);
  3885. MKL_DC_STORE_XMM(&MKL_DC_CC(i,j+1), xmm_c3);
  3886. i += MKER2;
  3887. }
  3888. if ((m - i) & MKER3) {
  3889. xmm_temp0 = MKL_DC_SETZERO_XMM();
  3890. xmm_temp3 = MKL_DC_SETZERO_XMM();
  3891. #if !defined(MKL_DC_BETA_ZERO)
  3892. xmm_c = MKL_DC_LOAD_XMM_S(&MKL_DC_CC(i,j));
  3893. xmm_c3 = MKL_DC_LOAD_XMM_S(&MKL_DC_CC(i,j+1));
  3894. #endif
  3895. MKL_INT k;
  3896. for (k=0; k<k0; k+=k_in_ker) {
  3897. xmm_a = MKL_DC_LOAD_XMM_S(&MKL_DC_AA(i,k));
  3898. xmm_b = MKL_DC_LOAD_XMM_S(&MKL_DC_BB(k,j));
  3899. MKL_DC_MUL_ADD_XMM_S(xmm_a, xmm_b, xmm_temp0, xmm_b);
  3900. xmm_b = MKL_DC_LOAD_XMM_S(&MKL_DC_BB(k,j+1));
  3901. MKL_DC_MUL_ADD_XMM_S(xmm_a, xmm_b, xmm_temp3, xmm_b);
  3902. xmm_a = MKL_DC_LOAD_XMM_S(&MKL_DC_AA(i,k+1));
  3903. xmm_b = MKL_DC_LOAD_XMM_S(&MKL_DC_BB(k+1,j));
  3904. MKL_DC_MUL_ADD_XMM_S(xmm_a, xmm_b, xmm_temp0, xmm_b);
  3905. xmm_b = MKL_DC_LOAD_XMM_S(&MKL_DC_BB(k+1,j+1));
  3906. MKL_DC_MUL_ADD_XMM_S(xmm_a, xmm_b, xmm_temp3, xmm_b);
  3907. xmm_a = MKL_DC_LOAD_XMM_S(&MKL_DC_AA(i,k+2));
  3908. xmm_b = MKL_DC_LOAD_XMM_S(&MKL_DC_BB(k+2,j));
  3909. MKL_DC_MUL_ADD_XMM_S(xmm_a, xmm_b, xmm_temp0, xmm_b);
  3910. xmm_b = MKL_DC_LOAD_XMM_S(&MKL_DC_BB(k+2,j+1));
  3911. MKL_DC_MUL_ADD_XMM_S(xmm_a, xmm_b, xmm_temp3, xmm_b);
  3912. xmm_a = MKL_DC_LOAD_XMM_S(&MKL_DC_AA(i,k+3));
  3913. xmm_b = MKL_DC_LOAD_XMM_S(&MKL_DC_BB(k+3,j));
  3914. MKL_DC_MUL_ADD_XMM_S(xmm_a, xmm_b, xmm_temp0, xmm_b);
  3915. xmm_b = MKL_DC_LOAD_XMM_S(&MKL_DC_BB(k+3,j+1));
  3916. MKL_DC_MUL_ADD_XMM_S(xmm_a, xmm_b, xmm_temp3, xmm_b);
  3917. }
  3918. if (krem & 2) {
  3919. xmm_a = MKL_DC_LOAD_XMM_S(&MKL_DC_AA(i,k));
  3920. xmm_b = MKL_DC_LOAD_XMM_S(&MKL_DC_BB(k,j));
  3921. MKL_DC_MUL_ADD_XMM_S(xmm_a, xmm_b, xmm_temp0, xmm_b);
  3922. xmm_b = MKL_DC_LOAD_XMM_S(&MKL_DC_BB(k,j+1));
  3923. MKL_DC_MUL_ADD_XMM_S(xmm_a, xmm_b, xmm_temp3, xmm_b);
  3924. k++;
  3925. xmm_a = MKL_DC_LOAD_XMM_S(&MKL_DC_AA(i,k));
  3926. xmm_b = MKL_DC_LOAD_XMM_S(&MKL_DC_BB(k,j));
  3927. MKL_DC_MUL_ADD_XMM_S(xmm_a, xmm_b, xmm_temp0, xmm_b);
  3928. xmm_b = MKL_DC_LOAD_XMM_S(&MKL_DC_BB(k,j+1));
  3929. MKL_DC_MUL_ADD_XMM_S(xmm_a, xmm_b, xmm_temp3, xmm_b);
  3930. k++;
  3931. }
  3932. if (krem & 1) {
  3933. xmm_a = MKL_DC_LOAD_XMM_S(&MKL_DC_AA(i,k));
  3934. xmm_b = MKL_DC_LOAD_XMM_S(&MKL_DC_BB(k,j));
  3935. MKL_DC_MUL_ADD_XMM_S(xmm_a, xmm_b, xmm_temp0, xmm_b);
  3936. xmm_b = MKL_DC_LOAD_XMM_S(&MKL_DC_BB(k,j+1));
  3937. MKL_DC_MUL_ADD_XMM_S(xmm_a, xmm_b, xmm_temp3, xmm_b);
  3938. k++;
  3939. }
  3940. #if !defined(MKL_DC_BETA_ZERO)
  3941. #if !defined(MKL_DC_BETA_ONE)
  3942. xmm_c = MKL_DC_MUL_XMM(xmm_c, xmm_beta);
  3943. #endif
  3944. #if defined(MKL_DC_ALPHA_ONE)
  3945. xmm_c = MKL_DC_ADD_XMM_S(xmm_temp0, xmm_c);
  3946. #else
  3947. MKL_DC_MUL_ADD_XMM_S(xmm_alpha, xmm_temp0, xmm_c, xmm_temp);
  3948. #endif
  3949. #else
  3950. #if !defined(MKL_DC_ALPHA_ONE)
  3951. xmm_c = MKL_DC_MUL_XMM_S(xmm_alpha, xmm_temp0);
  3952. #else
  3953. xmm_c = xmm_temp0;
  3954. #endif
  3955. #endif
  3956. #if !defined(MKL_DC_BETA_ZERO)
  3957. #if !defined(MKL_DC_BETA_ONE)
  3958. xmm_c3 = MKL_DC_MUL_XMM(xmm_c3, xmm_beta);
  3959. #endif
  3960. #if defined(MKL_DC_ALPHA_ONE)
  3961. xmm_c3 = MKL_DC_ADD_XMM_S(xmm_temp3, xmm_c3);
  3962. #else
  3963. MKL_DC_MUL_ADD_XMM_S(xmm_alpha, xmm_temp3, xmm_c3, xmm_temp);
  3964. #endif
  3965. #else
  3966. #if !defined(MKL_DC_ALPHA_ONE)
  3967. xmm_c3 = MKL_DC_MUL_XMM_S(xmm_alpha, xmm_temp3);
  3968. #else
  3969. xmm_c3 = xmm_temp3;
  3970. #endif
  3971. #endif
  3972. MKL_DC_STORE_XMM_S(&MKL_DC_CC(i,j), xmm_c);
  3973. MKL_DC_STORE_XMM_S(&MKL_DC_CC(i,j+1), xmm_c3);
  3974. i += MKER3;
  3975. }
  3976. } else if ((n-j) == 1) {
  3977. MKL_INT i;
  3978. for (i=0; i<m0; i+=m_in_ker) {
  3979. ymm_temp0 = MKL_DC_SETZERO_YMM();
  3980. ymm_temp1 = MKL_DC_SETZERO_YMM();
  3981. ymm_temp2 = MKL_DC_SETZERO_YMM();
  3982. ymm_temp3 = MKL_DC_SETZERO_YMM();
  3983. MKL_INT k;
  3984. for (k=0; k<k0; k+=k_in_ker) {
  3985. ymm_a = MKL_DC_LOAD_YMM(&MKL_DC_AA(i,k + 0));
  3986. ymm_b = MKL_DC_BCAST_YMM(&MKL_DC_BB(k + 0,j));
  3987. MKL_DC_MUL_ADD_YMM(ymm_a, ymm_b, ymm_temp0, ymm_temp);
  3988. ymm_a1 = MKL_DC_LOAD_YMM(&MKL_DC_AA(i+4,k + 0));
  3989. MKL_DC_MUL_ADD_YMM(ymm_a1, ymm_b, ymm_temp1, ymm_temp);
  3990. ymm_a = MKL_DC_LOAD_YMM(&MKL_DC_AA(i,k + 1));
  3991. ymm_b = MKL_DC_BCAST_YMM(&MKL_DC_BB(k + 1,j));
  3992. MKL_DC_MUL_ADD_YMM(ymm_a, ymm_b, ymm_temp2, ymm_temp);
  3993. ymm_a1 = MKL_DC_LOAD_YMM(&MKL_DC_AA(i+4,k + 1));
  3994. MKL_DC_MUL_ADD_YMM(ymm_a1, ymm_b, ymm_temp3, ymm_temp);
  3995. ymm_a = MKL_DC_LOAD_YMM(&MKL_DC_AA(i,k + 2));
  3996. ymm_b = MKL_DC_BCAST_YMM(&MKL_DC_BB(k + 2,j));
  3997. MKL_DC_MUL_ADD_YMM(ymm_a, ymm_b, ymm_temp0, ymm_temp);
  3998. ymm_a1 = MKL_DC_LOAD_YMM(&MKL_DC_AA(i+4,k + 2));
  3999. MKL_DC_MUL_ADD_YMM(ymm_a1, ymm_b, ymm_temp1, ymm_temp);
  4000. ymm_a = MKL_DC_LOAD_YMM(&MKL_DC_AA(i,k + 3));
  4001. ymm_b = MKL_DC_BCAST_YMM(&MKL_DC_BB(k + 3,j));
  4002. MKL_DC_MUL_ADD_YMM(ymm_a, ymm_b, ymm_temp2, ymm_temp);
  4003. ymm_a1 = MKL_DC_LOAD_YMM(&MKL_DC_AA(i+4,k + 3));
  4004. MKL_DC_MUL_ADD_YMM(ymm_a1, ymm_b, ymm_temp3, ymm_temp);
  4005. }
  4006. if (krem & 2) {
  4007. ymm_a = MKL_DC_LOAD_YMM(&MKL_DC_AA(i,k + 0));
  4008. ymm_b = MKL_DC_BCAST_YMM(&MKL_DC_BB(k + 0,j));
  4009. MKL_DC_MUL_ADD_YMM(ymm_a, ymm_b, ymm_temp0, ymm_temp);
  4010. ymm_a1 = MKL_DC_LOAD_YMM(&MKL_DC_AA(i+4,k + 0));
  4011. MKL_DC_MUL_ADD_YMM(ymm_a1, ymm_b, ymm_temp1, ymm_temp);
  4012. k++;
  4013. ymm_a = MKL_DC_LOAD_YMM(&MKL_DC_AA(i,k + 0));
  4014. ymm_b = MKL_DC_BCAST_YMM(&MKL_DC_BB(k + 0,j));
  4015. MKL_DC_MUL_ADD_YMM(ymm_a, ymm_b, ymm_temp2, ymm_temp);
  4016. ymm_a1 = MKL_DC_LOAD_YMM(&MKL_DC_AA(i+4,k + 0));
  4017. MKL_DC_MUL_ADD_YMM(ymm_a1, ymm_b, ymm_temp3, ymm_temp);
  4018. k++;
  4019. }
  4020. if (kK>=2) {
  4021. ymm_temp0 = MKL_DC_ADD_YMM(ymm_temp0, ymm_temp2);
  4022. ymm_temp1 = MKL_DC_ADD_YMM(ymm_temp1, ymm_temp3);
  4023. }
  4024. if (krem & 1) {
  4025. ymm_a = MKL_DC_LOAD_YMM(&MKL_DC_AA(i,k + 0));
  4026. ymm_b = MKL_DC_BCAST_YMM(&MKL_DC_BB(k + 0,j));
  4027. MKL_DC_MUL_ADD_YMM(ymm_a, ymm_b, ymm_temp0, ymm_temp);
  4028. ymm_a1 = MKL_DC_LOAD_YMM(&MKL_DC_AA(i+4,k + 0));
  4029. MKL_DC_MUL_ADD_YMM(ymm_a1, ymm_b, ymm_temp1, ymm_temp);
  4030. k++;
  4031. }
  4032. #if !defined(MKL_DC_BETA_ZERO)
  4033. ymm_c0 = MKL_DC_LOAD_YMM(&MKL_DC_CC(i,j));
  4034. #if !defined(MKL_DC_BETA_ONE)
  4035. ymm_c0 = MKL_DC_MUL_YMM(ymm_c0, ymm_beta);
  4036. #endif
  4037. #if defined(MKL_DC_ALPHA_ONE)
  4038. ymm_c0 = MKL_DC_ADD_YMM(ymm_temp0, ymm_c0);
  4039. #else
  4040. MKL_DC_MUL_ADD_YMM(ymm_alpha, ymm_temp0, ymm_c0, ymm_temp);
  4041. #endif
  4042. #else
  4043. #if !defined(MKL_DC_ALPHA_ONE)
  4044. ymm_c0 = MKL_DC_MUL_YMM(ymm_alpha, ymm_temp0);
  4045. #else
  4046. ymm_c0 = ymm_temp0;
  4047. #endif
  4048. #endif
  4049. MKL_DC_STORE_YMM(&MKL_DC_CC(i,j), ymm_c0);
  4050. #if !defined(MKL_DC_BETA_ZERO)
  4051. ymm_c1 = MKL_DC_LOAD_YMM(&MKL_DC_CC(i+4,j));
  4052. #if !defined(MKL_DC_BETA_ONE)
  4053. ymm_c1 = MKL_DC_MUL_YMM(ymm_c1, ymm_beta);
  4054. #endif
  4055. #if defined(MKL_DC_ALPHA_ONE)
  4056. ymm_c1 = MKL_DC_ADD_YMM(ymm_temp1, ymm_c1);
  4057. #else
  4058. MKL_DC_MUL_ADD_YMM(ymm_alpha, ymm_temp1, ymm_c1, ymm_temp);
  4059. #endif
  4060. #else
  4061. #if !defined(MKL_DC_ALPHA_ONE)
  4062. ymm_c1 = MKL_DC_MUL_YMM(ymm_alpha, ymm_temp1);
  4063. #else
  4064. ymm_c1 = ymm_temp1;
  4065. #endif
  4066. #endif
  4067. MKL_DC_STORE_YMM(&MKL_DC_CC(i+4,j), ymm_c1);
  4068. }
  4069. if ((m-i) & MKER1) {
  4070. #if !defined(MKL_DC_BETA_ZERO)
  4071. ymm_c0 = MKL_DC_LOAD_YMM(&MKL_DC_CC(i,j));
  4072. #endif
  4073. ymm_temp0 = MKL_DC_SETZERO_YMM();
  4074. ymm_temp1 = MKL_DC_SETZERO_YMM();
  4075. MKL_INT k;
  4076. for (k=0; k<k0; k+=k_in_ker) {
  4077. ymm_a = MKL_DC_LOAD_YMM(&MKL_DC_AA(i,k + 0));
  4078. ymm_b = MKL_DC_BCAST_YMM(&MKL_DC_BB(k + 0,j));
  4079. MKL_DC_MUL_ADD_YMM(ymm_a, ymm_b, ymm_temp0, ymm_temp);
  4080. ymm_a = MKL_DC_LOAD_YMM(&MKL_DC_AA(i,k + 1));
  4081. ymm_b = MKL_DC_BCAST_YMM(&MKL_DC_BB(k + 1,j));
  4082. MKL_DC_MUL_ADD_YMM(ymm_a, ymm_b, ymm_temp1, ymm_temp);
  4083. ymm_a = MKL_DC_LOAD_YMM(&MKL_DC_AA(i,k + 2));
  4084. ymm_b = MKL_DC_BCAST_YMM(&MKL_DC_BB(k + 2,j));
  4085. MKL_DC_MUL_ADD_YMM(ymm_a, ymm_b, ymm_temp0, ymm_temp);
  4086. ymm_a = MKL_DC_LOAD_YMM(&MKL_DC_AA(i,k + 3));
  4087. ymm_b = MKL_DC_BCAST_YMM(&MKL_DC_BB(k + 3,j));
  4088. MKL_DC_MUL_ADD_YMM(ymm_a, ymm_b, ymm_temp1, ymm_temp);
  4089. }
  4090. if (krem & 2) {
  4091. ymm_a = MKL_DC_LOAD_YMM(&MKL_DC_AA(i,k + 0));
  4092. ymm_b = MKL_DC_BCAST_YMM(&MKL_DC_BB(k + 0,j));
  4093. MKL_DC_MUL_ADD_YMM(ymm_a, ymm_b, ymm_temp0, ymm_temp);
  4094. k++;
  4095. ymm_a = MKL_DC_LOAD_YMM(&MKL_DC_AA(i,k + 0));
  4096. ymm_b = MKL_DC_BCAST_YMM(&MKL_DC_BB(k + 0,j));
  4097. MKL_DC_MUL_ADD_YMM(ymm_a, ymm_b, ymm_temp1, ymm_temp);
  4098. k++;
  4099. }
  4100. if (kK>=2) {
  4101. ymm_temp0 = MKL_DC_ADD_YMM(ymm_temp0, ymm_temp1);
  4102. }
  4103. if (krem & 1) {
  4104. ymm_a = MKL_DC_LOAD_YMM(&MKL_DC_AA(i,k + 0));
  4105. ymm_b = MKL_DC_BCAST_YMM(&MKL_DC_BB(k + 0,j));
  4106. MKL_DC_MUL_ADD_YMM(ymm_a, ymm_b, ymm_temp0, ymm_temp);
  4107. k++;
  4108. }
  4109. #if !defined(MKL_DC_BETA_ZERO)
  4110. #if !defined(MKL_DC_BETA_ONE)
  4111. ymm_c0 = MKL_DC_MUL_YMM(ymm_c0, ymm_beta);
  4112. #endif
  4113. #if defined(MKL_DC_ALPHA_ONE)
  4114. ymm_c0 = MKL_DC_ADD_YMM(ymm_temp0, ymm_c0);
  4115. #else
  4116. MKL_DC_MUL_ADD_YMM(ymm_alpha, ymm_temp0, ymm_c0, ymm_temp);
  4117. #endif
  4118. #else
  4119. #if !defined(MKL_DC_ALPHA_ONE)
  4120. ymm_c0 = MKL_DC_MUL_YMM(ymm_alpha, ymm_temp0);
  4121. #else
  4122. ymm_c0 = ymm_temp0;
  4123. #endif
  4124. #endif
  4125. MKL_DC_STORE_YMM(&MKL_DC_CC(i,j), ymm_c0);
  4126. i += MKER1;
  4127. }
  4128. if ((m-i) & MKER2) {
  4129. xmm_temp0 = MKL_DC_SETZERO_XMM();
  4130. xmm_temp3 = MKL_DC_SETZERO_XMM();
  4131. #if !defined(MKL_DC_BETA_ZERO)
  4132. xmm_c = MKL_DC_LOAD_XMM(&MKL_DC_CC(i,j));
  4133. #endif
  4134. MKL_INT k;
  4135. for (k=0; k<k0; k+=k_in_ker) {
  4136. xmm_a = MKL_DC_LOAD_XMM(&MKL_DC_AA(i,k));
  4137. ymm_b = MKL_DC_BCAST_YMM(&MKL_DC_BB(k,j));
  4138. xmm_b = MKL_DC_CAST_YMM_TO_XMM(ymm_b);
  4139. MKL_DC_MUL_ADD_XMM(xmm_a, xmm_b, xmm_temp0, xmm_b);
  4140. xmm_a = MKL_DC_LOAD_XMM(&MKL_DC_AA(i,k+1));
  4141. ymm_b = MKL_DC_BCAST_YMM(&MKL_DC_BB(k+1,j));
  4142. xmm_b = MKL_DC_CAST_YMM_TO_XMM(ymm_b);
  4143. MKL_DC_MUL_ADD_XMM(xmm_a, xmm_b, xmm_temp3, xmm_b);
  4144. xmm_a = MKL_DC_LOAD_XMM(&MKL_DC_AA(i,k+2));
  4145. ymm_b = MKL_DC_BCAST_YMM(&MKL_DC_BB(k+2,j));
  4146. xmm_b = MKL_DC_CAST_YMM_TO_XMM(ymm_b);
  4147. MKL_DC_MUL_ADD_XMM(xmm_a, xmm_b, xmm_temp0, xmm_b);
  4148. xmm_a = MKL_DC_LOAD_XMM(&MKL_DC_AA(i,k+3));
  4149. ymm_b = MKL_DC_BCAST_YMM(&MKL_DC_BB(k+3,j));
  4150. xmm_b = MKL_DC_CAST_YMM_TO_XMM(ymm_b);
  4151. MKL_DC_MUL_ADD_XMM(xmm_a, xmm_b, xmm_temp3, xmm_b);
  4152. }
  4153. if (krem & 2) {
  4154. xmm_a = MKL_DC_LOAD_XMM(&MKL_DC_AA(i,k));
  4155. ymm_b = MKL_DC_BCAST_YMM(&MKL_DC_BB(k,j));
  4156. xmm_b = MKL_DC_CAST_YMM_TO_XMM(ymm_b);
  4157. MKL_DC_MUL_ADD_XMM(xmm_a, xmm_b, xmm_temp0, xmm_b);
  4158. k++;
  4159. xmm_a = MKL_DC_LOAD_XMM(&MKL_DC_AA(i,k));
  4160. ymm_b = MKL_DC_BCAST_YMM(&MKL_DC_BB(k,j));
  4161. xmm_b = MKL_DC_CAST_YMM_TO_XMM(ymm_b);
  4162. MKL_DC_MUL_ADD_XMM(xmm_a, xmm_b, xmm_temp3, xmm_b);
  4163. k++;
  4164. }
  4165. if (kK>=2) {
  4166. xmm_temp0 = MKL_DC_ADD_XMM(xmm_temp0, xmm_temp3);
  4167. }
  4168. if (krem & 1) {
  4169. xmm_a = MKL_DC_LOAD_XMM(&MKL_DC_AA(i,k));
  4170. ymm_b = MKL_DC_BCAST_YMM(&MKL_DC_BB(k,j));
  4171. xmm_b = MKL_DC_CAST_YMM_TO_XMM(ymm_b);
  4172. MKL_DC_MUL_ADD_XMM(xmm_a, xmm_b, xmm_temp0, xmm_b);
  4173. k++;
  4174. }
  4175. #if !defined(MKL_DC_BETA_ZERO)
  4176. #if !defined(MKL_DC_BETA_ONE)
  4177. xmm_c = MKL_DC_MUL_XMM(xmm_c, xmm_beta);
  4178. #endif
  4179. #if defined(MKL_DC_ALPHA_ONE)
  4180. xmm_c = MKL_DC_ADD_XMM(xmm_temp0, xmm_c);
  4181. #else
  4182. MKL_DC_MUL_ADD_XMM(xmm_alpha, xmm_temp0, xmm_c, xmm_temp);
  4183. #endif
  4184. #else
  4185. #if !defined(MKL_DC_ALPHA_ONE)
  4186. xmm_c = MKL_DC_MUL_XMM(xmm_alpha, xmm_temp0);
  4187. #else
  4188. xmm_c = xmm_temp0;
  4189. #endif
  4190. #endif
  4191. MKL_DC_STORE_XMM(&MKL_DC_CC(i,j), xmm_c);
  4192. i += MKER2;
  4193. }
  4194. if ((m-i) & MKER3) {
  4195. xmm_temp0 = MKL_DC_SETZERO_XMM();
  4196. xmm_temp3 = MKL_DC_SETZERO_XMM();
  4197. #if !defined(MKL_DC_BETA_ZERO)
  4198. xmm_c = MKL_DC_LOAD_XMM_S(&MKL_DC_CC(i,j));
  4199. #endif
  4200. MKL_INT k;
  4201. for (k=0; k<k0; k+=k_in_ker) {
  4202. xmm_a = MKL_DC_LOAD_XMM_S(&MKL_DC_AA(i,k));
  4203. xmm_b = MKL_DC_LOAD_XMM_S(&MKL_DC_BB(k,j));
  4204. MKL_DC_MUL_ADD_XMM_S(xmm_a, xmm_b, xmm_temp0, xmm_b);
  4205. xmm_a = MKL_DC_LOAD_XMM_S(&MKL_DC_AA(i,k+1));
  4206. xmm_b = MKL_DC_LOAD_XMM_S(&MKL_DC_BB(k+1,j));
  4207. MKL_DC_MUL_ADD_XMM_S(xmm_a, xmm_b, xmm_temp3, xmm_b);
  4208. xmm_a = MKL_DC_LOAD_XMM_S(&MKL_DC_AA(i,k+2));
  4209. xmm_b = MKL_DC_LOAD_XMM_S(&MKL_DC_BB(k+2,j));
  4210. MKL_DC_MUL_ADD_XMM_S(xmm_a, xmm_b, xmm_temp0, xmm_b);
  4211. xmm_a = MKL_DC_LOAD_XMM_S(&MKL_DC_AA(i,k+3));
  4212. xmm_b = MKL_DC_LOAD_XMM_S(&MKL_DC_BB(k+3,j));
  4213. MKL_DC_MUL_ADD_XMM_S(xmm_a, xmm_b, xmm_temp3, xmm_b);
  4214. }
  4215. if (krem & 2) {
  4216. xmm_a = MKL_DC_LOAD_XMM_S(&MKL_DC_AA(i,k));
  4217. xmm_b = MKL_DC_LOAD_XMM_S(&MKL_DC_BB(k,j));
  4218. MKL_DC_MUL_ADD_XMM_S(xmm_a, xmm_b, xmm_temp0, xmm_b);
  4219. k++;
  4220. xmm_a = MKL_DC_LOAD_XMM_S(&MKL_DC_AA(i,k));
  4221. xmm_b = MKL_DC_LOAD_XMM_S(&MKL_DC_BB(k,j));
  4222. MKL_DC_MUL_ADD_XMM_S(xmm_a, xmm_b, xmm_temp3, xmm_b);
  4223. k++;
  4224. }
  4225. if (kK>=2) {
  4226. xmm_temp0 = MKL_DC_ADD_XMM_S(xmm_temp0, xmm_temp3);
  4227. }
  4228. if (krem & 1) {
  4229. xmm_a = MKL_DC_LOAD_XMM_S(&MKL_DC_AA(i,k));
  4230. xmm_b = MKL_DC_LOAD_XMM_S(&MKL_DC_BB(k,j));
  4231. MKL_DC_MUL_ADD_XMM_S(xmm_a, xmm_b, xmm_temp0, xmm_b);
  4232. k++;
  4233. }
  4234. #if !defined(MKL_DC_BETA_ZERO)
  4235. #if !defined(MKL_DC_BETA_ONE)
  4236. xmm_c = MKL_DC_MUL_XMM(xmm_c, xmm_beta);
  4237. #endif
  4238. #if defined(MKL_DC_ALPHA_ONE)
  4239. xmm_c = MKL_DC_ADD_XMM_S(xmm_temp0, xmm_c);
  4240. #else
  4241. MKL_DC_MUL_ADD_XMM_S(xmm_alpha, xmm_temp0, xmm_c, xmm_temp);
  4242. #endif
  4243. #else
  4244. #if !defined(MKL_DC_ALPHA_ONE)
  4245. xmm_c = MKL_DC_MUL_XMM_S(xmm_alpha, xmm_temp0);
  4246. #else
  4247. xmm_c = xmm_temp0;
  4248. #endif
  4249. #endif
  4250. MKL_DC_STORE_XMM_S(&MKL_DC_CC(i,j), xmm_c);
  4251. i += MKER3;
  4252. }
  4253. }
  4254. }
  4255. static __inline void MKL_DC_FNAME_GEMM_KERNEL(dgemm_tn_mnk)
  4256. (MKL_INT m, MKL_INT n, MKL_INT kK,
  4257. const mkl_dc_type * ALPHA,
  4258. const mkl_dc_type * A, MKL_INT lda,
  4259. const mkl_dc_type * B, MKL_INT ldb,
  4260. const mkl_dc_type * BETA,
  4261. mkl_dc_type * C, MKL_INT ldc)
  4262. {
  4263. #undef MKL_DC_AA
  4264. #undef MKL_DC_BB
  4265. #undef MKL_DC_CC
  4266. #define MKL_DC_AA(i,j) ((A)[(j)+lda*(i)])
  4267. #define MKL_DC_BB(i,j) ((B)[(i)+ldb*(j)])
  4268. #define MKL_DC_CC(i,j) ((C)[(i)+ldc*(j)])
  4269. const MKL_INT m_in_ker = 4;
  4270. const MKL_INT n_in_ker = 2;
  4271. const MKL_INT k_in_ker = 4;
  4272. const MKL_INT64 MASKBIT = ((MKL_INT64) 1<<63);
  4273. MKL_INT m0 = (m/m_in_ker)*m_in_ker;
  4274. MKL_INT n0 = (n/n_in_ker)*n_in_ker;
  4275. MKL_INT k0 = (kK/k_in_ker)*k_in_ker;
  4276. MKL_INT krem = kK - k0;
  4277. MKL_DC_YMMTYPE ymm_temp;
  4278. MKL_DC_YMMTYPE ymm_temp0, ymm_temp1;
  4279. MKL_DC_YMMTYPE ymm_temp2, ymm_temp3;
  4280. MKL_DC_YMMTYPE ymm_temp4, ymm_temp5;
  4281. MKL_DC_YMMTYPE ymm_temp6, ymm_temp7;
  4282. MKL_DC_YMMTYPE ymm_temp8, ymm_temp9;
  4283. MKL_DC_YMMTYPE ymm_temp10, ymm_temp11;
  4284. MKL_DC_YMMTYPE ymm_temp20, ymm_temp21;
  4285. MKL_DC_YMMTYPE ymm_temp22, ymm_temp23;
  4286. MKL_DC_YMMTYPE ymm_c0, ymm_c1;
  4287. MKL_DC_YMMTYPE ymm_c2, ymm_c3;
  4288. MKL_DC_YMMTYPE ymm_c4, ymm_c5;
  4289. MKL_DC_YMMTYPE ymm_c6, ymm_c7;
  4290. MKL_DC_YMMTYPE ymm_b0, ymm_b1, ymm_b2;
  4291. MKL_DC_YMMTYPE ymm_a;
  4292. MKL_DC_YMMTYPE ymm_alpha;
  4293. MKL_DC_XMMTYPE xmm_a, xmm_b0, xmm_b1;
  4294. MKL_DC_XMMTYPE xmm_temp0, xmm_temp1, xmm_temp2, xmm_temp3;
  4295. MKL_DC_XMMTYPE xmm_temp4, xmm_temp5, xmm_temp6, xmm_temp7;
  4296. MKL_DC_XMMTYPE xmm_temp8, xmm_temp9, xmm_temp10, xmm_temp11;
  4297. MKL_DC_XMMTYPE xmm_temp;
  4298. MKL_DC_XMMTYPE xmm_c0, xmm_c1, xmm_c2, xmm_c3;
  4299. MKL_DC_XMMTYPE xmm_c4, xmm_c5, xmm_c6, xmm_c7;
  4300. MKL_DC_XMMTYPE xmm_alpha;
  4301. #if !defined(MKL_DC_ALPHA_ZERO) && !defined(MKL_DC_ALPHA_ONE)
  4302. ymm_alpha = MKL_DC_BCAST_YMM(ALPHA);
  4303. xmm_alpha = MKL_DC_CAST_YMM_TO_XMM(ymm_alpha);
  4304. #endif
  4305. #if !defined(MKL_DC_BETA_ZERO) && !defined(MKL_DC_BETA_ONE)
  4306. MKL_DC_YMMTYPE ymm_beta = MKL_DC_BCAST_YMM(BETA);
  4307. MKL_DC_XMMTYPE xmm_beta = MKL_DC_CAST_YMM_TO_XMM(ymm_beta);
  4308. #endif
  4309. __m256i k_mask;
  4310. if (krem == 1) {
  4311. k_mask = _mm256_set_epi64x(0, 0, 0, MASKBIT);
  4312. } else if (krem == 2) {
  4313. k_mask = _mm256_set_epi64x(0, 0, MASKBIT, MASKBIT);
  4314. } else if (krem == 3) {
  4315. k_mask = _mm256_set_epi64x(0, MASKBIT, MASKBIT, MASKBIT);
  4316. }
  4317. MKL_INT j;
  4318. for (j=0; j<n0; j+=n_in_ker) {
  4319. MKL_INT i;
  4320. for (i=0; i<m0; i+=m_in_ker) {
  4321. ymm_temp0 = MKL_DC_SETZERO_YMM();
  4322. ymm_temp1 = MKL_DC_SETZERO_YMM();
  4323. ymm_temp2 = MKL_DC_SETZERO_YMM();
  4324. ymm_temp3 = MKL_DC_SETZERO_YMM();
  4325. ymm_temp4 = MKL_DC_SETZERO_YMM();
  4326. ymm_temp5 = MKL_DC_SETZERO_YMM();
  4327. ymm_temp6 = MKL_DC_SETZERO_YMM();
  4328. ymm_temp7 = MKL_DC_SETZERO_YMM();
  4329. MKL_INT k;
  4330. for (k=0; k<k0; k+=k_in_ker) {
  4331. ymm_a = MKL_DC_LOAD_YMM(&MKL_DC_AA(i + 0, k + 0));
  4332. ymm_b0 = MKL_DC_LOAD_YMM(&MKL_DC_BB(k + 0, j+0));
  4333. ymm_b1 = MKL_DC_LOAD_YMM(&MKL_DC_BB(k + 0, j+1));
  4334. MKL_DC_MUL_ADD_YMM(ymm_a, ymm_b0, ymm_temp0, ymm_temp);
  4335. MKL_DC_MUL_ADD_YMM(ymm_a, ymm_b1, ymm_temp1, ymm_temp);
  4336. ymm_a = MKL_DC_LOAD_YMM(&MKL_DC_AA(i+1,k + 0));
  4337. MKL_DC_MUL_ADD_YMM(ymm_a, ymm_b0, ymm_temp2, ymm_temp);
  4338. MKL_DC_MUL_ADD_YMM(ymm_a, ymm_b1, ymm_temp3, ymm_temp);
  4339. ymm_a = MKL_DC_LOAD_YMM(&MKL_DC_AA(i+2,k + 0));
  4340. MKL_DC_MUL_ADD_YMM(ymm_a, ymm_b0, ymm_temp4, ymm_temp);
  4341. MKL_DC_MUL_ADD_YMM(ymm_a, ymm_b1, ymm_temp5, ymm_temp);
  4342. ymm_a = MKL_DC_LOAD_YMM(&MKL_DC_AA(i+3,k + 0));
  4343. MKL_DC_MUL_ADD_YMM(ymm_a, ymm_b0, ymm_temp6, ymm_temp);
  4344. MKL_DC_MUL_ADD_YMM(ymm_a, ymm_b1, ymm_temp7, ymm_temp);
  4345. }
  4346. if (krem) {
  4347. ymm_a = MKL_DC_MASKLOAD_YMM(&MKL_DC_AA(i+0, k + 0), k_mask);
  4348. ymm_b0 = MKL_DC_MASKLOAD_YMM(&MKL_DC_BB(k + 0, j+0), k_mask);
  4349. ymm_b1 = MKL_DC_MASKLOAD_YMM(&MKL_DC_BB(k + 0, j+1), k_mask);
  4350. MKL_DC_MUL_ADD_YMM(ymm_a, ymm_b0, ymm_temp0, ymm_temp);
  4351. MKL_DC_MUL_ADD_YMM(ymm_a, ymm_b1, ymm_temp1, ymm_temp);
  4352. ymm_a = MKL_DC_MASKLOAD_YMM(&MKL_DC_AA(i+1,k + 0), k_mask);
  4353. MKL_DC_MUL_ADD_YMM(ymm_a, ymm_b0, ymm_temp2, ymm_temp);
  4354. MKL_DC_MUL_ADD_YMM(ymm_a, ymm_b1, ymm_temp3, ymm_temp);
  4355. ymm_a = MKL_DC_MASKLOAD_YMM(&MKL_DC_AA(i+2,k + 0), k_mask);
  4356. MKL_DC_MUL_ADD_YMM(ymm_a, ymm_b0, ymm_temp4, ymm_temp);
  4357. MKL_DC_MUL_ADD_YMM(ymm_a, ymm_b1, ymm_temp5, ymm_temp);
  4358. ymm_a = MKL_DC_MASKLOAD_YMM(&MKL_DC_AA(i+3,k + 0), k_mask);
  4359. MKL_DC_MUL_ADD_YMM(ymm_a, ymm_b0, ymm_temp6, ymm_temp);
  4360. MKL_DC_MUL_ADD_YMM(ymm_a, ymm_b1, ymm_temp7, ymm_temp);
  4361. }
  4362. ymm_temp20 = MKL_DC_HADD_YMM(ymm_temp0, ymm_temp2);
  4363. ymm_temp21 = MKL_DC_HADD_YMM(ymm_temp4, ymm_temp6);
  4364. ymm_temp22 = MKL_DC_PERM2F128_YMM(ymm_temp20, ymm_temp21, 0x21);
  4365. ymm_temp23 = MKL_DC_PERM2F128_YMM(ymm_temp20, ymm_temp21, 0x30);
  4366. ymm_temp0 = MKL_DC_ADD_YMM(ymm_temp22, ymm_temp23);
  4367. #if !defined(MKL_DC_BETA_ZERO)
  4368. ymm_c0 = MKL_DC_LOAD_YMM(&MKL_DC_CC(i,j));
  4369. #if !defined(MKL_DC_BETA_ONE)
  4370. ymm_c0 = MKL_DC_MUL_YMM(ymm_beta, ymm_c0);
  4371. #endif
  4372. #if defined(MKL_DC_ALPHA_ONE)
  4373. ymm_c0 = MKL_DC_ADD_YMM(ymm_temp0, ymm_c0);
  4374. #else
  4375. MKL_DC_MUL_ADD_YMM(ymm_alpha, ymm_temp0, ymm_c0, ymm_temp);
  4376. #endif
  4377. #else
  4378. #if !defined(MKL_DC_ALPHA_ONE)
  4379. ymm_c0 = MKL_DC_MUL_YMM(ymm_alpha, ymm_temp0);
  4380. #else
  4381. ymm_c0 = ymm_temp0;
  4382. #endif
  4383. #endif
  4384. MKL_DC_STORE_YMM(&MKL_DC_CC(i,j), ymm_c0);
  4385. ymm_temp20 = MKL_DC_HADD_YMM(ymm_temp1, ymm_temp3);
  4386. ymm_temp21 = MKL_DC_HADD_YMM(ymm_temp5, ymm_temp7);
  4387. ymm_temp22 = MKL_DC_PERM2F128_YMM(ymm_temp20, ymm_temp21, 0x21);
  4388. ymm_temp23 = MKL_DC_PERM2F128_YMM(ymm_temp20, ymm_temp21, 0x30);
  4389. ymm_temp1 = MKL_DC_ADD_YMM(ymm_temp22, ymm_temp23);
  4390. #if !defined(MKL_DC_BETA_ZERO)
  4391. ymm_c1 = MKL_DC_LOAD_YMM(&MKL_DC_CC(i,j+1));
  4392. #if !defined(MKL_DC_BETA_ONE)
  4393. ymm_c1 = MKL_DC_MUL_YMM(ymm_beta, ymm_c1);
  4394. #endif
  4395. #if defined(MKL_DC_ALPHA_ONE)
  4396. ymm_c1 = MKL_DC_ADD_YMM(ymm_temp1, ymm_c1);
  4397. #else
  4398. MKL_DC_MUL_ADD_YMM(ymm_alpha, ymm_temp1, ymm_c1, ymm_temp);
  4399. #endif
  4400. #else
  4401. #if !defined(MKL_DC_ALPHA_ONE)
  4402. ymm_c1 = MKL_DC_MUL_YMM(ymm_alpha, ymm_temp1);
  4403. #else
  4404. ymm_c1 = ymm_temp1;
  4405. #endif
  4406. #endif
  4407. MKL_DC_STORE_YMM(&MKL_DC_CC(i,j+1), ymm_c1);
  4408. }
  4409. if ((m-i) == 3) {
  4410. ymm_temp0 = MKL_DC_SETZERO_YMM();
  4411. ymm_temp1 = MKL_DC_SETZERO_YMM();
  4412. ymm_temp2 = MKL_DC_SETZERO_YMM();
  4413. ymm_temp3 = MKL_DC_SETZERO_YMM();
  4414. ymm_temp4 = MKL_DC_SETZERO_YMM();
  4415. ymm_temp5 = MKL_DC_SETZERO_YMM();
  4416. ymm_temp6 = MKL_DC_SETZERO_YMM();
  4417. ymm_temp7 = MKL_DC_SETZERO_YMM();
  4418. __m256i m_mask = _mm256_set_epi64x(0, MASKBIT, MASKBIT, MASKBIT);
  4419. MKL_INT k;
  4420. for (k=0; k<k0; k+=k_in_ker) {
  4421. ymm_a = MKL_DC_LOAD_YMM(&MKL_DC_AA(i + 0, k + 0));
  4422. ymm_b0 = MKL_DC_LOAD_YMM(&MKL_DC_BB(k + 0, j+0));
  4423. ymm_b1 = MKL_DC_LOAD_YMM(&MKL_DC_BB(k + 0, j+1));
  4424. MKL_DC_MUL_ADD_YMM(ymm_a, ymm_b0, ymm_temp0, ymm_temp);
  4425. MKL_DC_MUL_ADD_YMM(ymm_a, ymm_b1, ymm_temp1, ymm_temp);
  4426. ymm_a = MKL_DC_LOAD_YMM(&MKL_DC_AA(i+1,k + 0));
  4427. MKL_DC_MUL_ADD_YMM(ymm_a, ymm_b0, ymm_temp2, ymm_temp);
  4428. MKL_DC_MUL_ADD_YMM(ymm_a, ymm_b1, ymm_temp3, ymm_temp);
  4429. ymm_a = MKL_DC_LOAD_YMM(&MKL_DC_AA(i+2,k + 0));
  4430. MKL_DC_MUL_ADD_YMM(ymm_a, ymm_b0, ymm_temp4, ymm_temp);
  4431. MKL_DC_MUL_ADD_YMM(ymm_a, ymm_b1, ymm_temp5, ymm_temp);
  4432. }
  4433. if (krem) {
  4434. ymm_a = MKL_DC_MASKLOAD_YMM(&MKL_DC_AA(i+0, k + 0), k_mask);
  4435. ymm_b0 = MKL_DC_MASKLOAD_YMM(&MKL_DC_BB(k + 0, j+0), k_mask);
  4436. ymm_b1 = MKL_DC_MASKLOAD_YMM(&MKL_DC_BB(k + 0, j+1), k_mask);
  4437. MKL_DC_MUL_ADD_YMM(ymm_a, ymm_b0, ymm_temp0, ymm_temp);
  4438. MKL_DC_MUL_ADD_YMM(ymm_a, ymm_b1, ymm_temp1, ymm_temp);
  4439. ymm_a = MKL_DC_MASKLOAD_YMM(&MKL_DC_AA(i+1,k + 0), k_mask);
  4440. MKL_DC_MUL_ADD_YMM(ymm_a, ymm_b0, ymm_temp2, ymm_temp);
  4441. MKL_DC_MUL_ADD_YMM(ymm_a, ymm_b1, ymm_temp3, ymm_temp);
  4442. ymm_a = MKL_DC_MASKLOAD_YMM(&MKL_DC_AA(i+2,k + 0), k_mask);
  4443. MKL_DC_MUL_ADD_YMM(ymm_a, ymm_b0, ymm_temp4, ymm_temp);
  4444. MKL_DC_MUL_ADD_YMM(ymm_a, ymm_b1, ymm_temp5, ymm_temp);
  4445. }
  4446. ymm_temp20 = MKL_DC_HADD_YMM(ymm_temp0, ymm_temp2);
  4447. ymm_temp21 = MKL_DC_HADD_YMM(ymm_temp4, ymm_temp6);
  4448. ymm_temp22 = MKL_DC_PERM2F128_YMM(ymm_temp20, ymm_temp21, 0x21);
  4449. ymm_temp23 = MKL_DC_PERM2F128_YMM(ymm_temp20, ymm_temp21, 0x30);
  4450. ymm_temp0 = MKL_DC_ADD_YMM(ymm_temp22, ymm_temp23);
  4451. #if !defined(MKL_DC_BETA_ZERO)
  4452. ymm_c0 = MKL_DC_MASKLOAD_YMM(&MKL_DC_CC(i,j), m_mask);
  4453. #if !defined(MKL_DC_BETA_ONE)
  4454. ymm_c0 = MKL_DC_MUL_YMM(ymm_beta, ymm_c0);
  4455. #endif
  4456. #if defined(MKL_DC_ALPHA_ONE)
  4457. ymm_c0 = MKL_DC_ADD_YMM(ymm_temp0, ymm_c0);
  4458. #else
  4459. MKL_DC_MUL_ADD_YMM(ymm_alpha, ymm_temp0, ymm_c0, ymm_temp);
  4460. #endif
  4461. #else
  4462. #if !defined(MKL_DC_ALPHA_ONE)
  4463. ymm_c0 = MKL_DC_MUL_YMM(ymm_alpha, ymm_temp0);
  4464. #else
  4465. ymm_c0 = ymm_temp0;
  4466. #endif
  4467. #endif
  4468. MKL_DC_MASKSTORE_YMM(&MKL_DC_CC(i,j), m_mask, ymm_c0);
  4469. ymm_temp20 = MKL_DC_HADD_YMM(ymm_temp1, ymm_temp3);
  4470. ymm_temp21 = MKL_DC_HADD_YMM(ymm_temp5, ymm_temp7);
  4471. ymm_temp22 = MKL_DC_PERM2F128_YMM(ymm_temp20, ymm_temp21, 0x21);
  4472. ymm_temp23 = MKL_DC_PERM2F128_YMM(ymm_temp20, ymm_temp21, 0x30);
  4473. ymm_temp1 = MKL_DC_ADD_YMM(ymm_temp22, ymm_temp23);
  4474. #if !defined(MKL_DC_BETA_ZERO)
  4475. ymm_c1 = MKL_DC_MASKLOAD_YMM(&MKL_DC_CC(i,j+1), m_mask);
  4476. #if !defined(MKL_DC_BETA_ONE)
  4477. ymm_c1 = MKL_DC_MUL_YMM(ymm_beta, ymm_c1);
  4478. #endif
  4479. #if defined(MKL_DC_ALPHA_ONE)
  4480. ymm_c1 = MKL_DC_ADD_YMM(ymm_temp1, ymm_c1);
  4481. #else
  4482. MKL_DC_MUL_ADD_YMM(ymm_alpha, ymm_temp1, ymm_c1, ymm_temp);
  4483. #endif
  4484. #else
  4485. #if !defined(MKL_DC_ALPHA_ONE)
  4486. ymm_c1 = MKL_DC_MUL_YMM(ymm_alpha, ymm_temp1);
  4487. #else
  4488. ymm_c1 = ymm_temp1;
  4489. #endif
  4490. #endif
  4491. MKL_DC_MASKSTORE_YMM(&MKL_DC_CC(i,j+1), m_mask, ymm_c1);
  4492. } else if ((m-i) == 2) {
  4493. ymm_temp0 = MKL_DC_SETZERO_YMM();
  4494. ymm_temp1 = MKL_DC_SETZERO_YMM();
  4495. ymm_temp2 = MKL_DC_SETZERO_YMM();
  4496. ymm_temp3 = MKL_DC_SETZERO_YMM();
  4497. ymm_temp4 = MKL_DC_SETZERO_YMM();
  4498. MKL_INT k;
  4499. for (k=0; k<k0; k+=k_in_ker) {
  4500. ymm_a = MKL_DC_LOAD_YMM(&MKL_DC_AA(i+0, k + 0));
  4501. ymm_b0 = MKL_DC_LOAD_YMM(&MKL_DC_BB(k + 0, j+0));
  4502. ymm_b1 = MKL_DC_LOAD_YMM(&MKL_DC_BB(k + 0, j+1));
  4503. MKL_DC_MUL_ADD_YMM(ymm_a, ymm_b0, ymm_temp0, ymm_temp);
  4504. MKL_DC_MUL_ADD_YMM(ymm_a, ymm_b1, ymm_temp1, ymm_temp);
  4505. ymm_a = MKL_DC_LOAD_YMM(&MKL_DC_AA(i+1,k + 0));
  4506. MKL_DC_MUL_ADD_YMM(ymm_a, ymm_b0, ymm_temp2, ymm_temp);
  4507. MKL_DC_MUL_ADD_YMM(ymm_a, ymm_b1, ymm_temp3, ymm_temp);
  4508. }
  4509. if (krem) {
  4510. ymm_a = MKL_DC_MASKLOAD_YMM(&MKL_DC_AA(i+0, k + 0), k_mask);
  4511. ymm_b0 = MKL_DC_MASKLOAD_YMM(&MKL_DC_BB(k + 0, j+0), k_mask);
  4512. ymm_b1 = MKL_DC_MASKLOAD_YMM(&MKL_DC_BB(k + 0, j+1), k_mask);
  4513. MKL_DC_MUL_ADD_YMM(ymm_a, ymm_b0, ymm_temp0, ymm_temp);
  4514. MKL_DC_MUL_ADD_YMM(ymm_a, ymm_b1, ymm_temp1, ymm_temp);
  4515. ymm_a = MKL_DC_MASKLOAD_YMM(&MKL_DC_AA(i+1,k + 0), k_mask);
  4516. MKL_DC_MUL_ADD_YMM(ymm_a, ymm_b0, ymm_temp2, ymm_temp);
  4517. MKL_DC_MUL_ADD_YMM(ymm_a, ymm_b1, ymm_temp3, ymm_temp);
  4518. }
  4519. ymm_temp20 = MKL_DC_HADD_YMM(ymm_temp0, ymm_temp2);
  4520. ymm_temp22 = MKL_DC_PERM2F128_YMM(ymm_temp20, ymm_temp20, 0x01);
  4521. ymm_temp0 = MKL_DC_ADD_YMM(ymm_temp20, ymm_temp22);
  4522. xmm_temp0 = MKL_DC_CAST_YMM_TO_XMM(ymm_temp0);
  4523. #if !defined(MKL_DC_BETA_ZERO)
  4524. xmm_c1 = MKL_DC_LOAD_XMM(&MKL_DC_CC(i,j+0));
  4525. #if !defined(MKL_DC_BETA_ONE)
  4526. xmm_c1 = MKL_DC_MUL_XMM(xmm_beta, xmm_c1);
  4527. #endif
  4528. #if defined(MKL_DC_ALPHA_ONE)
  4529. xmm_c1 = MKL_DC_ADD_XMM(xmm_temp0, xmm_c1);
  4530. #else
  4531. MKL_DC_MUL_ADD_XMM(xmm_alpha, xmm_temp0, xmm_c1, xmm_temp);
  4532. #endif
  4533. #else
  4534. #if !defined(MKL_DC_ALPHA_ONE)
  4535. xmm_c1 = MKL_DC_MUL_XMM(xmm_alpha, xmm_temp0);
  4536. #else
  4537. xmm_c1 = xmm_temp0;
  4538. #endif
  4539. #endif
  4540. MKL_DC_STORE_XMM(&MKL_DC_CC(i,j+0), xmm_c1);
  4541. ymm_temp20 = MKL_DC_HADD_YMM(ymm_temp1, ymm_temp3);
  4542. ymm_temp22 = MKL_DC_PERM2F128_YMM(ymm_temp20, ymm_temp20, 0x01);
  4543. ymm_temp1 = MKL_DC_ADD_YMM(ymm_temp20, ymm_temp22);
  4544. xmm_temp1 = MKL_DC_CAST_YMM_TO_XMM(ymm_temp1);
  4545. #if !defined(MKL_DC_BETA_ZERO)
  4546. xmm_c2 = MKL_DC_LOAD_XMM(&MKL_DC_CC(i,j+1));
  4547. #if !defined(MKL_DC_BETA_ONE)
  4548. xmm_c2 = MKL_DC_MUL_XMM(xmm_beta, xmm_c2);
  4549. #endif
  4550. #if defined(MKL_DC_ALPHA_ONE)
  4551. xmm_c2 = MKL_DC_ADD_XMM(xmm_temp1, xmm_c2);
  4552. #else
  4553. MKL_DC_MUL_ADD_XMM(xmm_alpha, xmm_temp1, xmm_c2, xmm_temp);
  4554. #endif
  4555. #else
  4556. #if !defined(MKL_DC_ALPHA_ONE)
  4557. xmm_c2 = MKL_DC_MUL_XMM(xmm_alpha, xmm_temp1);
  4558. #else
  4559. xmm_c2 = xmm_temp1;
  4560. #endif
  4561. #endif
  4562. MKL_DC_STORE_XMM(&MKL_DC_CC(i,j+1), xmm_c2);
  4563. } else if ((m-i) == 1) {
  4564. ymm_temp0 = MKL_DC_SETZERO_YMM();
  4565. ymm_temp1 = MKL_DC_SETZERO_YMM();
  4566. MKL_INT k;
  4567. for (k=0; k<k0; k+=k_in_ker) {
  4568. ymm_a = MKL_DC_LOAD_YMM(&MKL_DC_AA(i+0, k + 0));
  4569. ymm_b0 = MKL_DC_LOAD_YMM(&MKL_DC_BB(k + 0, j+0));
  4570. ymm_b1 = MKL_DC_LOAD_YMM(&MKL_DC_BB(k + 0, j+1));
  4571. MKL_DC_MUL_ADD_YMM(ymm_a, ymm_b0, ymm_temp0, ymm_temp);
  4572. MKL_DC_MUL_ADD_YMM(ymm_a, ymm_b1, ymm_temp1, ymm_temp);
  4573. }
  4574. if (krem) {
  4575. ymm_a = MKL_DC_MASKLOAD_YMM(&MKL_DC_AA(i+0, k + 0), k_mask);
  4576. ymm_b0 = MKL_DC_MASKLOAD_YMM(&MKL_DC_BB(k + 0, j+0), k_mask);
  4577. ymm_b1 = MKL_DC_MASKLOAD_YMM(&MKL_DC_BB(k + 0, j+1), k_mask);
  4578. MKL_DC_MUL_ADD_YMM(ymm_a, ymm_b0, ymm_temp0, ymm_temp);
  4579. MKL_DC_MUL_ADD_YMM(ymm_a, ymm_b1, ymm_temp1, ymm_temp);
  4580. }
  4581. ymm_temp20 = MKL_DC_HADD_YMM(ymm_temp0, ymm_temp0);
  4582. ymm_temp22 = MKL_DC_PERM2F128_YMM(ymm_temp20, ymm_temp20, 0x01);
  4583. ymm_temp0 = MKL_DC_ADD_YMM(ymm_temp20, ymm_temp22);
  4584. xmm_temp0 = MKL_DC_CAST_YMM_TO_XMM(ymm_temp0);
  4585. #if !defined(MKL_DC_BETA_ZERO)
  4586. xmm_c1 = MKL_DC_LOAD_XMM_S(&MKL_DC_CC(i,j+0));
  4587. #if !defined(MKL_DC_BETA_ONE)
  4588. xmm_c1 = MKL_DC_MUL_XMM_S(xmm_beta, xmm_c1);
  4589. #endif
  4590. #if defined(MKL_DC_ALPHA_ONE)
  4591. xmm_c1 = MKL_DC_ADD_XMM_S(xmm_temp0, xmm_c1);
  4592. #else
  4593. MKL_DC_MUL_ADD_XMM_S(xmm_alpha, xmm_temp0, xmm_c1, xmm_temp);
  4594. #endif
  4595. #else
  4596. #if !defined(MKL_DC_ALPHA_ONE)
  4597. xmm_c1 = MKL_DC_MUL_XMM_S(xmm_alpha, xmm_temp0);
  4598. #else
  4599. xmm_c1 = xmm_temp0;
  4600. #endif
  4601. #endif
  4602. MKL_DC_STORE_XMM_S(&MKL_DC_CC(i,j+0), xmm_c1);
  4603. ymm_temp20 = MKL_DC_HADD_YMM(ymm_temp1, ymm_temp1);
  4604. ymm_temp22 = MKL_DC_PERM2F128_YMM(ymm_temp20, ymm_temp20, 0x01);
  4605. ymm_temp1 = MKL_DC_ADD_YMM(ymm_temp20, ymm_temp22);
  4606. xmm_temp1 = MKL_DC_CAST_YMM_TO_XMM(ymm_temp1);
  4607. #if !defined(MKL_DC_BETA_ZERO)
  4608. xmm_c2 = MKL_DC_LOAD_XMM_S(&MKL_DC_CC(i,j+1));
  4609. #if !defined(MKL_DC_BETA_ONE)
  4610. xmm_c2 = MKL_DC_MUL_XMM_S(xmm_beta, xmm_c2);
  4611. #endif
  4612. #if defined(MKL_DC_ALPHA_ONE)
  4613. xmm_c2 = MKL_DC_ADD_XMM_S(xmm_temp1, xmm_c2);
  4614. #else
  4615. MKL_DC_MUL_ADD_XMM_S(xmm_alpha, xmm_temp1, xmm_c2, xmm_temp);
  4616. #endif
  4617. #else
  4618. #if !defined(MKL_DC_ALPHA_ONE)
  4619. xmm_c2 = MKL_DC_MUL_XMM_S(xmm_alpha, xmm_temp1);
  4620. #else
  4621. xmm_c2 = xmm_temp1;
  4622. #endif
  4623. #endif
  4624. MKL_DC_STORE_XMM_S(&MKL_DC_CC(i,j+1), xmm_c2);
  4625. }
  4626. }
  4627. if (n-j) {
  4628. MKL_INT i;
  4629. for (i=0; i<m0; i+=m_in_ker) {
  4630. ymm_temp0 = MKL_DC_SETZERO_YMM();
  4631. ymm_temp2 = MKL_DC_SETZERO_YMM();
  4632. ymm_temp4 = MKL_DC_SETZERO_YMM();
  4633. ymm_temp6 = MKL_DC_SETZERO_YMM();
  4634. MKL_INT k;
  4635. for (k=0; k<k0; k+=k_in_ker) {
  4636. ymm_a = MKL_DC_LOAD_YMM(&MKL_DC_AA(i+0, k + 0));
  4637. ymm_b0 = MKL_DC_LOAD_YMM(&MKL_DC_BB(k + 0, j+0));
  4638. MKL_DC_MUL_ADD_YMM(ymm_a, ymm_b0, ymm_temp0, ymm_temp);
  4639. ymm_a = MKL_DC_LOAD_YMM(&MKL_DC_AA(i+1,k + 0));
  4640. MKL_DC_MUL_ADD_YMM(ymm_a, ymm_b0, ymm_temp2, ymm_temp);
  4641. ymm_a = MKL_DC_LOAD_YMM(&MKL_DC_AA(i+2,k + 0));
  4642. MKL_DC_MUL_ADD_YMM(ymm_a, ymm_b0, ymm_temp4, ymm_temp);
  4643. ymm_a = MKL_DC_LOAD_YMM(&MKL_DC_AA(i+3,k + 0));
  4644. MKL_DC_MUL_ADD_YMM(ymm_a, ymm_b0, ymm_temp6, ymm_temp);
  4645. }
  4646. if (krem) {
  4647. ymm_a = MKL_DC_MASKLOAD_YMM(&MKL_DC_AA(i+0, k + 0), k_mask);
  4648. ymm_b0 = MKL_DC_MASKLOAD_YMM(&MKL_DC_BB(k + 0, j+0), k_mask);
  4649. MKL_DC_MUL_ADD_YMM(ymm_a, ymm_b0, ymm_temp0, ymm_temp);
  4650. ymm_a = MKL_DC_MASKLOAD_YMM(&MKL_DC_AA(i+1,k + 0), k_mask);
  4651. MKL_DC_MUL_ADD_YMM(ymm_a, ymm_b0, ymm_temp2, ymm_temp);
  4652. ymm_a = MKL_DC_MASKLOAD_YMM(&MKL_DC_AA(i+2,k + 0), k_mask);
  4653. MKL_DC_MUL_ADD_YMM(ymm_a, ymm_b0, ymm_temp4, ymm_temp);
  4654. ymm_a = MKL_DC_MASKLOAD_YMM(&MKL_DC_AA(i+3,k + 0), k_mask);
  4655. MKL_DC_MUL_ADD_YMM(ymm_a, ymm_b0, ymm_temp6, ymm_temp);
  4656. }
  4657. ymm_temp20 = MKL_DC_HADD_YMM(ymm_temp0, ymm_temp2);
  4658. ymm_temp21 = MKL_DC_HADD_YMM(ymm_temp4, ymm_temp6);
  4659. ymm_temp22 = MKL_DC_PERM2F128_YMM(ymm_temp20, ymm_temp21, 0x21);
  4660. ymm_temp23 = MKL_DC_PERM2F128_YMM(ymm_temp20, ymm_temp21, 0x30);
  4661. ymm_temp0 = MKL_DC_ADD_YMM(ymm_temp22, ymm_temp23);
  4662. #if !defined(MKL_DC_BETA_ZERO)
  4663. ymm_c0 = MKL_DC_LOAD_YMM(&MKL_DC_CC(i,j));
  4664. #if !defined(MKL_DC_BETA_ONE)
  4665. ymm_c0 = MKL_DC_MUL_YMM(ymm_beta, ymm_c0);
  4666. #endif
  4667. #if defined(MKL_DC_ALPHA_ONE)
  4668. ymm_c0 = MKL_DC_ADD_YMM(ymm_temp0, ymm_c0);
  4669. #else
  4670. MKL_DC_MUL_ADD_YMM(ymm_alpha, ymm_temp0, ymm_c0, ymm_temp);
  4671. #endif
  4672. #else
  4673. #if !defined(MKL_DC_ALPHA_ONE)
  4674. ymm_c0 = MKL_DC_MUL_YMM(ymm_alpha, ymm_temp0);
  4675. #else
  4676. ymm_c0 = ymm_temp0;
  4677. #endif
  4678. #endif
  4679. MKL_DC_STORE_YMM(&MKL_DC_CC(i,j), ymm_c0);
  4680. }
  4681. if ((m-i) == 3) {
  4682. ymm_temp0 = MKL_DC_SETZERO_YMM();
  4683. ymm_temp2 = MKL_DC_SETZERO_YMM();
  4684. ymm_temp4 = MKL_DC_SETZERO_YMM();
  4685. ymm_temp6 = MKL_DC_SETZERO_YMM();
  4686. __m256i m_mask = _mm256_set_epi64x(0, MASKBIT, MASKBIT, MASKBIT);
  4687. MKL_INT k;
  4688. for (k=0; k<k0; k+=k_in_ker) {
  4689. ymm_a = MKL_DC_LOAD_YMM(&MKL_DC_AA(i+0, k + 0));
  4690. ymm_b0 = MKL_DC_LOAD_YMM(&MKL_DC_BB(k + 0, j+0));
  4691. MKL_DC_MUL_ADD_YMM(ymm_a, ymm_b0, ymm_temp0, ymm_temp);
  4692. ymm_a = MKL_DC_LOAD_YMM(&MKL_DC_AA(i+1,k + 0));
  4693. MKL_DC_MUL_ADD_YMM(ymm_a, ymm_b0, ymm_temp2, ymm_temp);
  4694. ymm_a = MKL_DC_LOAD_YMM(&MKL_DC_AA(i+2,k + 0));
  4695. MKL_DC_MUL_ADD_YMM(ymm_a, ymm_b0, ymm_temp4, ymm_temp);
  4696. }
  4697. if (krem) {
  4698. ymm_a = MKL_DC_MASKLOAD_YMM(&MKL_DC_AA(i+0, k + 0), k_mask);
  4699. ymm_b0 = MKL_DC_MASKLOAD_YMM(&MKL_DC_BB(k + 0, j+0), k_mask);
  4700. MKL_DC_MUL_ADD_YMM(ymm_a, ymm_b0, ymm_temp0, ymm_temp);
  4701. ymm_a = MKL_DC_MASKLOAD_YMM(&MKL_DC_AA(i+1,k + 0), k_mask);
  4702. MKL_DC_MUL_ADD_YMM(ymm_a, ymm_b0, ymm_temp2, ymm_temp);
  4703. ymm_a = MKL_DC_MASKLOAD_YMM(&MKL_DC_AA(i+2,k + 0), k_mask);
  4704. MKL_DC_MUL_ADD_YMM(ymm_a, ymm_b0, ymm_temp4, ymm_temp);
  4705. }
  4706. ymm_temp20 = MKL_DC_HADD_YMM(ymm_temp0, ymm_temp2);
  4707. ymm_temp21 = MKL_DC_HADD_YMM(ymm_temp4, ymm_temp6);
  4708. ymm_temp22 = MKL_DC_PERM2F128_YMM(ymm_temp20, ymm_temp21, 0x21);
  4709. ymm_temp23 = MKL_DC_PERM2F128_YMM(ymm_temp20, ymm_temp21, 0x30);
  4710. ymm_temp0 = MKL_DC_ADD_YMM(ymm_temp22, ymm_temp23);
  4711. #if !defined(MKL_DC_BETA_ZERO)
  4712. ymm_c0 = MKL_DC_MASKLOAD_YMM(&MKL_DC_CC(i,j), m_mask);
  4713. #if !defined(MKL_DC_BETA_ONE)
  4714. ymm_c0 = MKL_DC_MUL_YMM(ymm_beta, ymm_c0);
  4715. #endif
  4716. #if defined(MKL_DC_ALPHA_ONE)
  4717. ymm_c0 = MKL_DC_ADD_YMM(ymm_temp0, ymm_c0);
  4718. #else
  4719. MKL_DC_MUL_ADD_YMM(ymm_alpha, ymm_temp0, ymm_c0, ymm_temp);
  4720. #endif
  4721. #else
  4722. #if !defined(MKL_DC_ALPHA_ONE)
  4723. ymm_c0 = MKL_DC_MUL_YMM(ymm_alpha, ymm_temp0);
  4724. #else
  4725. ymm_c0 = ymm_temp0;
  4726. #endif
  4727. #endif
  4728. MKL_DC_MASKSTORE_YMM(&MKL_DC_CC(i,j), m_mask, ymm_c0);
  4729. } else if ((m-i) == 2) {
  4730. ymm_temp0 = MKL_DC_SETZERO_YMM();
  4731. ymm_temp2 = MKL_DC_SETZERO_YMM();
  4732. MKL_INT k;
  4733. for (k=0; k<k0; k+=k_in_ker) {
  4734. ymm_a = MKL_DC_LOAD_YMM(&MKL_DC_AA(i+0, k + 0));
  4735. ymm_b0 = MKL_DC_LOAD_YMM(&MKL_DC_BB(k + 0, j+0));
  4736. MKL_DC_MUL_ADD_YMM(ymm_a, ymm_b0, ymm_temp0, ymm_temp);
  4737. ymm_a = MKL_DC_LOAD_YMM(&MKL_DC_AA(i+1,k + 0));
  4738. MKL_DC_MUL_ADD_YMM(ymm_a, ymm_b0, ymm_temp2, ymm_temp);
  4739. }
  4740. if (krem) {
  4741. ymm_a = MKL_DC_MASKLOAD_YMM(&MKL_DC_AA(i+0, k + 0), k_mask);
  4742. ymm_b0 = MKL_DC_MASKLOAD_YMM(&MKL_DC_BB(k + 0, j+0), k_mask);
  4743. MKL_DC_MUL_ADD_YMM(ymm_a, ymm_b0, ymm_temp0, ymm_temp);
  4744. ymm_a = MKL_DC_MASKLOAD_YMM(&MKL_DC_AA(i+1,k + 0), k_mask);
  4745. MKL_DC_MUL_ADD_YMM(ymm_a, ymm_b0, ymm_temp2, ymm_temp);
  4746. }
  4747. ymm_temp20 = MKL_DC_HADD_YMM(ymm_temp0, ymm_temp2);
  4748. ymm_temp22 = MKL_DC_PERM2F128_YMM(ymm_temp20, ymm_temp20, 0x01);
  4749. ymm_temp0 = MKL_DC_ADD_YMM(ymm_temp20, ymm_temp22);
  4750. xmm_temp0 = MKL_DC_CAST_YMM_TO_XMM(ymm_temp0);
  4751. #if !defined(MKL_DC_BETA_ZERO)
  4752. xmm_c1 = MKL_DC_LOAD_XMM(&MKL_DC_CC(i,j+0));
  4753. #if !defined(MKL_DC_BETA_ONE)
  4754. xmm_c1 = MKL_DC_MUL_XMM(xmm_beta, xmm_c1);
  4755. #endif
  4756. #if defined(MKL_DC_ALPHA_ONE)
  4757. xmm_c1 = MKL_DC_ADD_XMM(xmm_temp0, xmm_c1);
  4758. #else
  4759. MKL_DC_MUL_ADD_XMM(xmm_alpha, xmm_temp0, xmm_c1, xmm_temp);
  4760. #endif
  4761. #else
  4762. #if !defined(MKL_DC_ALPHA_ONE)
  4763. xmm_c1 = MKL_DC_MUL_XMM(xmm_alpha, xmm_temp0);
  4764. #else
  4765. xmm_c1 = xmm_temp0;
  4766. #endif
  4767. #endif
  4768. MKL_DC_STORE_XMM(&MKL_DC_CC(i,j+0), xmm_c1);
  4769. } else if ((m-i) == 1) {
  4770. ymm_temp0 = MKL_DC_SETZERO_YMM();
  4771. MKL_INT k;
  4772. for (k=0; k<k0; k+=k_in_ker) {
  4773. ymm_a = MKL_DC_LOAD_YMM(&MKL_DC_AA(i+0, k + 0));
  4774. ymm_b0 = MKL_DC_LOAD_YMM(&MKL_DC_BB(k + 0, j+0));
  4775. MKL_DC_MUL_ADD_YMM(ymm_a, ymm_b0, ymm_temp0, ymm_temp);
  4776. }
  4777. if (krem) {
  4778. ymm_a = MKL_DC_MASKLOAD_YMM(&MKL_DC_AA(i+0, k + 0), k_mask);
  4779. ymm_b0 = MKL_DC_MASKLOAD_YMM(&MKL_DC_BB(k + 0, j+0), k_mask);
  4780. MKL_DC_MUL_ADD_YMM(ymm_a, ymm_b0, ymm_temp0, ymm_temp);
  4781. }
  4782. ymm_temp20 = MKL_DC_HADD_YMM(ymm_temp0, ymm_temp0);
  4783. ymm_temp22 = MKL_DC_PERM2F128_YMM(ymm_temp20, ymm_temp20, 0x01);
  4784. ymm_temp0 = MKL_DC_ADD_YMM(ymm_temp20, ymm_temp22);
  4785. xmm_temp0 = MKL_DC_CAST_YMM_TO_XMM(ymm_temp0);
  4786. #if !defined(MKL_DC_BETA_ZERO)
  4787. xmm_c1 = MKL_DC_LOAD_XMM_S(&MKL_DC_CC(i,j+0));
  4788. #if !defined(MKL_DC_BETA_ONE)
  4789. xmm_c1 = MKL_DC_MUL_XMM(xmm_beta, xmm_c1);
  4790. #endif
  4791. #if defined(MKL_DC_ALPHA_ONE)
  4792. xmm_c1 = MKL_DC_ADD_XMM_S(xmm_temp0, xmm_c1);
  4793. #else
  4794. MKL_DC_MUL_ADD_XMM_S(xmm_alpha, xmm_temp0, xmm_c1, xmm_temp);
  4795. #endif
  4796. #else
  4797. #if !defined(MKL_DC_ALPHA_ONE)
  4798. xmm_c1 = MKL_DC_MUL_XMM_S(xmm_alpha, xmm_temp0);
  4799. #else
  4800. xmm_c1 = xmm_temp0;
  4801. #endif
  4802. #endif
  4803. MKL_DC_STORE_XMM_S(&MKL_DC_CC(i,j+0), xmm_c1);
  4804. }
  4805. }
  4806. }
  4807. static __inline void MKL_DC_FNAME_GEMM_KERNEL(dgemm_tt_mnk)
  4808. (MKL_INT m, MKL_INT n, MKL_INT kK,
  4809. const mkl_dc_type * ALPHA,
  4810. const mkl_dc_type * A, MKL_INT lda,
  4811. const mkl_dc_type * B, MKL_INT ldb,
  4812. const mkl_dc_type * BETA,
  4813. mkl_dc_type * C, MKL_INT ldc)
  4814. {
  4815. #undef MKL_DC_AA
  4816. #undef MKL_DC_BB
  4817. #undef MKL_DC_CC
  4818. #define MKL_DC_AA(i,j) ((A)[(j)+lda*(i)])
  4819. #define MKL_DC_BB(i,j) ((B)[(j)+ldb*(i)])
  4820. #define MKL_DC_CC(i,j) ((C)[(i)+ldc*(j)])
  4821. const MKL_INT m_in_ker = 4;
  4822. const MKL_INT n_in_ker = 8;
  4823. const MKL_INT k_in_ker = 4;
  4824. MKL_INT m0 = (m/m_in_ker)*m_in_ker;
  4825. MKL_INT n0 = (n/n_in_ker)*n_in_ker;
  4826. MKL_INT k0 = (kK/k_in_ker)*k_in_ker;
  4827. MKL_INT krem = kK - k0;
  4828. MKL_DC_YMMTYPE ymm_temp;
  4829. MKL_DC_YMMTYPE ymm_temp0, ymm_temp1;
  4830. MKL_DC_YMMTYPE ymm_temp2, ymm_temp3;
  4831. MKL_DC_YMMTYPE ymm_temp4, ymm_temp5;
  4832. MKL_DC_YMMTYPE ymm_temp6, ymm_temp7;
  4833. MKL_DC_YMMTYPE ymm_c0, ymm_c1;
  4834. MKL_DC_YMMTYPE ymm_c2, ymm_c3;
  4835. MKL_DC_YMMTYPE ymm_c4, ymm_c5;
  4836. MKL_DC_YMMTYPE ymm_c6, ymm_c7;
  4837. MKL_DC_YMMTYPE ymm_a, ymm_b1, ymm_b2;
  4838. MKL_DC_YMMTYPE ymm_alpha;
  4839. MKL_DC_XMMTYPE xmm_a, xmm_b1;
  4840. MKL_DC_XMMTYPE xmm_temp0, xmm_temp1, xmm_temp2, xmm_temp3;
  4841. MKL_DC_XMMTYPE xmm_temp4, xmm_temp5, xmm_temp6, xmm_temp7;
  4842. MKL_DC_XMMTYPE xmm_temp;
  4843. MKL_DC_XMMTYPE xmm_c0, xmm_c1, xmm_c2, xmm_c3;
  4844. MKL_DC_XMMTYPE xmm_c4, xmm_c5, xmm_c6, xmm_c7;
  4845. MKL_DC_XMMTYPE xmm_alpha;
  4846. #if !defined(MKL_DC_ALPHA_ZERO) && !defined(MKL_DC_ALPHA_ONE)
  4847. ymm_alpha = MKL_DC_BCAST_YMM(ALPHA);
  4848. xmm_alpha = MKL_DC_CAST_YMM_TO_XMM(ymm_alpha);
  4849. #endif
  4850. #if !defined(MKL_DC_BETA_ZERO) && !defined(MKL_DC_BETA_ONE)
  4851. MKL_DC_YMMTYPE ymm_beta = MKL_DC_BCAST_YMM(BETA);
  4852. MKL_DC_XMMTYPE xmm_beta = MKL_DC_CAST_YMM_TO_XMM(ymm_beta);
  4853. #endif
  4854. MKL_INT j;
  4855. for (j=0; j<n0; j+=n_in_ker) {
  4856. MKL_INT i;
  4857. for (i=0; i<m0; i+=m_in_ker) {
  4858. ymm_temp0 = MKL_DC_SETZERO_YMM();
  4859. ymm_temp1 = MKL_DC_SETZERO_YMM();
  4860. ymm_temp2 = MKL_DC_SETZERO_YMM();
  4861. ymm_temp3 = MKL_DC_SETZERO_YMM();
  4862. ymm_temp4 = MKL_DC_SETZERO_YMM();
  4863. ymm_temp5 = MKL_DC_SETZERO_YMM();
  4864. ymm_temp6 = MKL_DC_SETZERO_YMM();
  4865. ymm_temp7 = MKL_DC_SETZERO_YMM();
  4866. MKL_INT k;
  4867. for (k=0; k<k0; k+=k_in_ker) {
  4868. ymm_b1 = MKL_DC_LOAD_YMM(&MKL_DC_BB(k+0, j));
  4869. ymm_a = MKL_DC_BCAST_YMM(&MKL_DC_AA(i, k+0));
  4870. MKL_DC_MUL_ADD_YMM(ymm_a, ymm_b1, ymm_temp0, ymm_temp);
  4871. ymm_b2 = MKL_DC_LOAD_YMM(&MKL_DC_BB(k+0, j+4));
  4872. MKL_DC_MUL_ADD_YMM(ymm_a, ymm_b2, ymm_temp1, ymm_temp);
  4873. ymm_a = MKL_DC_BCAST_YMM(&MKL_DC_AA(i+1, k+0));
  4874. MKL_DC_MUL_ADD_YMM(ymm_a, ymm_b1, ymm_temp2, ymm_temp);
  4875. MKL_DC_MUL_ADD_YMM(ymm_a, ymm_b2, ymm_temp3, ymm_temp);
  4876. ymm_a = MKL_DC_BCAST_YMM(&MKL_DC_AA(i+2, k+0));
  4877. MKL_DC_MUL_ADD_YMM(ymm_a, ymm_b1, ymm_temp4, ymm_temp);
  4878. MKL_DC_MUL_ADD_YMM(ymm_a, ymm_b2, ymm_temp5, ymm_temp);
  4879. ymm_a = MKL_DC_BCAST_YMM(&MKL_DC_AA(i+3, k+0));
  4880. MKL_DC_MUL_ADD_YMM(ymm_a, ymm_b1, ymm_temp6, ymm_temp);
  4881. MKL_DC_MUL_ADD_YMM(ymm_a, ymm_b2, ymm_temp7, ymm_temp);
  4882. ymm_b1 = MKL_DC_LOAD_YMM(&MKL_DC_BB(k+1, j));
  4883. ymm_a = MKL_DC_BCAST_YMM(&MKL_DC_AA(i, k+1));
  4884. MKL_DC_MUL_ADD_YMM(ymm_a, ymm_b1, ymm_temp0, ymm_temp);
  4885. ymm_b2 = MKL_DC_LOAD_YMM(&MKL_DC_BB(k+1, j+4));
  4886. MKL_DC_MUL_ADD_YMM(ymm_a, ymm_b2, ymm_temp1, ymm_temp);
  4887. ymm_a = MKL_DC_BCAST_YMM(&MKL_DC_AA(i+1, k+1));
  4888. MKL_DC_MUL_ADD_YMM(ymm_a, ymm_b1, ymm_temp2, ymm_temp);
  4889. MKL_DC_MUL_ADD_YMM(ymm_a, ymm_b2, ymm_temp3, ymm_temp);
  4890. ymm_a = MKL_DC_BCAST_YMM(&MKL_DC_AA(i+2, k+1));
  4891. MKL_DC_MUL_ADD_YMM(ymm_a, ymm_b1, ymm_temp4, ymm_temp);
  4892. MKL_DC_MUL_ADD_YMM(ymm_a, ymm_b2, ymm_temp5, ymm_temp);
  4893. ymm_a = MKL_DC_BCAST_YMM(&MKL_DC_AA(i+3, k+1));
  4894. MKL_DC_MUL_ADD_YMM(ymm_a, ymm_b1, ymm_temp6, ymm_temp);
  4895. MKL_DC_MUL_ADD_YMM(ymm_a, ymm_b2, ymm_temp7, ymm_temp);
  4896. ymm_b1 = MKL_DC_LOAD_YMM(&MKL_DC_BB(k+2, j));
  4897. ymm_a = MKL_DC_BCAST_YMM(&MKL_DC_AA(i, k+2));
  4898. MKL_DC_MUL_ADD_YMM(ymm_a, ymm_b1, ymm_temp0, ymm_temp);
  4899. ymm_b2 = MKL_DC_LOAD_YMM(&MKL_DC_BB(k+2, j+4));
  4900. MKL_DC_MUL_ADD_YMM(ymm_a, ymm_b2, ymm_temp1, ymm_temp);
  4901. ymm_a = MKL_DC_BCAST_YMM(&MKL_DC_AA(i+1, k+2));
  4902. MKL_DC_MUL_ADD_YMM(ymm_a, ymm_b1, ymm_temp2, ymm_temp);
  4903. MKL_DC_MUL_ADD_YMM(ymm_a, ymm_b2, ymm_temp3, ymm_temp);
  4904. ymm_a = MKL_DC_BCAST_YMM(&MKL_DC_AA(i+2, k+2));
  4905. MKL_DC_MUL_ADD_YMM(ymm_a, ymm_b1, ymm_temp4, ymm_temp);
  4906. MKL_DC_MUL_ADD_YMM(ymm_a, ymm_b2, ymm_temp5, ymm_temp);
  4907. ymm_a = MKL_DC_BCAST_YMM(&MKL_DC_AA(i+3, k+2));
  4908. MKL_DC_MUL_ADD_YMM(ymm_a, ymm_b1, ymm_temp6, ymm_temp);
  4909. MKL_DC_MUL_ADD_YMM(ymm_a, ymm_b2, ymm_temp7, ymm_temp);
  4910. ymm_b1 = MKL_DC_LOAD_YMM(&MKL_DC_BB(k+3, j));
  4911. ymm_a = MKL_DC_BCAST_YMM(&MKL_DC_AA(i, k+3));
  4912. MKL_DC_MUL_ADD_YMM(ymm_a, ymm_b1, ymm_temp0, ymm_temp);
  4913. ymm_b2 = MKL_DC_LOAD_YMM(&MKL_DC_BB(k+3, j+4));
  4914. MKL_DC_MUL_ADD_YMM(ymm_a, ymm_b2, ymm_temp1, ymm_temp);
  4915. ymm_a = MKL_DC_BCAST_YMM(&MKL_DC_AA(i+1, k+3));
  4916. MKL_DC_MUL_ADD_YMM(ymm_a, ymm_b1, ymm_temp2, ymm_temp);
  4917. MKL_DC_MUL_ADD_YMM(ymm_a, ymm_b2, ymm_temp3, ymm_temp);
  4918. ymm_a = MKL_DC_BCAST_YMM(&MKL_DC_AA(i+2, k+3));
  4919. MKL_DC_MUL_ADD_YMM(ymm_a, ymm_b1, ymm_temp4, ymm_temp);
  4920. MKL_DC_MUL_ADD_YMM(ymm_a, ymm_b2, ymm_temp5, ymm_temp);
  4921. ymm_a = MKL_DC_BCAST_YMM(&MKL_DC_AA(i+3, k+3));
  4922. MKL_DC_MUL_ADD_YMM(ymm_a, ymm_b1, ymm_temp6, ymm_temp);
  4923. MKL_DC_MUL_ADD_YMM(ymm_a, ymm_b2, ymm_temp7, ymm_temp);
  4924. }
  4925. if ((kK-k) & 2) {
  4926. ymm_b1 = MKL_DC_LOAD_YMM(&MKL_DC_BB(k+0, j));
  4927. ymm_a = MKL_DC_BCAST_YMM(&MKL_DC_AA(i, k+0));
  4928. MKL_DC_MUL_ADD_YMM(ymm_a, ymm_b1, ymm_temp0, ymm_temp);
  4929. ymm_b2 = MKL_DC_LOAD_YMM(&MKL_DC_BB(k+0, j+4));
  4930. MKL_DC_MUL_ADD_YMM(ymm_a, ymm_b2, ymm_temp1, ymm_temp);
  4931. ymm_a = MKL_DC_BCAST_YMM(&MKL_DC_AA(i+1, k+0));
  4932. MKL_DC_MUL_ADD_YMM(ymm_a, ymm_b1, ymm_temp2, ymm_temp);
  4933. MKL_DC_MUL_ADD_YMM(ymm_a, ymm_b2, ymm_temp3, ymm_temp);
  4934. ymm_a = MKL_DC_BCAST_YMM(&MKL_DC_AA(i+2, k+0));
  4935. MKL_DC_MUL_ADD_YMM(ymm_a, ymm_b1, ymm_temp4, ymm_temp);
  4936. MKL_DC_MUL_ADD_YMM(ymm_a, ymm_b2, ymm_temp5, ymm_temp);
  4937. ymm_a = MKL_DC_BCAST_YMM(&MKL_DC_AA(i+3, k+0));
  4938. MKL_DC_MUL_ADD_YMM(ymm_a, ymm_b1, ymm_temp6, ymm_temp);
  4939. MKL_DC_MUL_ADD_YMM(ymm_a, ymm_b2, ymm_temp7, ymm_temp);
  4940. ymm_b1 = MKL_DC_LOAD_YMM(&MKL_DC_BB(k+1, j));
  4941. ymm_a = MKL_DC_BCAST_YMM(&MKL_DC_AA(i, k+1));
  4942. MKL_DC_MUL_ADD_YMM(ymm_a, ymm_b1, ymm_temp0, ymm_temp);
  4943. ymm_b2 = MKL_DC_LOAD_YMM(&MKL_DC_BB(k+1, j+4));
  4944. MKL_DC_MUL_ADD_YMM(ymm_a, ymm_b2, ymm_temp1, ymm_temp);
  4945. ymm_a = MKL_DC_BCAST_YMM(&MKL_DC_AA(i+1, k+1));
  4946. MKL_DC_MUL_ADD_YMM(ymm_a, ymm_b1, ymm_temp2, ymm_temp);
  4947. MKL_DC_MUL_ADD_YMM(ymm_a, ymm_b2, ymm_temp3, ymm_temp);
  4948. ymm_a = MKL_DC_BCAST_YMM(&MKL_DC_AA(i+2, k+1));
  4949. MKL_DC_MUL_ADD_YMM(ymm_a, ymm_b1, ymm_temp4, ymm_temp);
  4950. MKL_DC_MUL_ADD_YMM(ymm_a, ymm_b2, ymm_temp5, ymm_temp);
  4951. ymm_a = MKL_DC_BCAST_YMM(&MKL_DC_AA(i+3, k+1));
  4952. MKL_DC_MUL_ADD_YMM(ymm_a, ymm_b1, ymm_temp6, ymm_temp);
  4953. MKL_DC_MUL_ADD_YMM(ymm_a, ymm_b2, ymm_temp7, ymm_temp);
  4954. k+=2;
  4955. }
  4956. if (kK-k) {
  4957. ymm_b1 = MKL_DC_LOAD_YMM(&MKL_DC_BB(k+0, j));
  4958. ymm_a = MKL_DC_BCAST_YMM(&MKL_DC_AA(i, k+0));
  4959. MKL_DC_MUL_ADD_YMM(ymm_a, ymm_b1, ymm_temp0, ymm_temp);
  4960. ymm_b2 = MKL_DC_LOAD_YMM(&MKL_DC_BB(k+0, j+4));
  4961. MKL_DC_MUL_ADD_YMM(ymm_a, ymm_b2, ymm_temp1, ymm_temp);
  4962. ymm_a = MKL_DC_BCAST_YMM(&MKL_DC_AA(i+1, k+0));
  4963. MKL_DC_MUL_ADD_YMM(ymm_a, ymm_b1, ymm_temp2, ymm_temp);
  4964. MKL_DC_MUL_ADD_YMM(ymm_a, ymm_b2, ymm_temp3, ymm_temp);
  4965. ymm_a = MKL_DC_BCAST_YMM(&MKL_DC_AA(i+2, k+0));
  4966. MKL_DC_MUL_ADD_YMM(ymm_a, ymm_b1, ymm_temp4, ymm_temp);
  4967. MKL_DC_MUL_ADD_YMM(ymm_a, ymm_b2, ymm_temp5, ymm_temp);
  4968. ymm_a = MKL_DC_BCAST_YMM(&MKL_DC_AA(i+3, k+0));
  4969. MKL_DC_MUL_ADD_YMM(ymm_a, ymm_b1, ymm_temp6, ymm_temp);
  4970. MKL_DC_MUL_ADD_YMM(ymm_a, ymm_b2, ymm_temp7, ymm_temp);
  4971. }
  4972. MKL_DC_VEC_TRANSPOSE_YMM(ymm_temp0, ymm_temp2, ymm_temp4, ymm_temp6, ymm_c0, ymm_c2, ymm_c4, ymm_c6);
  4973. #if !defined(MKL_DC_BETA_ZERO)
  4974. ymm_c0 = MKL_DC_LOAD_YMM(&MKL_DC_CC(i,j+0));
  4975. #if !defined(MKL_DC_BETA_ONE)
  4976. ymm_c0 = MKL_DC_MUL_YMM(ymm_beta, ymm_c0);
  4977. #endif
  4978. #if defined(MKL_DC_ALPHA_ONE)
  4979. ymm_c0 = MKL_DC_ADD_YMM(ymm_temp0, ymm_c0);
  4980. #else
  4981. MKL_DC_MUL_ADD_YMM(ymm_alpha, ymm_temp0, ymm_c0, ymm_temp);
  4982. #endif
  4983. #else
  4984. #if !defined(MKL_DC_ALPHA_ONE)
  4985. ymm_c0 = MKL_DC_MUL_YMM(ymm_alpha, ymm_temp0);
  4986. #else
  4987. ymm_c0 = ymm_temp0;
  4988. #endif
  4989. #endif
  4990. MKL_DC_STORE_YMM(&MKL_DC_CC(i,j+0), ymm_c0);
  4991. #if !defined(MKL_DC_BETA_ZERO)
  4992. ymm_c2 = MKL_DC_LOAD_YMM(&MKL_DC_CC(i,j+1));
  4993. #if !defined(MKL_DC_BETA_ONE)
  4994. ymm_c2 = MKL_DC_MUL_YMM(ymm_beta, ymm_c2);
  4995. #endif
  4996. #if defined(MKL_DC_ALPHA_ONE)
  4997. ymm_c2 = MKL_DC_ADD_YMM(ymm_temp2, ymm_c2);
  4998. #else
  4999. MKL_DC_MUL_ADD_YMM(ymm_alpha, ymm_temp2, ymm_c2, ymm_temp);
  5000. #endif
  5001. #else
  5002. #if !defined(MKL_DC_ALPHA_ONE)
  5003. ymm_c2 = MKL_DC_MUL_YMM(ymm_alpha, ymm_temp2);
  5004. #else
  5005. ymm_c2 = ymm_temp2;
  5006. #endif
  5007. #endif
  5008. MKL_DC_STORE_YMM(&MKL_DC_CC(i,j+1), ymm_c2);
  5009. #if !defined(MKL_DC_BETA_ZERO)
  5010. ymm_c4 = MKL_DC_LOAD_YMM(&MKL_DC_CC(i,j+2));
  5011. #if !defined(MKL_DC_BETA_ONE)
  5012. ymm_c4 = MKL_DC_MUL_YMM(ymm_beta, ymm_c4);
  5013. #endif
  5014. #if defined(MKL_DC_ALPHA_ONE)
  5015. ymm_c4 = MKL_DC_ADD_YMM(ymm_temp4, ymm_c4);
  5016. #else
  5017. MKL_DC_MUL_ADD_YMM(ymm_alpha, ymm_temp4, ymm_c4, ymm_temp);
  5018. #endif
  5019. #else
  5020. #if !defined(MKL_DC_ALPHA_ONE)
  5021. ymm_c4 = MKL_DC_MUL_YMM(ymm_alpha, ymm_temp4);
  5022. #else
  5023. ymm_c4 = ymm_temp4;
  5024. #endif
  5025. #endif
  5026. MKL_DC_STORE_YMM(&MKL_DC_CC(i,j+2), ymm_c4);
  5027. #if !defined(MKL_DC_BETA_ZERO)
  5028. ymm_c6 = MKL_DC_LOAD_YMM(&MKL_DC_CC(i,j+3));
  5029. #if !defined(MKL_DC_BETA_ONE)
  5030. ymm_c6 = MKL_DC_MUL_YMM(ymm_beta, ymm_c6);
  5031. #endif
  5032. #if defined(MKL_DC_ALPHA_ONE)
  5033. ymm_c6 = MKL_DC_ADD_YMM(ymm_temp6, ymm_c6);
  5034. #else
  5035. MKL_DC_MUL_ADD_YMM(ymm_alpha, ymm_temp6, ymm_c6, ymm_temp);
  5036. #endif
  5037. #else
  5038. #if !defined(MKL_DC_ALPHA_ONE)
  5039. ymm_c6 = MKL_DC_MUL_YMM(ymm_alpha, ymm_temp6);
  5040. #else
  5041. ymm_c6 = ymm_temp6;
  5042. #endif
  5043. #endif
  5044. MKL_DC_STORE_YMM(&MKL_DC_CC(i,j+3), ymm_c6);
  5045. MKL_DC_VEC_TRANSPOSE_YMM(ymm_temp1, ymm_temp3, ymm_temp5, ymm_temp7, ymm_c1, ymm_c3, ymm_c5, ymm_c7);
  5046. #if !defined(MKL_DC_BETA_ZERO)
  5047. ymm_c1 = MKL_DC_LOAD_YMM(&MKL_DC_CC(i,j+4));
  5048. #if !defined(MKL_DC_BETA_ONE)
  5049. ymm_c1 = MKL_DC_MUL_YMM(ymm_beta, ymm_c1);
  5050. #endif
  5051. #if defined(MKL_DC_ALPHA_ONE)
  5052. ymm_c1 = MKL_DC_ADD_YMM(ymm_temp1, ymm_c1);
  5053. #else
  5054. MKL_DC_MUL_ADD_YMM(ymm_alpha, ymm_temp1, ymm_c1, ymm_temp);
  5055. #endif
  5056. #else
  5057. #if !defined(MKL_DC_ALPHA_ONE)
  5058. ymm_c1 = MKL_DC_MUL_YMM(ymm_alpha, ymm_temp1);
  5059. #else
  5060. ymm_c1 = ymm_temp1;
  5061. #endif
  5062. #endif
  5063. MKL_DC_STORE_YMM(&MKL_DC_CC(i,j+4), ymm_c1);
  5064. #if !defined(MKL_DC_BETA_ZERO)
  5065. ymm_c3 = MKL_DC_LOAD_YMM(&MKL_DC_CC(i,j+5));
  5066. #if !defined(MKL_DC_BETA_ONE)
  5067. ymm_c3 = MKL_DC_MUL_YMM(ymm_beta, ymm_c3);
  5068. #endif
  5069. #if defined(MKL_DC_ALPHA_ONE)
  5070. ymm_c3 = MKL_DC_ADD_YMM(ymm_temp3, ymm_c3);
  5071. #else
  5072. MKL_DC_MUL_ADD_YMM(ymm_alpha, ymm_temp3, ymm_c3, ymm_temp);
  5073. #endif
  5074. #else
  5075. #if !defined(MKL_DC_ALPHA_ONE)
  5076. ymm_c3 = MKL_DC_MUL_YMM(ymm_alpha, ymm_temp3);
  5077. #else
  5078. ymm_c3 = ymm_temp3;
  5079. #endif
  5080. #endif
  5081. MKL_DC_STORE_YMM(&MKL_DC_CC(i,j+5), ymm_c3);
  5082. #if !defined(MKL_DC_BETA_ZERO)
  5083. ymm_c5 = MKL_DC_LOAD_YMM(&MKL_DC_CC(i,j+6));
  5084. #if !defined(MKL_DC_BETA_ONE)
  5085. ymm_c5 = MKL_DC_MUL_YMM(ymm_beta, ymm_c5);
  5086. #endif
  5087. #if defined(MKL_DC_ALPHA_ONE)
  5088. ymm_c5 = MKL_DC_ADD_YMM(ymm_temp5, ymm_c5);
  5089. #else
  5090. MKL_DC_MUL_ADD_YMM(ymm_alpha, ymm_temp5, ymm_c5, ymm_temp);
  5091. #endif
  5092. #else
  5093. #if !defined(MKL_DC_ALPHA_ONE)
  5094. ymm_c5 = MKL_DC_MUL_YMM(ymm_alpha, ymm_temp5);
  5095. #else
  5096. ymm_c5 = ymm_temp5;
  5097. #endif
  5098. #endif
  5099. MKL_DC_STORE_YMM(&MKL_DC_CC(i,j+6), ymm_c5);
  5100. #if !defined(MKL_DC_BETA_ZERO)
  5101. ymm_c7 = MKL_DC_LOAD_YMM(&MKL_DC_CC(i,j+7));
  5102. #if !defined(MKL_DC_BETA_ONE)
  5103. ymm_c7 = MKL_DC_MUL_YMM(ymm_beta, ymm_c7);
  5104. #endif
  5105. #if defined(MKL_DC_ALPHA_ONE)
  5106. ymm_c7 = MKL_DC_ADD_YMM(ymm_temp7, ymm_c7);
  5107. #else
  5108. MKL_DC_MUL_ADD_YMM(ymm_alpha, ymm_temp7, ymm_c7, ymm_temp);
  5109. #endif
  5110. #else
  5111. #if !defined(MKL_DC_ALPHA_ONE)
  5112. ymm_c7 = MKL_DC_MUL_YMM(ymm_alpha, ymm_temp7);
  5113. #else
  5114. ymm_c7 = ymm_temp7;
  5115. #endif
  5116. #endif
  5117. MKL_DC_STORE_YMM(&MKL_DC_CC(i,j+7), ymm_c7);
  5118. }
  5119. if ((m-i) & 2) {
  5120. ymm_temp0 = MKL_DC_SETZERO_YMM();
  5121. ymm_temp1 = MKL_DC_SETZERO_YMM();
  5122. ymm_temp2 = MKL_DC_SETZERO_YMM();
  5123. ymm_temp3 = MKL_DC_SETZERO_YMM();
  5124. MKL_INT k;
  5125. for (k=0; k<k0; k+=k_in_ker) {
  5126. ymm_b1 = MKL_DC_LOAD_YMM(&MKL_DC_BB(k+0, j));
  5127. ymm_a = MKL_DC_BCAST_YMM(&MKL_DC_AA(i, k+0));
  5128. MKL_DC_MUL_ADD_YMM(ymm_a, ymm_b1, ymm_temp0, ymm_temp);
  5129. ymm_b2 = MKL_DC_LOAD_YMM(&MKL_DC_BB(k+0, j+4));
  5130. MKL_DC_MUL_ADD_YMM(ymm_a, ymm_b2, ymm_temp1, ymm_temp);
  5131. ymm_a = MKL_DC_BCAST_YMM(&MKL_DC_AA(i+1, k+0));
  5132. MKL_DC_MUL_ADD_YMM(ymm_a, ymm_b1, ymm_temp2, ymm_temp);
  5133. MKL_DC_MUL_ADD_YMM(ymm_a, ymm_b2, ymm_temp3, ymm_temp);
  5134. ymm_b1 = MKL_DC_LOAD_YMM(&MKL_DC_BB(k+1, j));
  5135. ymm_a = MKL_DC_BCAST_YMM(&MKL_DC_AA(i, k+1));
  5136. MKL_DC_MUL_ADD_YMM(ymm_a, ymm_b1, ymm_temp0, ymm_temp);
  5137. ymm_b2 = MKL_DC_LOAD_YMM(&MKL_DC_BB(k+1, j+4));
  5138. MKL_DC_MUL_ADD_YMM(ymm_a, ymm_b2, ymm_temp1, ymm_temp);
  5139. ymm_a = MKL_DC_BCAST_YMM(&MKL_DC_AA(i+1, k+1));
  5140. MKL_DC_MUL_ADD_YMM(ymm_a, ymm_b1, ymm_temp2, ymm_temp);
  5141. MKL_DC_MUL_ADD_YMM(ymm_a, ymm_b2, ymm_temp3, ymm_temp);
  5142. ymm_b1 = MKL_DC_LOAD_YMM(&MKL_DC_BB(k+2, j));
  5143. ymm_a = MKL_DC_BCAST_YMM(&MKL_DC_AA(i, k+2));
  5144. MKL_DC_MUL_ADD_YMM(ymm_a, ymm_b1, ymm_temp0, ymm_temp);
  5145. ymm_b2 = MKL_DC_LOAD_YMM(&MKL_DC_BB(k+2, j+4));
  5146. MKL_DC_MUL_ADD_YMM(ymm_a, ymm_b2, ymm_temp1, ymm_temp);
  5147. ymm_a = MKL_DC_BCAST_YMM(&MKL_DC_AA(i+1, k+2));
  5148. MKL_DC_MUL_ADD_YMM(ymm_a, ymm_b1, ymm_temp2, ymm_temp);
  5149. MKL_DC_MUL_ADD_YMM(ymm_a, ymm_b2, ymm_temp3, ymm_temp);
  5150. ymm_b1 = MKL_DC_LOAD_YMM(&MKL_DC_BB(k+3, j));
  5151. ymm_a = MKL_DC_BCAST_YMM(&MKL_DC_AA(i, k+3));
  5152. MKL_DC_MUL_ADD_YMM(ymm_a, ymm_b1, ymm_temp0, ymm_temp);
  5153. ymm_b2 = MKL_DC_LOAD_YMM(&MKL_DC_BB(k+3, j+4));
  5154. MKL_DC_MUL_ADD_YMM(ymm_a, ymm_b2, ymm_temp1, ymm_temp);
  5155. ymm_a = MKL_DC_BCAST_YMM(&MKL_DC_AA(i+1, k+3));
  5156. MKL_DC_MUL_ADD_YMM(ymm_a, ymm_b1, ymm_temp2, ymm_temp);
  5157. MKL_DC_MUL_ADD_YMM(ymm_a, ymm_b2, ymm_temp3, ymm_temp);
  5158. }
  5159. if ((kK-k) & 2) {
  5160. ymm_b1 = MKL_DC_LOAD_YMM(&MKL_DC_BB(k+0, j));
  5161. ymm_a = MKL_DC_BCAST_YMM(&MKL_DC_AA(i, k+0));
  5162. MKL_DC_MUL_ADD_YMM(ymm_a, ymm_b1, ymm_temp0, ymm_temp);
  5163. ymm_b2 = MKL_DC_LOAD_YMM(&MKL_DC_BB(k+0, j+4));
  5164. MKL_DC_MUL_ADD_YMM(ymm_a, ymm_b2, ymm_temp1, ymm_temp);
  5165. ymm_a = MKL_DC_BCAST_YMM(&MKL_DC_AA(i+1, k+0));
  5166. MKL_DC_MUL_ADD_YMM(ymm_a, ymm_b1, ymm_temp2, ymm_temp);
  5167. MKL_DC_MUL_ADD_YMM(ymm_a, ymm_b2, ymm_temp3, ymm_temp);
  5168. ymm_b1 = MKL_DC_LOAD_YMM(&MKL_DC_BB(k+1, j));
  5169. ymm_a = MKL_DC_BCAST_YMM(&MKL_DC_AA(i, k+1));
  5170. MKL_DC_MUL_ADD_YMM(ymm_a, ymm_b1, ymm_temp0, ymm_temp);
  5171. ymm_b2 = MKL_DC_LOAD_YMM(&MKL_DC_BB(k+1, j+4));
  5172. MKL_DC_MUL_ADD_YMM(ymm_a, ymm_b2, ymm_temp1, ymm_temp);
  5173. ymm_a = MKL_DC_BCAST_YMM(&MKL_DC_AA(i+1, k+1));
  5174. MKL_DC_MUL_ADD_YMM(ymm_a, ymm_b1, ymm_temp2, ymm_temp);
  5175. MKL_DC_MUL_ADD_YMM(ymm_a, ymm_b2, ymm_temp3, ymm_temp);
  5176. k+=2;
  5177. }
  5178. if (kK-k) {
  5179. ymm_b1 = MKL_DC_LOAD_YMM(&MKL_DC_BB(k+0, j));
  5180. ymm_a = MKL_DC_BCAST_YMM(&MKL_DC_AA(i, k+0));
  5181. MKL_DC_MUL_ADD_YMM(ymm_a, ymm_b1, ymm_temp0, ymm_temp);
  5182. ymm_b2 = MKL_DC_LOAD_YMM(&MKL_DC_BB(k+0, j+4));
  5183. MKL_DC_MUL_ADD_YMM(ymm_a, ymm_b2, ymm_temp1, ymm_temp);
  5184. ymm_a = MKL_DC_BCAST_YMM(&MKL_DC_AA(i+1, k+0));
  5185. MKL_DC_MUL_ADD_YMM(ymm_a, ymm_b1, ymm_temp2, ymm_temp);
  5186. MKL_DC_MUL_ADD_YMM(ymm_a, ymm_b2, ymm_temp3, ymm_temp);
  5187. }
  5188. ymm_temp4 = MKL_DC_UNPACKLO_YMM(ymm_temp0, ymm_temp2);
  5189. xmm_temp4 = MKL_DC_CAST_YMM_TO_XMM(ymm_temp4);
  5190. #if !defined(MKL_DC_BETA_ZERO)
  5191. xmm_c0 = MKL_DC_LOAD_XMM(&MKL_DC_CC(i,j+0));
  5192. #if !defined(MKL_DC_BETA_ONE)
  5193. xmm_c0 = MKL_DC_MUL_XMM(xmm_beta, xmm_c0);
  5194. #endif
  5195. #if defined(MKL_DC_ALPHA_ONE)
  5196. xmm_c0 = MKL_DC_ADD_XMM(xmm_temp4, xmm_c0);
  5197. #else
  5198. MKL_DC_MUL_ADD_XMM(xmm_alpha, xmm_temp4, xmm_c0, xmm_temp);
  5199. #endif
  5200. #else
  5201. #if !defined(MKL_DC_ALPHA_ONE)
  5202. xmm_c0 = MKL_DC_MUL_XMM(xmm_alpha, xmm_temp4);
  5203. #else
  5204. xmm_c0 = xmm_temp4;
  5205. #endif
  5206. #endif
  5207. MKL_DC_STORE_XMM(&MKL_DC_CC(i,j+0),xmm_c0);
  5208. ymm_temp5 = MKL_DC_UNPACKHI_YMM(ymm_temp0, ymm_temp2);
  5209. xmm_temp5 = MKL_DC_CAST_YMM_TO_XMM(ymm_temp5);
  5210. #if !defined(MKL_DC_BETA_ZERO)
  5211. xmm_c2 = MKL_DC_LOAD_XMM(&MKL_DC_CC(i,j+1));
  5212. #if !defined(MKL_DC_BETA_ONE)
  5213. xmm_c2 = MKL_DC_MUL_XMM(xmm_beta, xmm_c2);
  5214. #endif
  5215. #if defined(MKL_DC_ALPHA_ONE)
  5216. xmm_c2 = MKL_DC_ADD_XMM(xmm_temp5, xmm_c2);
  5217. #else
  5218. MKL_DC_MUL_ADD_XMM(xmm_alpha, xmm_temp5, xmm_c2, xmm_temp);
  5219. #endif
  5220. #else
  5221. #if !defined(MKL_DC_ALPHA_ONE)
  5222. xmm_c2 = MKL_DC_MUL_XMM(xmm_alpha, xmm_temp5);
  5223. #else
  5224. xmm_c2 = xmm_temp5;
  5225. #endif
  5226. #endif
  5227. MKL_DC_STORE_XMM(&MKL_DC_CC(i,j+1),xmm_c2);
  5228. ymm_temp0 = MKL_DC_PERM2F128_YMM(ymm_temp4, ymm_temp4, 0x11);
  5229. xmm_temp0 = MKL_DC_CAST_YMM_TO_XMM(ymm_temp0);
  5230. #if !defined(MKL_DC_BETA_ZERO)
  5231. xmm_c0 = MKL_DC_LOAD_XMM(&MKL_DC_CC(i,j+2));
  5232. #if !defined(MKL_DC_BETA_ONE)
  5233. xmm_c0 = MKL_DC_MUL_XMM(xmm_beta, xmm_c0);
  5234. #endif
  5235. #if defined(MKL_DC_ALPHA_ONE)
  5236. xmm_c0 = MKL_DC_ADD_XMM(xmm_temp0, xmm_c0);
  5237. #else
  5238. MKL_DC_MUL_ADD_XMM(xmm_alpha, xmm_temp0, xmm_c0, xmm_temp);
  5239. #endif
  5240. #else
  5241. #if !defined(MKL_DC_ALPHA_ONE)
  5242. xmm_c0 = MKL_DC_MUL_XMM(xmm_alpha, xmm_temp0);
  5243. #else
  5244. xmm_c0 = xmm_temp0;
  5245. #endif
  5246. #endif
  5247. MKL_DC_STORE_XMM(&MKL_DC_CC(i,j+2),xmm_c0);
  5248. ymm_temp0 = MKL_DC_PERM2F128_YMM(ymm_temp5, ymm_temp5, 0x11);
  5249. xmm_temp0 = MKL_DC_CAST_YMM_TO_XMM(ymm_temp0);
  5250. #if !defined(MKL_DC_BETA_ZERO)
  5251. xmm_c2 = MKL_DC_LOAD_XMM(&MKL_DC_CC(i,j+3));
  5252. #if !defined(MKL_DC_BETA_ONE)
  5253. xmm_c2 = MKL_DC_MUL_XMM(xmm_beta, xmm_c2);
  5254. #endif
  5255. #if defined(MKL_DC_ALPHA_ONE)
  5256. xmm_c2 = MKL_DC_ADD_XMM(xmm_temp0, xmm_c2);
  5257. #else
  5258. MKL_DC_MUL_ADD_XMM(xmm_alpha, xmm_temp0, xmm_c2, xmm_temp);
  5259. #endif
  5260. #else
  5261. #if !defined(MKL_DC_ALPHA_ONE)
  5262. xmm_c2 = MKL_DC_MUL_XMM(xmm_alpha, xmm_temp0);
  5263. #else
  5264. xmm_c2 = xmm_temp0;
  5265. #endif
  5266. #endif
  5267. MKL_DC_STORE_XMM(&MKL_DC_CC(i,j+3),xmm_c2);
  5268. ymm_temp4 = MKL_DC_UNPACKLO_YMM(ymm_temp1, ymm_temp3);
  5269. xmm_temp4 = MKL_DC_CAST_YMM_TO_XMM(ymm_temp4);
  5270. #if !defined(MKL_DC_BETA_ZERO)
  5271. xmm_c0 = MKL_DC_LOAD_XMM(&MKL_DC_CC(i,j+4));
  5272. #if !defined(MKL_DC_BETA_ONE)
  5273. xmm_c0 = MKL_DC_MUL_XMM(xmm_beta, xmm_c0);
  5274. #endif
  5275. #if defined(MKL_DC_ALPHA_ONE)
  5276. xmm_c0 = MKL_DC_ADD_XMM(xmm_temp4, xmm_c0);
  5277. #else
  5278. MKL_DC_MUL_ADD_XMM(xmm_alpha, xmm_temp4, xmm_c0, xmm_temp);
  5279. #endif
  5280. #else
  5281. #if !defined(MKL_DC_ALPHA_ONE)
  5282. xmm_c0 = MKL_DC_MUL_XMM(xmm_alpha, xmm_temp4);
  5283. #else
  5284. xmm_c0 = xmm_temp4;
  5285. #endif
  5286. #endif
  5287. MKL_DC_STORE_XMM(&MKL_DC_CC(i,j+4),xmm_c0);
  5288. ymm_temp5 = MKL_DC_UNPACKHI_YMM(ymm_temp1, ymm_temp3);
  5289. xmm_temp5 = MKL_DC_CAST_YMM_TO_XMM(ymm_temp5);
  5290. #if !defined(MKL_DC_BETA_ZERO)
  5291. xmm_c2 = MKL_DC_LOAD_XMM(&MKL_DC_CC(i,j+5));
  5292. #if !defined(MKL_DC_BETA_ONE)
  5293. xmm_c2 = MKL_DC_MUL_XMM(xmm_beta, xmm_c2);
  5294. #endif
  5295. #if defined(MKL_DC_ALPHA_ONE)
  5296. xmm_c2 = MKL_DC_ADD_XMM(xmm_temp5, xmm_c2);
  5297. #else
  5298. MKL_DC_MUL_ADD_XMM(xmm_alpha, xmm_temp5, xmm_c2, xmm_temp);
  5299. #endif
  5300. #else
  5301. #if !defined(MKL_DC_ALPHA_ONE)
  5302. xmm_c2 = MKL_DC_MUL_XMM(xmm_alpha, xmm_temp5);
  5303. #else
  5304. xmm_c2 = xmm_temp5;
  5305. #endif
  5306. #endif
  5307. MKL_DC_STORE_XMM(&MKL_DC_CC(i,j+5),xmm_c2);
  5308. ymm_temp0 = MKL_DC_PERM2F128_YMM(ymm_temp4, ymm_temp4, 0x11);
  5309. xmm_temp0 = MKL_DC_CAST_YMM_TO_XMM(ymm_temp0);
  5310. #if !defined(MKL_DC_BETA_ZERO)
  5311. xmm_c0 = MKL_DC_LOAD_XMM(&MKL_DC_CC(i,j+6));
  5312. #if !defined(MKL_DC_BETA_ONE)
  5313. xmm_c0 = MKL_DC_MUL_XMM(xmm_beta, xmm_c0);
  5314. #endif
  5315. #if defined(MKL_DC_ALPHA_ONE)
  5316. xmm_c0 = MKL_DC_ADD_XMM(xmm_temp0, xmm_c0);
  5317. #else
  5318. MKL_DC_MUL_ADD_XMM(xmm_alpha, xmm_temp0, xmm_c0, xmm_temp);
  5319. #endif
  5320. #else
  5321. #if !defined(MKL_DC_ALPHA_ONE)
  5322. xmm_c0 = MKL_DC_MUL_XMM(xmm_alpha, xmm_temp0);
  5323. #else
  5324. xmm_c0 = xmm_temp0;
  5325. #endif
  5326. #endif
  5327. MKL_DC_STORE_XMM(&MKL_DC_CC(i,j+6),xmm_c0);
  5328. ymm_temp1 = MKL_DC_PERM2F128_YMM(ymm_temp5, ymm_temp5, 0x11);
  5329. xmm_temp1 = MKL_DC_CAST_YMM_TO_XMM(ymm_temp1);
  5330. #if !defined(MKL_DC_BETA_ZERO)
  5331. xmm_c2 = MKL_DC_LOAD_XMM(&MKL_DC_CC(i,j+7));
  5332. #if !defined(MKL_DC_BETA_ONE)
  5333. xmm_c2 = MKL_DC_MUL_XMM(xmm_beta, xmm_c2);
  5334. #endif
  5335. #if defined(MKL_DC_ALPHA_ONE)
  5336. xmm_c2 = MKL_DC_ADD_XMM(xmm_temp1, xmm_c2);
  5337. #else
  5338. MKL_DC_MUL_ADD_XMM(xmm_alpha, xmm_temp1, xmm_c2, xmm_temp);
  5339. #endif
  5340. #else
  5341. #if !defined(MKL_DC_ALPHA_ONE)
  5342. xmm_c2 = MKL_DC_MUL_XMM(xmm_alpha, xmm_temp1);
  5343. #else
  5344. xmm_c2 = xmm_temp1;
  5345. #endif
  5346. #endif
  5347. MKL_DC_STORE_XMM(&MKL_DC_CC(i,j+7),xmm_c2);
  5348. i+=2;
  5349. }
  5350. if ((m-i) & 1) {
  5351. ymm_temp0 = MKL_DC_SETZERO_YMM();
  5352. ymm_temp1 = MKL_DC_SETZERO_YMM();
  5353. MKL_INT k;
  5354. for (k=0; k<k0; k+=k_in_ker) {
  5355. ymm_b1 = MKL_DC_LOAD_YMM(&MKL_DC_BB(k+0, j));
  5356. ymm_a = MKL_DC_BCAST_YMM(&MKL_DC_AA(i, k+0));
  5357. MKL_DC_MUL_ADD_YMM(ymm_a, ymm_b1, ymm_temp0, ymm_temp);
  5358. ymm_b2 = MKL_DC_LOAD_YMM(&MKL_DC_BB(k+0, j+4));
  5359. MKL_DC_MUL_ADD_YMM(ymm_a, ymm_b2, ymm_temp1, ymm_temp);
  5360. ymm_b1 = MKL_DC_LOAD_YMM(&MKL_DC_BB(k+1, j));
  5361. ymm_a = MKL_DC_BCAST_YMM(&MKL_DC_AA(i, k+1));
  5362. MKL_DC_MUL_ADD_YMM(ymm_a, ymm_b1, ymm_temp0, ymm_temp);
  5363. ymm_b2 = MKL_DC_LOAD_YMM(&MKL_DC_BB(k+1, j+4));
  5364. MKL_DC_MUL_ADD_YMM(ymm_a, ymm_b2, ymm_temp1, ymm_temp);
  5365. ymm_b1 = MKL_DC_LOAD_YMM(&MKL_DC_BB(k+2, j));
  5366. ymm_a = MKL_DC_BCAST_YMM(&MKL_DC_AA(i, k+2));
  5367. MKL_DC_MUL_ADD_YMM(ymm_a, ymm_b1, ymm_temp0, ymm_temp);
  5368. ymm_b2 = MKL_DC_LOAD_YMM(&MKL_DC_BB(k+2, j+4));
  5369. MKL_DC_MUL_ADD_YMM(ymm_a, ymm_b2, ymm_temp1, ymm_temp);
  5370. ymm_b1 = MKL_DC_LOAD_YMM(&MKL_DC_BB(k+3, j));
  5371. ymm_a = MKL_DC_BCAST_YMM(&MKL_DC_AA(i, k+3));
  5372. MKL_DC_MUL_ADD_YMM(ymm_a, ymm_b1, ymm_temp0, ymm_temp);
  5373. ymm_b2 = MKL_DC_LOAD_YMM(&MKL_DC_BB(k+3, j+4));
  5374. MKL_DC_MUL_ADD_YMM(ymm_a, ymm_b2, ymm_temp1, ymm_temp);
  5375. }
  5376. if ((kK-k) & 2) {
  5377. ymm_b1 = MKL_DC_LOAD_YMM(&MKL_DC_BB(k+0, j));
  5378. ymm_a = MKL_DC_BCAST_YMM(&MKL_DC_AA(i, k+0));
  5379. MKL_DC_MUL_ADD_YMM(ymm_a, ymm_b1, ymm_temp0, ymm_temp);
  5380. ymm_b2 = MKL_DC_LOAD_YMM(&MKL_DC_BB(k+0, j+4));
  5381. MKL_DC_MUL_ADD_YMM(ymm_a, ymm_b2, ymm_temp1, ymm_temp);
  5382. ymm_b1 = MKL_DC_LOAD_YMM(&MKL_DC_BB(k+1, j));
  5383. ymm_a = MKL_DC_BCAST_YMM(&MKL_DC_AA(i, k+1));
  5384. MKL_DC_MUL_ADD_YMM(ymm_a, ymm_b1, ymm_temp0, ymm_temp);
  5385. ymm_b2 = MKL_DC_LOAD_YMM(&MKL_DC_BB(k+1, j+4));
  5386. MKL_DC_MUL_ADD_YMM(ymm_a, ymm_b2, ymm_temp1, ymm_temp);
  5387. k+=2;
  5388. }
  5389. if (kK-k) {
  5390. ymm_b1 = MKL_DC_LOAD_YMM(&MKL_DC_BB(k+0, j));
  5391. ymm_a = MKL_DC_BCAST_YMM(&MKL_DC_AA(i, k+0));
  5392. MKL_DC_MUL_ADD_YMM(ymm_a, ymm_b1, ymm_temp0, ymm_temp);
  5393. ymm_b2 = MKL_DC_LOAD_YMM(&MKL_DC_BB(k+0, j+4));
  5394. MKL_DC_MUL_ADD_YMM(ymm_a, ymm_b2, ymm_temp1, ymm_temp);
  5395. }
  5396. xmm_temp4 = MKL_DC_CAST_YMM_TO_XMM(ymm_temp0);
  5397. #if !defined(MKL_DC_BETA_ZERO)
  5398. xmm_c0 = MKL_DC_LOAD_XMM_S(&MKL_DC_CC(i,j+0));
  5399. #if !defined(MKL_DC_BETA_ONE)
  5400. xmm_c0 = MKL_DC_MUL_XMM_S(xmm_beta, xmm_c0);
  5401. #endif
  5402. #if defined(MKL_DC_ALPHA_ONE)
  5403. xmm_c0 = MKL_DC_ADD_XMM_S(xmm_temp4, xmm_c0);
  5404. #else
  5405. MKL_DC_MUL_ADD_XMM_S(xmm_alpha, xmm_temp4, xmm_c0, xmm_temp);
  5406. #endif
  5407. #else
  5408. #if !defined(MKL_DC_ALPHA_ONE)
  5409. xmm_c0 = MKL_DC_MUL_XMM_S(xmm_alpha, xmm_temp4);
  5410. #else
  5411. xmm_c0 = xmm_temp4;
  5412. #endif
  5413. #endif
  5414. MKL_DC_STORE_XMM_S(&MKL_DC_CC(i,j+0),xmm_c0);
  5415. xmm_temp4 = MKL_DC_UNPACKHI_XMM(xmm_temp4, xmm_temp4);
  5416. #if !defined(MKL_DC_BETA_ZERO)
  5417. xmm_c0 = MKL_DC_LOAD_XMM_S(&MKL_DC_CC(i,j+1));
  5418. #if !defined(MKL_DC_BETA_ONE)
  5419. xmm_c0 = MKL_DC_MUL_XMM_S(xmm_beta, xmm_c0);
  5420. #endif
  5421. #if defined(MKL_DC_ALPHA_ONE)
  5422. xmm_c0 = MKL_DC_ADD_XMM_S(xmm_temp4, xmm_c0);
  5423. #else
  5424. MKL_DC_MUL_ADD_XMM_S(xmm_alpha, xmm_temp4, xmm_c0, xmm_temp);
  5425. #endif
  5426. #else
  5427. #if !defined(MKL_DC_ALPHA_ONE)
  5428. xmm_c0 = MKL_DC_MUL_XMM_S(xmm_alpha, xmm_temp4);
  5429. #else
  5430. xmm_c0 = xmm_temp4;
  5431. #endif
  5432. #endif
  5433. MKL_DC_STORE_XMM_S(&MKL_DC_CC(i,j+1),xmm_c0);
  5434. ymm_temp4 = MKL_DC_PERM2F128_YMM(ymm_temp0, ymm_temp0, 0x11);
  5435. xmm_temp4 = MKL_DC_CAST_YMM_TO_XMM(ymm_temp4);
  5436. #if !defined(MKL_DC_BETA_ZERO)
  5437. xmm_c0 = MKL_DC_LOAD_XMM_S(&MKL_DC_CC(i,j+2));
  5438. #if !defined(MKL_DC_BETA_ONE)
  5439. xmm_c0 = MKL_DC_MUL_XMM_S(xmm_beta, xmm_c0);
  5440. #endif
  5441. #if defined(MKL_DC_ALPHA_ONE)
  5442. xmm_c0 = MKL_DC_ADD_XMM_S(xmm_temp4, xmm_c0);
  5443. #else
  5444. MKL_DC_MUL_ADD_XMM_S(xmm_alpha, xmm_temp4, xmm_c0, xmm_temp);
  5445. #endif
  5446. #else
  5447. #if !defined(MKL_DC_ALPHA_ONE)
  5448. xmm_c0 = MKL_DC_MUL_XMM_S(xmm_alpha, xmm_temp4);
  5449. #else
  5450. xmm_c0 = xmm_temp4;
  5451. #endif
  5452. #endif
  5453. MKL_DC_STORE_XMM_S(&MKL_DC_CC(i,j+2),xmm_c0);
  5454. xmm_temp4 = MKL_DC_UNPACKHI_XMM(xmm_temp4, xmm_temp4);
  5455. #if !defined(MKL_DC_BETA_ZERO)
  5456. xmm_c0 = MKL_DC_LOAD_XMM_S(&MKL_DC_CC(i,j+3));
  5457. #if !defined(MKL_DC_BETA_ONE)
  5458. xmm_c0 = MKL_DC_MUL_XMM_S(xmm_beta, xmm_c0);
  5459. #endif
  5460. #if defined(MKL_DC_ALPHA_ONE)
  5461. xmm_c0 = MKL_DC_ADD_XMM_S(xmm_temp4, xmm_c0);
  5462. #else
  5463. MKL_DC_MUL_ADD_XMM_S(xmm_alpha, xmm_temp4, xmm_c0, xmm_temp);
  5464. #endif
  5465. #else
  5466. #if !defined(MKL_DC_ALPHA_ONE)
  5467. xmm_c0 = MKL_DC_MUL_XMM_S(xmm_alpha, xmm_temp4);
  5468. #else
  5469. xmm_c0 = xmm_temp4;
  5470. #endif
  5471. #endif
  5472. MKL_DC_STORE_XMM_S(&MKL_DC_CC(i,j+3),xmm_c0);
  5473. xmm_temp4 = MKL_DC_CAST_YMM_TO_XMM(ymm_temp1);
  5474. #if !defined(MKL_DC_BETA_ZERO)
  5475. xmm_c0 = MKL_DC_LOAD_XMM_S(&MKL_DC_CC(i,j+4));
  5476. #if !defined(MKL_DC_BETA_ONE)
  5477. xmm_c0 = MKL_DC_MUL_XMM_S(xmm_beta, xmm_c0);
  5478. #endif
  5479. #if defined(MKL_DC_ALPHA_ONE)
  5480. xmm_c0 = MKL_DC_ADD_XMM_S(xmm_temp4, xmm_c0);
  5481. #else
  5482. MKL_DC_MUL_ADD_XMM_S(xmm_alpha, xmm_temp4, xmm_c0, xmm_temp);
  5483. #endif
  5484. #else
  5485. #if !defined(MKL_DC_ALPHA_ONE)
  5486. xmm_c0 = MKL_DC_MUL_XMM_S(xmm_alpha, xmm_temp4);
  5487. #else
  5488. xmm_c0 = xmm_temp4;
  5489. #endif
  5490. #endif
  5491. MKL_DC_STORE_XMM_S(&MKL_DC_CC(i,j+4),xmm_c0);
  5492. xmm_temp4 = MKL_DC_UNPACKHI_XMM(xmm_temp4, xmm_temp4);
  5493. #if !defined(MKL_DC_BETA_ZERO)
  5494. xmm_c0 = MKL_DC_LOAD_XMM_S(&MKL_DC_CC(i,j+5));
  5495. #if !defined(MKL_DC_BETA_ONE)
  5496. xmm_c0 = MKL_DC_MUL_XMM_S(xmm_beta, xmm_c0);
  5497. #endif
  5498. #if defined(MKL_DC_ALPHA_ONE)
  5499. xmm_c0 = MKL_DC_ADD_XMM_S(xmm_temp4, xmm_c0);
  5500. #else
  5501. MKL_DC_MUL_ADD_XMM_S(xmm_alpha, xmm_temp4, xmm_c0, xmm_temp);
  5502. #endif
  5503. #else
  5504. #if !defined(MKL_DC_ALPHA_ONE)
  5505. xmm_c0 = MKL_DC_MUL_XMM_S(xmm_alpha, xmm_temp4);
  5506. #else
  5507. xmm_c0 = xmm_temp4;
  5508. #endif
  5509. #endif
  5510. MKL_DC_STORE_XMM_S(&MKL_DC_CC(i,j+5),xmm_c0);
  5511. ymm_temp4 = MKL_DC_PERM2F128_YMM(ymm_temp1, ymm_temp1, 0x11);
  5512. xmm_temp4 = MKL_DC_CAST_YMM_TO_XMM(ymm_temp4);
  5513. #if !defined(MKL_DC_BETA_ZERO)
  5514. xmm_c0 = MKL_DC_LOAD_XMM_S(&MKL_DC_CC(i,j+6));
  5515. #if !defined(MKL_DC_BETA_ONE)
  5516. xmm_c0 = MKL_DC_MUL_XMM_S(xmm_beta, xmm_c0);
  5517. #endif
  5518. #if defined(MKL_DC_ALPHA_ONE)
  5519. xmm_c0 = MKL_DC_ADD_XMM_S(xmm_temp4, xmm_c0);
  5520. #else
  5521. MKL_DC_MUL_ADD_XMM_S(xmm_alpha, xmm_temp4, xmm_c0, xmm_temp);
  5522. #endif
  5523. #else
  5524. #if !defined(MKL_DC_ALPHA_ONE)
  5525. xmm_c0 = MKL_DC_MUL_XMM_S(xmm_alpha, xmm_temp4);
  5526. #else
  5527. xmm_c0 = xmm_temp4;
  5528. #endif
  5529. #endif
  5530. MKL_DC_STORE_XMM_S(&MKL_DC_CC(i,j+6),xmm_c0);
  5531. xmm_temp4 = MKL_DC_UNPACKHI_XMM(xmm_temp4, xmm_temp4);
  5532. #if !defined(MKL_DC_BETA_ZERO)
  5533. xmm_c0 = MKL_DC_LOAD_XMM_S(&MKL_DC_CC(i,j+7));
  5534. #if !defined(MKL_DC_BETA_ONE)
  5535. xmm_c0 = MKL_DC_MUL_XMM_S(xmm_beta, xmm_c0);
  5536. #endif
  5537. #if defined(MKL_DC_ALPHA_ONE)
  5538. xmm_c0 = MKL_DC_ADD_XMM_S(xmm_temp4, xmm_c0);
  5539. #else
  5540. MKL_DC_MUL_ADD_XMM_S(xmm_alpha, xmm_temp4, xmm_c0, xmm_temp);
  5541. #endif
  5542. #else
  5543. #if !defined(MKL_DC_ALPHA_ONE)
  5544. xmm_c0 = MKL_DC_MUL_XMM_S(xmm_alpha, xmm_temp4);
  5545. #else
  5546. xmm_c0 = xmm_temp4;
  5547. #endif
  5548. #endif
  5549. MKL_DC_STORE_XMM_S(&MKL_DC_CC(i,j+7),xmm_c0);
  5550. }
  5551. }
  5552. if ((n-j) & 4) {
  5553. MKL_INT i;
  5554. for (i=0; i<m0; i+=m_in_ker) {
  5555. ymm_temp0 = MKL_DC_SETZERO_YMM();
  5556. ymm_temp1 = MKL_DC_SETZERO_YMM();
  5557. ymm_temp2 = MKL_DC_SETZERO_YMM();
  5558. ymm_temp3 = MKL_DC_SETZERO_YMM();
  5559. ymm_temp4 = MKL_DC_SETZERO_YMM();
  5560. ymm_temp5 = MKL_DC_SETZERO_YMM();
  5561. ymm_temp6 = MKL_DC_SETZERO_YMM();
  5562. ymm_temp7 = MKL_DC_SETZERO_YMM();
  5563. MKL_INT k;
  5564. for (k=0; k<k0; k+=k_in_ker) {
  5565. ymm_b1 = MKL_DC_LOAD_YMM(&MKL_DC_BB(k+0, j));
  5566. ymm_a = MKL_DC_BCAST_YMM(&MKL_DC_AA(i, k+0));
  5567. MKL_DC_MUL_ADD_YMM(ymm_a, ymm_b1, ymm_temp0, ymm_temp);
  5568. ymm_a = MKL_DC_BCAST_YMM(&MKL_DC_AA(i+1, k+0));
  5569. MKL_DC_MUL_ADD_YMM(ymm_a, ymm_b1, ymm_temp2, ymm_temp);
  5570. ymm_a = MKL_DC_BCAST_YMM(&MKL_DC_AA(i+2, k+0));
  5571. MKL_DC_MUL_ADD_YMM(ymm_a, ymm_b1, ymm_temp4, ymm_temp);
  5572. ymm_a = MKL_DC_BCAST_YMM(&MKL_DC_AA(i+3, k+0));
  5573. MKL_DC_MUL_ADD_YMM(ymm_a, ymm_b1, ymm_temp6, ymm_temp);
  5574. ymm_b1 = MKL_DC_LOAD_YMM(&MKL_DC_BB(k+1, j));
  5575. ymm_a = MKL_DC_BCAST_YMM(&MKL_DC_AA(i, k+1));
  5576. MKL_DC_MUL_ADD_YMM(ymm_a, ymm_b1, ymm_temp1, ymm_temp);
  5577. ymm_a = MKL_DC_BCAST_YMM(&MKL_DC_AA(i+1, k+1));
  5578. MKL_DC_MUL_ADD_YMM(ymm_a, ymm_b1, ymm_temp3, ymm_temp);
  5579. ymm_a = MKL_DC_BCAST_YMM(&MKL_DC_AA(i+2, k+1));
  5580. MKL_DC_MUL_ADD_YMM(ymm_a, ymm_b1, ymm_temp5, ymm_temp);
  5581. ymm_a = MKL_DC_BCAST_YMM(&MKL_DC_AA(i+3, k+1));
  5582. MKL_DC_MUL_ADD_YMM(ymm_a, ymm_b1, ymm_temp7, ymm_temp);
  5583. ymm_b1 = MKL_DC_LOAD_YMM(&MKL_DC_BB(k+2, j));
  5584. ymm_a = MKL_DC_BCAST_YMM(&MKL_DC_AA(i, k+2));
  5585. MKL_DC_MUL_ADD_YMM(ymm_a, ymm_b1, ymm_temp0, ymm_temp);
  5586. ymm_a = MKL_DC_BCAST_YMM(&MKL_DC_AA(i+1, k+2));
  5587. MKL_DC_MUL_ADD_YMM(ymm_a, ymm_b1, ymm_temp2, ymm_temp);
  5588. ymm_a = MKL_DC_BCAST_YMM(&MKL_DC_AA(i+2, k+2));
  5589. MKL_DC_MUL_ADD_YMM(ymm_a, ymm_b1, ymm_temp4, ymm_temp);
  5590. ymm_a = MKL_DC_BCAST_YMM(&MKL_DC_AA(i+3, k+2));
  5591. MKL_DC_MUL_ADD_YMM(ymm_a, ymm_b1, ymm_temp6, ymm_temp);
  5592. ymm_b1 = MKL_DC_LOAD_YMM(&MKL_DC_BB(k+3, j));
  5593. ymm_a = MKL_DC_BCAST_YMM(&MKL_DC_AA(i, k+3));
  5594. MKL_DC_MUL_ADD_YMM(ymm_a, ymm_b1, ymm_temp1, ymm_temp);
  5595. ymm_a = MKL_DC_BCAST_YMM(&MKL_DC_AA(i+1, k+3));
  5596. MKL_DC_MUL_ADD_YMM(ymm_a, ymm_b1, ymm_temp3, ymm_temp);
  5597. ymm_a = MKL_DC_BCAST_YMM(&MKL_DC_AA(i+2, k+3));
  5598. MKL_DC_MUL_ADD_YMM(ymm_a, ymm_b1, ymm_temp5, ymm_temp);
  5599. ymm_a = MKL_DC_BCAST_YMM(&MKL_DC_AA(i+3, k+3));
  5600. MKL_DC_MUL_ADD_YMM(ymm_a, ymm_b1, ymm_temp7, ymm_temp);
  5601. }
  5602. if ((kK-k) & 2) {
  5603. ymm_b1 = MKL_DC_LOAD_YMM(&MKL_DC_BB(k+0, j));
  5604. ymm_a = MKL_DC_BCAST_YMM(&MKL_DC_AA(i, k+0));
  5605. MKL_DC_MUL_ADD_YMM(ymm_a, ymm_b1, ymm_temp0, ymm_temp);
  5606. ymm_a = MKL_DC_BCAST_YMM(&MKL_DC_AA(i+1, k+0));
  5607. MKL_DC_MUL_ADD_YMM(ymm_a, ymm_b1, ymm_temp2, ymm_temp);
  5608. ymm_a = MKL_DC_BCAST_YMM(&MKL_DC_AA(i+2, k+0));
  5609. MKL_DC_MUL_ADD_YMM(ymm_a, ymm_b1, ymm_temp4, ymm_temp);
  5610. ymm_a = MKL_DC_BCAST_YMM(&MKL_DC_AA(i+3, k+0));
  5611. MKL_DC_MUL_ADD_YMM(ymm_a, ymm_b1, ymm_temp6, ymm_temp);
  5612. ymm_b1 = MKL_DC_LOAD_YMM(&MKL_DC_BB(k+1, j));
  5613. ymm_a = MKL_DC_BCAST_YMM(&MKL_DC_AA(i, k+1));
  5614. MKL_DC_MUL_ADD_YMM(ymm_a, ymm_b1, ymm_temp1, ymm_temp);
  5615. ymm_a = MKL_DC_BCAST_YMM(&MKL_DC_AA(i+1, k+1));
  5616. MKL_DC_MUL_ADD_YMM(ymm_a, ymm_b1, ymm_temp3, ymm_temp);
  5617. ymm_a = MKL_DC_BCAST_YMM(&MKL_DC_AA(i+2, k+1));
  5618. MKL_DC_MUL_ADD_YMM(ymm_a, ymm_b1, ymm_temp5, ymm_temp);
  5619. ymm_a = MKL_DC_BCAST_YMM(&MKL_DC_AA(i+3, k+1));
  5620. MKL_DC_MUL_ADD_YMM(ymm_a, ymm_b1, ymm_temp7, ymm_temp);
  5621. k+=2;
  5622. }
  5623. if (kK>=2) {
  5624. ymm_temp0 = MKL_DC_ADD_YMM(ymm_temp0, ymm_temp1);
  5625. ymm_temp2 = MKL_DC_ADD_YMM(ymm_temp2, ymm_temp3);
  5626. ymm_temp4 = MKL_DC_ADD_YMM(ymm_temp4, ymm_temp5);
  5627. ymm_temp6 = MKL_DC_ADD_YMM(ymm_temp6, ymm_temp7);
  5628. }
  5629. if (kK-k) {
  5630. ymm_b1 = MKL_DC_LOAD_YMM(&MKL_DC_BB(k+0, j));
  5631. ymm_a = MKL_DC_BCAST_YMM(&MKL_DC_AA(i, k+0));
  5632. MKL_DC_MUL_ADD_YMM(ymm_a, ymm_b1, ymm_temp0, ymm_temp);
  5633. ymm_a = MKL_DC_BCAST_YMM(&MKL_DC_AA(i+1, k+0));
  5634. MKL_DC_MUL_ADD_YMM(ymm_a, ymm_b1, ymm_temp2, ymm_temp);
  5635. ymm_a = MKL_DC_BCAST_YMM(&MKL_DC_AA(i+2, k+0));
  5636. MKL_DC_MUL_ADD_YMM(ymm_a, ymm_b1, ymm_temp4, ymm_temp);
  5637. ymm_a = MKL_DC_BCAST_YMM(&MKL_DC_AA(i+3, k+0));
  5638. MKL_DC_MUL_ADD_YMM(ymm_a, ymm_b1, ymm_temp6, ymm_temp);
  5639. }
  5640. MKL_DC_VEC_TRANSPOSE_YMM(ymm_temp0, ymm_temp2, ymm_temp4, ymm_temp6, ymm_c0, ymm_c2, ymm_c4, ymm_c6);
  5641. #if !defined(MKL_DC_BETA_ZERO)
  5642. ymm_c0 = MKL_DC_LOAD_YMM(&MKL_DC_CC(i,j+0));
  5643. #if !defined(MKL_DC_BETA_ONE)
  5644. ymm_c0 = MKL_DC_MUL_YMM(ymm_beta, ymm_c0);
  5645. #endif
  5646. #if defined(MKL_DC_ALPHA_ONE)
  5647. ymm_c0 = MKL_DC_ADD_YMM(ymm_temp0, ymm_c0);
  5648. #else
  5649. MKL_DC_MUL_ADD_YMM(ymm_alpha, ymm_temp0, ymm_c0, ymm_temp);
  5650. #endif
  5651. #else
  5652. #if !defined(MKL_DC_ALPHA_ONE)
  5653. ymm_c0 = MKL_DC_MUL_YMM(ymm_alpha, ymm_temp0);
  5654. #else
  5655. ymm_c0 = ymm_temp0;
  5656. #endif
  5657. #endif
  5658. MKL_DC_STORE_YMM(&MKL_DC_CC(i,j+0), ymm_c0);
  5659. #if !defined(MKL_DC_BETA_ZERO)
  5660. ymm_c2 = MKL_DC_LOAD_YMM(&MKL_DC_CC(i,j+1));
  5661. #if !defined(MKL_DC_BETA_ONE)
  5662. ymm_c2 = MKL_DC_MUL_YMM(ymm_beta, ymm_c2);
  5663. #endif
  5664. #if defined(MKL_DC_ALPHA_ONE)
  5665. ymm_c2 = MKL_DC_ADD_YMM(ymm_temp2, ymm_c2);
  5666. #else
  5667. MKL_DC_MUL_ADD_YMM(ymm_alpha, ymm_temp2, ymm_c2, ymm_temp);
  5668. #endif
  5669. #else
  5670. #if !defined(MKL_DC_ALPHA_ONE)
  5671. ymm_c2 = MKL_DC_MUL_YMM(ymm_alpha, ymm_temp2);
  5672. #else
  5673. ymm_c2 = ymm_temp2;
  5674. #endif
  5675. #endif
  5676. MKL_DC_STORE_YMM(&MKL_DC_CC(i,j+1), ymm_c2);
  5677. #if !defined(MKL_DC_BETA_ZERO)
  5678. ymm_c4 = MKL_DC_LOAD_YMM(&MKL_DC_CC(i,j+2));
  5679. #if !defined(MKL_DC_BETA_ONE)
  5680. ymm_c4 = MKL_DC_MUL_YMM(ymm_beta, ymm_c4);
  5681. #endif
  5682. #if defined(MKL_DC_ALPHA_ONE)
  5683. ymm_c4 = MKL_DC_ADD_YMM(ymm_temp4, ymm_c4);
  5684. #else
  5685. MKL_DC_MUL_ADD_YMM(ymm_alpha, ymm_temp4, ymm_c4, ymm_temp);
  5686. #endif
  5687. #else
  5688. #if !defined(MKL_DC_ALPHA_ONE)
  5689. ymm_c4 = MKL_DC_MUL_YMM(ymm_alpha, ymm_temp4);
  5690. #else
  5691. ymm_c4 = ymm_temp4;
  5692. #endif
  5693. #endif
  5694. MKL_DC_STORE_YMM(&MKL_DC_CC(i,j+2), ymm_c4);
  5695. #if !defined(MKL_DC_BETA_ZERO)
  5696. ymm_c6 = MKL_DC_LOAD_YMM(&MKL_DC_CC(i,j+3));
  5697. #if !defined(MKL_DC_BETA_ONE)
  5698. ymm_c6 = MKL_DC_MUL_YMM(ymm_beta, ymm_c6);
  5699. #endif
  5700. #if defined(MKL_DC_ALPHA_ONE)
  5701. ymm_c6 = MKL_DC_ADD_YMM(ymm_temp6, ymm_c6);
  5702. #else
  5703. MKL_DC_MUL_ADD_YMM(ymm_alpha, ymm_temp6, ymm_c6, ymm_temp);
  5704. #endif
  5705. #else
  5706. #if !defined(MKL_DC_ALPHA_ONE)
  5707. ymm_c6 = MKL_DC_MUL_YMM(ymm_alpha, ymm_temp6);
  5708. #else
  5709. ymm_c6 = ymm_temp6;
  5710. #endif
  5711. #endif
  5712. MKL_DC_STORE_YMM(&MKL_DC_CC(i,j+3), ymm_c6);
  5713. }
  5714. if ((m-i) & 2) {
  5715. ymm_temp0 = MKL_DC_SETZERO_YMM();
  5716. ymm_temp1 = MKL_DC_SETZERO_YMM();
  5717. ymm_temp2 = MKL_DC_SETZERO_YMM();
  5718. ymm_temp3 = MKL_DC_SETZERO_YMM();
  5719. MKL_INT k;
  5720. for (k=0; k<k0; k+=k_in_ker) {
  5721. ymm_b1 = MKL_DC_LOAD_YMM(&MKL_DC_BB(k+0, j));
  5722. ymm_a = MKL_DC_BCAST_YMM(&MKL_DC_AA(i, k+0));
  5723. MKL_DC_MUL_ADD_YMM(ymm_a, ymm_b1, ymm_temp0, ymm_temp);
  5724. ymm_a = MKL_DC_BCAST_YMM(&MKL_DC_AA(i+1, k+0));
  5725. MKL_DC_MUL_ADD_YMM(ymm_a, ymm_b1, ymm_temp2, ymm_temp);
  5726. ymm_b1 = MKL_DC_LOAD_YMM(&MKL_DC_BB(k+1, j));
  5727. ymm_a = MKL_DC_BCAST_YMM(&MKL_DC_AA(i, k+1));
  5728. MKL_DC_MUL_ADD_YMM(ymm_a, ymm_b1, ymm_temp1, ymm_temp);
  5729. ymm_a = MKL_DC_BCAST_YMM(&MKL_DC_AA(i+1, k+1));
  5730. MKL_DC_MUL_ADD_YMM(ymm_a, ymm_b1, ymm_temp3, ymm_temp);
  5731. ymm_b1 = MKL_DC_LOAD_YMM(&MKL_DC_BB(k+2, j));
  5732. ymm_a = MKL_DC_BCAST_YMM(&MKL_DC_AA(i, k+2));
  5733. MKL_DC_MUL_ADD_YMM(ymm_a, ymm_b1, ymm_temp0, ymm_temp);
  5734. ymm_a = MKL_DC_BCAST_YMM(&MKL_DC_AA(i+1, k+2));
  5735. MKL_DC_MUL_ADD_YMM(ymm_a, ymm_b1, ymm_temp2, ymm_temp);
  5736. ymm_b1 = MKL_DC_LOAD_YMM(&MKL_DC_BB(k+3, j));
  5737. ymm_a = MKL_DC_BCAST_YMM(&MKL_DC_AA(i, k+3));
  5738. MKL_DC_MUL_ADD_YMM(ymm_a, ymm_b1, ymm_temp1, ymm_temp);
  5739. ymm_a = MKL_DC_BCAST_YMM(&MKL_DC_AA(i+1, k+3));
  5740. MKL_DC_MUL_ADD_YMM(ymm_a, ymm_b1, ymm_temp3, ymm_temp);
  5741. }
  5742. if ((kK-k) & 2) {
  5743. ymm_b1 = MKL_DC_LOAD_YMM(&MKL_DC_BB(k+0, j));
  5744. ymm_a = MKL_DC_BCAST_YMM(&MKL_DC_AA(i, k+0));
  5745. MKL_DC_MUL_ADD_YMM(ymm_a, ymm_b1, ymm_temp0, ymm_temp);
  5746. ymm_a = MKL_DC_BCAST_YMM(&MKL_DC_AA(i+1, k+0));
  5747. MKL_DC_MUL_ADD_YMM(ymm_a, ymm_b1, ymm_temp2, ymm_temp);
  5748. ymm_b1 = MKL_DC_LOAD_YMM(&MKL_DC_BB(k+1, j));
  5749. ymm_a = MKL_DC_BCAST_YMM(&MKL_DC_AA(i, k+1));
  5750. MKL_DC_MUL_ADD_YMM(ymm_a, ymm_b1, ymm_temp1, ymm_temp);
  5751. ymm_a = MKL_DC_BCAST_YMM(&MKL_DC_AA(i+1, k+1));
  5752. MKL_DC_MUL_ADD_YMM(ymm_a, ymm_b1, ymm_temp3, ymm_temp);
  5753. k+=2;
  5754. }
  5755. if (kK >= 2) {
  5756. ymm_temp0 = MKL_DC_ADD_YMM(ymm_temp0, ymm_temp1);
  5757. ymm_temp2 = MKL_DC_ADD_YMM(ymm_temp2, ymm_temp3);
  5758. }
  5759. if (kK-k) {
  5760. ymm_b1 = MKL_DC_LOAD_YMM(&MKL_DC_BB(k+0, j));
  5761. ymm_a = MKL_DC_BCAST_YMM(&MKL_DC_AA(i, k+0));
  5762. MKL_DC_MUL_ADD_YMM(ymm_a, ymm_b1, ymm_temp0, ymm_temp);
  5763. ymm_a = MKL_DC_BCAST_YMM(&MKL_DC_AA(i+1, k+0));
  5764. MKL_DC_MUL_ADD_YMM(ymm_a, ymm_b1, ymm_temp2, ymm_temp);
  5765. }
  5766. ymm_temp4 = MKL_DC_UNPACKLO_YMM(ymm_temp0, ymm_temp2);
  5767. xmm_temp4 = MKL_DC_CAST_YMM_TO_XMM(ymm_temp4);
  5768. #if !defined(MKL_DC_BETA_ZERO)
  5769. xmm_c0 = MKL_DC_LOAD_XMM(&MKL_DC_CC(i,j+0));
  5770. #if !defined(MKL_DC_BETA_ONE)
  5771. xmm_c0 = MKL_DC_MUL_XMM(xmm_beta, xmm_c0);
  5772. #endif
  5773. #if defined(MKL_DC_ALPHA_ONE)
  5774. xmm_c0 = MKL_DC_ADD_XMM(xmm_temp4, xmm_c0);
  5775. #else
  5776. MKL_DC_MUL_ADD_XMM(xmm_alpha, xmm_temp4, xmm_c0, xmm_temp);
  5777. #endif
  5778. #else
  5779. #if !defined(MKL_DC_ALPHA_ONE)
  5780. xmm_c0 = MKL_DC_MUL_XMM(xmm_alpha, xmm_temp4);
  5781. #else
  5782. xmm_c0 = xmm_temp4;
  5783. #endif
  5784. #endif
  5785. MKL_DC_STORE_XMM(&MKL_DC_CC(i,j+0),xmm_c0);
  5786. ymm_temp5 = MKL_DC_UNPACKHI_YMM(ymm_temp0, ymm_temp2);
  5787. xmm_temp5 = MKL_DC_CAST_YMM_TO_XMM(ymm_temp5);
  5788. #if !defined(MKL_DC_BETA_ZERO)
  5789. xmm_c2 = MKL_DC_LOAD_XMM(&MKL_DC_CC(i,j+1));
  5790. #if !defined(MKL_DC_BETA_ONE)
  5791. xmm_c2 = MKL_DC_MUL_XMM(xmm_beta, xmm_c2);
  5792. #endif
  5793. #if defined(MKL_DC_ALPHA_ONE)
  5794. xmm_c2 = MKL_DC_ADD_XMM(xmm_temp5, xmm_c2);
  5795. #else
  5796. MKL_DC_MUL_ADD_XMM(xmm_alpha, xmm_temp5, xmm_c2, xmm_temp);
  5797. #endif
  5798. #else
  5799. #if !defined(MKL_DC_ALPHA_ONE)
  5800. xmm_c2 = MKL_DC_MUL_XMM(xmm_alpha, xmm_temp5);
  5801. #else
  5802. xmm_c2 = xmm_temp5;
  5803. #endif
  5804. #endif
  5805. MKL_DC_STORE_XMM(&MKL_DC_CC(i,j+1),xmm_c2);
  5806. ymm_temp0 = MKL_DC_PERM2F128_YMM(ymm_temp4, ymm_temp4, 0x11);
  5807. xmm_temp0 = MKL_DC_CAST_YMM_TO_XMM(ymm_temp0);
  5808. #if !defined(MKL_DC_BETA_ZERO)
  5809. xmm_c0 = MKL_DC_LOAD_XMM(&MKL_DC_CC(i,j+2));
  5810. #if !defined(MKL_DC_BETA_ONE)
  5811. xmm_c0 = MKL_DC_MUL_XMM(xmm_beta, xmm_c0);
  5812. #endif
  5813. #if defined(MKL_DC_ALPHA_ONE)
  5814. xmm_c0 = MKL_DC_ADD_XMM(xmm_temp0, xmm_c0);
  5815. #else
  5816. MKL_DC_MUL_ADD_XMM(xmm_alpha, xmm_temp0, xmm_c0, xmm_temp);
  5817. #endif
  5818. #else
  5819. #if !defined(MKL_DC_ALPHA_ONE)
  5820. xmm_c0 = MKL_DC_MUL_XMM(xmm_alpha, xmm_temp0);
  5821. #else
  5822. xmm_c0 = xmm_temp0;
  5823. #endif
  5824. #endif
  5825. MKL_DC_STORE_XMM(&MKL_DC_CC(i,j+2),xmm_c0);
  5826. ymm_temp0 = MKL_DC_PERM2F128_YMM(ymm_temp5, ymm_temp5, 0x11);
  5827. xmm_temp0 = MKL_DC_CAST_YMM_TO_XMM(ymm_temp0);
  5828. #if !defined(MKL_DC_BETA_ZERO)
  5829. xmm_c2 = MKL_DC_LOAD_XMM(&MKL_DC_CC(i,j+3));
  5830. #if !defined(MKL_DC_BETA_ONE)
  5831. xmm_c2 = MKL_DC_MUL_XMM(xmm_beta, xmm_c2);
  5832. #endif
  5833. #if defined(MKL_DC_ALPHA_ONE)
  5834. xmm_c2 = MKL_DC_ADD_XMM(xmm_temp0, xmm_c2);
  5835. #else
  5836. MKL_DC_MUL_ADD_XMM(xmm_alpha, xmm_temp0, xmm_c2, xmm_temp);
  5837. #endif
  5838. #else
  5839. #if !defined(MKL_DC_ALPHA_ONE)
  5840. xmm_c2 = MKL_DC_MUL_XMM(xmm_alpha, xmm_temp0);
  5841. #else
  5842. xmm_c2 = xmm_temp0;
  5843. #endif
  5844. #endif
  5845. MKL_DC_STORE_XMM(&MKL_DC_CC(i,j+3),xmm_c2);
  5846. i+=2;
  5847. }
  5848. if ((m-i) & 1) {
  5849. ymm_temp0 = MKL_DC_SETZERO_YMM();
  5850. ymm_temp1 = MKL_DC_SETZERO_YMM();
  5851. MKL_INT k;
  5852. for (k=0; k<k0; k+=k_in_ker) {
  5853. ymm_b1 = MKL_DC_LOAD_YMM(&MKL_DC_BB(k+0, j));
  5854. ymm_a = MKL_DC_BCAST_YMM(&MKL_DC_AA(i, k+0));
  5855. MKL_DC_MUL_ADD_YMM(ymm_a, ymm_b1, ymm_temp0, ymm_temp);
  5856. ymm_b1 = MKL_DC_LOAD_YMM(&MKL_DC_BB(k+1, j));
  5857. ymm_a = MKL_DC_BCAST_YMM(&MKL_DC_AA(i, k+1));
  5858. MKL_DC_MUL_ADD_YMM(ymm_a, ymm_b1, ymm_temp1, ymm_temp);
  5859. ymm_b1 = MKL_DC_LOAD_YMM(&MKL_DC_BB(k+2, j));
  5860. ymm_a = MKL_DC_BCAST_YMM(&MKL_DC_AA(i, k+2));
  5861. MKL_DC_MUL_ADD_YMM(ymm_a, ymm_b1, ymm_temp0, ymm_temp);
  5862. ymm_b1 = MKL_DC_LOAD_YMM(&MKL_DC_BB(k+3, j));
  5863. ymm_a = MKL_DC_BCAST_YMM(&MKL_DC_AA(i, k+3));
  5864. MKL_DC_MUL_ADD_YMM(ymm_a, ymm_b1, ymm_temp1, ymm_temp);
  5865. }
  5866. if ((kK-k) & 2) {
  5867. ymm_b1 = MKL_DC_LOAD_YMM(&MKL_DC_BB(k+0, j));
  5868. ymm_a = MKL_DC_BCAST_YMM(&MKL_DC_AA(i, k+0));
  5869. MKL_DC_MUL_ADD_YMM(ymm_a, ymm_b1, ymm_temp0, ymm_temp);
  5870. ymm_b1 = MKL_DC_LOAD_YMM(&MKL_DC_BB(k+1, j));
  5871. ymm_a = MKL_DC_BCAST_YMM(&MKL_DC_AA(i, k+1));
  5872. MKL_DC_MUL_ADD_YMM(ymm_a, ymm_b1, ymm_temp1, ymm_temp);
  5873. k+=2;
  5874. }
  5875. if (kK >= 2) {
  5876. ymm_temp0 = MKL_DC_ADD_YMM(ymm_temp0, ymm_temp1);
  5877. }
  5878. if (kK-k) {
  5879. ymm_b1 = MKL_DC_LOAD_YMM(&MKL_DC_BB(k+0, j));
  5880. ymm_a = MKL_DC_BCAST_YMM(&MKL_DC_AA(i, k+0));
  5881. MKL_DC_MUL_ADD_YMM(ymm_a, ymm_b1, ymm_temp0, ymm_temp);
  5882. }
  5883. xmm_temp4 = MKL_DC_CAST_YMM_TO_XMM(ymm_temp0);
  5884. #if !defined(MKL_DC_BETA_ZERO)
  5885. xmm_c0 = MKL_DC_LOAD_XMM_S(&MKL_DC_CC(i,j+0));
  5886. #if !defined(MKL_DC_BETA_ONE)
  5887. xmm_c0 = MKL_DC_MUL_XMM_S(xmm_beta, xmm_c0);
  5888. #endif
  5889. #if defined(MKL_DC_ALPHA_ONE)
  5890. xmm_c0 = MKL_DC_ADD_XMM_S(xmm_temp4, xmm_c0);
  5891. #else
  5892. MKL_DC_MUL_ADD_XMM_S(xmm_alpha, xmm_temp4, xmm_c0, xmm_temp);
  5893. #endif
  5894. #else
  5895. #if !defined(MKL_DC_ALPHA_ONE)
  5896. xmm_c0 = MKL_DC_MUL_XMM_S(xmm_alpha, xmm_temp4);
  5897. #else
  5898. xmm_c0 = xmm_temp4;
  5899. #endif
  5900. #endif
  5901. MKL_DC_STORE_XMM_S(&MKL_DC_CC(i,j+0),xmm_c0);
  5902. xmm_temp4 = MKL_DC_UNPACKHI_XMM(xmm_temp4, xmm_temp4);
  5903. #if !defined(MKL_DC_BETA_ZERO)
  5904. xmm_c0 = MKL_DC_LOAD_XMM_S(&MKL_DC_CC(i,j+1));
  5905. #if !defined(MKL_DC_BETA_ONE)
  5906. xmm_c0 = MKL_DC_MUL_XMM(xmm_beta, xmm_c0);
  5907. #endif
  5908. #if defined(MKL_DC_ALPHA_ONE)
  5909. xmm_c0 = MKL_DC_ADD_XMM_S(xmm_temp4, xmm_c0);
  5910. #else
  5911. MKL_DC_MUL_ADD_XMM_S(xmm_alpha, xmm_temp4, xmm_c0, xmm_temp);
  5912. #endif
  5913. #else
  5914. #if !defined(MKL_DC_ALPHA_ONE)
  5915. xmm_c0 = MKL_DC_MUL_XMM_S(xmm_alpha, xmm_temp4);
  5916. #else
  5917. xmm_c0 = xmm_temp4;
  5918. #endif
  5919. #endif
  5920. MKL_DC_STORE_XMM_S(&MKL_DC_CC(i,j+1),xmm_c0);
  5921. ymm_temp4 = MKL_DC_PERM2F128_YMM(ymm_temp0, ymm_temp0, 0x11);
  5922. xmm_temp4 = MKL_DC_CAST_YMM_TO_XMM(ymm_temp4);
  5923. #if !defined(MKL_DC_BETA_ZERO)
  5924. xmm_c0 = MKL_DC_LOAD_XMM_S(&MKL_DC_CC(i,j+2));
  5925. #if !defined(MKL_DC_BETA_ONE)
  5926. xmm_c0 = MKL_DC_MUL_XMM(xmm_beta, xmm_c0);
  5927. #endif
  5928. #if defined(MKL_DC_ALPHA_ONE)
  5929. xmm_c0 = MKL_DC_ADD_XMM_S(xmm_temp4, xmm_c0);
  5930. #else
  5931. MKL_DC_MUL_ADD_XMM_S(xmm_alpha, xmm_temp4, xmm_c0, xmm_temp);
  5932. #endif
  5933. #else
  5934. #if !defined(MKL_DC_ALPHA_ONE)
  5935. xmm_c0 = MKL_DC_MUL_XMM_S(xmm_alpha, xmm_temp4);
  5936. #else
  5937. xmm_c0 = xmm_temp4;
  5938. #endif
  5939. #endif
  5940. MKL_DC_STORE_XMM_S(&MKL_DC_CC(i,j+2),xmm_c0);
  5941. xmm_temp4 = MKL_DC_UNPACKHI_XMM(xmm_temp4, xmm_temp4);
  5942. #if !defined(MKL_DC_BETA_ZERO)
  5943. xmm_c0 = MKL_DC_LOAD_XMM_S(&MKL_DC_CC(i,j+3));
  5944. #if !defined(MKL_DC_BETA_ONE)
  5945. xmm_c0 = MKL_DC_MUL_XMM(xmm_beta, xmm_c0);
  5946. #endif
  5947. #if defined(MKL_DC_ALPHA_ONE)
  5948. xmm_c0 = MKL_DC_ADD_XMM_S(xmm_temp4, xmm_c0);
  5949. #else
  5950. MKL_DC_MUL_ADD_XMM_S(xmm_alpha, xmm_temp4, xmm_c0, xmm_temp);
  5951. #endif
  5952. #else
  5953. #if !defined(MKL_DC_ALPHA_ONE)
  5954. xmm_c0 = MKL_DC_MUL_XMM_S(xmm_alpha, xmm_temp4);
  5955. #else
  5956. xmm_c0 = xmm_temp4;
  5957. #endif
  5958. #endif
  5959. MKL_DC_STORE_XMM_S(&MKL_DC_CC(i,j+3),xmm_c0);
  5960. }
  5961. j+=4;
  5962. }
  5963. if ((n-j) & 2) {
  5964. MKL_INT i;
  5965. for (i=0; i<m0; i+=m_in_ker) {
  5966. xmm_temp0 = MKL_DC_SETZERO_XMM();
  5967. xmm_temp1 = MKL_DC_SETZERO_XMM();
  5968. xmm_temp2 = MKL_DC_SETZERO_XMM();
  5969. xmm_temp3 = MKL_DC_SETZERO_XMM();
  5970. xmm_temp4 = MKL_DC_SETZERO_XMM();
  5971. xmm_temp5 = MKL_DC_SETZERO_XMM();
  5972. xmm_temp6 = MKL_DC_SETZERO_XMM();
  5973. xmm_temp7 = MKL_DC_SETZERO_XMM();
  5974. MKL_INT k;
  5975. for (k=0; k<k0; k+=k_in_ker) {
  5976. xmm_b1 = MKL_DC_LOAD_XMM(&MKL_DC_BB(k+0, j));
  5977. xmm_a = MKL_DC_LOADDUP_XMM(&MKL_DC_AA(i, k+0));
  5978. MKL_DC_MUL_ADD_XMM(xmm_a, xmm_b1, xmm_temp0, xmm_temp);
  5979. xmm_a = MKL_DC_LOADDUP_XMM(&MKL_DC_AA(i+1, k+0));
  5980. MKL_DC_MUL_ADD_XMM(xmm_a, xmm_b1, xmm_temp2, xmm_temp);
  5981. xmm_a = MKL_DC_LOADDUP_XMM(&MKL_DC_AA(i+2, k+0));
  5982. MKL_DC_MUL_ADD_XMM(xmm_a, xmm_b1, xmm_temp4, xmm_temp);
  5983. xmm_a = MKL_DC_LOADDUP_XMM(&MKL_DC_AA(i+3, k+0));
  5984. MKL_DC_MUL_ADD_XMM(xmm_a, xmm_b1, xmm_temp6, xmm_temp);
  5985. xmm_b1 = MKL_DC_LOAD_XMM(&MKL_DC_BB(k+1, j));
  5986. xmm_a = MKL_DC_LOADDUP_XMM(&MKL_DC_AA(i, k+1));
  5987. MKL_DC_MUL_ADD_XMM(xmm_a, xmm_b1, xmm_temp1, xmm_temp);
  5988. xmm_a = MKL_DC_LOADDUP_XMM(&MKL_DC_AA(i+1, k+1));
  5989. MKL_DC_MUL_ADD_XMM(xmm_a, xmm_b1, xmm_temp3, xmm_temp);
  5990. xmm_a = MKL_DC_LOADDUP_XMM(&MKL_DC_AA(i+2, k+1));
  5991. MKL_DC_MUL_ADD_XMM(xmm_a, xmm_b1, xmm_temp5, xmm_temp);
  5992. xmm_a = MKL_DC_LOADDUP_XMM(&MKL_DC_AA(i+3, k+1));
  5993. MKL_DC_MUL_ADD_XMM(xmm_a, xmm_b1, xmm_temp7, xmm_temp);
  5994. xmm_b1 = MKL_DC_LOAD_XMM(&MKL_DC_BB(k+2, j));
  5995. xmm_a = MKL_DC_LOADDUP_XMM(&MKL_DC_AA(i, k+2));
  5996. MKL_DC_MUL_ADD_XMM(xmm_a, xmm_b1, xmm_temp0, xmm_temp);
  5997. xmm_a = MKL_DC_LOADDUP_XMM(&MKL_DC_AA(i+1, k+2));
  5998. MKL_DC_MUL_ADD_XMM(xmm_a, xmm_b1, xmm_temp2, xmm_temp);
  5999. xmm_a = MKL_DC_LOADDUP_XMM(&MKL_DC_AA(i+2, k+2));
  6000. MKL_DC_MUL_ADD_XMM(xmm_a, xmm_b1, xmm_temp4, xmm_temp);
  6001. xmm_a = MKL_DC_LOADDUP_XMM(&MKL_DC_AA(i+3, k+2));
  6002. MKL_DC_MUL_ADD_XMM(xmm_a, xmm_b1, xmm_temp6, xmm_temp);
  6003. xmm_b1 = MKL_DC_LOAD_XMM(&MKL_DC_BB(k+3, j));
  6004. xmm_a = MKL_DC_LOADDUP_XMM(&MKL_DC_AA(i, k+3));
  6005. MKL_DC_MUL_ADD_XMM(xmm_a, xmm_b1, xmm_temp1, xmm_temp);
  6006. xmm_a = MKL_DC_LOADDUP_XMM(&MKL_DC_AA(i+1, k+3));
  6007. MKL_DC_MUL_ADD_XMM(xmm_a, xmm_b1, xmm_temp3, xmm_temp);
  6008. xmm_a = MKL_DC_LOADDUP_XMM(&MKL_DC_AA(i+2, k+3));
  6009. MKL_DC_MUL_ADD_XMM(xmm_a, xmm_b1, xmm_temp5, xmm_temp);
  6010. xmm_a = MKL_DC_LOADDUP_XMM(&MKL_DC_AA(i+3, k+3));
  6011. MKL_DC_MUL_ADD_XMM(xmm_a, xmm_b1, xmm_temp7, xmm_temp);
  6012. }
  6013. if ((kK-k) & 2) {
  6014. xmm_b1 = MKL_DC_LOAD_XMM(&MKL_DC_BB(k+0, j));
  6015. xmm_a = MKL_DC_LOADDUP_XMM(&MKL_DC_AA(i, k+0));
  6016. MKL_DC_MUL_ADD_XMM(xmm_a, xmm_b1, xmm_temp0, xmm_temp);
  6017. xmm_a = MKL_DC_LOADDUP_XMM(&MKL_DC_AA(i+1, k+0));
  6018. MKL_DC_MUL_ADD_XMM(xmm_a, xmm_b1, xmm_temp2, xmm_temp);
  6019. xmm_a = MKL_DC_LOADDUP_XMM(&MKL_DC_AA(i+2, k+0));
  6020. MKL_DC_MUL_ADD_XMM(xmm_a, xmm_b1, xmm_temp4, xmm_temp);
  6021. xmm_a = MKL_DC_LOADDUP_XMM(&MKL_DC_AA(i+3, k+0));
  6022. MKL_DC_MUL_ADD_XMM(xmm_a, xmm_b1, xmm_temp6, xmm_temp);
  6023. xmm_b1 = MKL_DC_LOAD_XMM(&MKL_DC_BB(k+1, j));
  6024. xmm_a = MKL_DC_LOADDUP_XMM(&MKL_DC_AA(i, k+1));
  6025. MKL_DC_MUL_ADD_XMM(xmm_a, xmm_b1, xmm_temp1, xmm_temp);
  6026. xmm_a = MKL_DC_LOADDUP_XMM(&MKL_DC_AA(i+1, k+1));
  6027. MKL_DC_MUL_ADD_XMM(xmm_a, xmm_b1, xmm_temp3, xmm_temp);
  6028. xmm_a = MKL_DC_LOADDUP_XMM(&MKL_DC_AA(i+2, k+1));
  6029. MKL_DC_MUL_ADD_XMM(xmm_a, xmm_b1, xmm_temp5, xmm_temp);
  6030. xmm_a = MKL_DC_LOADDUP_XMM(&MKL_DC_AA(i+3, k+1));
  6031. MKL_DC_MUL_ADD_XMM(xmm_a, xmm_b1, xmm_temp7, xmm_temp);
  6032. k+=2;
  6033. }
  6034. if (kK >= 2) {
  6035. xmm_temp0 = MKL_DC_ADD_XMM(xmm_temp0, xmm_temp1);
  6036. xmm_temp2 = MKL_DC_ADD_XMM(xmm_temp2, xmm_temp3);
  6037. xmm_temp4 = MKL_DC_ADD_XMM(xmm_temp4, xmm_temp5);
  6038. xmm_temp6 = MKL_DC_ADD_XMM(xmm_temp6, xmm_temp7);
  6039. }
  6040. if (kK-k) {
  6041. xmm_b1 = MKL_DC_LOAD_XMM(&MKL_DC_BB(k+0, j));
  6042. xmm_a = MKL_DC_LOADDUP_XMM(&MKL_DC_AA(i, k+0));
  6043. MKL_DC_MUL_ADD_XMM(xmm_a, xmm_b1, xmm_temp0, xmm_temp);
  6044. xmm_a = MKL_DC_LOADDUP_XMM(&MKL_DC_AA(i+1, k+0));
  6045. MKL_DC_MUL_ADD_XMM(xmm_a, xmm_b1, xmm_temp2, xmm_temp);
  6046. xmm_a = MKL_DC_LOADDUP_XMM(&MKL_DC_AA(i+2, k+0));
  6047. MKL_DC_MUL_ADD_XMM(xmm_a, xmm_b1, xmm_temp4, xmm_temp);
  6048. xmm_a = MKL_DC_LOADDUP_XMM(&MKL_DC_AA(i+3, k+0));
  6049. MKL_DC_MUL_ADD_XMM(xmm_a, xmm_b1, xmm_temp6, xmm_temp);
  6050. }
  6051. MKL_DC_VEC_TRANSPOSE_XMM(xmm_temp1, xmm_temp3, xmm_temp0, xmm_temp2);
  6052. #if !defined(MKL_DC_BETA_ZERO)
  6053. xmm_c0 = MKL_DC_LOAD_XMM(&MKL_DC_CC(i,j+0));
  6054. #if !defined(MKL_DC_BETA_ONE)
  6055. xmm_c0 = MKL_DC_MUL_XMM(xmm_beta, xmm_c0);
  6056. #endif
  6057. #if defined(MKL_DC_ALPHA_ONE)
  6058. xmm_c0 = MKL_DC_ADD_XMM(xmm_temp1, xmm_c0);
  6059. #else
  6060. MKL_DC_MUL_ADD_XMM(xmm_alpha, xmm_temp1, xmm_c0, xmm_temp);
  6061. #endif
  6062. #else
  6063. #if !defined(MKL_DC_ALPHA_ONE)
  6064. xmm_c0 = MKL_DC_MUL_XMM(xmm_alpha, xmm_temp1);
  6065. #else
  6066. xmm_c0 = xmm_temp1;
  6067. #endif
  6068. #endif
  6069. MKL_DC_STORE_XMM(&MKL_DC_CC(i,j+0), xmm_c0);
  6070. #if !defined(MKL_DC_BETA_ZERO)
  6071. xmm_c2 = MKL_DC_LOAD_XMM(&MKL_DC_CC(i,j+1));
  6072. #if !defined(MKL_DC_BETA_ONE)
  6073. xmm_c2 = MKL_DC_MUL_XMM(xmm_beta, xmm_c2);
  6074. #endif
  6075. #if defined(MKL_DC_ALPHA_ONE)
  6076. xmm_c2 = MKL_DC_ADD_XMM(xmm_temp3, xmm_c2);
  6077. #else
  6078. MKL_DC_MUL_ADD_XMM(xmm_alpha, xmm_temp3, xmm_c2, xmm_temp);
  6079. #endif
  6080. #else
  6081. #if !defined(MKL_DC_ALPHA_ONE)
  6082. xmm_c2 = MKL_DC_MUL_XMM(xmm_alpha, xmm_temp3);
  6083. #else
  6084. xmm_c2 = xmm_temp3;
  6085. #endif
  6086. #endif
  6087. MKL_DC_STORE_XMM(&MKL_DC_CC(i,j+1), xmm_c2);
  6088. MKL_DC_VEC_TRANSPOSE_XMM(xmm_temp1, xmm_temp3, xmm_temp4, xmm_temp6);
  6089. #if !defined(MKL_DC_BETA_ZERO)
  6090. xmm_c4 = MKL_DC_LOAD_XMM(&MKL_DC_CC(i+2,j+0));
  6091. #if !defined(MKL_DC_BETA_ONE)
  6092. xmm_c4 = MKL_DC_MUL_XMM(xmm_beta, xmm_c4);
  6093. #endif
  6094. #if defined(MKL_DC_ALPHA_ONE)
  6095. xmm_c4 = MKL_DC_ADD_XMM(xmm_temp1, xmm_c4);
  6096. #else
  6097. MKL_DC_MUL_ADD_XMM(xmm_alpha, xmm_temp1, xmm_c4, xmm_temp);
  6098. #endif
  6099. #else
  6100. #if !defined(MKL_DC_ALPHA_ONE)
  6101. xmm_c4 = MKL_DC_MUL_XMM(xmm_alpha, xmm_temp1);
  6102. #else
  6103. xmm_c4 = xmm_temp1;
  6104. #endif
  6105. #endif
  6106. MKL_DC_STORE_XMM(&MKL_DC_CC(i+2,j+0), xmm_c4);
  6107. #if !defined(MKL_DC_BETA_ZERO)
  6108. xmm_c6 = MKL_DC_LOAD_XMM(&MKL_DC_CC(i+2,j+1));
  6109. #if !defined(MKL_DC_BETA_ONE)
  6110. xmm_c6 = MKL_DC_MUL_XMM(xmm_beta, xmm_c6);
  6111. #endif
  6112. #if defined(MKL_DC_ALPHA_ONE)
  6113. xmm_c6 = MKL_DC_ADD_XMM(xmm_temp3, xmm_c6);
  6114. #else
  6115. MKL_DC_MUL_ADD_XMM(xmm_alpha, xmm_temp3, xmm_c6, xmm_temp);
  6116. #endif
  6117. #else
  6118. #if !defined(MKL_DC_ALPHA_ONE)
  6119. xmm_c6 = MKL_DC_MUL_XMM(xmm_alpha, xmm_temp3);
  6120. #else
  6121. xmm_c6 = xmm_temp3;
  6122. #endif
  6123. #endif
  6124. MKL_DC_STORE_XMM(&MKL_DC_CC(i+2,j+1), xmm_c6);
  6125. }
  6126. if ((m-i) & 2) {
  6127. xmm_temp0 = MKL_DC_SETZERO_XMM();
  6128. xmm_temp1 = MKL_DC_SETZERO_XMM();
  6129. xmm_temp2 = MKL_DC_SETZERO_XMM();
  6130. xmm_temp3 = MKL_DC_SETZERO_XMM();
  6131. MKL_INT k;
  6132. for (k=0; k<k0; k+=k_in_ker) {
  6133. xmm_b1 = MKL_DC_LOAD_XMM(&MKL_DC_BB(k+0, j));
  6134. xmm_a = MKL_DC_LOADDUP_XMM(&MKL_DC_AA(i, k+0));
  6135. MKL_DC_MUL_ADD_XMM(xmm_a, xmm_b1, xmm_temp0, xmm_temp);
  6136. xmm_a = MKL_DC_LOADDUP_XMM(&MKL_DC_AA(i+1, k+0));
  6137. MKL_DC_MUL_ADD_XMM(xmm_a, xmm_b1, xmm_temp2, xmm_temp);
  6138. xmm_b1 = MKL_DC_LOAD_XMM(&MKL_DC_BB(k+1, j));
  6139. xmm_a = MKL_DC_LOADDUP_XMM(&MKL_DC_AA(i, k+1));
  6140. MKL_DC_MUL_ADD_XMM(xmm_a, xmm_b1, xmm_temp1, xmm_temp);
  6141. xmm_a = MKL_DC_LOADDUP_XMM(&MKL_DC_AA(i+1, k+1));
  6142. MKL_DC_MUL_ADD_XMM(xmm_a, xmm_b1, xmm_temp3, xmm_temp);
  6143. xmm_b1 = MKL_DC_LOAD_XMM(&MKL_DC_BB(k+2, j));
  6144. xmm_a = MKL_DC_LOADDUP_XMM(&MKL_DC_AA(i, k+2));
  6145. MKL_DC_MUL_ADD_XMM(xmm_a, xmm_b1, xmm_temp0, xmm_temp);
  6146. xmm_a = MKL_DC_LOADDUP_XMM(&MKL_DC_AA(i+1, k+2));
  6147. MKL_DC_MUL_ADD_XMM(xmm_a, xmm_b1, xmm_temp2, xmm_temp);
  6148. xmm_b1 = MKL_DC_LOAD_XMM(&MKL_DC_BB(k+3, j));
  6149. xmm_a = MKL_DC_LOADDUP_XMM(&MKL_DC_AA(i, k+3));
  6150. MKL_DC_MUL_ADD_XMM(xmm_a, xmm_b1, xmm_temp1, xmm_temp);
  6151. xmm_a = MKL_DC_LOADDUP_XMM(&MKL_DC_AA(i+1, k+3));
  6152. MKL_DC_MUL_ADD_XMM(xmm_a, xmm_b1, xmm_temp3, xmm_temp);
  6153. }
  6154. if ((kK-k) & 2) {
  6155. xmm_b1 = MKL_DC_LOAD_XMM(&MKL_DC_BB(k+0, j));
  6156. xmm_a = MKL_DC_LOADDUP_XMM(&MKL_DC_AA(i, k+0));
  6157. MKL_DC_MUL_ADD_XMM(xmm_a, xmm_b1, xmm_temp0, xmm_temp);
  6158. xmm_a = MKL_DC_LOADDUP_XMM(&MKL_DC_AA(i+1, k+0));
  6159. MKL_DC_MUL_ADD_XMM(xmm_a, xmm_b1, xmm_temp2, xmm_temp);
  6160. xmm_b1 = MKL_DC_LOAD_XMM(&MKL_DC_BB(k+1, j));
  6161. xmm_a = MKL_DC_LOADDUP_XMM(&MKL_DC_AA(i, k+1));
  6162. MKL_DC_MUL_ADD_XMM(xmm_a, xmm_b1, xmm_temp1, xmm_temp);
  6163. xmm_a = MKL_DC_LOADDUP_XMM(&MKL_DC_AA(i+1, k+1));
  6164. MKL_DC_MUL_ADD_XMM(xmm_a, xmm_b1, xmm_temp3, xmm_temp);
  6165. k+=2;
  6166. }
  6167. if (kK>=2) {
  6168. xmm_temp0 = MKL_DC_ADD_XMM(xmm_temp0, xmm_temp1);
  6169. xmm_temp2 = MKL_DC_ADD_XMM(xmm_temp2, xmm_temp3);
  6170. }
  6171. if ((kK-k)) {
  6172. xmm_b1 = MKL_DC_LOAD_XMM(&MKL_DC_BB(k+0, j));
  6173. xmm_a = MKL_DC_LOADDUP_XMM(&MKL_DC_AA(i, k+0));
  6174. MKL_DC_MUL_ADD_XMM(xmm_a, xmm_b1, xmm_temp0, xmm_temp);
  6175. xmm_a = MKL_DC_LOADDUP_XMM(&MKL_DC_AA(i+1, k+0));
  6176. MKL_DC_MUL_ADD_XMM(xmm_a, xmm_b1, xmm_temp2, xmm_temp);
  6177. }
  6178. xmm_temp4 = MKL_DC_UNPACKLO_XMM(xmm_temp0, xmm_temp2);
  6179. #if !defined(MKL_DC_BETA_ZERO)
  6180. xmm_c0 = MKL_DC_LOAD_XMM(&MKL_DC_CC(i,j+0));
  6181. #if !defined(MKL_DC_BETA_ONE)
  6182. xmm_c0 = MKL_DC_MUL_XMM(xmm_beta, xmm_c0);
  6183. #endif
  6184. #if defined(MKL_DC_ALPHA_ONE)
  6185. xmm_c0 = MKL_DC_ADD_XMM(xmm_temp4, xmm_c0);
  6186. #else
  6187. MKL_DC_MUL_ADD_XMM(xmm_alpha, xmm_temp4, xmm_c0, xmm_temp);
  6188. #endif
  6189. #else
  6190. #if !defined(MKL_DC_ALPHA_ONE)
  6191. xmm_c0 = MKL_DC_MUL_XMM(xmm_alpha, xmm_temp4);
  6192. #else
  6193. xmm_c0 = xmm_temp4;
  6194. #endif
  6195. #endif
  6196. MKL_DC_STORE_XMM(&MKL_DC_CC(i,j+0),xmm_c0);
  6197. xmm_temp5 = MKL_DC_UNPACKHI_XMM(xmm_temp0, xmm_temp2);
  6198. #if !defined(MKL_DC_BETA_ZERO)
  6199. xmm_c2 = MKL_DC_LOAD_XMM(&MKL_DC_CC(i,j+1));
  6200. #if !defined(MKL_DC_BETA_ONE)
  6201. xmm_c2 = MKL_DC_MUL_XMM(xmm_beta, xmm_c2);
  6202. #endif
  6203. #if defined(MKL_DC_ALPHA_ONE)
  6204. xmm_c2 = MKL_DC_ADD_XMM(xmm_temp5, xmm_c2);
  6205. #else
  6206. MKL_DC_MUL_ADD_XMM(xmm_alpha, xmm_temp5, xmm_c2, xmm_temp);
  6207. #endif
  6208. #else
  6209. #if !defined(MKL_DC_ALPHA_ONE)
  6210. xmm_c2 = MKL_DC_MUL_XMM(xmm_alpha, xmm_temp5);
  6211. #else
  6212. xmm_c2 = xmm_temp5;
  6213. #endif
  6214. #endif
  6215. MKL_DC_STORE_XMM(&MKL_DC_CC(i,j+1),xmm_c2);
  6216. i+=2;
  6217. }
  6218. if ((m-i) & 1) {
  6219. xmm_temp0 = MKL_DC_SETZERO_XMM();
  6220. xmm_temp1 = MKL_DC_SETZERO_XMM();
  6221. MKL_INT k;
  6222. for (k=0; k<k0; k+=k_in_ker) {
  6223. xmm_b1 = MKL_DC_LOAD_XMM(&MKL_DC_BB(k+0, j));
  6224. xmm_a = MKL_DC_LOADDUP_XMM(&MKL_DC_AA(i, k+0));
  6225. MKL_DC_MUL_ADD_XMM(xmm_a, xmm_b1, xmm_temp0, xmm_temp);
  6226. xmm_b1 = MKL_DC_LOAD_XMM(&MKL_DC_BB(k+1, j));
  6227. xmm_a = MKL_DC_LOADDUP_XMM(&MKL_DC_AA(i, k+1));
  6228. MKL_DC_MUL_ADD_XMM(xmm_a, xmm_b1, xmm_temp1, xmm_temp);
  6229. xmm_b1 = MKL_DC_LOAD_XMM(&MKL_DC_BB(k+2, j));
  6230. xmm_a = MKL_DC_LOADDUP_XMM(&MKL_DC_AA(i, k+2));
  6231. MKL_DC_MUL_ADD_XMM(xmm_a, xmm_b1, xmm_temp0, xmm_temp);
  6232. xmm_b1 = MKL_DC_LOAD_XMM(&MKL_DC_BB(k+3, j));
  6233. xmm_a = MKL_DC_LOADDUP_XMM(&MKL_DC_AA(i, k+3));
  6234. MKL_DC_MUL_ADD_XMM(xmm_a, xmm_b1, xmm_temp1, xmm_temp);
  6235. }
  6236. if ((kK-k) & 2) {
  6237. xmm_b1 = MKL_DC_LOAD_XMM(&MKL_DC_BB(k+0, j));
  6238. xmm_a = MKL_DC_LOADDUP_XMM(&MKL_DC_AA(i, k+0));
  6239. MKL_DC_MUL_ADD_XMM(xmm_a, xmm_b1, xmm_temp0, xmm_temp);
  6240. xmm_b1 = MKL_DC_LOAD_XMM(&MKL_DC_BB(k+1, j));
  6241. xmm_a = MKL_DC_LOADDUP_XMM(&MKL_DC_AA(i, k+1));
  6242. MKL_DC_MUL_ADD_XMM(xmm_a, xmm_b1, xmm_temp1, xmm_temp);
  6243. k+=2;
  6244. }
  6245. if (kK>=2) {
  6246. xmm_temp0 = MKL_DC_ADD_XMM(xmm_temp0, xmm_temp1);
  6247. }
  6248. if ((kK-k)) {
  6249. xmm_b1 = MKL_DC_LOAD_XMM(&MKL_DC_BB(k+0, j));
  6250. xmm_a = MKL_DC_LOADDUP_XMM(&MKL_DC_AA(i, k+0));
  6251. MKL_DC_MUL_ADD_XMM(xmm_a, xmm_b1, xmm_temp0, xmm_temp);
  6252. }
  6253. #if !defined(MKL_DC_BETA_ZERO)
  6254. xmm_c0 = MKL_DC_LOAD_XMM_S(&MKL_DC_CC(i,j+0));
  6255. #if !defined(MKL_DC_BETA_ONE)
  6256. xmm_c0 = MKL_DC_MUL_XMM_S(xmm_beta, xmm_c0);
  6257. #endif
  6258. #if defined(MKL_DC_ALPHA_ONE)
  6259. xmm_c0 = MKL_DC_ADD_XMM_S(xmm_temp0, xmm_c0);
  6260. #else
  6261. MKL_DC_MUL_ADD_XMM_S(xmm_alpha, xmm_temp0, xmm_c0, xmm_temp);
  6262. #endif
  6263. #else
  6264. #if !defined(MKL_DC_ALPHA_ONE)
  6265. xmm_c0 = MKL_DC_MUL_XMM_S(xmm_alpha, xmm_temp0);
  6266. #else
  6267. xmm_c0 = xmm_temp0;
  6268. #endif
  6269. #endif
  6270. MKL_DC_STORE_XMM_S(&MKL_DC_CC(i,j+0),xmm_c0);
  6271. xmm_temp4 = MKL_DC_UNPACKHI_XMM(xmm_temp0, xmm_temp0);
  6272. #if !defined(MKL_DC_BETA_ZERO)
  6273. xmm_c0 = MKL_DC_LOAD_XMM_S(&MKL_DC_CC(i,j+1));
  6274. #if !defined(MKL_DC_BETA_ONE)
  6275. xmm_c0 = MKL_DC_MUL_XMM_S(xmm_beta, xmm_c0);
  6276. #endif
  6277. #if defined(MKL_DC_ALPHA_ONE)
  6278. xmm_c0 = MKL_DC_ADD_XMM_S(xmm_temp4, xmm_c0);
  6279. #else
  6280. MKL_DC_MUL_ADD_XMM_S(xmm_alpha, xmm_temp4, xmm_c0, xmm_temp);
  6281. #endif
  6282. #else
  6283. #if !defined(MKL_DC_ALPHA_ONE)
  6284. xmm_c0 = MKL_DC_MUL_XMM_S(xmm_alpha, xmm_temp4);
  6285. #else
  6286. xmm_c0 = xmm_temp4;
  6287. #endif
  6288. #endif
  6289. MKL_DC_STORE_XMM_S(&MKL_DC_CC(i,j+1),xmm_c0);
  6290. }
  6291. j+=2;
  6292. }
  6293. if ((n-j)) {
  6294. MKL_INT i;
  6295. for (i=0; i<m0; i+=m_in_ker) {
  6296. xmm_temp0 = MKL_DC_SETZERO_XMM();
  6297. xmm_temp1 = MKL_DC_SETZERO_XMM();
  6298. xmm_temp2 = MKL_DC_SETZERO_XMM();
  6299. xmm_temp3 = MKL_DC_SETZERO_XMM();
  6300. xmm_temp4 = MKL_DC_SETZERO_XMM();
  6301. xmm_temp5 = MKL_DC_SETZERO_XMM();
  6302. xmm_temp6 = MKL_DC_SETZERO_XMM();
  6303. xmm_temp7 = MKL_DC_SETZERO_XMM();
  6304. MKL_INT k;
  6305. for (k=0; k<k0; k+=k_in_ker) {
  6306. xmm_b1 = MKL_DC_LOAD_XMM_S(&MKL_DC_BB(k+0, j));
  6307. xmm_a = MKL_DC_LOAD_XMM_S(&MKL_DC_AA(i, k+0));
  6308. MKL_DC_MUL_ADD_XMM_S(xmm_a, xmm_b1, xmm_temp0, xmm_temp);
  6309. xmm_a = MKL_DC_LOAD_XMM_S(&MKL_DC_AA(i+1, k+0));
  6310. MKL_DC_MUL_ADD_XMM_S(xmm_a, xmm_b1, xmm_temp2, xmm_temp);
  6311. xmm_a = MKL_DC_LOAD_XMM_S(&MKL_DC_AA(i+2, k+0));
  6312. MKL_DC_MUL_ADD_XMM_S(xmm_a, xmm_b1, xmm_temp4, xmm_temp);
  6313. xmm_a = MKL_DC_LOAD_XMM_S(&MKL_DC_AA(i+3, k+0));
  6314. MKL_DC_MUL_ADD_XMM_S(xmm_a, xmm_b1, xmm_temp6, xmm_temp);
  6315. xmm_b1 = MKL_DC_LOAD_XMM_S(&MKL_DC_BB(k+1, j));
  6316. xmm_a = MKL_DC_LOAD_XMM_S(&MKL_DC_AA(i, k+1));
  6317. MKL_DC_MUL_ADD_XMM_S(xmm_a, xmm_b1, xmm_temp1, xmm_temp);
  6318. xmm_a = MKL_DC_LOAD_XMM_S(&MKL_DC_AA(i+1, k+1));
  6319. MKL_DC_MUL_ADD_XMM_S(xmm_a, xmm_b1, xmm_temp3, xmm_temp);
  6320. xmm_a = MKL_DC_LOAD_XMM_S(&MKL_DC_AA(i+2, k+1));
  6321. MKL_DC_MUL_ADD_XMM_S(xmm_a, xmm_b1, xmm_temp5, xmm_temp);
  6322. xmm_a = MKL_DC_LOAD_XMM_S(&MKL_DC_AA(i+3, k+1));
  6323. MKL_DC_MUL_ADD_XMM_S(xmm_a, xmm_b1, xmm_temp7, xmm_temp);
  6324. xmm_b1 = MKL_DC_LOAD_XMM_S(&MKL_DC_BB(k+2, j));
  6325. xmm_a = MKL_DC_LOAD_XMM_S(&MKL_DC_AA(i, k+2));
  6326. MKL_DC_MUL_ADD_XMM_S(xmm_a, xmm_b1, xmm_temp0, xmm_temp);
  6327. xmm_a = MKL_DC_LOAD_XMM_S(&MKL_DC_AA(i+1, k+2));
  6328. MKL_DC_MUL_ADD_XMM_S(xmm_a, xmm_b1, xmm_temp2, xmm_temp);
  6329. xmm_a = MKL_DC_LOAD_XMM_S(&MKL_DC_AA(i+2, k+2));
  6330. MKL_DC_MUL_ADD_XMM_S(xmm_a, xmm_b1, xmm_temp4, xmm_temp);
  6331. xmm_a = MKL_DC_LOAD_XMM_S(&MKL_DC_AA(i+3, k+2));
  6332. MKL_DC_MUL_ADD_XMM_S(xmm_a, xmm_b1, xmm_temp6, xmm_temp);
  6333. xmm_b1 = MKL_DC_LOAD_XMM_S(&MKL_DC_BB(k+3, j));
  6334. xmm_a = MKL_DC_LOAD_XMM_S(&MKL_DC_AA(i, k+3));
  6335. MKL_DC_MUL_ADD_XMM_S(xmm_a, xmm_b1, xmm_temp1, xmm_temp);
  6336. xmm_a = MKL_DC_LOAD_XMM_S(&MKL_DC_AA(i+1, k+3));
  6337. MKL_DC_MUL_ADD_XMM_S(xmm_a, xmm_b1, xmm_temp3, xmm_temp);
  6338. xmm_a = MKL_DC_LOAD_XMM_S(&MKL_DC_AA(i+2, k+3));
  6339. MKL_DC_MUL_ADD_XMM_S(xmm_a, xmm_b1, xmm_temp5, xmm_temp);
  6340. xmm_a = MKL_DC_LOAD_XMM_S(&MKL_DC_AA(i+3, k+3));
  6341. MKL_DC_MUL_ADD_XMM_S(xmm_a, xmm_b1, xmm_temp7, xmm_temp);
  6342. }
  6343. if ((kK-k) & 2) {
  6344. xmm_b1 = MKL_DC_LOAD_XMM_S(&MKL_DC_BB(k+0, j));
  6345. xmm_a = MKL_DC_LOAD_XMM_S(&MKL_DC_AA(i, k+0));
  6346. MKL_DC_MUL_ADD_XMM_S(xmm_a, xmm_b1, xmm_temp0, xmm_temp);
  6347. xmm_a = MKL_DC_LOAD_XMM_S(&MKL_DC_AA(i+1, k+0));
  6348. MKL_DC_MUL_ADD_XMM_S(xmm_a, xmm_b1, xmm_temp2, xmm_temp);
  6349. xmm_a = MKL_DC_LOAD_XMM_S(&MKL_DC_AA(i+2, k+0));
  6350. MKL_DC_MUL_ADD_XMM_S(xmm_a, xmm_b1, xmm_temp4, xmm_temp);
  6351. xmm_a = MKL_DC_LOAD_XMM_S(&MKL_DC_AA(i+3, k+0));
  6352. MKL_DC_MUL_ADD_XMM_S(xmm_a, xmm_b1, xmm_temp6, xmm_temp);
  6353. xmm_b1 = MKL_DC_LOAD_XMM_S(&MKL_DC_BB(k+1, j));
  6354. xmm_a = MKL_DC_LOAD_XMM_S(&MKL_DC_AA(i, k+1));
  6355. MKL_DC_MUL_ADD_XMM_S(xmm_a, xmm_b1, xmm_temp1, xmm_temp);
  6356. xmm_a = MKL_DC_LOAD_XMM_S(&MKL_DC_AA(i+1, k+1));
  6357. MKL_DC_MUL_ADD_XMM_S(xmm_a, xmm_b1, xmm_temp3, xmm_temp);
  6358. xmm_a = MKL_DC_LOAD_XMM_S(&MKL_DC_AA(i+2, k+1));
  6359. MKL_DC_MUL_ADD_XMM_S(xmm_a, xmm_b1, xmm_temp5, xmm_temp);
  6360. xmm_a = MKL_DC_LOAD_XMM_S(&MKL_DC_AA(i+3, k+1));
  6361. MKL_DC_MUL_ADD_XMM_S(xmm_a, xmm_b1, xmm_temp7, xmm_temp);
  6362. k+=2;
  6363. }
  6364. if (kK>=2) {
  6365. xmm_temp0 = MKL_DC_ADD_XMM_S(xmm_temp0, xmm_temp1);
  6366. xmm_temp2 = MKL_DC_ADD_XMM_S(xmm_temp2, xmm_temp3);
  6367. xmm_temp4 = MKL_DC_ADD_XMM_S(xmm_temp4, xmm_temp5);
  6368. xmm_temp6 = MKL_DC_ADD_XMM_S(xmm_temp6, xmm_temp7);
  6369. }
  6370. if ((kK-k)) {
  6371. xmm_b1 = MKL_DC_LOAD_XMM_S(&MKL_DC_BB(k+0, j));
  6372. xmm_a = MKL_DC_LOAD_XMM_S(&MKL_DC_AA(i, k+0));
  6373. MKL_DC_MUL_ADD_XMM_S(xmm_a, xmm_b1, xmm_temp0, xmm_temp);
  6374. xmm_a = MKL_DC_LOAD_XMM_S(&MKL_DC_AA(i+1, k+0));
  6375. MKL_DC_MUL_ADD_XMM_S(xmm_a, xmm_b1, xmm_temp2, xmm_temp);
  6376. xmm_a = MKL_DC_LOAD_XMM_S(&MKL_DC_AA(i+2, k+0));
  6377. MKL_DC_MUL_ADD_XMM_S(xmm_a, xmm_b1, xmm_temp4, xmm_temp);
  6378. xmm_a = MKL_DC_LOAD_XMM_S(&MKL_DC_AA(i+3, k+0));
  6379. MKL_DC_MUL_ADD_XMM_S(xmm_a, xmm_b1, xmm_temp6, xmm_temp);
  6380. }
  6381. #if !defined(MKL_DC_BETA_ZERO)
  6382. xmm_c0 = MKL_DC_LOAD_XMM_S(&MKL_DC_CC(i,j+0));
  6383. #if !defined(MKL_DC_BETA_ONE)
  6384. xmm_c0 = MKL_DC_MUL_XMM_S(xmm_beta, xmm_c0);
  6385. #endif
  6386. #if defined(MKL_DC_ALPHA_ONE)
  6387. xmm_c0 = MKL_DC_ADD_XMM_S(xmm_temp0, xmm_c0);
  6388. #else
  6389. MKL_DC_MUL_ADD_XMM_S(xmm_alpha, xmm_temp0, xmm_c0, xmm_temp);
  6390. #endif
  6391. #else
  6392. #if !defined(MKL_DC_ALPHA_ONE)
  6393. xmm_c0 = MKL_DC_MUL_XMM_S(xmm_alpha, xmm_temp0);
  6394. #else
  6395. xmm_c0 = xmm_temp0;
  6396. #endif
  6397. #endif
  6398. MKL_DC_STORE_XMM_S(&MKL_DC_CC(i,j+0), xmm_c0);
  6399. #if !defined(MKL_DC_BETA_ZERO)
  6400. xmm_c2 = MKL_DC_LOAD_XMM_S(&MKL_DC_CC(i+1,j+0));
  6401. #if !defined(MKL_DC_BETA_ONE)
  6402. xmm_c2 = MKL_DC_MUL_XMM_S(xmm_beta, xmm_c2);
  6403. #endif
  6404. #if defined(MKL_DC_ALPHA_ONE)
  6405. xmm_c2 = MKL_DC_ADD_XMM_S(xmm_temp2, xmm_c2);
  6406. #else
  6407. MKL_DC_MUL_ADD_XMM_S(xmm_alpha, xmm_temp2, xmm_c2, xmm_temp);
  6408. #endif
  6409. #else
  6410. #if !defined(MKL_DC_ALPHA_ONE)
  6411. xmm_c2 = MKL_DC_MUL_XMM_S(xmm_alpha, xmm_temp2);
  6412. #else
  6413. xmm_c2 = xmm_temp2;
  6414. #endif
  6415. #endif
  6416. MKL_DC_STORE_XMM_S(&MKL_DC_CC(i+1,j+0), xmm_c2);
  6417. #if !defined(MKL_DC_BETA_ZERO)
  6418. xmm_c4 = MKL_DC_LOAD_XMM_S(&MKL_DC_CC(i+2,j+0));
  6419. #if !defined(MKL_DC_BETA_ONE)
  6420. xmm_c4 = MKL_DC_MUL_XMM(xmm_beta, xmm_c4);
  6421. #endif
  6422. #if defined(MKL_DC_ALPHA_ONE)
  6423. xmm_c4 = MKL_DC_ADD_XMM_S(xmm_temp4, xmm_c4);
  6424. #else
  6425. MKL_DC_MUL_ADD_XMM_S(xmm_alpha, xmm_temp4, xmm_c4, xmm_temp);
  6426. #endif
  6427. #else
  6428. #if !defined(MKL_DC_ALPHA_ONE)
  6429. xmm_c4 = MKL_DC_MUL_XMM_S(xmm_alpha, xmm_temp4);
  6430. #else
  6431. xmm_c4 = xmm_temp4;
  6432. #endif
  6433. #endif
  6434. MKL_DC_STORE_XMM_S(&MKL_DC_CC(i+2,j+0), xmm_c4);
  6435. #if !defined(MKL_DC_BETA_ZERO)
  6436. xmm_c6 = MKL_DC_LOAD_XMM_S(&MKL_DC_CC(i+3,j+0));
  6437. #if !defined(MKL_DC_BETA_ONE)
  6438. xmm_c6 = MKL_DC_MUL_XMM(xmm_beta, xmm_c6);
  6439. #endif
  6440. #if defined(MKL_DC_ALPHA_ONE)
  6441. xmm_c6 = MKL_DC_ADD_XMM_S(xmm_temp6, xmm_c6);
  6442. #else
  6443. MKL_DC_MUL_ADD_XMM_S(xmm_alpha, xmm_temp6, xmm_c6, xmm_temp);
  6444. #endif
  6445. #else
  6446. #if !defined(MKL_DC_ALPHA_ONE)
  6447. xmm_c6 = MKL_DC_MUL_XMM_S(xmm_alpha, xmm_temp6);
  6448. #else
  6449. xmm_c6 = xmm_temp6;
  6450. #endif
  6451. #endif
  6452. MKL_DC_STORE_XMM_S(&MKL_DC_CC(i+3,j+0), xmm_c6);
  6453. }
  6454. if ((m-i) & 2) {
  6455. xmm_temp0 = MKL_DC_SETZERO_XMM();
  6456. xmm_temp1 = MKL_DC_SETZERO_XMM();
  6457. xmm_temp2 = MKL_DC_SETZERO_XMM();
  6458. xmm_temp3 = MKL_DC_SETZERO_XMM();
  6459. MKL_INT k;
  6460. for (k=0; k<k0; k+=k_in_ker) {
  6461. xmm_b1 = MKL_DC_LOAD_XMM_S(&MKL_DC_BB(k+0, j));
  6462. xmm_a = MKL_DC_LOAD_XMM_S(&MKL_DC_AA(i, k+0));
  6463. MKL_DC_MUL_ADD_XMM_S(xmm_a, xmm_b1, xmm_temp0, xmm_temp);
  6464. xmm_a = MKL_DC_LOAD_XMM_S(&MKL_DC_AA(i+1, k+0));
  6465. MKL_DC_MUL_ADD_XMM_S(xmm_a, xmm_b1, xmm_temp2, xmm_temp);
  6466. xmm_b1 = MKL_DC_LOAD_XMM_S(&MKL_DC_BB(k+1, j));
  6467. xmm_a = MKL_DC_LOAD_XMM_S(&MKL_DC_AA(i, k+1));
  6468. MKL_DC_MUL_ADD_XMM_S(xmm_a, xmm_b1, xmm_temp1, xmm_temp);
  6469. xmm_a = MKL_DC_LOAD_XMM_S(&MKL_DC_AA(i+1, k+1));
  6470. MKL_DC_MUL_ADD_XMM_S(xmm_a, xmm_b1, xmm_temp3, xmm_temp);
  6471. xmm_b1 = MKL_DC_LOAD_XMM_S(&MKL_DC_BB(k+2, j));
  6472. xmm_a = MKL_DC_LOAD_XMM_S(&MKL_DC_AA(i, k+2));
  6473. MKL_DC_MUL_ADD_XMM_S(xmm_a, xmm_b1, xmm_temp0, xmm_temp);
  6474. xmm_a = MKL_DC_LOAD_XMM_S(&MKL_DC_AA(i+1, k+2));
  6475. MKL_DC_MUL_ADD_XMM_S(xmm_a, xmm_b1, xmm_temp2, xmm_temp);
  6476. xmm_b1 = MKL_DC_LOAD_XMM_S(&MKL_DC_BB(k+3, j));
  6477. xmm_a = MKL_DC_LOAD_XMM_S(&MKL_DC_AA(i, k+3));
  6478. MKL_DC_MUL_ADD_XMM_S(xmm_a, xmm_b1, xmm_temp1, xmm_temp);
  6479. xmm_a = MKL_DC_LOAD_XMM_S(&MKL_DC_AA(i+1, k+3));
  6480. MKL_DC_MUL_ADD_XMM_S(xmm_a, xmm_b1, xmm_temp3, xmm_temp);
  6481. }
  6482. if ((kK-k) & 2) {
  6483. xmm_b1 = MKL_DC_LOAD_XMM_S(&MKL_DC_BB(k+0, j));
  6484. xmm_a = MKL_DC_LOAD_XMM_S(&MKL_DC_AA(i, k+0));
  6485. MKL_DC_MUL_ADD_XMM_S(xmm_a, xmm_b1, xmm_temp0, xmm_temp);
  6486. xmm_a = MKL_DC_LOAD_XMM_S(&MKL_DC_AA(i+1, k+0));
  6487. MKL_DC_MUL_ADD_XMM_S(xmm_a, xmm_b1, xmm_temp2, xmm_temp);
  6488. xmm_b1 = MKL_DC_LOAD_XMM_S(&MKL_DC_BB(k+1, j));
  6489. xmm_a = MKL_DC_LOAD_XMM_S(&MKL_DC_AA(i, k+1));
  6490. MKL_DC_MUL_ADD_XMM_S(xmm_a, xmm_b1, xmm_temp1, xmm_temp);
  6491. xmm_a = MKL_DC_LOAD_XMM_S(&MKL_DC_AA(i+1, k+1));
  6492. MKL_DC_MUL_ADD_XMM_S(xmm_a, xmm_b1, xmm_temp3, xmm_temp);
  6493. k+=2;
  6494. }
  6495. if (kK>=2) {
  6496. xmm_temp0 = MKL_DC_ADD_XMM_S(xmm_temp0, xmm_temp1);
  6497. xmm_temp2 = MKL_DC_ADD_XMM_S(xmm_temp2, xmm_temp3);
  6498. }
  6499. if ((kK-k)) {
  6500. xmm_b1 = MKL_DC_LOAD_XMM_S(&MKL_DC_BB(k+0, j));
  6501. xmm_a = MKL_DC_LOAD_XMM_S(&MKL_DC_AA(i, k+0));
  6502. MKL_DC_MUL_ADD_XMM_S(xmm_a, xmm_b1, xmm_temp0, xmm_temp);
  6503. xmm_a = MKL_DC_LOAD_XMM_S(&MKL_DC_AA(i+1, k+0));
  6504. MKL_DC_MUL_ADD_XMM_S(xmm_a, xmm_b1, xmm_temp2, xmm_temp);
  6505. }
  6506. #if !defined(MKL_DC_BETA_ZERO)
  6507. xmm_c0 = MKL_DC_LOAD_XMM_S(&MKL_DC_CC(i,j+0));
  6508. #if !defined(MKL_DC_BETA_ONE)
  6509. xmm_c0 = MKL_DC_MUL_XMM_S(xmm_beta, xmm_c0);
  6510. #endif
  6511. #if defined(MKL_DC_ALPHA_ONE)
  6512. xmm_c0 = MKL_DC_ADD_XMM_S(xmm_temp0, xmm_c0);
  6513. #else
  6514. MKL_DC_MUL_ADD_XMM_S(xmm_alpha, xmm_temp0, xmm_c0, xmm_temp);
  6515. #endif
  6516. #else
  6517. #if !defined(MKL_DC_ALPHA_ONE)
  6518. xmm_c0 = MKL_DC_MUL_XMM_S(xmm_alpha, xmm_temp0);
  6519. #else
  6520. xmm_c0 = xmm_temp0;
  6521. #endif
  6522. #endif
  6523. MKL_DC_STORE_XMM_S(&MKL_DC_CC(i,j+0),xmm_c0);
  6524. #if !defined(MKL_DC_BETA_ZERO)
  6525. xmm_c2 = MKL_DC_LOAD_XMM_S(&MKL_DC_CC(i+1,j+0));
  6526. #if !defined(MKL_DC_BETA_ONE)
  6527. xmm_c2 = MKL_DC_MUL_XMM_S(xmm_beta, xmm_c2);
  6528. #endif
  6529. #if defined(MKL_DC_ALPHA_ONE)
  6530. xmm_c2 = MKL_DC_ADD_XMM_S(xmm_temp2, xmm_c2);
  6531. #else
  6532. MKL_DC_MUL_ADD_XMM_S(xmm_alpha, xmm_temp2, xmm_c2, xmm_temp);
  6533. #endif
  6534. #else
  6535. #if !defined(MKL_DC_ALPHA_ONE)
  6536. xmm_c2 = MKL_DC_MUL_XMM_S(xmm_alpha, xmm_temp2);
  6537. #else
  6538. xmm_c2 = xmm_temp2;
  6539. #endif
  6540. #endif
  6541. MKL_DC_STORE_XMM_S(&MKL_DC_CC(i+1,j+0),xmm_c2);
  6542. i+=2;
  6543. }
  6544. if ((m-i) & 1) {
  6545. xmm_temp0 = MKL_DC_SETZERO_XMM();
  6546. xmm_temp1 = MKL_DC_SETZERO_XMM();
  6547. MKL_INT k;
  6548. for (k=0; k<k0; k+=k_in_ker) {
  6549. xmm_b1 = MKL_DC_LOAD_XMM_S(&MKL_DC_BB(k+0, j));
  6550. xmm_a = MKL_DC_LOAD_XMM_S(&MKL_DC_AA(i, k+0));
  6551. MKL_DC_MUL_ADD_XMM_S(xmm_a, xmm_b1, xmm_temp0, xmm_temp);
  6552. xmm_b1 = MKL_DC_LOAD_XMM_S(&MKL_DC_BB(k+1, j));
  6553. xmm_a = MKL_DC_LOAD_XMM_S(&MKL_DC_AA(i, k+1));
  6554. MKL_DC_MUL_ADD_XMM_S(xmm_a, xmm_b1, xmm_temp1, xmm_temp);
  6555. xmm_b1 = MKL_DC_LOAD_XMM_S(&MKL_DC_BB(k+2, j));
  6556. xmm_a = MKL_DC_LOAD_XMM_S(&MKL_DC_AA(i, k+2));
  6557. MKL_DC_MUL_ADD_XMM_S(xmm_a, xmm_b1, xmm_temp0, xmm_temp);
  6558. xmm_b1 = MKL_DC_LOAD_XMM_S(&MKL_DC_BB(k+3, j));
  6559. xmm_a = MKL_DC_LOAD_XMM_S(&MKL_DC_AA(i, k+3));
  6560. MKL_DC_MUL_ADD_XMM_S(xmm_a, xmm_b1, xmm_temp1, xmm_temp);
  6561. }
  6562. if ((kK-k) & 2) {
  6563. xmm_b1 = MKL_DC_LOAD_XMM_S(&MKL_DC_BB(k+0, j));
  6564. xmm_a = MKL_DC_LOAD_XMM_S(&MKL_DC_AA(i, k+0));
  6565. MKL_DC_MUL_ADD_XMM_S(xmm_a, xmm_b1, xmm_temp0, xmm_temp);
  6566. xmm_b1 = MKL_DC_LOAD_XMM_S(&MKL_DC_BB(k+1, j));
  6567. xmm_a = MKL_DC_LOAD_XMM_S(&MKL_DC_AA(i, k+1));
  6568. MKL_DC_MUL_ADD_XMM_S(xmm_a, xmm_b1, xmm_temp1, xmm_temp);
  6569. k+=2;
  6570. }
  6571. if (kK>=2) {
  6572. xmm_temp0 = MKL_DC_ADD_XMM_S(xmm_temp0, xmm_temp1);
  6573. }
  6574. if (kK-k) {
  6575. xmm_b1 = MKL_DC_LOAD_XMM_S(&MKL_DC_BB(k+0, j));
  6576. xmm_a = MKL_DC_LOAD_XMM_S(&MKL_DC_AA(i, k+0));
  6577. MKL_DC_MUL_ADD_XMM_S(xmm_a, xmm_b1, xmm_temp0, xmm_temp);
  6578. }
  6579. #if !defined(MKL_DC_BETA_ZERO)
  6580. xmm_c0 = MKL_DC_LOAD_XMM_S(&MKL_DC_CC(i,j+0));
  6581. #if !defined(MKL_DC_BETA_ONE)
  6582. xmm_c0 = MKL_DC_MUL_XMM_S(xmm_beta, xmm_c0);
  6583. #endif
  6584. #if defined(MKL_DC_ALPHA_ONE)
  6585. xmm_c0 = MKL_DC_ADD_XMM_S(xmm_temp0, xmm_c0);
  6586. #else
  6587. MKL_DC_MUL_ADD_XMM_S(xmm_alpha, xmm_temp0, xmm_c0, xmm_temp);
  6588. #endif
  6589. #else
  6590. #if !defined(MKL_DC_ALPHA_ONE)
  6591. xmm_c0 = MKL_DC_MUL_XMM_S(xmm_alpha, xmm_temp0);
  6592. #else
  6593. xmm_c0 = xmm_temp0;
  6594. #endif
  6595. #endif
  6596. MKL_DC_STORE_XMM_S(&MKL_DC_CC(i,j+0),xmm_c0);
  6597. }
  6598. }
  6599. }
  6600. #endif
  6601. #endif
  6602. #undef MKL_DC_AA
  6603. #undef MKL_DC_BB
  6604. #undef MKL_DC_CC
  6605. #undef MKL_DC_FNAME_GEMM_KERNEL