;
; Measure P6 optimized DAXPY execution times
; From: Micheal Cranford <mcranfor@ichips.intel.com>

;
; This program requires TASM (under DOS)
;
; ASM.BAT file is as follows
;
;  TASM /c /r /la /ml /m5 /w2 /zi %1 %2 %3 %4
;
; LNK.BAT file is as follows
;
;  TLINK /3 /c /C /d /l /m /s /v %1 %2 %3 %4 %5 %6


;
; Results (from mcranfor@ichips.intel.com)
;
; Processor   Points / Loop   Total Cycles   Cycles / DP MFLOP    MFLOPS @ freq
;
; Pentium          5            826182430          3.2            72 @ 233 MHz
;Pentium Pro       5            395092362          1.5           194 @ 300 MHz


           .MODEL  NEARSTACK SMALL

TextLineSize  EQU     80        ; number of ASCII characters to print
FullSize      EQU     25        ; number of full size steps
Passes        EQU     1000000   ; number of passes to make

           .STACK  100

           .DATA

Xarray        DQ      128 DUP (1.0)
Yarray        DQ      128 DUP (2.0)
Constant      DQ      3.0
Corrections   DD      0         ; RDTSC cycle time overhead
CycleCounts   DD      4 DUP (0) ; enough for four cycle times
PackedCount   DB      10 DUP (0) ; temporary packed BCD value
OutgoingText  DB      80 DUP (0) ; output character buffer

           .CODE

           .586

           LOCALS

RDTSC      MACRO
   DB      0FH,31H
ENDM


; call ReadTSC for each slot twice for a start time and a finish
; time and the resulting slot will have the corrected cycle time

ReadTSC    MACRO TimeSlot
   RDTSC                        ; get current TSC cycle count
   SUB     EAX,DS:[Corrections] ; subtract overhead cycles
   SUB     EAX,DS:[CycleCounts+TimeSlot*4] ; subtract last
   MOV     DS:[CycleCounts+TimeSlot*4],EAX ; update cycles
ENDM


; call InitTSC only once at the very beginning before using ReadTSC

InitTSC    MACRO
   XOR     EAX,EAX              ; reset cycle count
   MOV     DS:[Corrections],EAX
   MOV     DS:[CycleCounts+00],EAX
   MOV     DS:[CycleCounts+04],EAX
   MOV     DS:[CycleCounts+08],EAX
   MOV     DS:[CycleCounts+12],EAX
   ReadTSC -1                   ; get current TSC cycle count
   RDTSC                        ; get ReadTSC overhead
   SUB     EAX,DS:[Corrections] ; subtract first
   MOV     DS:[Corrections],EAX ; update cycles
ENDM

   EVEN

DAXPYP6:
   PUSH    BP
   MOV     BP,SP
   MOV     AX,@data             ; setup cycle count segments
   MOV     DS,AX
   MOV     ES,AX

   InitTSC
   ReadTSC 0                    ; get starting cycle count

   FNINIT                       ; start from known FPU state
   FLD     QWORD PTR DS:[Constant] ; get multiplier constant
   MOV     ECX,Passes           ; initialize loop counter

OuterLoop:

   LEA     SI,DS:[Xarray]       ; get pointer to X array
   LEA     DI,DS:[Yarray]       ; get pointer to Y array
   MOV     AX,FullSize          ; get full steps counter

InnerLoop:                      ;           C PPPPP F7 F6 F5 F4 F3 F2 F1 F0

   FLD     QWORD PTR DS:[SI+00] ; get X0    0        K  .  .  .  .  .  .  .
   FMUL    ST(0),ST(1)          ; T0:=K*X0  1 a      K X0  .  .  .  .  .  .

   FLD     QWORD PTR DS:[SI+08] ; get X1    0        K t0  .  .  .  .  .  .
   FMUL    ST(0),ST(2)          ; T1:=K*X1  1 ab     K t0 X1  .  .  .  .  .

   FLD     QWORD PTR DS:[SI+16] ; get X2    0        K t0 t1  .  .  .  .  .
   FMUL    ST(0),ST(3)          ; T2:=K*X2  1 abc    K t0 t1 X2  .  .  .  .

   FLD     QWORD PTR DS:[SI+24] ; get X3    0        K t0 t1 t2  .  .  .  .
   FMUL    ST(0),ST(4)          ; T3:=K*X3  1 abcd   K t0 t1 t2 X3  .  .  .
   FXCH    ST(2)                ; T3 <=> T1 0        K t0 t1 t2 t3  .  .  .

   FLD     QWORD PTR DS:[SI+32] ; get X4    0        K t0 t3 t2 t1  .  .  .
   FMUL    ST(0),ST(5)          ; T4:=K*X4  1 abcde  K t0 t3 t2 t1 X4  .  .
   FXCH    ST(4)                ; T4 <=> T0 0        K T0 t3 t2 t1 t4  .  .

   FLD     QWORD PTR DS:[DI+00] ; get Y0    0        K t4 t3 t2 t1 T0  .  .
   FADDP   ST(1),ST(0)          ; Y0:=T0+Y0 1 fbcde  K t4 t3 t2 t1 T0 Y0  .

   FLD     QWORD PTR DS:[DI+08] ; get Y1    0        K t4 t3 t2 T1 y0  .  .
   FADDP   ST(2),ST(0)          ; Y1:=T1+Y1 1 fgcde  K t4 t3 t2 T1 y0 Y1  .

   FLD     QWORD PTR DS:[DI+16] ; get Y2    0        K t4 t3 T2 y1 y0  .  .
   FADDP   ST(3),ST(0)          ; Y2:=T2+Y2 1 fghde  K t4 t3 T2 y1 y0 Y2  .

   FLD     QWORD PTR DS:[DI+24] ; get Y3    0        K t4 T3 y2 y1 y0  .  .
   FADDP   ST(4),ST(0)          ; Y3:=T3+Y3 1 fghie  K t4 T3 y2 y1 y0 Y3  .

   FLD     QWORD PTR DS:[DI+32] ; get Y4    0        K T4 y3 y2 y1 y0  .  .
   FADDP   ST(5),ST(0)          ; Y4:=T4+Y4 1 fghij  K T4 y3 y2 y1 y0 Y4  .

   FSTP    QWORD PTR DS:[DI+00] ; store Y0  1  ghij  K y4 y3 y2 y1 Y0  .  .
   FSTP    QWORD PTR DS:[DI+08] ; store Y1  1   hij  K y4 y3 y2 Y1  .  .  .
   FSTP    QWORD PTR DS:[DI+16] ; store Y2  1    ij  K y4 y3 Y2  .  .  .  .
   FSTP    QWORD PTR DS:[DI+24] ; store Y3  1     j  K y4 Y3  .  .  .  .  .
   FSTP    QWORD PTR DS:[DI+32] ; store Y4  1        K Y4  .  .  .  .  .  .

   ADD     SI,40                ; update array X index
   ADD     DI,40                ; update array Y index

   DEC     AX                   ; update loop counter
   JNZ     SHORT InnerLoop      ; if more points

   FLD     QWORD PTR DS:[SI+00] ; get X0    0        K  .  .  .  .  .  .  .
   FMUL    ST(0),ST(1)          ; T0:=K*X0  1 a      K X0  .  .  .  .  .  .

   FLD     QWORD PTR DS:[SI+08] ; get X1    0        K t0  .  .  .  .  .  .
   FMUL    ST(0),ST(1)          ; T1:=K*X1  1 ab     K t0 X1  .  .  .  .  .

   FLD     QWORD PTR DS:[SI+16] ; get X2    0        K t0 t1  .  .  .  .  .
   FMUL    ST(0),ST(2)          ; T2:=K*X2  1 abc    K t0 t1 X2  .  .  .  .
   FXCH    ST(2)                ; T2 <=> T0 0        K t0 t1 t2  .  .  .  .

   FLD     QWORD PTR DS:[DI+00] ; get Y0    0        K t2 t1 t0  .  .  .  .
   FADDP   ST(1),ST(0)          ; Y0:=T0+Y0 1  bcd   K t2 t1 t0 Y0  .  .  .

   FLD     QWORD PTR DS:[DI+08] ; get Y1    0        K t2 t1 y0  .  .  .  .
   FADDP   ST(2),ST(0)          ; Y1:=T1+Y1 1   cde  K t2 t1 y0 Y1  .  .  .

   FLD     QWORD PTR DS:[DI+16] ; get Y2    0        K t2 y1 y0  .  .  .  .
   FADDP   ST(3),ST(0)          ; Y2:=T2+Y2 1 f  de  K t2 y1 y0 Y2  .  .  .

   FSTP    QWORD PTR DS:[DI+00] ; store Y0    f   e  K y2 y1 y0  .  .  .  .
   FSTP    QWORD PTR DS:[DI+08] ; store Y1    f      K y2 y1  .  .  .  .  .
   FSTP    QWORD PTR DS:[DI+16] ; store Y2           K y2  .  .  .  .  .  .

   DEC     ECX                  ; update pass counter
   JNZ     SHORT OuterLoop      ; if not finished

   FSTP    ST(0)                ; discard constant   K  .  .  .  .  .  .  .

   ReadTSC 0                    ; compute total cycle count

FinishedTest:

   LEA     ESI,DS:[CycleCounts] ; get cycle counts pointer
   LEA     EDI,DS:[OutgoingText] ; get ASCII buffer pointer
   MOV     BX,4                 ; get number of 32 bit words to convert

NextFullWord:

   FILD    DWORD PTR DS:[SI]    ; get next 32 bit integer cycle count
   FBSTP   TBYTE PTR DS:[PackedCount] ; save next packed 18 digit BCD
   MOV     ECX,9                ; 18 packed BCD digits = 9 total bytes
   XOR     EDX,EDX              ; get packed BCD cycle count offset
   MOV     WORD PTR DS:[DI+18],'  ' ; spaces between BCD cycle counts

NextBCDDigit:

   MOV     AL,DS:[PackedCount+EDX] ; ASCIIfy next nibble pair
   MOV     AH,AL
   SHR     AL,4
   AND     AH,00FH
   ADD     AX,03030H
   MOV     DS:[EDI+ECX*2-2],AX  ; save next pair ASCII digits
   INC     DX                   ; update packed BCD cycle count offset
   DEC     CX
   JNZ     NextBCDDigit         ; if more BCD digits to convert
   ADD     SI,4                 ; point to next 32 bit cycle count
   ADD     DI,20                ; point to next ASCII output string
   DEC     BX
   JNZ     NextFullWord         ; if more cycle counts to convert
   MOV     CX,TextLineSize      ; how many ASCII characters to print
   LEA     DX,DS:[OutgoingText] ; get pointer to ASCII output buffer
   MOV     BX,00001H            ; ascii terminal display file handle
   MOV     AH,040H              ; print ASCII cycle counts
   INT     021H
   POP     BP                   ; cleanup and exit
   MOV     AX,04C00H
   INT     21H

   END     DAXPYP6
