/* * leon3perf.c, Joris van Rantwijk, 2011. * * Measure performance of various instruction sequences on a LEON3. * * This program runs on a LEON3 under either RTEMS or Snapgear Linux-2.6. * Measurements are more accurate when running under RTEMS. * * To compile this program with RTEMS 4.10, run: * sparc-rtems4.10-gcc \ * -B/opt/rtems-4.10/sparc-rtems-4.10/leon3/lib/ -specs bsp_specs -qrtems \ * -msoft-float -Wall -Os leon3perf.c -o leon3perf.dsu */ #include #include #include #include #include #ifdef __rtems__ #include #include #endif #define F_USE 0x0001 #define F_DEF 0x0002 #define F_ADDR 0x0004 #define F_DEFADDR 0x0008 #define F_USECC 0x0010 #define F_DEFCC 0x0020 #define F_USEY 0x0040 #define F_DEFY 0x0080 #define F_BRANCH 0x0100 #define F_TAKE 0x0200 #define F_JUMP 0x0400 struct instruction { unsigned int opcode; const char * name; unsigned int flags; }; #define N_INSTRUCTIONS 51 const struct instruction instructions[N_INSTRUCTIONS] = { { 0x01000000, "nop", 0 }, { 0x15000004, "set 4096, %o2", F_DEF }, { 0x94102064, "mov 100, %o2", F_DEF }, { 0x9610000a, "mov %o2, %o3", F_USE }, { 0x94100008, "mov %o0, %o2", F_DEF | F_DEFADDR }, { 0x940a2003, "and %o0, 3, %o2", F_DEF }, { 0x940a0009, "and %o0, %o1, %o2", F_DEF }, { 0x960a000a, "and %o0, %o2, %o3", F_USE }, { 0x940a000b, "and %o0, %o3, %o2", F_USE | F_DEF }, { 0x948a0009, "andcc %o0, %o1, %o2", F_DEF | F_DEFCC }, { 0xd4020000, "ld [%o0], %o2", F_DEF | F_DEFADDR }, { 0xd4022004, "ld [%o0+4], %o2", F_DEF | F_DEFADDR }, { 0xd40a0000, "ldub [%o0], %o2", F_DEF }, { 0xd40a0000, "ldub [%o0+1], %o2", F_DEF }, { 0xd41a0000, "ldd [%o0], %o2", F_DEF }, { 0xd6028000, "ld [%o2], %o3", F_USE | F_ADDR }, { 0xd4220000, "st %o2, [%o0]", F_USE }, { 0xd42a0000, "stb %o2, [%o0]", F_USE }, { 0xd43a0000, "std %o2, [%o0]", F_USE }, { 0xd2228000, "st %o1, [%o2]", F_USE | F_ADDR }, { 0x952a2003, "sll %o0, 3, %o2", F_DEF }, { 0x952a0009, "sll %o0, %o1, %o2", F_DEF }, { 0x972aa003, "sll %o2, 3, %o3", F_USE }, { 0x952aa003, "sll %o2, 3, %o2", F_USE | F_DEF }, { 0x94520009, "umul %o0, %o1, %o2", F_DEF | F_DEFY }, { 0x945a0009, "smul %o0, %o1, %o2", F_DEF | F_DEFY }, { 0x94522001, "umul %o0, 1, %o2", F_DEF | F_DEFY | F_DEFADDR }, { 0x9652000a, "umul %o0, %o2, %o3", F_USE | F_DEFY }, { 0x9452000a, "umul %o0, %o2, %o2", F_USE | F_DEF | F_DEFY }, { 0x94d20009, "umulcc %o0, %o1, %o2", F_DEF | F_DEFY | F_DEFCC }, { 0x94720009, "udiv %o0, %o1, %o2", F_DEF | F_USEY }, { 0x947a0009, "sdiv %o0, %o1, %o2", F_DEF | F_USEY }, { 0x96728009, "udiv %o2, %o1, %o3", F_USE | F_USEY }, { 0x94728009, "udiv %o2, %o1, %o2", F_USE | F_DEF | F_USEY }, { 0x94f20009, "udivcc %o0, %o1, %o2", F_DEF | F_USEY | F_DEFCC }, { 0x95400000, "rd %y, %o2", F_DEF | F_USEY }, { 0x94022003, "add %o0, 3, %o2", F_DEF }, { 0x94020009, "add %o0, %o1, %o2", F_DEF }, { 0x9602000a, "add %o0, %o2, %o3", F_USE }, { 0x9402000a, "add %o0, %o2, %o2", F_USE | F_DEF }, { 0x94820009, "addcc %o0, %o1, %o2", F_DEF | F_DEFCC }, { 0x94420009, "addx %o0, %o1, %o2", F_DEF | F_USECC }, { 0x94c20009, "addxcc %o0, %o1, %o2", F_DEF | F_USECC | F_DEFCC }, { 0x10800004, "ba +4", F_BRANCH | F_TAKE }, { 0x30800004, "ba,a +4", F_BRANCH | F_TAKE }, { 0x12800004, "bne +4", F_BRANCH | F_USECC | F_TAKE }, { 0x32800004, "bne,a +4", F_BRANCH | F_USECC | F_TAKE }, { 0x02800004, "be +4", F_BRANCH | F_USECC }, { 0x22800004, "be,a +4", F_BRANCH | F_USECC }, { 0x40000004, "call +4", F_BRANCH | F_TAKE }, { 0x81c3e058, "jmpl %o7+88, %g0", F_JUMP }, }; #define CODE_TEMPLATE_LEN 32 #define CODE_PREFIX_LEN 8 #define CODE_SUFFIX_LEN 8 #ifdef __linux__ #define LOOP_COUNT 262144 #else #define LOOP_COUNT 16384 #endif const unsigned int code_template[CODE_TEMPLATE_LEN] = { /* prefix */ 0x9a10000f, /* mov %o7, %o5 ; o5 := return pointer */ 0x19000008 + 8*((LOOP_COUNT-1)/8192), /* set N, %o4 ; o4 := LOOP_COUNT */ 0x40000002, /* call loop ; o7 := own address */ 0x01000000, /* nop */ /* loop: */ 0x81800000, /* wr %g0, %y*/ 0x01000000, /* nop */ 0x01000000, /* nop */ 0x01000000, /* nop */ /* 16 slots for custom instructions */ 0x01000000, /* nop */ 0x01000000, /* nop */ 0x01000000, /* nop */ 0x01000000, /* nop */ 0x01000000, /* nop */ 0x01000000, /* nop */ 0x01000000, /* nop */ 0x01000000, /* nop */ 0x01000000, /* nop */ 0x01000000, /* nop */ 0x01000000, /* nop */ 0x01000000, /* nop */ 0x01000000, /* nop */ 0x01000000, /* nop */ 0x01000000, /* nop */ 0x01000000, /* nop */ /* suffix */ 0x01000000, /* nop */ 0x01000000, /* nop */ 0x01000000, /* nop */ 0x98a32001, /* subcc %o4, 1, %o4 */ 0x12bfffe9, /* bne loop */ 0x01000000, /* nop */ 0x81c36008, /* jmpl %o5+8, %g0 */ 0x01000000, /* nop */ }; unsigned int scratch[LOOP_COUNT]; /* Flush all caches (instruction and data). */ static void flush_cache(void) { asm volatile ("flush ; nop"); usleep(1000); } /* Build code fragment based on template and up to three custom instructions. */ void build_code(unsigned int *codeptr, int insna, int insnb, int insnc, int nopsab, int nopsbc) { memcpy(codeptr, code_template, 4 * CODE_TEMPLATE_LEN); codeptr[CODE_PREFIX_LEN] = instructions[insna].opcode; codeptr[CODE_PREFIX_LEN+1+nopsab] = instructions[insnb].opcode; codeptr[CODE_PREFIX_LEN+2+nopsab+nopsbc] = instructions[insnc].opcode; } #ifdef __rtems__ struct timer_context { rtems_interrupt_level intlevel; unsigned int tstart; }; /* Start timer. */ static void timer_start(struct timer_context *tim) { /* Disable interrupts. */ rtems_interrupt_disable(tim->intlevel); /* Use second counter in GPTIMER for timing */ LEON3_Timer_Regs->timer[1].reload = 0xffffffff; LEON3_Timer_Regs->timer[1].conf = LEON3_GPTIMER_EN | LEON3_GPTIMER_LD; tim->tstart = LEON3_Timer_Regs->timer[1].value; } /* Stop timer and return number of cycles used. */ static unsigned int timer_stop(struct timer_context *tim) { unsigned int tend, elapsed; tend = LEON3_Timer_Regs->timer[1].value; /* Restore interrupts. */ rtems_interrupt_enable(tim->intlevel); elapsed = tim->tstart - tend; return elapsed * (LEON3_Timer_Regs->scaler_reload + 1); } #endif #ifdef __linux__ struct timer_context { struct timeval tvstart; }; /* Start timer. */ static inline void timer_start(struct timer_context *tim) { gettimeofday(&tim->tvstart, NULL); } /* Stop timer and return number of cycles used. */ static unsigned int timer_stop(struct timer_context *tim) { struct timeval tv; unsigned int elapsed; gettimeofday(&tv, NULL); elapsed = 1000000 * (tv.tv_sec - tim->tvstart.tv_sec) + tv.tv_usec - tim->tvstart.tv_usec; return elapsed * CPUSPEED_MHZ; } #endif /* Run code fragment and return number of cycles used per loop. */ unsigned int run_code(void (*codeptr)(int, int, int), int o0, int o1, int o2) { struct timer_context tim; int t; timer_start(&tim); codeptr(o0, o1, o2); t = timer_stop(&tim); #ifndef __linux__ if (t % LOOP_COUNT > LOOP_COUNT/10) { printf("NON-INTEGER NUMBER OF CYCLES PER LOOP\n"); exit(1); } #endif return t / LOOP_COUNT; } /* Build and run a fragment with specified instruction/NOP mix and return the number of cycles per loop. */ unsigned int run(int insna, int insnb, int insnc, int nopsab, int nopsbc) { typedef void func(int, int, int); static unsigned int code[CODE_TEMPLATE_LEN]; unsigned int t; int p; for (t = 0; t < 8; t++) scratch[t] = (unsigned int)scratch; p = ((((int)scratch) + 15) & (~15)) + 8; build_code(code, insna, insnb, insnc, nopsab, nopsbc); flush_cache(); t = run_code((func*)code, p, 31, p); return t; } static void load_loop(int ldaddr, int ldstep, int dum) { int loopcnt = LOOP_COUNT; int d1; asm volatile ( "1: \n" " nop \n" " nop \n" " nop \n" " ld [%1], %2 \n" " nop \n" " nop \n" " nop \n" " subcc %0, 1, %0 \n" " bne 1b \n" " add %1, %5, %1 \n" : "=&r" (loopcnt), "=&r" (ldaddr), "=&r" (d1) : "0" (loopcnt), "1" (ldaddr), "r" (ldstep) ); } static void store_load_loop(int ldaddr, int ldstep, int staddr) { int loopcnt = LOOP_COUNT; int d1; asm volatile ( "1: \n" " nop \n" " nop \n" " nop \n" " st %%g0, [%6] \n" " ld [%1], %2 \n" " nop \n" " nop \n" " nop \n" " subcc %0, 1, %0 \n" " bne 1b \n" " add %1, %5, %1 \n" : "=&r" (loopcnt), "=&r" (ldaddr), "=&r" (d1) : "0" (loopcnt), "1" (ldaddr), "r" (ldstep), "r" (staddr) ); } /* Measure the cost of a cache miss, and a store followed by cache miss. */ void cache_perf(void) { unsigned int t_ldhit, t_ldmiss, t_stldhit, t_stldmiss; int dum[1] = { 0 }; /* Measure time of load with cache hit. */ flush_cache(); t_ldhit = run_code(load_loop, (int)scratch, 0, 0); printf("load with cache hit = %u cycles/loop\n", t_ldhit); /* Measure time of load with cache miss. */ flush_cache(); t_ldmiss = run_code(load_loop, (int)scratch, 4, 0); printf("load with cache miss = %u cycles/loop\n", t_ldmiss); /* Measure time of store followed by load with cache hit. */ flush_cache(); t_stldhit = run_code(store_load_loop, (int)scratch, 0, (int)dum); printf("store followed by load with cache hit = %u cycles/loop\n", t_stldhit); /* Measure time of store followed by load with cache miss. */ flush_cache(); t_stldmiss = run_code(store_load_loop, (int)scratch, 4, (int)dum); printf("store followed by load with cache miss = %u cycles/loop\n", t_stldmiss); printf("-> cache miss penalty = %u cycles\n", t_ldmiss - t_ldhit); printf("-> stall on store buffer flush = %u cycles\n", (t_stldmiss - t_stldhit) - (t_ldmiss - t_ldhit)); } int main(void) { int insn_cost[N_INSTRUCTIONS]; unsigned int base_cycles, cycles; int cost, latency; int i, j, k; printf("\nMeasure LEON3 instruction cycle counts.\n\n"); #ifdef DISABLE_CACHE { uint32_t ctrl; asm volatile ( "lda [%%g0] 0x2, %0" : "=r" (ctrl) ); ctrl &= ~(3 << 2); asm volatile ( "sta %0, [%%g0] 0x2" : : "r" (ctrl) ); } #endif /* Measure all-NOP loop. */ base_cycles = run(0, 0, 0, 0, 0); printf("basic loop = %u cycles/loop\n\n", base_cycles); /* Measure individual instructions. */ for (i = 0; i < N_INSTRUCTIONS; i++) { cycles = run(i, 0, 0, 0, 0); cost = cycles + 1 - base_cycles; if (instructions[i].flags & F_JUMP) cost += 14; /* skips over 14 instructions */ if ((instructions[i].flags & (F_BRANCH|F_TAKE)) == (F_BRANCH|F_TAKE)) cost += 2; /* skips two instructions */ insn_cost[i] = cost; printf("%-32s %u cycles/loop, cost=%d\n", instructions[i].name, cycles, cost); } printf("\n"); /* Measure DEF-USE pairs. */ for (i = 0; i < N_INSTRUCTIONS; i++) { unsigned int fi = instructions[i].flags; if ((fi & (F_USE|F_ADDR|F_USECC|F_BRANCH|F_JUMP)) != 0) continue; for (j = 0; j < N_INSTRUCTIONS; j++) { unsigned int fj = instructions[j].flags; if ((fj & (F_BRANCH|F_JUMP)) != 0) continue; if ((fj & F_USE) && !(fi & F_DEF)) continue; if ((fj & F_ADDR) && !(fi & F_DEFADDR)) continue; if ((fj & F_USECC) && !(fi & F_DEFCC)) continue; if ((fj & F_USEY) && !(fi & F_DEFY)) continue; cycles = run(i, j, 0, 0, 0); cost = cycles + 2 - base_cycles; latency = cost - insn_cost[i] - insn_cost[j]; if (latency == 0) continue; printf("%s ; %s -> %d cycles/loop, latency=%d\n", instructions[i].name, instructions[j].name, cycles, latency); if (latency > 0 && latency < 10) { cycles = run(i, j, 0, latency, 0); cost = cycles + 2 - base_cycles; if (cost != insn_cost[i] + insn_cost[j]) { printf("%s ; %d*nop ; %s -> %d cycles/loop, UNEXPECTED\n", instructions[i].name, latency, instructions[j].name, cycles); } } } } printf("\n"); /* Measure DEFCC-BRANCH pairs. */ for (i = 0; i < N_INSTRUCTIONS; i++) { unsigned int fi = instructions[i].flags; if ((fi & (F_USE|F_ADDR|F_USECC|F_USEY|F_BRANCH|F_JUMP)) != 0) continue; if ((fi & F_DEFCC) == 0) continue; for (j = 0; j < N_INSTRUCTIONS; j++) { unsigned int fj = instructions[j].flags; if ((fj & (F_BRANCH|F_USECC)) != (F_BRANCH|F_USECC)) continue; cycles = run(i, j, 0, 0, 0); cost = cycles + 2 - base_cycles; if ((fj & (F_BRANCH|F_TAKE)) == (F_BRANCH|F_TAKE)) cost += 2; /* skips two instructions */ latency = cost - insn_cost[i] - insn_cost[j]; printf("%s ; %s -> %d cycles/loop, latency=%d\n", instructions[i].name, instructions[j].name, cycles, latency); for (k = 1; latency > 0 && k <= 3; k++) { cycles = run(i, j, 0, k, 0); cost = cycles + 2 - base_cycles; if ((fj & (F_BRANCH|F_TAKE)) == (F_BRANCH|F_TAKE)) cost += 2; /* skips two instructions */ printf("%s ; %d*nop ; %s -> %d cycles/loop\n", instructions[i].name, k, instructions[j].name, cycles); } } } printf("\n"); cache_perf(); printf("\n"); printf("done.\n"); return 0; } /* vim: expandtab softtabstop=4 */