





按照奇淫巧計的慣例,代碼詳細講解將在續篇中給出。以下代碼編譯方式gcc -g -o test main.c。限64位機。

#include "stdio.h"
#include "stdlib.h"
#include "string.h"
#if defined(__i386__)
static __inline__ unsigned long long rdtsc(void)
    unsigned long long int x;
    __asm__ volatile(".byte 0x0f, 0x31" : "=A"(x));
    return x;
#elif defined(__x86_64__)
static __inline__ unsigned long long rdtsc(void)
    unsigned hi, lo;
    __asm__ __volatile__("rdtsc" : "=a"(lo), "=d"(hi));
    return ((unsigned long long)lo) | (((unsigned long long)hi) << 32);
asm(".type  m_b_64, @function       ");
asm("m_b_64:push   %rbp                  ");
asm("       mov    %rsp,%rbp             ");
asm("       mov    %rdx,%rcx");
asm("       rep    movsq        ");
asm("       leaveq      ");
asm("       retq        ");
asm(".type  m_b_32, @function       ");
asm("m_b_32:push   %rbp                  ");
asm("       mov    %rsp,%rbp             ");
asm("       mov    %rdx,%rcx");
asm("       rep    movsd        ");
asm("       leaveq  ");
asm("       retq    ");
asm(".type  m_b_16, @function       ");
asm("m_b_16:push   %rbp                  ");
asm("       mov    %rsp,%rbp             ");
asm("       mov    %rdx,%rcx");
asm("       rep    movsw        ");
asm("       leaveq  ");
asm("       retq    ");
asm(".type  m_b_8, @function       ");
asm("m_b_8:   push   %rbp                  ");
asm("       mov    %rsp,%rbp             ");
asm("       mov    %rdx,%rcx");
asm("       rep    movsb        ");
asm("       leaveq  ");
asm("       retq    ");
int main(void)
    int bytes_cnt = 32 * 1024 * 1024; //32M bytes
    int word_cnt = bytes_cnt / 2; //16M words
    int dword_cnt = word_cnt / 2; //8M  double words
    int qdword_cnt = dword_cnt / 2; //4M  quad words
    char* from = (char*) malloc(bytes_cnt);
    char* to = (char*)malloc(bytes_cnt);
    memset(from, 0x2, bytes_cnt);
    memset(to, 0x0, bytes_cnt);
    unsigned long long start;
    unsigned long long end;
    int i;

    for (i = 0; i < 10; ++i) {
        start = rdtsc();
        m_b_8(to, from, bytes_cnt);
        end = rdtsc();
        printf("m_b_8 use time:/t/t%d/n", end - start);

    for (i = 0; i < 10; ++i) {
        start = rdtsc();
        m_b_16(to, from, word_cnt);
        end = rdtsc();
        printf("m_b_16 use time:/t%d/n", end - start);

    for (i = 0; i < 10; ++i) {
        start = rdtsc();
        m_b_32(to, from, dword_cnt);
        end = rdtsc();
        printf("m_b_32 use time:/t%d/n", end - start);

    for (i = 0; i < 10; ++i) {
        start = rdtsc();
        m_b_64(to, from, qdword_cnt);
        end = rdtsc();
        printf("m_b_64 use time:/t%d/n", end - start);

    /*use to make sure cpy is ok******
    int sum = 0;
    int i = 0;
    return 0;


Loads the current value of the processor’s time-stamp counter (a 64-bit MSR) into the EDX:EAX registers and also loads the IA32_TSC_AUX MSR (address C000_0103H) into the ECX register. The EDX register is loaded with the high-order 32 bits of the IA32_TSC MSR; the EAX register is loaded with the low-order 32 bits of the IA32_TSC MSR; and the ECX register is loaded with the low-order 32-bits of IA32_TSC_AUX MSR. On processors that support the Intel 64 architecture, the highorder 32 bits of each of RAX, RDX, and RCX are cleared.


static __inline__ unsigned long long rdtsc(void)   
  unsigned hi, lo;  
  __asm__ __volatile__ ("rdtsc" : "=a"(lo), "=d"(hi));   
  return ( (unsigned long long)lo)|( ((unsigned long long)hi)<<32 );   

64位程序參數傳遞的方法和32位有很大不同,是通過寄存器來傳遞的,m_b_64(to,from,qdword_cnt); 這段函數在執行前,首先將to指向的地址存放在RDI寄存器中,然後將from指向的地址存放在RSI寄存器中,qdword_cnt存放在RDX寄存器中。而REP MOVSQ的含義是:Move RCX quadwords from[RSI] to [RDI].因此需要將存放在RDX寄存器的qdword_cnt存放在RCX寄存器內。

關於REP MOVSQ/MOVSW/MOVSD/MOVSB命令的詳細情況,可查詢intel指令手冊。

