美文网首页
2020-07-23

2020-07-23

作者: Yao_0 | 来源:发表于2020-07-23 18:20 被阅读0次

!nsys profile --stats=true ./saxpy 查看程序cuda时间

优化前

#include <stdio.h>

#define N 2048 * 2048 // Number of elements in each vector

/*
 * Optimize this already-accelerated codebase. Work iteratively,
 * and use nsys to support your work.
 *
 * Aim to profile `saxpy` (without modifying `N`) running under
 * 20us.
 *
 * Some bugs have been placed in this codebase for your edification.
 */

__global__ void saxpy(int * a, int * b, int * c)
{
    int tid = blockIdx.x * blockDim.x * threadIdx.x;

    if ( tid < N )
        c[tid] = 2 * a[tid] + b[tid];
}

int main()
{
    float *a, *b, *c;

    int size = N * sizeof (int); // The total number of bytes per vector

    cudaMallocManaged(&a, size);
    cudaMallocManaged(&b, size);
    cudaMallocManaged(&c, size);

    // Initialize memory
    for( int i = 0; i < N; ++i )
    {
        a[i] = 2;
        b[i] = 1;
        c[i] = 0;
    }

    int threads_per_block = 128;
    int number_of_blocks = (N / threads_per_block) + 1;

    saxpy <<< number_of_blocks, threads_per_block >>> ( a, b, c );

    // Print out the first and last 5 values of c for a quality check
    for( int i = 0; i < 5; ++i )
        printf("c[%d] = %d, ", i, c[i]);
    printf ("\n");
    for( int i = N-5; i < N; ++i )
        printf("c[%d] = %d, ", i, c[i]);
    printf ("\n");

    cudaFree( a ); cudaFree( b ); cudaFree( c );
}


优化后

#include <stdio.h>

#define N 2048 * 2048 // Number of elements in each vector

__global__ void saxpy(int * a, int * b, int * c)
{
  // Determine our unique global thread ID, so we know which element to process
  int tid = blockIdx.x * blockDim.x + threadIdx.x;
  int stride = blockDim.x * gridDim.x;
  
  for (int i = tid; i < N; i += stride)
    c[i] = 2 * a[i] + b[i];
}

int main()
{
  int *a, *b, *c;

  int size = N * sizeof (int); // The total number of bytes per vector

  int deviceId;
  int numberOfSMs;

  cudaGetDevice(&deviceId);
  cudaDeviceGetAttribute(&numberOfSMs, cudaDevAttrMultiProcessorCount, deviceId);

  // Allocate memory
  cudaMallocManaged(&a, size);
  cudaMallocManaged(&b, size);
  cudaMallocManaged(&c, size);

  // Initialize memory
  for( int i = 0; i < N; ++i )
  {
    a[i] = 2;
    b[i] = 1;
    c[i] = 0;
  }

  cudaMemPrefetchAsync(a, size, deviceId);
  cudaMemPrefetchAsync(b, size, deviceId);
  cudaMemPrefetchAsync(c, size, deviceId);

  int threads_per_block = 256;
  int number_of_blocks = numberOfSMs * 32;

  saxpy <<<number_of_blocks, threads_per_block>>>( a, b, c );

  cudaDeviceSynchronize(); // Wait for the GPU to finish

  // Print out the first and last 5 values of c for a quality check
  for( int i = 0; i < 5; ++i )
    printf("c[%d] = %d, ", i, c[i]);
  printf ("\n");
  for( int i = N-5; i < N; ++i )
    printf("c[%d] = %d, ", i, c[i]);
  printf ("\n");

  // Free all our allocated memory
  cudaFree( a ); cudaFree( b ); cudaFree( c );
}

相关文章

  • 2020-07-23 生产环境部署

    2020-07-23 生产环境部署 首先就是cookie问题

  • python--批处理--多进程

    2020-07-23 更新,增加线程自定义 初始

  • 心理学如何影响用户决策:吃瓜子效应

    心理学如何影响用户决策:吃瓜子效应 原木关注作者 2020-07-23 1 评论7702 浏览17 收藏10 分钟...

  • 2020-07-23

    2020-07-23 日精进打卡 姓名:彭新 宁波蓝天白云供应链管理有限公司 【日精进打卡第868天】 【知学习】...

  • 20200723宽基指数估值表

    估值日期:2020-07-23 ​ 颜色说明: 1、红色:说明当前指数估值比较高,不建议童鞋们去购买红色背景的相关...

  • 那些“月入十万”的自媒体人,你们还好吗?

    还在坚持写字的科技不谓侠2020-07-23 1 就像上知乎的朋友,总给人一种人均985本硕、人均年薪百万的错觉。...

  • 2020-07-23

    2020-07-23 我的目标是儿子今年的高考成绩超出预期,考取心仪的南京大学物理专业!我愿意成为那个榜样,带动更...

  • 喝酒聊天

    今晚喝酒了,白的,虽然也不是我喝… 2020-07-23 晚饭后葛弟弟问我想吃点儿啥,看那样子是要去商店整点吃的 ...

  • 经历过后收获的经验

    2020-07-23星期四 前语:不经历风雨怎么见彩虹,这首歌的歌词今天我才深深地理解它的真正含义。人生的每...

  • 今日复盘(Day179)-承诺

    2020-07-23 一、今日收获 按计划推进了备课内容,并陪伴了小朋友,此外完成了画笔记的作业,第一课的复习及作...

网友评论

      本文标题:2020-07-23

      本文链接:https://www.haomeiwen.com/subject/tkwhlktx.html