5.555a3.57 3.57 0 003.08-1.778l3.078 1.78A7.135 7.135 0 0112 19.11zm7.11-6.715h-.79v.79h-.79v-.79h-.79v-.79h.79v-.79h.79v.79h.79zm2.962 0h-.79v.79h-.79v-.79h-.79v-.79h.79v-.79h.79v.79h.79z\" fill=\"currentColor\" />","children":["$","$Laa",null,{"children":["$","code",null,{"children":[["$","span",null,{"className":"line","children":["$","span",null,{"style":{"--shiki-light":"#6A737D","--shiki-dark":"#6A737D"},"children":"// <<<1, N>>> means: 1 block, N threads"}]}],"\n",["$","span",null,{"className":"line","children":[["$","span",null,{"style":{"--shiki-light":"#24292E","--shiki-dark":"#E1E4E8"},"children":"cuda_vector_add_simple"}],["$","span",null,{"style":{"--shiki-light":"#D73A49","--shiki-dark":"#F97583"},"children":"<<<"}],["$","span",null,{"style":{"--shiki-light":"#005CC5","--shiki-dark":"#79B8FF"},"children":"1"}],["$","span",null,{"style":{"--shiki-light":"#24292E","--shiki-dark":"#E1E4E8"},"children":", N"}],["$","span",null,{"style":{"--shiki-light":"#D73A49","--shiki-dark":"#F97583"},"children":">>>"}],["$","span",null,{"style":{"--shiki-light":"#24292E","--shiki-dark":"#E1E4E8"},"children":"(d_OUT, d_A, d_B, N);"}]]}]]}]}]}] 76:["$","p",null,{"children":["Here ",["$","code",null,{"children":"<<<1, N>>>"}]," is CUDA launch syntax. ",["$","code",null,{"children":"1"}]," is blocks, ",["$","code",null,{"children":"N"}]," is threads per block. A block can have at most 1024 threads, so this only handles small N. Next, we scale with multiple blocks."]}] 77:["$","h2",null,{"className":"flex scroll-m-28 flex-row items-center gap-2","id":"hardware-hierarchy-grid-block-thread","children":[["$","a",null,{"data-card":"","href":"#hardware-hierarchy-grid-block-thread","className":"peer","children":"Hardware Hierarchy: Grid, Block, Thread"}],["$","svg",null,{"ref":"$undefined","xmlns":"http://www.w3.org/2000/svg","width":24,"height":24,"viewBox":"0 0 24 24","fill":"none","stroke":"currentColor","strokeWidth":2,"strokeLinecap":"round","strokeLinejoin":"round","className":"lucide size-3.5 shrink-0 text-fd-muted-foreground opacity-0 transition-opacity peer-hover:opacity-100","aria-label":"Link to section","children":[[["$","path","1cjeqo",{"d":"M10 13a5 5 0 0 0 7.54.54l3-3a5 5 0 0 0-7.07-7.07l-1.72 1.71"}],["$","path","19qd67",{"d":"M14 11a5 5 0 0 0-7.54-.54l-3 3a5 5 0 0 0 7.07 7.07l1.71-1.71"}]],"$undefined"]}]]}] 78:["$","p",null,{"children":"If you have 1 million elements, do you launch 1 million threads? Can the GPU handle it?"}] 79:["$","p",null,{"children":["GPU is not a flat thread pool. It is organized as ",["$","strong",null,{"children":"Grid → Block → Thread"}],"."]}] 7a:["$","h3",null,{"className":"flex scroll-m-28 flex-row items-center gap-2","id":"hierarchy-mapping","children":[["$","a",null,{"data-card":"","href":"#hierarchy-mapping","className":"peer","children":"Hierarchy Mapping"}],["$","svg",null,{"ref":"$undefined","xmlns":"http://www.w3.org/2000/svg","width":24,"height":24,"viewBox":"0 0 24 24","fill":"none","stroke":"currentColor","strokeWidth":2,"strokeLinecap":"round","strokeLinejoin":"round","className":"lucide size-3.5 shrink-0 text-fd-muted-foreground opacity-0 transition-opacity peer-hover:opacity-100","aria-label":"Link to section","children":[[["$","path","1cjeqo",{"d":"M10 13a5 5 0 0 0 7.54.54l3-3a5 5 0 0 0-7.07-7.07l-1.72 1.71"}],["$","path","19qd67",{"d":"M14 11a5 5 0 0 0-7.54-.54l-3 3a5 5 0 0 0 7.07 7.07l1.71-1.71"}]],"$undefined"]}]]}] 7b:["$","div",null,{"style":{"display":"flex","justifyContent":"center"},"children":["$","img",null,{"src":"/images/docs/systems/gpu-programming/grid-block-thread.png","alt":"Hierarchy of Grid, Block, Thread","style":{"maxWidth":"500px","width":"100%"}}]}] 7c:["$","ol",null,{"children":["\n",["$","li",null,{"children":[["$","strong",null,{"children":"Grid"}],": all blocks launched by a kernel call."]}],"\n",["$","li",null,{"children":[["$","strong",null,{"children":"Block"}],": scheduled onto an ",["$","strong",null,{"children":"SM (Streaming Multiprocessor)"}],". Threads in a block can cooperate via shared memory."]}],"\n",["$","li",null,{"children"

GPU Architecture Basics

Log in to continue reading

GPU Architecture Basics

Log in to continue reading

Table of Contents