An Introduction To CPP Coroutines Through A Thread Scheduling Demonstration
An Introduction To CPP Coroutines Through A Thread Scheduling Demonstration
com/dian-lun-lin/cpp_coroutine_examples
Bio: https://dian-lun-lin.github.io/
Agenda
• Learn what is coroutine
4
Why Coroutine
2. Take a shower
take a shower
2. Take a shower
turn off the stove
enjoy your hot water
There is an overlap!
6
Why Coroutine
• Coroutine is very useful if you have a stove other computing resource!
GPU, TPU, async I/O, …
Without coroutine
2 void cpu_work() {
3 cpu_matmul(matA, matB, ...);
4 }
5
6 void gpu_work() {
7 cudaStream_t stream;
8 cudaStreamCreate(stream);
9 gpu_matmul<<<8, 256, 0, stream>>>(matA, matB, ...);
10 cudaStreamSynchronize(stream);
11 cudaStreamDestroy(stream);
12 }
13
14 // cpu_work and gpu_work are independent
15 // assume we only have one CPU thread
16 int main() {
17 cpu_work();
18 gpu_work();
19
20 // alternatively
21 gpu_work();
22 cpu_work();
23 }
7
Why Coroutine
• Coroutine is very useful if you have a stove other computing resource!
GPU, TPU, async I/O, …
Without coroutine
2 void cpu_work() {
3 cpu_matmul(matA, matB, ...);
4 }
8
Why Coroutine
• Coroutine is very useful if you have a stove other computing resource!
GPU, TPU, async I/O, …
Without coroutine
6 void gpu_work() {
7 cudaStream_t stream;
8 cudaStreamCreate(stream);
9 gpu_matmul<<<8, 256, 0, stream>>>(matA, matB, ...);
10 cudaStreamSynchronize(stream);
11 cudaStreamDestroy(stream);
12 }
9
Why Coroutine
• Coroutine is very useful if you have a stove other computing resource!
GPU, TPU, async I/O, …
Without coroutine
15
How to Define a Coroutine
2 void cpu_work() {
3 cpu_matmul(matA, matB, ...);
4 }
Use 5
6 Coro gpu_work() {
co_await, co_yield, and/or co_return 7 cudaStream_t stream;
8 cudaStreamCreate(stream);
9 gpu_matmul<<<8, 256, 0, stream>>>(matA, matB);
10 while(cudaStreamQuery(stream) != cudaSuccess) {
11 co_await std::suspend_always{};
12 }
13 cudaStreamDestory(stream);
14 }
Return a 15
16 // cpu_work and gpu_work are independent to each other
coroutine object specifying a promise 17
18
// assume we only have one CPU thread
int main() {
19 auto coro = gpu_work();
20 cpu_work();
21
22 while(!coro.done()) { coro.resume(); }
23 }
16
How to Define a Coroutine
Use
6 Coro
co_await, co_yield, and/or co_return
11 co_await std::suspend_always{};
Return a
coroutine object specifying a promise
17
How to Define a Coroutine
18
How to Define a Coroutine
4 struct promise_type {
5 std::suspend_always initial_suspend() noexcept { return {}; }
6 std::suspend_always final_suspend() noexcept { return {}; }
7
8 Coro get_return_object() { return std::coroutine_handle<promise_type>::from_promise(*this); }
9 void return_void() {}
10 void unhandled_exception() {}
11 };
19
Agenda
• Learn what is coroutine
21
How to Define a Promise
22
Promise
2 struct Coro {
3 The suspension of beginning of a coroutine
4 struct promise_type {
5 std::suspend_always initial_suspend() noexcept { return {}; }
23
Promise
2 struct Coro {
3 The suspension of end of a coroutine
4 struct promise_type {
24
Promise
2 struct Coro {
3
4 struct promise_type {
5
25
Promise
2 struct Coro {
3
4 struct promise_type {
5
void unhandled_exception() {}
Exception handling
26
Why do We Need to Define Promise
6 Coro gpu_work() {
7 cudaStream_t stream;
8 cudaStreamCreate(stream);
9 gpu_matmul<<<8, 256, 0, stream>>>(matA, matB);
10 while(cudaStreamQuery(stream) != cudaSuccess) {
11 co_await std::suspend_always{};
12 }
13 cudaStreamDestory(stream);
14 }
27
Why do We Need to Define Promise
6 Coro gpu_work() {
7 cudaStream_t stream;
8 cudaStreamCreate(stream);
9 gpu_matmul<<<8, 256, 0, stream>>>(matA, matB);
10 while(cudaStreamQuery(stream) != cudaSuccess) {
11 co_await std::suspend_always{};
12 }
13 cudaStreamDestory(stream);
14 }
28
Coroutine/Promise – Compiler’s View
6 Coro gpu_work() {
7 cudaStream_t stream;
8 cudaStreamCreate(stream);
9 gpu_matmul<<<8, 256, 0, stream>>>(matA, matB);
10 while(cudaStreamQuery(stream) != cudaSuccess) { Compiler’s view (simplified)
11 co_await std::suspend_always{};
12 }
13 cudaStreamDestory(stream); 2 Coro gpu_work() {
14 } 3 Coro::promise_type p();
4 Coro coro_obj = p.get_return_object();
5
6 try {
7 co_await p.initial_suspend();
8 cudaStream_t stream;
9 cudaStreamCreate(stream);
10 gpu_matmul<<<8, 256, 0, stream>>>(matA, matB);
11 while(cudaStreamQuery(stream) != cudaSuccess) {
12 co_await std::suspend_always{};
13 }
14 cudaStreamDestory(stream);
15 } catch(...) {
16 p.unhandled_exception();
17 }
18 co_await p.final_suspend();
19 }
29
Coroutine/Promise – Compiler’s View
6 Coro gpu_work() {
7 cudaStream_t stream;
8 cudaStreamCreate(stream);
9 gpu_matmul<<<8, 256, 0, stream>>>(matA, matB);
10 while(cudaStreamQuery(stream) != cudaSuccess) { Compiler’s view (simplified)
11 co_await std::suspend_always{};
12 }
13 cudaStreamDestory(stream);
14 } 3 Coro::promise_type p();
4 Coro coro_obj = p.get_return_object();
30
Coroutine/Promise – Compiler’s View
6 Coro gpu_work() {
7 cudaStream_t stream;
8 cudaStreamCreate(stream);
9 gpu_matmul<<<8, 256, 0, stream>>>(matA, matB);
10 while(cudaStreamQuery(stream) != cudaSuccess) { Compiler’s view (simplified)
11 co_await std::suspend_always{};
12 }
13 cudaStreamDestory(stream);
14 }
6 try {
15 } catch(...) {
16 p.unhandled_exception();
17 }
31
Coroutine/Promise – Compiler’s View
6 Coro gpu_work() {
7 cudaStream_t stream;
8 cudaStreamCreate(stream);
9 gpu_matmul<<<8, 256, 0, stream>>>(matA, matB);
10 while(cudaStreamQuery(stream) != cudaSuccess) { Compiler’s view (simplified)
11 co_await std::suspend_always{};
12 }
13 cudaStreamDestory(stream);
14 }
7 co_await p.initial_suspend();
8 cudaStream_t stream;
9 cudaStreamCreate(stream);
10 gpu_matmul<<<8, 256, 0, stream>>>(matA, matB);
11 while(cudaStreamQuery(stream) != cudaSuccess) {
12 co_await std::suspend_always{};
13 }
14 cudaStreamDestory(stream);
32
Coroutine/Promise – Compiler’s View
6 Coro gpu_work() {
7 cudaStream_t stream;
8 cudaStreamCreate(stream);
9 gpu_matmul<<<8, 256, 0, stream>>>(matA, matB);
10 while(cudaStreamQuery(stream) != cudaSuccess) { Compiler’s view (simplified)
11 co_await std::suspend_always{};
12 }
13 cudaStreamDestory(stream);
14 }
18 co_await p.final_suspend();
33
That’s Why We Need to Define Promise
6 Coro gpu_work() {
7 cudaStream_t stream;
8 cudaStreamCreate(stream);
9 gpu_matmul<<<8, 256, 0, stream>>>(matA, matB);
10 while(cudaStreamQuery(stream) != cudaSuccess) { Compiler’s view (simplified)
11 co_await std::suspend_always{};
12 }
13 cudaStreamDestory(stream); 2 Coro gpu_work() {
14 } 3 Coro::promise_type p();
4 Coro coro_obj = p.get_return_object();
5
6 try {
7 co_await p.initial_suspend();
8 cudaStream_t stream;
9 cudaStreamCreate(stream);
10 gpu_matmul<<<8, 256, 0, stream>>>(matA, matB);
Promise controls behavior of coroutine! 11 while(cudaStreamQuery(stream) != cudaSuccess) {
12 co_await std::suspend_always{};
13 }
14 cudaStreamDestory(stream);
15 } catch(...) {
16 p.unhandled_exception();
17 }
18 co_await p.final_suspend();
19 }
34
Agenda
• Learn what is coroutine
6 Coro gpu_work() {
7 cudaStream_t stream;
8 cudaStreamCreate(stream);
9 gpu_matmul<<<8, 256, 0, stream>>>(matA, matB);
10 while(cudaStreamQuery(stream) != cudaSuccess) {
11 co_await std::suspend_always{};
12 }
13 cudaStreamDestory(stream);
14 }
36
Awaitable
• Awaitable controls a specific suspension point behavior
11 co_await std::suspend_always{};
37
How to Define an Awaitable
38
Compiler’s View (Simplified)
39
Built-in Awaitable
• await_ready()
• await_suspend() Compiler’s view
12 // compiler transform
• await_resume() 13 auto&& awaiter = std::suspend_always{};
14 if(!awaiter.await_ready()) {
15 awaiter.await_suspend(std::coroutine_handle<>...);
16 //<suspend/resume>
17 }
19 awaiter.await_resume();
40
Variants of await_suspend()
• await_ready()
• await_suspend() Compiler’s view
12 // compiler transform
• await_resume() 13 auto&& awaiter = std::suspend_always{};
14 if(!awaiter.await_ready()) {
15 awaiter.await_suspend(std::coroutine_handle<>...);
16 //<suspend/resume>
17 }
19 awaiter.await_resume();
42
Agenda
• Learn what is coroutine
46
Agenda
• Learn what is coroutine
Coroutine tasks
4 Task TaskA(Scheduler& sch) {
5 std::cout << "Hello from TaskA\n";
6 co_await sch.suspend();
7 std::cout << "Executing the TaskA\n";
8 co_await sch.suspend();
9 std::cout << "TaskA is finished\n";
10 }
11
12 Task TaskB(Scheduler& sch) {
13 std::cout << "Hello from TaskB\n";
14 co_await sch.suspend();
15 std::cout << "Executing the TaskB\n";
16 co_await sch.suspend();
17 std::cout << "TaskB is finished\n";
18 }
Code: https://github.com/dian-lun-lin/cpp_coroutine_examples/single-threaded
48
Coroutine Tasks and Scheduler APIs
Single-threaded scheduler
Code: https://github.com/dian-lun-lin/cpp_coroutine_examples/single-threaded
49
Definition of coroutine/promise_type
Single-threaded scheduler
5 struct Task {
6
7 struct promise_type {
8 std::suspend_always initial_suspend() noexcept { return {}; }
9 std::suspend_always final_suspend() noexcept { return {}; }
10
11 Task get_return_object() {
return std::coroutine_handle<promise_type>::from_promise(*this);
}
12 void return_void() {}
13 void unhandled_exception() {}
14 };
15
16 Task(std::coroutine_handle<promise_type> handle): handle{handle} {}
17
18 auto get_handle() { return handle; }
19
20 std::coroutine_handle<promise_type> handle;
21 };
Code: https://github.com/dian-lun-lin/cpp_coroutine_examples/single-threaded
50
Definition of coroutine/promise_type
Single-threaded scheduler
5 struct Task {
15
16 Task(std::coroutine_handle<promise_type> handle): handle{handle} {}
17
18 auto get_handle() { return handle; }
19
20 std::coroutine_handle<promise_type> handle;
21 };
Code: https://github.com/dian-lun-lin/cpp_coroutine_examples/single-threaded
51
Definition of coroutine/promise_type
Single-threaded scheduler
7 struct promise_type {
8 std::suspend_always initial_suspend() noexcept { return {}; }
9 std::suspend_always final_suspend() noexcept { return {}; }
10
11 Task get_return_object() {
return std::coroutine_handle<promise_type>::from_promise(*this);
}
12 void return_void() {}
13 void unhandled_exception() {}
14 };
Code: https://github.com/dian-lun-lin/cpp_coroutine_examples/single-threaded
52
Scheduler Implementation
24 class Scheduler {
25
Single-threaded scheduler
26 std::queue<std::coroutine_handle<>> _tasks;
27 emplace(…) : emplace a coroutine handle (task)
28 public:
29
schedule(…) : schedule all emplaced tasks
30 void emplace(std::coroutine_handle<> task) { get_handle(): get coroutine handle
31 _tasks.push(task);
32 }
21 int main() {
33
34 void schedule() { 22
35 while(!_tasks.empty()) { 23 Scheduler sch;
36 auto task = _tasks.front(); 24
37 _tasks.pop(); 25 sch.emplace(TaskA(sch).get_handle());
38 task.resume(); 26 sch.emplace(TaskB(sch).get_handle());
39 27
40 if(!task.done()) { 28 std::cout << "Start scheduling...\n";
41 _tasks.push(task); 29
42 } 30 sch.schedule();
43 else { task.destroy(); } 31 }
44 }
45 }
46 auto suspend() {
47 return std::suspend_always{};
48 }
53
Scheduler Implementation
24 class Scheduler {
25
Single-threaded scheduler
26 std::queue<std::coroutine_handle<>> _tasks;
27 emplace(…) : emplace a coroutine handle (task)
28
schedule(…) : schedule all emplaced tasks
get_handle(): get coroutine handle
21 int main() {
22
23 Scheduler sch;
24
25 sch.emplace(TaskA(sch).get_handle());
26 sch.emplace(TaskB(sch).get_handle());
27
28 std::cout << "Start scheduling...\n";
29
30 sch.schedule();
31 }
54
Scheduler Implementation
Single-threaded scheduler
55
Scheduler Implementation
Single-threaded scheduler
21 int main() {
34 void schedule() { 22
35 while(!_tasks.empty()) { 23 Scheduler sch;
36 auto task = _tasks.front(); 24
37 _tasks.pop(); 25 sch.emplace(TaskA(sch).get_handle());
38 task.resume(); 26 sch.emplace(TaskB(sch).get_handle());
39 27
40 if(!task.done()) { 28 std::cout << "Start scheduling...\n";
41 _tasks.push(task); 29
42 } 30 sch.schedule();
43 else { task.destroy(); } 31 }
44 }
45 }
56
Scheduler Implementation
Single-threaded scheduler
21 int main() {
22
23 Scheduler sch;
24
25 sch.emplace(TaskA(sch).get_handle());
26 sch.emplace(TaskB(sch).get_handle());
27
28 std::cout << "Start scheduling...\n";
29
30 sch.schedule();
31 }
46 auto suspend() {
47 return std::suspend_always{};
48 }
57
Results of Using Task Queue
21 int main() {
22
Single-threaded scheduler
23 Scheduler sch;
24
25 sch.emplace(TaskA(sch).get_handle());
26 sch.emplace(TaskB(sch).get_handle());
27 Results
28 std::cout << "Start scheduling...\n";
29 Start scheduling...
30 sch.schedule(); Hello from TaskA
31 }
Code: https://github.com/dian-lun-lin/cpp_coroutine_examples/multi-threaded
72
Definition of coroutine/promise_type
Multi-threaded scheduler
5 struct Task {
6
7 struct promise_type {
8 std::suspend_always initial_suspend() noexcept { return {}; }
9 std::suspend_always final_suspend() noexcept { return {}; }
10
11 Task get_return_object() {
return std::coroutine_handle<promise_type>::from_promise(*this);
}
12 void return_void() {}
13 void unhandled_exception() {}
14 };
15
16 Task(std::coroutine_handle<promise_type> handle): handle{handle} {}
17
18 auto get_handle() { return handle; }
19
20 std::coroutine_handle<promise_type> handle;
21 };
Code: https://github.com/dian-lun-lin/cpp_coroutine_examples/multi-threaded
73
Member Variables/functions in Scheduler
30 class Scheduler {
31
Multi-threaded scheduler
32 public:
33
34 Scheduler(size_t num_threads);
35
36 void emplace(std::coroutine_handle<> task);
37 auto suspend();
38 void schedule();
39 void wait();
40
41 private:
42
43 void _enqueue(std::coroutine_handle<> task);
44 void _process(std::coroutine_handle<> task);
45
46 std::vector<std::coroutine_handle<>> _tasks;
47 std::queue<std::coroutine_handle<>> _pending_tasks;
48 std::vector<std::thread> _workers;
49
50 std::mutex _mtx;
51 std::condition_variable _cv;
52 bool _stop{false};
53 std::atomic<size_t> _finished{0};
54 };
Code: https://github.com/dian-lun-lin/cpp_coroutine_examples/multi-threaded
74
Member Variables/functions in Scheduler
30 class Scheduler {
Multi-threaded scheduler
Code: https://github.com/dian-lun-lin/cpp_coroutine_examples/multi-threaded
75
Member Variables/functions in Scheduler
30 class Scheduler {
Multi-threaded scheduler
45
46 std::vector<std::coroutine_handle<>> _tasks;
47 std::queue<std::coroutine_handle<>> _pending_tasks;
Code: https://github.com/dian-lun-lin/cpp_coroutine_examples/multi-threaded
76
Member Variables/functions in Scheduler
30 class Scheduler {
Multi-threaded scheduler
48 std::vector<std::thread> _workers;
_enqueue() : insert a task for execution
_process() : resume a task
Code: https://github.com/dian-lun-lin/cpp_coroutine_examples/multi-threaded
77
Member Variables/functions in Scheduler
30 class Scheduler {
Multi-threaded scheduler
Code: https://github.com/dian-lun-lin/cpp_coroutine_examples/multi-threaded
78
Member Variables/functions in Scheduler
30 class Scheduler {
Multi-threaded scheduler
Code: https://github.com/dian-lun-lin/cpp_coroutine_examples/multi-threaded
79
Member Variables/functions in Scheduler
30 class Scheduler {
31
Multi-threaded scheduler
32 public:
33
34 Scheduler(size_t num_threads);
_tasks : store all emplaced tasks 35
_pending_tasks : store tasks ready to resume() 36 void emplace(std::coroutine_handle<> task);
37 auto suspend();
_workers : store all threads
38 void schedule();
_mtx and _cv : block/unblock threads 39 void wait();
_stop : signal threads to return 40
_finished : count finished tasks 41 private:
42
43 void _enqueue(std::coroutine_handle<> task);
44 void _process(std::coroutine_handle<> task);
45
46 std::vector<std::coroutine_handle<>> _tasks;
47 std::queue<std::coroutine_handle<>> _pending_tasks;
48 std::vector<std::thread> _workers;
_enqueue() : insert a task for execution 49
50 std::mutex _mtx;
_process() : resume a task 51 std::condition_variable _cv;
52 bool _stop{false};
53 std::atomic<size_t> _finished{0};
54 };
Code: https://github.com/dian-lun-lin/cpp_coroutine_examples/multi-threaded
80
Multi-threaded Scheduler Using Centralized Queue
56 Scheduler::Scheduler(size_t num_threads) { Multi-threaded scheduler
57 _workers.reserve(num_threads);
58
59 for(size_t t = 0; t < num_threads; ++t) {
60 _workers.emplace_back([this]() {
61 while(true) {
62 std::coroutine_handle<> task;
63 {
64 std::unique_lock<std::mutex> lock(_mtx);
65 _cv.wait(lock, [this]{
return _stop || (!_pending_tasks.empty());
});
66 if(_stop) {
67 return;
68 }
69
70 task = _pending_tasks.front();
71 _pending_tasks.pop();
72 }
73 if(task) {
74 _process(task);
75 }
76 }
77 }
78 );
79 }
80 }
81
Multi-threaded Scheduler Using Centralized Queue
56 Scheduler::Scheduler(size_t num_threads) { Multi-threaded scheduler
57 _workers.reserve(num_threads);
58
80 }
82
Multi-threaded Scheduler Using Centralized Queue
Multi-threaded scheduler
78 );
79 }
83
Multi-threaded Scheduler Using Centralized Queue
Multi-threaded scheduler
61 while(true) {
62 std::coroutine_handle<> task;
63
76 }
84
Multi-threaded Scheduler Using Centralized Queue
Multi-threaded scheduler
63 {
64 std::unique_lock<std::mutex> lock(_mtx);
65 _cv.wait(lock, [this]{
return _stop || (!_pending_tasks.empty());
});
66 if(_stop) {
67 return;
68 }
69
70 task = _pending_tasks.front();
71 _pending_tasks.pop();
72 }
85
Multi-threaded Scheduler Using Centralized Queue
Multi-threaded scheduler
73 if(task) {
74 _process(task);
75 }
86
Multi-threaded Scheduler Using Centralized Queue
56 Scheduler::Scheduler(size_t num_threads) { Multi-threaded scheduler
57 _workers.reserve(num_threads);
58
59 for(size_t t = 0; t < num_threads; ++t) {
60 _workers.emplace_back([this]() {
61 while(true) {
62 std::coroutine_handle<> task;
63 {
64 std::unique_lock<std::mutex> lock(_mtx);
65 _cv.wait(lock, [this]{
return _stop || (!_pending_tasks.empty());
});
66 if(_stop) {
67 return;
68 }
69
70 task = _pending_tasks.front();
71 _pending_tasks.pop();
72 }
73 if(task) {
74 _process(task);
75 }
76 }
77 }
78 );
79 }
80 }
87
The difference
56 Scheduler::Scheduler(size_t num_threads) { Multi-threaded scheduler
57 _workers.reserve(num_threads);
58
59 for(size_t t = 0; t < num_threads; ++t) {
60 _workers.emplace_back([this]() {
61 while(true) {
62 std::coroutine_handle<> task; The difference between coroutine and function: task type
63 {
64 std::unique_lock<std::mutex> lock(_mtx); std::function<void()>
65 _cv.wait(lock, [this]{
return _stop || (!_pending_tasks.empty());
});
66 if(_stop) {
67 return;
68 }
69
70 task = _pending_tasks.front();
71 _pending_tasks.pop();
72 }
73 if(task) {
74 _process(task);
75 }
76 }
77 }
78 );
79 }
80 }
88
Definition of _process() and _enqueue()
102 void Scheduler::_process(std::coroutine_handle<> task) { Multi-threaded scheduler
103 task.resume();
104
105 if(!task.done()) {
106 _enqueue(task);
107 }
108 else {
109 task.destroy();
110 if(_finished.fetch_add(1) + 1 == _tasks.size()) { • Resume a task
111 { • If the task is not done, enqueue the task back
112 std::unique_lock<std::mutex> lock(_mtx); to _pending_tasks
113 _stop = true; • If the task is done, increase _finished by one
114 } - Check if all tasks are finished
115 _cv.notify_all();
116 }
117 }
118 }
89
Definition of _process() and _enqueue()
Multi-threaded scheduler
Code: https://github.com/dian-lun-lin/cpp_coroutine_examples/multi-threaded
91
Agenda
• Learn what is coroutine
Code: https://github.com/dian-lun-lin/cpp_coroutine_examples/cpu-gpu
93
Definition of Coroutine Tasks and Scheduler API
CPU-GPU scheduler
6 template <typename T>
7 __global__
8 void gpu_work() {
9 Coroutine tasks
10 }
11
12 Task TaskA(Scheduler& sch) {
13
14 std::cout << "Start TaskA\n";
15
16 cudaStream_t stream;
17
18 cudaStreamCreate(&stream);
19
20 gpu_work<<<8, 256, 0, stream>>>();
21
22 while(cudaStreamQuery(stream) != cudaSuccess) {
23 co_await sch.suspend();
24 }
25
26 std::cout << "TaskA is finished\n";
27
28 cudaStreamDestroy(stream);
29 }
Code: https://github.com/dian-lun-lin/cpp_coroutine_examples/cpu-gpu
94
Definition of Coroutine Tasks and Scheduler API
CPU-GPU scheduler
6 template <typename T>
7 __global__
8 void gpu_work() {
9 Coroutine tasks
10 }
11
12 Task TaskA(Scheduler& sch) {
13
52 int main() {
14 std::cout << "Start TaskA\n";
53
15
54 Scheduler sch;
16 cudaStream_t stream;
55
17
56 sch.emplace(TaskA(sch).get_handle());
18 cudaStreamCreate(&stream);
57 sch.emplace(TaskB(sch).get_handle());
19
58
20 gpu_work<<<8, 256, 0, stream>>>();
59 std::cout << "Start scheduling...\n";
21
60
22 while(cudaStreamQuery(stream) != cudaSuccess) {
61 sch.schedule();
23 co_await sch.suspend();
62 sch.wait();
24 }
63
25
64 }
26 std::cout << "TaskA is finished\n";
27
28 cudaStreamDestroy(stream);
29 }
Code: https://github.com/dian-lun-lin/cpp_coroutine_examples/cpu-gpu
95
Definition of Coroutine Tasks and Scheduler API
CPU-GPU scheduler
6 template <typename T>
7 __global__
8 void gpu_work() {
9 Coroutine tasks
10 }
11
No need of another scheduler implementation!
12 Task TaskA(Scheduler& sch) {
13
52 int main() {
14 std::cout << "Start TaskA\n";
53
15
54 Scheduler sch;
16 cudaStream_t stream;
55
17
56 sch.emplace(TaskA(sch).get_handle());
18 cudaStreamCreate(&stream);
57 sch.emplace(TaskB(sch).get_handle());
19
58
20 gpu_work<<<8, 256, 0, stream>>>();
59 std::cout << "Start scheduling...\n";
21
60
22 while(cudaStreamQuery(stream) != cudaSuccess) {
61 sch.schedule();
23 co_await sch.suspend();
62 sch.wait();
24 }
63
25
64 }
26 std::cout << "TaskA is finished\n";
27
28 cudaStreamDestroy(stream);
29 }
Code: https://github.com/dian-lun-lin/cpp_coroutine_examples/cpu-gpu
96
Agenda
• Learn what is coroutine
98
Definition of Task
58 void wo_coro_work(
59 dim3 dim_grid, dim3 dim_block,
60 size_t BLOCK_SIZE, int cpu_ms, int gpu_ms
61 ) {
62 cpu_loop(cpu_ms);
Task for without coroutine scheduler 63 cudaStream_t stream;
64 cudaStreamCreate(&stream);
65 cuda_loop<<<dim_grid, dim_block, 0, stream>>>(gpu_ms);
66 cudaStreamSynchronize(stream);
67 cudaStreamDestroy(stream);
68 }
• Both cpu_loop() and cuda_loop()
will loop for certain time
• All tasks are independent 44 cudaCoro::Task work(
45 cudaCoro::Scheduler& sch, dim3 dim_grid, dim3 dim_block,
46 size_t BLOCK_SIZE, int cpu_ms, int gpu_ms
47 ) {
48 cpu_loop(cpu_ms);
49 cudaStream_t stream;
Task for with coroutine scheduler 50 cudaStreamCreate(&stream);
51 cuda_loop<<<dim_grid, dim_block, 0, stream>>>(gpu_ms);
52 while(cudaStreamQuery(stream) != cudaSuccess) {
53 co_await sch.suspend();
54 }
55 cudaStreamDestroy(stream);
56 }
99
Hardware Platform
• 4 CPU threads
- 3.6 GHz
- 32 GB memory
• Compiler
- CUDA v12.0
- g++ 12.2.1
- -O3 enabled
101
Experimental Results - Different #Tasks
Take-home message:
• There is certain cost of using coroutines!
102
Experimental Results - Different #Tasks
Take-home message:
• There is certain cost of using coroutines!
103
Experimental Results - Different #Tasks
Take-home message:
• There is certain cost of using coroutines!
104
Experimental Results - Different #Tasks
Take-home message:
• There is certain cost of using coroutines!
105
Experimental Results - Different #Tasks
Take-home message:
• There is certain cost of using coroutines!
• More tasks = more overlaps
106
Closing
• We have presented what is coroutine
Taskflow: https://taskflow.github.io/
Dian-Lun Lin: https://dian-lun-lin.github.io/