OpenMP API 笔记

内置函数

线程

  • 获取多处理器个数 omp_get_num_procs()

  • 返回当前区域线程个数 omp_get_num_threads()

  • 获取线程ID omp_get_thread_num()

  • 设置线程个数 omp_set_num_threads()

  • 初始化锁 omp_init_lock()

  • 上锁 omp_set_lock()

  • 解锁 omp_unset_lock()

  • 销毁锁 omp_destroy_lock()


并行化

在代码块中运行并行代码

1
2
3
4
5
#pragma omp parallel
{
int num_procs = omp_get_num_procs();
printf("%d procs \n", num_procs);
}
parallel for

自动对for循环进行并行化,注意特殊情况(即循环之间有依赖关系)

1
2
3
4
5
6
    int i = 0;
#pragma omp parallel for
for(; i < n; i ++) {
int thread = omp_get_thread_num();
printf("index %d thread %d\n", i, thread);
}

parallel if

通过if判断,如果满足条件就并行执行,若不满足条件,就单线程执行

1
2
3
4
5
#pragma omp parallel if( n > 10 ) num_threads(2)
{
printf("n = %d thread %d\n", n, omp_get_thread_num());
}
}

sections && section

创建多个并行块,每个section同时执行。

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
void ompSection() {
#pragma omp parallel sections
{
#pragma omp section
{
printf("1\n");
}

#pragma omp section
{
printf("2\n");
}

#pragma omp section
{
printf("3\n");
}
}
}

Single

指定某一块代码只由单线程执行,除非使用nowait,否则执行期间其他线程都得等待single线程执行完成,相当于设置了一个barrier

1
2
3
4
5
6
7
8
9
10
#pragma omp parallel num_threads(4)
{
printf("do thread %d\n", omp_get_thread_num());

#pragma omp single
{
printf("single thread\n");
}
printf("do thread %d after single\n", omp_get_thread_num());
}

输出

1
2
3
4
5
6
7
8
9
do thread 1
do thread 0
do thread 3
do thread 2
single thread
do thread 2 after single
do thread 0 after single
do thread 1 after single
do thread 3 after single

具体在哪个线程上运行该代码块是未定的。

Master

指定程序块在主线程中运行,其他线程可以继续运行,与single有区别,master中无barrier

1
2
3
4
5
6
7
8
9
10
 #pragma omp parallel num_threads(4)
{
printf("threads id %d\n", omp_get_thread_num());

#pragma omp master
{
printf("main thread %d\n", omp_get_thread_num());
}
printf("threads out main thread %d\n", omp_get_thread_num());
}

变量

private

将一个变量或者多个变量声明为线程私有,其他线程无法访问该变量,区域外同名的变量也不会在并行区域内起作用,并且private变量也不会影响区域外共享变量。

1
2
3
4
5
6
7
8
void ompPrivateVariable() {
int k = 100;
#pragma omp parallel for private(k)
for(k = 10; k < 20; k ++ ) {
printf("k = %d from thread %d\n", k, omp_get_thread_num());
}
printf("k = %d from outsise\n", k);
}

firstprivate

可以从并行区域外拿到某个变量的副本,但无法修改并行区域外的变量。

lastprivate

退出并行区域时,将该变量赋给外部共享变量

1
2
3
4
5
6
7
8
9
10
void ompLastPrivateVar() {
int k = 100;
int i = 0;
#pragma omp parallel for firstprivate(k), lastprivate(k)
for(i = 0; i < 10; i ++) {
k += i;
printf("update k = %d from thread %d\n", k, omp_get_thread_num());
}
printf("last k = %d\n",k);
}

输出

1
2
3
4
5
6
7
8
9
10
11
update k = 100 from thread 0
update k = 106 from thread 2
update k = 103 from thread 1
update k = 108 from thread 3
update k = 101 from thread 0
update k = 113 from thread 2
update k = 107 from thread 1
update k = 117 from thread 3
update k = 103 from thread 0
update k = 112 from thread 1
last k = 117

语句

reduction

1
2
3
4
 #pragma omp parallel for reduction(+: sum)
for(i = 0; i < 10; i ++) {
sum += i;
}

default

schedule

任务调度 schedule (type, size) size可选

type:

  • dynamic 动态调度,动态分配迭代到每个线程,size可以指定每次给每个线程分配size个迭代
  • guided 每个线程开始会分配较大的迭代快,之后分配的迭代次数会逐渐减小,指数下降到size大小,若没有size参数则为1.
  • runtime 依赖于UNIX系统环境OMP_SCHEDULE
  • static 静态方式调度,n此循环,t个线程,则平均分配每个线程n/t此循环

线程间同步

barrier

等待其他线程都结束后才能继续运行。

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
int i = 0, sum = 0;
#pragma omp parallel num_threads(4)
{
#pragma omp for reduction(+: sum)
for(i = 0; i < 10; i ++) {
sum += i;
printf("thread %d sum\n", omp_get_thread_num());
}
printf("thread %d Barrier\n", omp_get_thread_num());
#pragma omp barrier
{
sum = sum * 2;
printf("sum is %d\n", sum);
printf("thread %d end\n", omp_get_thread_num());
}
}

输出

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
thread 1 sum
thread 3 sum
thread 2 sum
thread 0 sum
thread 1 sum
thread 3 sum
thread 2 sum
thread 0 sum
thread 1 sum
thread 0 sum
thread 0 Barrier
thread 1 Barrier
thread 3 Barrier
thread 2 Barrier
sum is 90
thread 0 end
sum is 180
thread 1 end
sum is 360
thread 3 end
sum is 720
thread 2 end

nowait

消除一些隐含的barrier。

1
2
3
4
5
6
7
8
9
10
11
12
13
int i = 0;
#pragma omp parallel num_threads(4)
{
#pragma omp for nowait
for(i = 0; i < 4; i ++) {
printf("thread %d i = %d\n",omp_get_thread_num(), i);
}
printf("no wait\n");
#pragma omp for nowait
for(; i < 7; i++) {
printf("s thread %d i = %d\n",omp_get_thread_num(), i);
}
}

输出,并不会等前一个for循环执行完才去执行下一个循环

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
thread 0 i = 0
no wait
s thread 0 i = 0
s thread 0 i = 1
thread 1 i = 1
no wait
s thread 1 i = 2
s thread 1 i = 3
thread 2 i = 2
no wait
s thread 2 i = 4
s thread 2 i = 5
thread 3 i = 3
no wait
s thread 3 i = 6

critical

临界区变量的保华,一个时间内只能有一个线程访问。

atomic

原子操作

1
2
3
4
5
6
int i = 0, nVar = 0;
#pragma omp parallel for shared(nVar)
for(i = 0; i < 10; i ++) {
#pragma omp atomic
nVar ++;
}