MPI Note[7]: 并行排序1
•
文章
开始讲算法的时候隐约有种回到20年前搞信息学奥赛拿的错觉,最近还真有人叫我开OI的班,技术扶贫怎么能收钱呢,虽然都开始扶贫大家的下一代了…
串行排序
可能大家学算法的时候最开始学的就是排序算法, 而这些算法都是单机上的串行执行的排序算法。
冒泡和快排
这个就不用多解释了…
归并排序
这是一种简单的分治算法,通过将数据拆分成子序列,将子序列排序,然后将有序子列合并得到完全有序的序列。
//merge A[head:mid], a[mid+1,tail], then write back to A[head:tail]
void merge(int arr[], int head, int mid, int tail)
{
int i, j, k;
int l_size = mid - head + 1;
int r_size = tail - mid;
//create temp array
int *L = (int *)malloc((l_size) * sizeof(int));
int *R = (int *)malloc((r_size) * sizeof(int));
for (i = 0; i < l_size; i++)
L[i] = arr[head + i];
for (j = 0; j < r_size; j++)
R[j] = arr[mid + 1 + j];
i = 0;
j = 0;
k = head;
//scan L and R
while (i < l_size && j < r_size)
{
if (L[i] < R[j])
{
arr[k] = L[i];
i++;
}
else
{
arr[k] = R[j];
j++;
}
k++;
}
//copy remaining
while (i < l_size)
{
arr[k] = L[i];
i++;
k++;
}
while (j < r_size)
{
arr[k] = R[j];
j++;
k++;
}
free(L);
free(R);
}
void merge_sort(int arr[], int head, int tail)
{
if (head < tail)
{
int mid = (head + tail) / 2;
merge_sort(arr, head, mid);
merge_sort(arr, mid + 1, tail);
merge(arr, head, mid, tail);
}
}PSRS并行排序
Parallel Sorting by Regular Sampling, PSRS拼音输入法会自动联想成:破碎人生...真是有趣。这个算法分为四个阶段:
- 每个分区进程利用一个串行算法将本地数据排序
- 将本地数据正则采样,抽出关键值,然后对其归并排序后再进行正则采样
- 利用新的采样值对本地数据分区
- 合并数据,归并排序
分发数据
//scatter source data to other nodes.
int num_ele_per_node = MAXN / world_size;
int mod = MAXN % world_size;
int *sub_array = (int *)malloc(sizeof(int) * (num_ele_per_node + mod));
int s_cnt[world_size];
int s_offset[world_size];
for (int i = 0; i < world_size; i++)
{
s_cnt[i] = num_ele_per_node;
s_offset[i] = i * num_ele_per_node;
}
//剩余的分到最后一组
s_cnt[world_size-1] = num_ele_per_node+mod;
MPI_Scatterv(n4, s_cnt, s_offset, MPI_INT, sub_array, num_ele_per_node + mod, MPI_INT, 0, MPI_COMM_WORLD);第一步,局部快排
int group_len = s_cnt[world_rank];
qsort(sub_array, group_len, sizeof(int), cmpfunc);
int *samples = (int *)malloc(world_size * sizeof(int));选择Regular Sample
//step.1b select samples on each node
int sq_world_size = world_size * world_size;
for (int k = 1; k < world_size; k++)
samples[k - 1] = sub_array[k * num_ele_per_node / world_size];
//step.1c collect all samples
int *global_samples = (int *)malloc(sq_world_size * sizeof(int));
MPI_Gather(samples, world_size - 1, MPI_INT, global_samples, world_size - 1, MPI_INT, 0, MPI_COMM_WORLD);选择全局分割点,并广播给其它各节点
//step.1d select pivots
int pivots[world_size];
if (world_rank == 0)
{
qsort(global_samples, (world_size - 1) * world_size, sizeof(int), cmpfunc);
for (int k = 1; k < world_size; k++)
pivots[k - 1] = global_samples[k * (world_size - 1)];
}
//step.1e bcast pivots
MPI_Bcast(pivots, world_size - 1, MPI_INT, 0, MPI_COMM_WORLD);本地分割
//step.2a split local array
int index = 0;
int p_cnt[world_size];
for (int i = 0; i < world_size; i++)
p_cnt[i] = 0;
pivots[world_size - 1] = 2147483647;
for (int i = 0; i < group_len; i++)
{
if (sub_array[i] <= pivots[index])
p_cnt[index]++;
else
{
i--;
index++;
}
}准备本地buffer和需要接收的数据
其中MPI_Alltoall是一个很经典的玩法,可以将每个节点的需要接收的数据同步。
//step.2b exchange each segment length from p_cnt to r_cnt
int r_cnt[world_size];
MPI_Alltoall(p_cnt, 1, MPI_INT, r_cnt, 1, MPI_INT, MPI_COMM_WORLD);
int r_offset[world_size];
r_offset[0] = 0;
s_offset[0] = 0;
for (int i = 1; i < world_size; i++)
{
s_offset[i] = s_offset[i - 1] + p_cnt[i - 1];
r_offset[i] = r_offset[i - 1] + r_cnt[i - 1];
}
int total_cnt = 0;
for (int i = 0; i < world_size; i++)
total_cnt += r_cnt[i];
int *sub_array2 = (int *)malloc(total_cnt * sizeof(int));
MPI_Alltoallv(sub_array, p_cnt, s_offset, MPI_INT, sub_array2, r_cnt, r_offset, MPI_INT, MPI_COMM_WORLD);
merge_sort(sub_array2, 0, total_cnt - 1);最后Gather拿到所有数据
MPI_Gather(&total_cnt, 1, MPI_INT, r_cnt, 1, MPI_INT, 0, MPI_COMM_WORLD);
r_offset[0] = 0;
for (int i = 1; i < world_size; i++)
r_offset[i] = r_offset[i - 1] + r_cnt[i - 1];
MPI_Gatherv(sub_array2, total_cnt, MPI_INT, n5, r_cnt, r_offset, MPI_INT, 0, MPI_COMM_WORLD);并行测试结果,可以见到当节点数大于2时就比qsort快了。
zartbot@zartbotWS:~/Desktop/mpi$ mpiexec -np 1 ./qs
mpi time: 30741448ns
quick_sort: 13068417ns
merge_sort: 19825194ns
zartbot@zartbotWS:~/Desktop/mpi$ mpiexec -np 2 ./qs
mpi time: 21733170ns
quick_sort: 13721354ns
merge_sort: 19732058ns
zartbot@zartbotWS:~/Desktop/mpi$ mpiexec -np 4 ./qs
mpi time: 13931970ns
quick_sort: 16625924ns
merge_sort: 20261184ns
zartbot@zartbotWS:~/Desktop/mpi$ mpiexec -np 8 ./qs
mpi time: 8299586ns
quick_sort: 16050739ns
merge_sort: 21257035ns
zartbot@zartbotWS:~/Desktop/mpi$ mpiexec -np 16 ./qs
mpi time: 4717631ns
quick_sort: 14314263ns
merge_sort: 21430261ns
zartbot@zartbotWS:~/Desktop/mpi$ mpiexec -np 24 ./qs
mpi time: 3984703ns
quick_sort: 22020000ns
merge_sort: 22696114ns
zartbot@zartbotWS:~/Desktop/mpi$ mpiexec -np 32 ./qs
mpi time: 3203447ns
quick_sort: 16230325ns
merge_sort: 21357250ns
zartbot@zartbotWS:~/Desktop/mpi$ mpiexec -np 48 ./qs
mpi time: 3565351ns
quick_sort: 15976286ns
merge_sort: 21657906ns测试代码
《MPI Note[7]: 并行排序1》来自互联网,仅为收藏学习,如侵权请联系删除。本文URL:http://www.bookhoes.com/749.html