Write an MPI program that implements a shell-sort like parallel algorithm that sorts an array of
integers. The initial array is partitioned into equal size sub-arrays which are distributed to the
processes (one per process). The parallel algorithm is described in Section 9.3.2 of the textbook
(pages 398-399). It consists of two phases: (i) the processes that are far away from each other
compare and split their assigned sub-arrays (using a hypercube pattern of communication); (ii)
perform odd-even transposition iterations as long as the sub-arrays are changing. The following
is a high-level pseudocode description of the algorithm: Algorithm 1 Shell-sort like parallel
algorithm 1: {Phase I: Hypercube Compare-Exchange.} 2: for i = (d 1) to 0 do 3: if (i-th bit of
rank) = 1 then 4: compare-split-hi(i); 5: else 6: compare-split-low(i); 7: {Phase II: Odd-even
Transposition Iterations.} 8: done = FALSE 9: while done = FALSE do 10: {Perform odd-even
iterations} 11: if received items need to be passed further then 12: broadcast FALSE to all
processes; 13: else 14: broadcast TRUE to all processes; 15: if all processes broadcast TRUE
then 16: done = TRUE In the algorithm description: d - the number of bits required to represent
the ID’s of the processes (d=3 for 8 processes). compare-split-hi(i) - performs a compare-and-
split operation so that processor i keeps half of the merged sub-arrays containing the greatest
integers. compare-split-low(i) - performs a compare-and-split operation so that processor i keeps
half of the merged sub-arrays containing the smallest integers. Test the program on 8 processes.
The input array should consist of 128 random integers from the range [0, 128]. The array is
generated at process 0 which is responsible for partitioning the array and sending the sub-arrays
to the other processors. Process 0 will keep its corresponding sub-array, so that it can participate
in the algorithm. At the end of the computation, process 0 collects all the sub-arrays and displays
the sorted array. Compare the execution times for your parallel shell-sort implementation with
those of the standard odd-even transposition sort (given in the textbook, section 6.3.5, pages 248-
250) and the serial quicksort. For this performance comparison you should use 8 processors and
randomly generated integer arrays of sizes: 216, 220, 224, and 230. The random integers should
be in the range [0, 128]. Produce a plot showing the execution times of the three algorithms.
Produce another plot to show the speedup obtained by the parallel shell-sort with respect to the
sequential quicksort. Write a short (max. 2 pages) report describing the implementation and the
obtained results. The report should be typeset using Latex and the plots should be generated
using gnuplot.
Solution
Answer:
Assembly Language Code :
.zero 1
.LC0:
.string " "
print_ar(int*, int):
push rbp
mov rbp, rsp
sub rsp, 32
mov QWORD PTR [rbp-24], rdi
mov DWORD PTR [rbp-28], esi
mov DWORD PTR [rbp-4], 0
.L3:
mov eax, DWORD PTR [rbp-4]
cmp eax, DWORD PTR [rbp-28]
jge .L2
mov eax, DWORD PTR [rbp-4]
cdqe
lea rdx, [0+rax*4]
mov rax, QWORD PTR [rbp-24]
add rax, rdx
mov eax, DWORD PTR [rax]
mov esi, eax
mov edi, OFFSET FLAT:std::cout
call std::basic_ostream >::operator<<(int)
mov esi, OFFSET FLAT:.LC0
mov rdi, rax
call std::basic_ostream >& std::operator<< >(std::basic_ostream >&, char const*)
add DWORD PTR [rbp-4], 1
jmp .L3
.L2:
mov esi, OFFSET FLAT:std::basic_ostream >& std::endl >(std::basic_ostream >&)
mov edi, OFFSET FLAT:std::cout
call std::basic_ostream >::operator<<(std::basic_ostream >& (*)(std::basic_ostream >&))
nop
leave
ret
shell_sort(int*, int):
push rbp
mov rbp, rsp
mov QWORD PTR [rbp-24], rdi
mov DWORD PTR [rbp-28], esi
mov eax, DWORD PTR [rbp-28]
mov edx, eax
shr edx, 31
add eax, edx
sar eax
mov DWORD PTR [rbp-8], eax
.L10:
cmp DWORD PTR [rbp-8], 0
jle .L11
mov eax, DWORD PTR [rbp-8]
mov DWORD PTR [rbp-12], eax
.L9:
mov eax, DWORD PTR [rbp-12]
cmp eax, DWORD PTR [rbp-28]
jge .L6
mov eax, DWORD PTR [rbp-12]
cdqe
lea rdx, [0+rax*4]
mov rax, QWORD PTR [rbp-24]
add rax, rdx
mov eax, DWORD PTR [rax]
mov DWORD PTR [rbp-16], eax
mov eax, DWORD PTR [rbp-12]
mov DWORD PTR [rbp-4], eax
.L8:
mov eax, DWORD PTR [rbp-4]
cmp eax, DWORD PTR [rbp-8]
jl .L7
mov eax, DWORD PTR [rbp-4]
sub eax, DWORD PTR [rbp-8]
cdqe
lea rdx, [0+rax*4]
mov rax, QWORD PTR [rbp-24]
add rax, rdx
mov eax, DWORD PTR [rax]
cmp eax, DWORD PTR [rbp-16]
jle .L7
mov eax, DWORD PTR [rbp-4]
cdqe
lea rdx, [0+rax*4]
mov rax, QWORD PTR [rbp-24]
add rdx, rax
mov eax, DWORD PTR [rbp-4]
sub eax, DWORD PTR [rbp-8]
cdqe
lea rcx, [0+rax*4]
mov rax, QWORD PTR [rbp-24]
add rax, rcx
mov eax, DWORD PTR [rax]
mov DWORD PTR [rdx], eax
mov eax, DWORD PTR [rbp-8]
sub DWORD PTR [rbp-4], eax
jmp .L8
.L7:
mov eax, DWORD PTR [rbp-4]
cdqe
lea rdx, [0+rax*4]
mov rax, QWORD PTR [rbp-24]
add rdx, rax
mov eax, DWORD PTR [rbp-16]
mov DWORD PTR [rdx], eax
add DWORD PTR [rbp-12], 1
jmp .L9
.L6:
mov eax, DWORD PTR [rbp-8]
mov edx, eax
shr edx, 31
add eax, edx
sar eax
mov DWORD PTR [rbp-8], eax
jmp .L10
.L11:
nop
pop rbp
ret
.LC1:
.string "Intial Array : "
.LC2:
.string "Sorted Array : "
main:
push rbp
mov rbp, rsp
sub rsp, 48
mov DWORD PTR [rbp-48], 1
mov DWORD PTR [rbp-44], 4
mov DWORD PTR [rbp-40], 16
mov DWORD PTR [rbp-36], 30
mov DWORD PTR [rbp-32], 29
mov DWORD PTR [rbp-28], 18
mov DWORD PTR [rbp-24], 100
mov DWORD PTR [rbp-20], 2
mov DWORD PTR [rbp-16], 43
mov DWORD PTR [rbp-12], 1
mov esi, OFFSET FLAT:.LC1
mov edi, OFFSET FLAT:std::cout
call std::basic_ostream >& std::operator<< >(std::basic_ostream >&, char const*)
lea rax, [rbp-48]
mov esi, 10
mov rdi, rax
call print_ar(int*, int)
lea rax, [rbp-48]
mov esi, 10
mov rdi, rax
call shell_sort(int*, int)
mov esi, OFFSET FLAT:.LC2
mov edi, OFFSET FLAT:std::cout
call std::basic_ostream >& std::operator<< >(std::basic_ostream >&, char const*)
lea rax, [rbp-48]
mov esi, 10
mov rdi, rax
call print_ar(int*, int)
mov eax, 0
leave
ret
__static_initialization_and_destruction_0(int, int):
push rbp
mov rbp, rsp
sub rsp, 16
mov DWORD PTR [rbp-4], edi
mov DWORD PTR [rbp-8], esi
cmp DWORD PTR [rbp-4], 1
jne .L16
cmp DWORD PTR [rbp-8], 65535
jne .L16
mov edi, OFFSET FLAT:std::__ioinit
call std::ios_base::Init::Init()
mov edx, OFFSET FLAT:__dso_handle
mov esi, OFFSET FLAT:std::__ioinit
mov edi, OFFSET FLAT:std::ios_base::Init::~Init()
call __cxa_atexit
.L16:
nop
leave
ret
push rbp
mov rbp, rsp
mov esi, 65535
mov edi, 1
call __static_initialization_and_destruction_0(int, int)
pop rbp
ret

More Related Content

PDF
Implement an MPI program to perform matrix-matrix multiplication AB .pdf
PDF
Write a program in MIPS that reads in a Roman form number from th.pdf
PDF
WCTF 2018 binja Editorial
PPTX
Advanced procedures in assembly language Full chapter ppt
PDF
Qemu JIT Code Generator and System Emulation
PPTX
Hadoop and HBase experiences in perf log project
PPTX
Operating System Engineering
PPTX
[ASM]Lab6
Implement an MPI program to perform matrix-matrix multiplication AB .pdf
Write a program in MIPS that reads in a Roman form number from th.pdf
WCTF 2018 binja Editorial
Advanced procedures in assembly language Full chapter ppt
Qemu JIT Code Generator and System Emulation
Hadoop and HBase experiences in perf log project
Operating System Engineering
[ASM]Lab6

Similar to Write an MPI program that implements a shell-sort like parallel algo.pdf (20)

PDF
Please convert the following C code to assembly Y86int j,k; .....pdf
PDF
Spark workshop
PPT
PPTX
Basic ASM by @binaryheadache
PDF
Real Time Big Data Management
PDF
Please convert the following C code to assembly Y86int i,j; ......pdf
PDF
Scale17x buffer overflows
PDF
Sergi Álvarez & Roi Martín - Radare2 Preview [RootedCON 2010]
PDF
Parallel and Distributed computing: why parallellismpdf
PPTX
Evgeniy Muralev, Mark Vince, Working with the compiler, not against it
PDF
Stata Programming Cheat Sheet
PPT
Basic concept of MATLAB.ppt
PPT
Introduction to Assembly Language
PDF
Vectorization in ATLAS
PPTX
C++ and Assembly: Debugging and Reverse Engineering
PDF
Stata cheatsheet programming
PDF
Redis Lua Scripts
PPTX
Python basics
PPTX
Python basics
PPTX
Python basics
Please convert the following C code to assembly Y86int j,k; .....pdf
Spark workshop
Basic ASM by @binaryheadache
Real Time Big Data Management
Please convert the following C code to assembly Y86int i,j; ......pdf
Scale17x buffer overflows
Sergi Álvarez & Roi Martín - Radare2 Preview [RootedCON 2010]
Parallel and Distributed computing: why parallellismpdf
Evgeniy Muralev, Mark Vince, Working with the compiler, not against it
Stata Programming Cheat Sheet
Basic concept of MATLAB.ppt
Introduction to Assembly Language
Vectorization in ATLAS
C++ and Assembly: Debugging and Reverse Engineering
Stata cheatsheet programming
Redis Lua Scripts
Python basics
Python basics
Python basics
Ad

More from bharatchawla141 (20)

PDF
Explain how HPV leads to cervical cancer and oral cancer. Include.pdf
PDF
Consider L = {a^nb^2nc^P p 0}. Prove L is not a context-free langu.pdf
PDF
Differentiate between analog and digital signals.SolutionIn ana.pdf
PDF
describe two properties of iPS and ES cells, including the meaning i.pdf
PDF
Coffman Company issued 1,000,000 10 year 10 percent bonds January 1,.pdf
PDF
A JAR file contains an image named images Fall.png. Java class na.pdf
PDF
A 71-year-old male patient comes to the hospital after having been p.pdf
PDF
You are carrying out experiments in cell fusion by fusing together ce.pdf
PDF
Which of the following are roles of proteins Pick ALL that apply..pdf
PDF
When are a persons fellowship, autonomy, and competence face threa.pdf
PDF
What is the function in homworts Where can one find homwortsS.pdf
PDF
This is the assignmentOBJECTIVESAfter finishing this lab, stude.pdf
PDF
Two paragraph opinion about the film editing of the movie LIFE OF .pdf
PDF
True or False The aDDM fixes the SV reference problem by assuming t.pdf
PDF
The number of visits to public libraries increased from 1.4 billion i.pdf
PDF
The first table contains data of the Student entry. Attributes are Si.pdf
PDF
Save for Later 12) Viruses, adware and spyware are referred to collec.pdf
PDF
Sustainability Sustainability is an important consideration for any.pdf
PDF
Research health data stewardship and in your post show why it is imp.pdf
PDF
Read carefully. Im not sure if the point class is correct but postin.pdf
Explain how HPV leads to cervical cancer and oral cancer. Include.pdf
Consider L = {a^nb^2nc^P p 0}. Prove L is not a context-free langu.pdf
Differentiate between analog and digital signals.SolutionIn ana.pdf
describe two properties of iPS and ES cells, including the meaning i.pdf
Coffman Company issued 1,000,000 10 year 10 percent bonds January 1,.pdf
A JAR file contains an image named images Fall.png. Java class na.pdf
A 71-year-old male patient comes to the hospital after having been p.pdf
You are carrying out experiments in cell fusion by fusing together ce.pdf
Which of the following are roles of proteins Pick ALL that apply..pdf
When are a persons fellowship, autonomy, and competence face threa.pdf
What is the function in homworts Where can one find homwortsS.pdf
This is the assignmentOBJECTIVESAfter finishing this lab, stude.pdf
Two paragraph opinion about the film editing of the movie LIFE OF .pdf
True or False The aDDM fixes the SV reference problem by assuming t.pdf
The number of visits to public libraries increased from 1.4 billion i.pdf
The first table contains data of the Student entry. Attributes are Si.pdf
Save for Later 12) Viruses, adware and spyware are referred to collec.pdf
Sustainability Sustainability is an important consideration for any.pdf
Research health data stewardship and in your post show why it is imp.pdf
Read carefully. Im not sure if the point class is correct but postin.pdf
Ad

Recently uploaded (20)

PDF
Environmental Education MCQ BD2EE - Share Source.pdf
PDF
The TKT Course. Modules 1, 2, 3.for self study
PDF
AI-driven educational solutions for real-life interventions in the Philippine...
PDF
fundamentals-of-heat-and-mass-transfer-6th-edition_incropera.pdf
PDF
LIFE & LIVING TRILOGY- PART (1) WHO ARE WE.pdf
PDF
International_Financial_Reporting_Standa.pdf
PDF
English Textual Question & Ans (12th Class).pdf
PDF
Literature_Review_methods_ BRACU_MKT426 course material
PDF
1.3 FINAL REVISED K-10 PE and Health CG 2023 Grades 4-10 (1).pdf
PDF
Skin Care and Cosmetic Ingredients Dictionary ( PDFDrive ).pdf
PDF
M.Tech in Aerospace Engineering | BIT Mesra
PDF
Fun with Grammar (Communicative Activities for the Azar Grammar Series)
PPTX
Macbeth play - analysis .pptx english lit
PPTX
What’s under the hood: Parsing standardized learning content for AI
PDF
1.Salivary gland disease.pdf 3.Bleeding and Clotting Disorders.pdf important
PDF
Journal of Dental Science - UDMY (2022).pdf
PDF
Farming Based Livelihood Systems English Notes
PDF
semiconductor packaging in vlsi design fab
PDF
CISA (Certified Information Systems Auditor) Domain-Wise Summary.pdf
PPTX
DRUGS USED FOR HORMONAL DISORDER, SUPPLIMENTATION, CONTRACEPTION, & MEDICAL T...
Environmental Education MCQ BD2EE - Share Source.pdf
The TKT Course. Modules 1, 2, 3.for self study
AI-driven educational solutions for real-life interventions in the Philippine...
fundamentals-of-heat-and-mass-transfer-6th-edition_incropera.pdf
LIFE & LIVING TRILOGY- PART (1) WHO ARE WE.pdf
International_Financial_Reporting_Standa.pdf
English Textual Question & Ans (12th Class).pdf
Literature_Review_methods_ BRACU_MKT426 course material
1.3 FINAL REVISED K-10 PE and Health CG 2023 Grades 4-10 (1).pdf
Skin Care and Cosmetic Ingredients Dictionary ( PDFDrive ).pdf
M.Tech in Aerospace Engineering | BIT Mesra
Fun with Grammar (Communicative Activities for the Azar Grammar Series)
Macbeth play - analysis .pptx english lit
What’s under the hood: Parsing standardized learning content for AI
1.Salivary gland disease.pdf 3.Bleeding and Clotting Disorders.pdf important
Journal of Dental Science - UDMY (2022).pdf
Farming Based Livelihood Systems English Notes
semiconductor packaging in vlsi design fab
CISA (Certified Information Systems Auditor) Domain-Wise Summary.pdf
DRUGS USED FOR HORMONAL DISORDER, SUPPLIMENTATION, CONTRACEPTION, & MEDICAL T...

Write an MPI program that implements a shell-sort like parallel algo.pdf

  • 1. Write an MPI program that implements a shell-sort like parallel algorithm that sorts an array of integers. The initial array is partitioned into equal size sub-arrays which are distributed to the processes (one per process). The parallel algorithm is described in Section 9.3.2 of the textbook (pages 398-399). It consists of two phases: (i) the processes that are far away from each other compare and split their assigned sub-arrays (using a hypercube pattern of communication); (ii) perform odd-even transposition iterations as long as the sub-arrays are changing. The following is a high-level pseudocode description of the algorithm: Algorithm 1 Shell-sort like parallel algorithm 1: {Phase I: Hypercube Compare-Exchange.} 2: for i = (d 1) to 0 do 3: if (i-th bit of rank) = 1 then 4: compare-split-hi(i); 5: else 6: compare-split-low(i); 7: {Phase II: Odd-even Transposition Iterations.} 8: done = FALSE 9: while done = FALSE do 10: {Perform odd-even iterations} 11: if received items need to be passed further then 12: broadcast FALSE to all processes; 13: else 14: broadcast TRUE to all processes; 15: if all processes broadcast TRUE then 16: done = TRUE In the algorithm description: d - the number of bits required to represent the ID’s of the processes (d=3 for 8 processes). compare-split-hi(i) - performs a compare-and- split operation so that processor i keeps half of the merged sub-arrays containing the greatest integers. compare-split-low(i) - performs a compare-and-split operation so that processor i keeps half of the merged sub-arrays containing the smallest integers. Test the program on 8 processes. The input array should consist of 128 random integers from the range [0, 128]. The array is generated at process 0 which is responsible for partitioning the array and sending the sub-arrays to the other processors. Process 0 will keep its corresponding sub-array, so that it can participate in the algorithm. At the end of the computation, process 0 collects all the sub-arrays and displays the sorted array. Compare the execution times for your parallel shell-sort implementation with those of the standard odd-even transposition sort (given in the textbook, section 6.3.5, pages 248- 250) and the serial quicksort. For this performance comparison you should use 8 processors and randomly generated integer arrays of sizes: 216, 220, 224, and 230. The random integers should be in the range [0, 128]. Produce a plot showing the execution times of the three algorithms. Produce another plot to show the speedup obtained by the parallel shell-sort with respect to the sequential quicksort. Write a short (max. 2 pages) report describing the implementation and the obtained results. The report should be typeset using Latex and the plots should be generated using gnuplot. Solution Answer: Assembly Language Code :
  • 2. .zero 1 .LC0: .string " " print_ar(int*, int): push rbp mov rbp, rsp sub rsp, 32 mov QWORD PTR [rbp-24], rdi mov DWORD PTR [rbp-28], esi mov DWORD PTR [rbp-4], 0 .L3: mov eax, DWORD PTR [rbp-4] cmp eax, DWORD PTR [rbp-28] jge .L2 mov eax, DWORD PTR [rbp-4] cdqe lea rdx, [0+rax*4] mov rax, QWORD PTR [rbp-24] add rax, rdx mov eax, DWORD PTR [rax] mov esi, eax mov edi, OFFSET FLAT:std::cout call std::basic_ostream >::operator<<(int) mov esi, OFFSET FLAT:.LC0 mov rdi, rax call std::basic_ostream >& std::operator<< >(std::basic_ostream >&, char const*) add DWORD PTR [rbp-4], 1 jmp .L3 .L2: mov esi, OFFSET FLAT:std::basic_ostream >& std::endl >(std::basic_ostream >&) mov edi, OFFSET FLAT:std::cout call std::basic_ostream >::operator<<(std::basic_ostream >& (*)(std::basic_ostream >&)) nop leave ret shell_sort(int*, int):
  • 3. push rbp mov rbp, rsp mov QWORD PTR [rbp-24], rdi mov DWORD PTR [rbp-28], esi mov eax, DWORD PTR [rbp-28] mov edx, eax shr edx, 31 add eax, edx sar eax mov DWORD PTR [rbp-8], eax .L10: cmp DWORD PTR [rbp-8], 0 jle .L11 mov eax, DWORD PTR [rbp-8] mov DWORD PTR [rbp-12], eax .L9: mov eax, DWORD PTR [rbp-12] cmp eax, DWORD PTR [rbp-28] jge .L6 mov eax, DWORD PTR [rbp-12] cdqe lea rdx, [0+rax*4] mov rax, QWORD PTR [rbp-24] add rax, rdx mov eax, DWORD PTR [rax] mov DWORD PTR [rbp-16], eax mov eax, DWORD PTR [rbp-12] mov DWORD PTR [rbp-4], eax .L8: mov eax, DWORD PTR [rbp-4] cmp eax, DWORD PTR [rbp-8] jl .L7 mov eax, DWORD PTR [rbp-4] sub eax, DWORD PTR [rbp-8] cdqe lea rdx, [0+rax*4]
  • 4. mov rax, QWORD PTR [rbp-24] add rax, rdx mov eax, DWORD PTR [rax] cmp eax, DWORD PTR [rbp-16] jle .L7 mov eax, DWORD PTR [rbp-4] cdqe lea rdx, [0+rax*4] mov rax, QWORD PTR [rbp-24] add rdx, rax mov eax, DWORD PTR [rbp-4] sub eax, DWORD PTR [rbp-8] cdqe lea rcx, [0+rax*4] mov rax, QWORD PTR [rbp-24] add rax, rcx mov eax, DWORD PTR [rax] mov DWORD PTR [rdx], eax mov eax, DWORD PTR [rbp-8] sub DWORD PTR [rbp-4], eax jmp .L8 .L7: mov eax, DWORD PTR [rbp-4] cdqe lea rdx, [0+rax*4] mov rax, QWORD PTR [rbp-24] add rdx, rax mov eax, DWORD PTR [rbp-16] mov DWORD PTR [rdx], eax add DWORD PTR [rbp-12], 1 jmp .L9 .L6: mov eax, DWORD PTR [rbp-8] mov edx, eax shr edx, 31 add eax, edx
  • 5. sar eax mov DWORD PTR [rbp-8], eax jmp .L10 .L11: nop pop rbp ret .LC1: .string "Intial Array : " .LC2: .string "Sorted Array : " main: push rbp mov rbp, rsp sub rsp, 48 mov DWORD PTR [rbp-48], 1 mov DWORD PTR [rbp-44], 4 mov DWORD PTR [rbp-40], 16 mov DWORD PTR [rbp-36], 30 mov DWORD PTR [rbp-32], 29 mov DWORD PTR [rbp-28], 18 mov DWORD PTR [rbp-24], 100 mov DWORD PTR [rbp-20], 2 mov DWORD PTR [rbp-16], 43 mov DWORD PTR [rbp-12], 1 mov esi, OFFSET FLAT:.LC1 mov edi, OFFSET FLAT:std::cout call std::basic_ostream >& std::operator<< >(std::basic_ostream >&, char const*) lea rax, [rbp-48] mov esi, 10 mov rdi, rax call print_ar(int*, int) lea rax, [rbp-48] mov esi, 10 mov rdi, rax call shell_sort(int*, int)
  • 6. mov esi, OFFSET FLAT:.LC2 mov edi, OFFSET FLAT:std::cout call std::basic_ostream >& std::operator<< >(std::basic_ostream >&, char const*) lea rax, [rbp-48] mov esi, 10 mov rdi, rax call print_ar(int*, int) mov eax, 0 leave ret __static_initialization_and_destruction_0(int, int): push rbp mov rbp, rsp sub rsp, 16 mov DWORD PTR [rbp-4], edi mov DWORD PTR [rbp-8], esi cmp DWORD PTR [rbp-4], 1 jne .L16 cmp DWORD PTR [rbp-8], 65535 jne .L16 mov edi, OFFSET FLAT:std::__ioinit call std::ios_base::Init::Init() mov edx, OFFSET FLAT:__dso_handle mov esi, OFFSET FLAT:std::__ioinit mov edi, OFFSET FLAT:std::ios_base::Init::~Init() call __cxa_atexit .L16: nop leave ret push rbp mov rbp, rsp mov esi, 65535 mov edi, 1 call __static_initialization_and_destruction_0(int, int) pop rbp
  • 7. ret