Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
87 changes: 61 additions & 26 deletions content/courses/parallel-computing-introduction/codes/mpi_io.f90
Original file line number Diff line number Diff line change
Expand Up @@ -5,20 +5,23 @@ program mpiwrite
integer :: N, M
integer :: i,j
character(len=80) :: arg
integer, allocatable, dimension(:,:) :: loc_u
integer, allocatable, dimension(:,:) :: u, gu
integer :: numargs

integer :: rank, nprocs, nrows, ncols
integer :: lrow, lcol, nrl, ncl
integer :: lrow, lcol
integer :: nrl, ncl, nr_total, nc_total, nghosts
integer, parameter :: root=0
type(MPI_Status) :: mpi_stat
type(MPI_Datatype) :: locarr
type(MPI_Datatype) :: locarr, fullarr
type(MPI_File) :: fh
integer :: amode
integer :: mpi_err, gmpi_err
integer(kind=MPI_OFFSET_KIND) :: disp=0
character(len=24) :: fname
integer :: ndims=2
integer, dimension(2) :: ldims, gdims, start_arr
integer, dimension(2) :: starts,sizes,subsizes
integer, dimension(2) :: gstarts,gsizes,gsubsizes
character(len=36) :: myfile


Expand Down Expand Up @@ -61,66 +64,98 @@ program mpiwrite
!Grid coordinates
lrow=rank/ncols
lcol=mod(rank,ncols)


!Hardcode each local array to be relatively small so we can see
!what we're doing
nrl=4
ncl=4

!Global array size
N=nrl*nrows
M=ncl*ncols

nghosts=2
nr_total=nrl+2*nghosts
nc_total=ncl+2*nghosts

! Set up values
allocate(loc_u(nrl,ncl))
do i=1,nrl
do j=1,ncl
loc_u(i,j)=(rank+1)*(i+j)
allocate(u(0:nr_total-1,0:nc_total-1))

u=-9
do i=nghosts,nrl+nghosts-1
do j=nghosts,ncl+nghosts-1
u(i,j)=rank
enddo
enddo

gdims=[N,M]
ldims=[nrl,ncl]
start_arr=[ncl*lrow,nrl*lcol]
print *, rank, lrow, lcol, start_arr
!array sizes
gsizes=[N,M]
sizes=[nr_total,nc_total]

write(myfile,'(a,i2.2)') trim(fname),rank
open(10,file=myfile)
write(10,*) rank
do i=1,nrl
write(10,*) loc_u(i,:)
write(10,*) u(i,:)
enddo

!Define a subarray for each local array within the global array
call MPI_TYPE_CREATE_SUBARRAY(ndims, gdims, ldims, start_arr, &
starts=[nghosts,nghosts]
subsizes=[nrl,ncl]
!Define a subarray for each local array
!Size includes ghost zones, starts picks out locations
call MPI_TYPE_CREATE_SUBARRAY(ndims, sizes, subsizes, starts, &
MPI_ORDER_FORTRAN, MPI_INTEGER, locarr)
call MPI_TYPE_COMMIT(locarr)

!Create the subarray for the global file view
!Excludes ghost zones
!Remember that subarry starts assume 0 lower bound like C
gsizes=[N,M]
gstarts=[lrow*nrl,lcol*ncl]
gsubsizes=[nrl,ncl]
call MPI_TYPE_CREATE_SUBARRAY(ndims, gsizes, gsubsizes, gstarts, &
MPI_ORDER_FORTRAN, MPI_INTEGER, fullarr)
call MPI_TYPE_COMMIT(fullarr)

amode=ior(MPI_MODE_CREATE, MPI_MODE_WRONLY)
call MPI_FILE_OPEN(MPI_COMM_WORLD,trim(fname),amode,MPI_INFO_NULL,fh,mpi_err)

call MPI_Allreduce(mpi_err, gmpi_err,1,MPI_INTEGER, MPI_LOR, MPI_COMM_WORLD)
call MPI_Allreduce(mpi_err,gmpi_err,1,MPI_INTEGER, MPI_BOR, MPI_COMM_WORLD)

if ( gmpi_err /= MPI_SUCCESS ) then
stop "Unable to open MPI file, terminating"
endif

!Need a header for the sizes, only root should write this
if ( rank==0 ) then
call MPI_File_write(fh, [N, M], 2, MPI_INTEGER, MPI_STATUS_IGNORE)
endif

!Everybody write its section
disp=2*sizeof(N)
!if ( rank==0 ) then
! call MPI_File_write(fh, [N, M], 2, MPI_INTEGER, MPI_STATUS_IGNORE)
!endif

call MPI_FILE_SET_VIEW(fh,disp,MPI_INTEGER,locarr,"native", MPI_INFO_NULL)
call MPI_FILE_WRITE_ALL(fh, loc_u, size(loc_u), MPI_INTEGER, mpi_stat)
call MPI_FILE_SET_VIEW(fh,disp,MPI_INTEGER,fullarr,"native", MPI_INFO_NULL)
call MPI_FILE_WRITE_ALL(fh, u, 1, locarr, mpi_stat)

call MPI_FILE_CLOSE(fh)

!Read it back in
if (rank==root) then
allocate(gu(0:N-1,0:M-1))
print *, 'allocated gu', size(gu)
amode=MPI_MODE_RDONLY
print *, 'Opening file'
call MPI_FILE_OPEN(MPI_COMM_WORLD,trim(fname),amode,MPI_INFO_NULL,fh,mpi_err)
print *, 'Opened file ',trim(fname)
if ( mpi_err /= MPI_SUCCESS) then
stop "Unable to open MPI file for reading"
endif
print *, "Starting to read"

call MPI_FILE_READ(fh, gu, size(gu), MPI_INTEGER, mpi_stat)

endif


call MPI_Type_free(locarr)
call MPI_Type_free(fullarr)

call MPI_Finalize()

end program
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,6 @@
comm=MPI.COMM_WORLD
fh=MPI.File.Open(comm,filename,MPI.MODE_RDONLY)


dims=np.empty((2,),dtype='int')
fh.Read(dims)
N=dims[0]; M=dims[1]
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@

filename=sys.argv[1]

fh=open(filename)
fh=open(filename,'rb')
x=np.fromfile(fh,dtype='int')

print(type(x),x.shape,x.size)
Expand Down
3 changes: 0 additions & 3 deletions content/notes/bioinfo-intro/_index.md
Original file line number Diff line number Diff line change
@@ -1,9 +1,6 @@
---
title: Introduction to Bioinformatics Tools for HPC
date: 2025-08-23T03:19:53Z
authors: [mab]
categories: ["Bioinformatics"]
tags: ["Bioinformatics"]
type: docs
weight: 30

Expand Down
18 changes: 18 additions & 0 deletions content/notes/bioinfo-reproducibility/_index.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
---
title: Reproducibility in Bioinformatics
date: 2026-03-25T19:08:46Z
type: docs
weight: 100
menu:
bioinfo-reproducibility:
---

## Tutorial Outline

* Difficulties in achieving reproducibility

* Potential problems with bioinformatics pipelines

* Some helpful tools

* Snakemake and Nextflow Examples
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
---
title: Version Control
date: 2026-03-25T19:08:46Z
type: docs
weight: 550
menu:
bioinfo-reproducibility:
---


{{< figure src=/notes/bioinfo-reproducibility/img/Triant-Bobar_Reproducibility_11.png >}}

GitHub: https://github.com

Track and manage changes to your code & files

Store and label changes at every step

Small or large projects

Collaborate on projects and minimize conflicting edits

Works on multiple platforms (MacOS, Windows, Linux)

Website for github, cutadapt repository

Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
---
title: Environment Management
date: 2026-03-25T19:08:46Z
type: docs
weight: 600
menu:
bioinfo-reproducibility:
---

## Conda/Mamba environments

* Isolated spaces for each project with specific tool versions
* Manage Python versions and dependencies
* Install packages and software directly into environment
* Stable and reproducible place to run code and applications
* Not limited to Python, can run bash, Rscript
* YAML configuration file to create or export and transfer an environment

Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
---
title: Storing Results
date: 2026-03-25T19:08:46Z
type: docs
weight: 650
menu:
bioinfo-reproducibility:
---

* Public repositories for sequence data - required for most journals
* NCBI: https://www.ncbi.nlm.nih.gov
* Ensembl: https://www.ensembl.org/index.html
* Always document and archive changes, especially if unpublished:
* - genome assembly versions
* - sequence data: SNPs, isoforms

Websites: NCBI, Ensembl, Santa Cruz

Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
---
title: Containers
date: 2026-03-25T19:08:46Z
type: docs
weight: 700
menu:
bioinfo-reproducibility:
---

Containers are portable environments that run across different computing environments

They contain packages, software and dependencies that remain isolated from host infrastructure

Standalone unit of software and can produce same results on different machine or server

Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
---
title: Bioinformatic Pipelines
date: 2026-03-25T19:08:46Z
type: docs
weight: 750
menu:
bioinfo-reproducibility:
---

## Typical bioinformatics workflows involve many steps:

* FASTQ → QC → Alignment → Sorting → Variant Calling → Annotation
* - FASTQ files need quality check and trimming
* Cutadapt
* BWA
* Samtools
* Freebayes
* VCFtools
* Create pipeline to string software together for “final” output

Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
---
title: Bioinformatic Pipeline Challenges
date: 2026-03-25T19:08:46Z
type: docs
weight: 800
menu:
bioinfo-reproducibility:
---

Complex dependencies between steps

Formatting inconsistencies

Hard to reproduce results - scalability, parameters, version changes

Difficult to parallelize efficiently

Manual scripts often fail on HPC

Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
---
title: Bioinformatic Pipelines on HPC
date: 2026-03-25T19:08:46Z
type: docs
weight: 850
menu:
bioinfo-reproducibility:
---

Which modules were loaded?

Where are scripts being run?

Tracking paths - hard-coded in scripts?

Out/error files - software vs slurm conflicts

<span style="color:#002060"> __Goal:__ </span> <span style="color:#002060"> </span> Automate and track these workflows

Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
---
title: Snakemake
date: 2026-03-25T19:08:46Z
type: docs
weight: 900
menu:
bioinfo-reproducibility:
---

{{< figure src=/notes/bioinfo-reproducibility/img/Triant-Bobar_Reproducibility_19.png >}}

https://snakemake.github.io/

__Snakemake__ is a workflow management system designed for scientific pipelines

Created by Johannes Köster, first released in 2012

Based on UNIX make - originally created in 1976 but still standard use

Python based - “ _snake-make_ ”

Free and open source, available on Mac, Windows, Unix

https://snakemake.readthedocs.io/en/stable/

https://github.com/snakemake


`Make` is a command-line interface software tool that performs actions ordered by configured dependencies as defined in a configuration file called a makefile. It is commonly used for build automation to build executable code from source code. 

Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
---
title: Snakemake Format
date: 2026-03-25T19:08:46Z
type: docs
weight: 950
menu:
bioinfo-reproducibility:
---

Similar to writing shell scripts but snake files contains sets of rules

Format is based on Python structure

Snakemake reads from snakefile that defines the rules

Snakefile rules have a target output

Snakemake uses pattern matching to follow the inputs, outputs and commands contained in rules to reach final target output

Loading
Loading