diff --git a/content/courses/parallel-computing-introduction/codes/mpi_io.f90 b/content/courses/parallel-computing-introduction/codes/mpi_io.f90
index a28884b5..507bd442 100644
--- a/content/courses/parallel-computing-introduction/codes/mpi_io.f90
+++ b/content/courses/parallel-computing-introduction/codes/mpi_io.f90
@@ -5,20 +5,23 @@ program mpiwrite
integer :: N, M
integer :: i,j
character(len=80) :: arg
- integer, allocatable, dimension(:,:) :: loc_u
+ integer, allocatable, dimension(:,:) :: u, gu
+ integer :: numargs
integer :: rank, nprocs, nrows, ncols
- integer :: lrow, lcol, nrl, ncl
+ integer :: lrow, lcol
+ integer :: nrl, ncl, nr_total, nc_total, nghosts
integer, parameter :: root=0
type(MPI_Status) :: mpi_stat
- type(MPI_Datatype) :: locarr
+ type(MPI_Datatype) :: locarr, fullarr
type(MPI_File) :: fh
integer :: amode
integer :: mpi_err, gmpi_err
integer(kind=MPI_OFFSET_KIND) :: disp=0
character(len=24) :: fname
integer :: ndims=2
- integer, dimension(2) :: ldims, gdims, start_arr
+ integer, dimension(2) :: starts,sizes,subsizes
+ integer, dimension(2) :: gstarts,gsizes,gsubsizes
character(len=36) :: myfile
@@ -61,66 +64,98 @@ program mpiwrite
!Grid coordinates
lrow=rank/ncols
lcol=mod(rank,ncols)
-
!Hardcode each local array to be relatively small so we can see
!what we're doing
nrl=4
ncl=4
- !Global array size
N=nrl*nrows
M=ncl*ncols
+ nghosts=2
+ nr_total=nrl+2*nghosts
+ nc_total=ncl+2*nghosts
+
! Set up values
- allocate(loc_u(nrl,ncl))
- do i=1,nrl
- do j=1,ncl
- loc_u(i,j)=(rank+1)*(i+j)
+ allocate(u(0:nr_total-1,0:nc_total-1))
+
+ u=-9
+ do i=nghosts,nrl+nghosts-1
+ do j=nghosts,ncl+nghosts-1
+ u(i,j)=rank
enddo
enddo
- gdims=[N,M]
- ldims=[nrl,ncl]
- start_arr=[ncl*lrow,nrl*lcol]
- print *, rank, lrow, lcol, start_arr
+ !array sizes
+ gsizes=[N,M]
+ sizes=[nr_total,nc_total]
write(myfile,'(a,i2.2)') trim(fname),rank
open(10,file=myfile)
write(10,*) rank
do i=1,nrl
- write(10,*) loc_u(i,:)
+ write(10,*) u(i,:)
enddo
- !Define a subarray for each local array within the global array
- call MPI_TYPE_CREATE_SUBARRAY(ndims, gdims, ldims, start_arr, &
+ starts=[nghosts,nghosts]
+ subsizes=[nrl,ncl]
+ !Define a subarray for each local array
+ !Size includes ghost zones, starts picks out locations
+ call MPI_TYPE_CREATE_SUBARRAY(ndims, sizes, subsizes, starts, &
MPI_ORDER_FORTRAN, MPI_INTEGER, locarr)
call MPI_TYPE_COMMIT(locarr)
+ !Create the subarray for the global file view
+ !Excludes ghost zones
+ !Remember that subarry starts assume 0 lower bound like C
+ gsizes=[N,M]
+ gstarts=[lrow*nrl,lcol*ncl]
+ gsubsizes=[nrl,ncl]
+ call MPI_TYPE_CREATE_SUBARRAY(ndims, gsizes, gsubsizes, gstarts, &
+ MPI_ORDER_FORTRAN, MPI_INTEGER, fullarr)
+ call MPI_TYPE_COMMIT(fullarr)
+
amode=ior(MPI_MODE_CREATE, MPI_MODE_WRONLY)
call MPI_FILE_OPEN(MPI_COMM_WORLD,trim(fname),amode,MPI_INFO_NULL,fh,mpi_err)
- call MPI_Allreduce(mpi_err, gmpi_err,1,MPI_INTEGER, MPI_LOR, MPI_COMM_WORLD)
+ call MPI_Allreduce(mpi_err,gmpi_err,1,MPI_INTEGER, MPI_BOR, MPI_COMM_WORLD)
if ( gmpi_err /= MPI_SUCCESS ) then
stop "Unable to open MPI file, terminating"
endif
!Need a header for the sizes, only root should write this
- if ( rank==0 ) then
- call MPI_File_write(fh, [N, M], 2, MPI_INTEGER, MPI_STATUS_IGNORE)
- endif
-
- !Everybody write its section
- disp=2*sizeof(N)
+ !if ( rank==0 ) then
+ ! call MPI_File_write(fh, [N, M], 2, MPI_INTEGER, MPI_STATUS_IGNORE)
+ !endif
- call MPI_FILE_SET_VIEW(fh,disp,MPI_INTEGER,locarr,"native", MPI_INFO_NULL)
- call MPI_FILE_WRITE_ALL(fh, loc_u, size(loc_u), MPI_INTEGER, mpi_stat)
+ call MPI_FILE_SET_VIEW(fh,disp,MPI_INTEGER,fullarr,"native", MPI_INFO_NULL)
+ call MPI_FILE_WRITE_ALL(fh, u, 1, locarr, mpi_stat)
call MPI_FILE_CLOSE(fh)
+ !Read it back in
+ if (rank==root) then
+ allocate(gu(0:N-1,0:M-1))
+ print *, 'allocated gu', size(gu)
+ amode=MPI_MODE_RDONLY
+ print *, 'Opening file'
+ call MPI_FILE_OPEN(MPI_COMM_WORLD,trim(fname),amode,MPI_INFO_NULL,fh,mpi_err)
+ print *, 'Opened file ',trim(fname)
+ if ( mpi_err /= MPI_SUCCESS) then
+ stop "Unable to open MPI file for reading"
+ endif
+ print *, "Starting to read"
+
+ call MPI_FILE_READ(fh, gu, size(gu), MPI_INTEGER, mpi_stat)
+
+ endif
+
call MPI_Type_free(locarr)
+ call MPI_Type_free(fullarr)
+
call MPI_Finalize()
end program
diff --git a/content/courses/parallel-computing-introduction/codes/read_mpi.py b/content/courses/parallel-computing-introduction/codes/read_mpi.py
index e136176a..04566894 100644
--- a/content/courses/parallel-computing-introduction/codes/read_mpi.py
+++ b/content/courses/parallel-computing-introduction/codes/read_mpi.py
@@ -7,7 +7,6 @@
comm=MPI.COMM_WORLD
fh=MPI.File.Open(comm,filename,MPI.MODE_RDONLY)
-
dims=np.empty((2,),dtype='int')
fh.Read(dims)
N=dims[0]; M=dims[1]
diff --git a/content/courses/parallel-computing-introduction/codes/readio.py b/content/courses/parallel-computing-introduction/codes/readio.py
index f27236dc..95a19372 100644
--- a/content/courses/parallel-computing-introduction/codes/readio.py
+++ b/content/courses/parallel-computing-introduction/codes/readio.py
@@ -3,7 +3,7 @@
filename=sys.argv[1]
-fh=open(filename)
+fh=open(filename,'rb')
x=np.fromfile(fh,dtype='int')
print(type(x),x.shape,x.size)
diff --git a/content/notes/bioinfo-intro/_index.md b/content/notes/bioinfo-intro/_index.md
index 2d6c4c29..ace65ee7 100644
--- a/content/notes/bioinfo-intro/_index.md
+++ b/content/notes/bioinfo-intro/_index.md
@@ -1,9 +1,6 @@
---
title: Introduction to Bioinformatics Tools for HPC
date: 2025-08-23T03:19:53Z
-authors: [mab]
-categories: ["Bioinformatics"]
-tags: ["Bioinformatics"]
type: docs
weight: 30
diff --git a/content/notes/bioinfo-reproducibility/_index.md b/content/notes/bioinfo-reproducibility/_index.md
new file mode 100644
index 00000000..fdf60375
--- /dev/null
+++ b/content/notes/bioinfo-reproducibility/_index.md
@@ -0,0 +1,18 @@
+---
+title: Reproducibility in Bioinformatics
+date: 2026-03-25T19:08:46Z
+type: docs
+weight: 100
+menu:
+ bioinfo-reproducibility:
+---
+
+## Tutorial Outline
+
+ * Difficulties in achieving reproducibility
+
+ * Potential problems with bioinformatics pipelines
+
+ * Some helpful tools
+
+ * Snakemake and Nextflow Examples
diff --git a/content/notes/bioinfo-reproducibility/bioinfo-reproducibility_10.md b/content/notes/bioinfo-reproducibility/bioinfo-reproducibility_10.md
new file mode 100644
index 00000000..1a248e94
--- /dev/null
+++ b/content/notes/bioinfo-reproducibility/bioinfo-reproducibility_10.md
@@ -0,0 +1,26 @@
+---
+title: Version Control
+date: 2026-03-25T19:08:46Z
+type: docs
+weight: 550
+menu:
+ bioinfo-reproducibility:
+---
+
+
+{{< figure src=/notes/bioinfo-reproducibility/img/Triant-Bobar_Reproducibility_11.png >}}
+
+GitHub: https://github.com
+
+Track and manage changes to your code & files
+
+Store and label changes at every step
+
+Small or large projects
+
+Collaborate on projects and minimize conflicting edits
+
+Works on multiple platforms (MacOS, Windows, Linux)
+
+Website for github, cutadapt repository
+
diff --git a/content/notes/bioinfo-reproducibility/bioinfo-reproducibility_11.md b/content/notes/bioinfo-reproducibility/bioinfo-reproducibility_11.md
new file mode 100644
index 00000000..f1c03f7c
--- /dev/null
+++ b/content/notes/bioinfo-reproducibility/bioinfo-reproducibility_11.md
@@ -0,0 +1,18 @@
+---
+title: Environment Management
+date: 2026-03-25T19:08:46Z
+type: docs
+weight: 600
+menu:
+ bioinfo-reproducibility:
+---
+
+## Conda/Mamba environments
+
+ * Isolated spaces for each project with specific tool versions
+ * Manage Python versions and dependencies
+ * Install packages and software directly into environment
+ * Stable and reproducible place to run code and applications
+ * Not limited to Python, can run bash, Rscript
+ * YAML configuration file to create or export and transfer an environment
+
diff --git a/content/notes/bioinfo-reproducibility/bioinfo-reproducibility_12.md b/content/notes/bioinfo-reproducibility/bioinfo-reproducibility_12.md
new file mode 100644
index 00000000..854d31e5
--- /dev/null
+++ b/content/notes/bioinfo-reproducibility/bioinfo-reproducibility_12.md
@@ -0,0 +1,18 @@
+---
+title: Storing Results
+date: 2026-03-25T19:08:46Z
+type: docs
+weight: 650
+menu:
+ bioinfo-reproducibility:
+---
+
+* Public repositories for sequence data - required for most journals
+ * NCBI: https://www.ncbi.nlm.nih.gov
+ * Ensembl: https://www.ensembl.org/index.html
+ * Always document and archive changes, especially if unpublished:
+ * - genome assembly versions
+ * - sequence data: SNPs, isoforms
+
+Websites: NCBI, Ensembl, Santa Cruz
+
diff --git a/content/notes/bioinfo-reproducibility/bioinfo-reproducibility_13.md b/content/notes/bioinfo-reproducibility/bioinfo-reproducibility_13.md
new file mode 100644
index 00000000..7ffda24d
--- /dev/null
+++ b/content/notes/bioinfo-reproducibility/bioinfo-reproducibility_13.md
@@ -0,0 +1,15 @@
+---
+title: Containers
+date: 2026-03-25T19:08:46Z
+type: docs
+weight: 700
+menu:
+ bioinfo-reproducibility:
+---
+
+Containers are portable environments that run across different computing environments
+
+They contain packages, software and dependencies that remain isolated from host infrastructure
+
+Standalone unit of software and can produce same results on different machine or server
+
diff --git a/content/notes/bioinfo-reproducibility/bioinfo-reproducibility_14.md b/content/notes/bioinfo-reproducibility/bioinfo-reproducibility_14.md
new file mode 100644
index 00000000..68b0dbb0
--- /dev/null
+++ b/content/notes/bioinfo-reproducibility/bioinfo-reproducibility_14.md
@@ -0,0 +1,20 @@
+---
+title: Bioinformatic Pipelines
+date: 2026-03-25T19:08:46Z
+type: docs
+weight: 750
+menu:
+ bioinfo-reproducibility:
+---
+
+## Typical bioinformatics workflows involve many steps:
+
+* FASTQ → QC → Alignment → Sorting → Variant Calling → Annotation
+ * - FASTQ files need quality check and trimming
+ * Cutadapt
+ * BWA
+ * Samtools
+ * Freebayes
+ * VCFtools
+* Create pipeline to string software together for “final” output
+
diff --git a/content/notes/bioinfo-reproducibility/bioinfo-reproducibility_15.md b/content/notes/bioinfo-reproducibility/bioinfo-reproducibility_15.md
new file mode 100644
index 00000000..e8e63356
--- /dev/null
+++ b/content/notes/bioinfo-reproducibility/bioinfo-reproducibility_15.md
@@ -0,0 +1,19 @@
+---
+title: Bioinformatic Pipeline Challenges
+date: 2026-03-25T19:08:46Z
+type: docs
+weight: 800
+menu:
+ bioinfo-reproducibility:
+---
+
+Complex dependencies between steps
+
+Formatting inconsistencies
+
+Hard to reproduce results - scalability, parameters, version changes
+
+Difficult to parallelize efficiently
+
+Manual scripts often fail on HPC
+
diff --git a/content/notes/bioinfo-reproducibility/bioinfo-reproducibility_16.md b/content/notes/bioinfo-reproducibility/bioinfo-reproducibility_16.md
new file mode 100644
index 00000000..e33b3f23
--- /dev/null
+++ b/content/notes/bioinfo-reproducibility/bioinfo-reproducibility_16.md
@@ -0,0 +1,19 @@
+---
+title: Bioinformatic Pipelines on HPC
+date: 2026-03-25T19:08:46Z
+type: docs
+weight: 850
+menu:
+ bioinfo-reproducibility:
+---
+
+Which modules were loaded?
+
+Where are scripts being run?
+
+Tracking paths - hard-coded in scripts?
+
+Out/error files - software vs slurm conflicts
+
+ __Goal:__ Automate and track these workflows
+
diff --git a/content/notes/bioinfo-reproducibility/bioinfo-reproducibility_17.md b/content/notes/bioinfo-reproducibility/bioinfo-reproducibility_17.md
new file mode 100644
index 00000000..e87cee4c
--- /dev/null
+++ b/content/notes/bioinfo-reproducibility/bioinfo-reproducibility_17.md
@@ -0,0 +1,30 @@
+---
+title: Snakemake
+date: 2026-03-25T19:08:46Z
+type: docs
+weight: 900
+menu:
+ bioinfo-reproducibility:
+---
+
+{{< figure src=/notes/bioinfo-reproducibility/img/Triant-Bobar_Reproducibility_19.png >}}
+
+https://snakemake.github.io/
+
+__Snakemake__ is a workflow management system designed for scientific pipelines
+
+Created by Johannes Köster, first released in 2012
+
+Based on UNIX make - originally created in 1976 but still standard use
+
+Python based - “ _snake-make_ ”
+
+Free and open source, available on Mac, Windows, Unix
+
+https://snakemake.readthedocs.io/en/stable/
+
+https://github.com/snakemake
+
+
+`Make` is a command-line interface software tool that performs actions ordered by configured dependencies as defined in a configuration file called a makefile. It is commonly used for build automation to build executable code from source code.
+
diff --git a/content/notes/bioinfo-reproducibility/bioinfo-reproducibility_18.md b/content/notes/bioinfo-reproducibility/bioinfo-reproducibility_18.md
new file mode 100644
index 00000000..bf72df3d
--- /dev/null
+++ b/content/notes/bioinfo-reproducibility/bioinfo-reproducibility_18.md
@@ -0,0 +1,19 @@
+---
+title: Snakemake Format
+date: 2026-03-25T19:08:46Z
+type: docs
+weight: 950
+menu:
+ bioinfo-reproducibility:
+---
+
+Similar to writing shell scripts but snake files contains sets of rules
+
+Format is based on Python structure
+
+Snakemake reads from snakefile that defines the rules
+
+Snakefile rules have a target output
+
+Snakemake uses pattern matching to follow the inputs, outputs and commands contained in rules to reach final target output
+
diff --git a/content/notes/bioinfo-reproducibility/bioinfo-reproducibility_19.md b/content/notes/bioinfo-reproducibility/bioinfo-reproducibility_19.md
new file mode 100644
index 00000000..c4d1620a
--- /dev/null
+++ b/content/notes/bioinfo-reproducibility/bioinfo-reproducibility_19.md
@@ -0,0 +1,29 @@
+---
+title: Snakemake Core Idea
+date: 2026-03-25T19:08:46Z
+type: docs
+weight: 1000
+menu:
+ bioinfo-reproducibility:
+---
+
+Instead of defining _steps_ , you define __rules that produce files__ .
+
+rule align:
+
+input:
+
+"reads.fastq"
+
+output:
+
+"aligned.bam"
+
+shell:
+
+"bwa mem ref.fa {input} > {output}"
+
+Snakemake builds a __directed acyclic graph (DAG)__ automatically.
+
+Fastq → Cutadapt → BWA → Sorted BAM → Freebayes → VCF
+
diff --git a/content/notes/bioinfo-reproducibility/bioinfo-reproducibility_20.md b/content/notes/bioinfo-reproducibility/bioinfo-reproducibility_20.md
new file mode 100644
index 00000000..dd7e2b60
--- /dev/null
+++ b/content/notes/bioinfo-reproducibility/bioinfo-reproducibility_20.md
@@ -0,0 +1,38 @@
+---
+title: Recommended Pipeline Directory Structure
+date: 2026-03-25T19:08:46Z
+type: docs
+weight: 1050
+menu:
+ bioinfo-reproducibility:
+---
+
+
+Benefits:
+
+separates __workflow logic from data__
+
+easier debugging
+
+easier collaboration
+
+Common practice:
+
+config/ → parameters and sample tables
+
+envs/ → reproducible environments
+
+rules/ → modular workflow steps
+
+results/ → generated outputs
+
+Example:
+
+bioinformatics_pipeline/├── Snakefile├── config/│ └── config.yml├── envs/│ └── bwa.yml├── rules/│ ├── alignment.smk│ ├── qc.smk│ └── variant_calling.smk├── scripts/│ └── custom_processing.py├── data/│ └── raw/├── results/│ ├── bam/│ ├── qc/│ └── variants/└── logs/
+
+A clean directory structure makes pipelines easier to maintain and reproduce.
+
+---
+
+.yml file can indicate how to make conda environment and what packages and dependencies you need
+
diff --git a/content/notes/bioinfo-reproducibility/bioinfo-reproducibility_21.md b/content/notes/bioinfo-reproducibility/bioinfo-reproducibility_21.md
new file mode 100644
index 00000000..b9f7e187
--- /dev/null
+++ b/content/notes/bioinfo-reproducibility/bioinfo-reproducibility_21.md
@@ -0,0 +1,20 @@
+---
+title: Snakefile Breakdown
+date: 2026-03-25T19:08:46Z
+type: docs
+weight: 1100
+menu:
+ bioinfo-reproducibility:
+---
+
+
+Fastq files that need trimming - input: sample.fastq
+
+Cutadapt - output: sample-trimmed.fastq
+
+BWA - align trimmed fastq to assembly output: sample-aligned.sam
+
+Samtools sorting, indexing - output: sample-sorted.bam
+
+Freebayes variant calling - output: sample-variants.vcf
+
diff --git a/content/notes/bioinfo-reproducibility/bioinfo-reproducibility_22.md b/content/notes/bioinfo-reproducibility/bioinfo-reproducibility_22.md
new file mode 100644
index 00000000..e3ff8bcf
--- /dev/null
+++ b/content/notes/bioinfo-reproducibility/bioinfo-reproducibility_22.md
@@ -0,0 +1,44 @@
+---
+title: Example Snakefile
+date: 2026-03-25T19:08:46Z
+type: docs
+weight: 1150
+menu:
+ bioinfo-reproducibility:
+---
+
+```no-highlight
+
+__rule__ all: input: "variants/sample1.vcf”
+
+__rule__ trim:
+
+input:
+
+”reads/sample1.fastq”
+
+output:
+
+”trimmed_reads/sample1-trimmed.fastq”
+
+shell:
+
+cutadapt -A TCCGGGTS -o {output} {input}
+
+__rule__ align: input: "trimmed_reads/sample1-trimmed.fastq" output: "bam/sample1.bam" threads: 1 shell: "bwa mem -t {threads} ref.fa {input} | samtools view -Sb - > {output}”
+
+__rule__ call_variants: input: "bam/sample1.bam" output: "variants/sample1.vcf" shell: "freebayes -f ref.fa {input} > {output}”
+
+Snakemake takes first rule as the target
+
+then constructs graph of dependencies
+
+{wildcards} serve as placeholders within rules to operate
+
+on multiple files via pattern matching
+
+```
+
+Snakemake builds the entire pipeline graph automatically.
+
+
diff --git a/content/notes/bioinfo-reproducibility/bioinfo-reproducibility_23.md b/content/notes/bioinfo-reproducibility/bioinfo-reproducibility_23.md
new file mode 100644
index 00000000..a40df77f
--- /dev/null
+++ b/content/notes/bioinfo-reproducibility/bioinfo-reproducibility_23.md
@@ -0,0 +1,30 @@
+---
+title: Snakemake Exercises on HPC
+date: 2026-03-25T19:08:46Z
+type: docs
+weight: 1200
+menu:
+ bioinfo-reproducibility:
+---
+
+Class data:
+
+/project/ hpc_training /reproducibility/ snakemake
+
+$ cp /project/ hpc_training /reproducibility/ snakemake .
+
+ - GCF_000005845.2_ASM584v2_genomic.fna - genome assembly
+
+ - SRR2584863_1.fastq - fastq sequence file, paired-1
+
+ - SRR2584863_2.fastq - fastq sequence file, paired-2
+
+ - *. smk - snakemake files
+
+ - config_variant.yml - configuration file
+
+ - submit_snakemake.sh - sample slurm file
+
+
+Yet another markup language- YAML Ain't Markup Language
+
diff --git a/content/notes/bioinfo-reproducibility/bioinfo-reproducibility_24.md b/content/notes/bioinfo-reproducibility/bioinfo-reproducibility_24.md
new file mode 100644
index 00000000..4da76d02
--- /dev/null
+++ b/content/notes/bioinfo-reproducibility/bioinfo-reproducibility_24.md
@@ -0,0 +1,20 @@
+---
+title: Running jobs on interactive node
+date: 2026-03-25T19:08:46Z
+type: docs
+weight: 1250
+menu:
+ bioinfo-reproducibility:
+---
+
+Run interactively - good for testing
+
+$ ijob -c 1 -A hpc_training -p interactive –v -t 2:00:00
+
+$ cp /project/ hpc_training /reproducibility/ snakemake .
+
+
+Default execution here is local so everything is running in my ijob session on a compute node. If we wanted to have these processes run non-interactively we would want to make sure we are using the executor flag in our snakemake call: "--executor slurm"
+
+Work in scratch
+
diff --git a/content/notes/bioinfo-reproducibility/bioinfo-reproducibility_25.md b/content/notes/bioinfo-reproducibility/bioinfo-reproducibility_25.md
new file mode 100644
index 00000000..49a39619
--- /dev/null
+++ b/content/notes/bioinfo-reproducibility/bioinfo-reproducibility_25.md
@@ -0,0 +1,23 @@
+---
+title: Modules
+date: 2026-03-25T19:08:46Z
+type: docs
+weight: 1300
+menu:
+ bioinfo-reproducibility:
+---
+
+$ module spider
+
+- specifics and version of package available
+
+```bash
+
+$ module spider snakemake
+
+$ module load snakemake/9.8.1
+
+$ module list
+
+$ snakemake -help
+```
diff --git a/content/notes/bioinfo-reproducibility/bioinfo-reproducibility_26.md b/content/notes/bioinfo-reproducibility/bioinfo-reproducibility_26.md
new file mode 100644
index 00000000..e25fee8b
--- /dev/null
+++ b/content/notes/bioinfo-reproducibility/bioinfo-reproducibility_26.md
@@ -0,0 +1,22 @@
+---
+title: Other Modules Needed
+date: 2026-03-25T19:08:46Z
+type: docs
+weight: 1350
+menu:
+ bioinfo-reproducibility:
+---
+
+```bash
+
+$ module load bwa/0.7.17
+
+$ module load cutadapt/4.9
+
+$ module load snakemake/9.8.1
+
+$ module load freebayes/1.3.10
+
+$ module load samtools/1.21
+
+```
diff --git a/content/notes/bioinfo-reproducibility/bioinfo-reproducibility_27.md b/content/notes/bioinfo-reproducibility/bioinfo-reproducibility_27.md
new file mode 100644
index 00000000..c3756c85
--- /dev/null
+++ b/content/notes/bioinfo-reproducibility/bioinfo-reproducibility_27.md
@@ -0,0 +1,26 @@
+---
+title: Running snakemake - genome alignment
+date: 2026-03-25T19:08:46Z
+type: docs
+weight: 1400
+menu:
+ bioinfo-reproducibility:
+---
+
+Snakefile - file.smk, contains rules for snakemake
+
+```bash
+
+$ snakemake -c 1 -s align.smk
+
+--dry-run -np good to test first without producing output
+
+-n only show steps, don't run, -p print shell commands
+
+-c number of cores
+
+-s needed if using a named snakefile (if just called "snakefile", don't need the –s flag)
+
+$ snakemake --dag| dot -Tpng > dag_align.png
+```
+
diff --git a/content/notes/bioinfo-reproducibility/bioinfo-reproducibility_28.md b/content/notes/bioinfo-reproducibility/bioinfo-reproducibility_28.md
new file mode 100644
index 00000000..bb611ddc
--- /dev/null
+++ b/content/notes/bioinfo-reproducibility/bioinfo-reproducibility_28.md
@@ -0,0 +1,22 @@
+---
+title: Running Snakemake - Variant Detection
+date: 2026-03-25T19:08:46Z
+type: docs
+weight: 1450
+menu:
+ bioinfo-reproducibility:
+---
+
+Snakefile - file.smk, contains rules for snakemake
+
+```bash
+$ snakemake -c 1 -s variant-call.smk
+
+--dry-run
+
+-c number of cores
+
+-s needed if using a named snakefile (if just called "snakefile", don't need)
+
+$ snakemake --dag -s variant-call.smk | dot -Tpng \ > dag_variant.png
+```
diff --git a/content/notes/bioinfo-reproducibility/bioinfo-reproducibility_29.md b/content/notes/bioinfo-reproducibility/bioinfo-reproducibility_29.md
new file mode 100644
index 00000000..048ac01f
--- /dev/null
+++ b/content/notes/bioinfo-reproducibility/bioinfo-reproducibility_29.md
@@ -0,0 +1,22 @@
+---
+title: Snakemake Examples on HPC
+date: 2026-03-25T19:08:46Z
+type: docs
+weight: 1500
+menu:
+ bioinfo-reproducibility:
+---
+
+Not recommended to hard-code files within snake file
+
+Can organize sample names, file paths, and software parameters in a YAML configuration file
+
+YAML - serialization language that transforms data into a format that can be shared between systems
+
+With snakemake, configuration file is a reference for the workflow
+
+
+Yet another markup language- YAML Ain't Markup Language
+Easy to keep things organized within a single file
+While showing, good to have separate config files rather than one huge one and commenting sections out
+
diff --git a/content/notes/bioinfo-reproducibility/bioinfo-reproducibility_30.md b/content/notes/bioinfo-reproducibility/bioinfo-reproducibility_30.md
new file mode 100644
index 00000000..13caad65
--- /dev/null
+++ b/content/notes/bioinfo-reproducibility/bioinfo-reproducibility_30.md
@@ -0,0 +1,22 @@
+---
+title: Running Snakemake with Config File
+date: 2026-03-25T19:08:46Z
+type: docs
+weight: 1550
+menu:
+ bioinfo-reproducibility:
+---
+
+
+Snakefile - file.smk, contains rules for snakemake
+
+```bash
+$ snakemake -c 1 -s variant-yml.smk --configfile config_variant.yml
+```
+
+--configfile – directing snakemake to a config file
+
+-c number of cores
+
+-s needed if using a named snakefile
+
diff --git a/content/notes/bioinfo-reproducibility/bioinfo-reproducibility_31.md b/content/notes/bioinfo-reproducibility/bioinfo-reproducibility_31.md
new file mode 100644
index 00000000..13e29fea
--- /dev/null
+++ b/content/notes/bioinfo-reproducibility/bioinfo-reproducibility_31.md
@@ -0,0 +1,19 @@
+---
+title: Reproducible Environments
+date: 2026-03-25T19:08:46Z
+type: docs
+weight: 1600
+menu:
+ bioinfo-reproducibility:
+---
+
+## Snakemake supports reproducible environments.
+
+Example with Conda:
+
+rule fastqc: input: "reads.fastq" output: "qc.html" conda: ”~/.conda/envs/fastqc_env” #path to conda environment shell: "fastqc {input}"
+
+Benefits: Easy dependency management, portable workflows
+
+.yml file can indicate how to make conda environment and what packages and dependencies you need
+
diff --git a/content/notes/bioinfo-reproducibility/bioinfo-reproducibility_32.md b/content/notes/bioinfo-reproducibility/bioinfo-reproducibility_32.md
new file mode 100644
index 00000000..f7eb989f
--- /dev/null
+++ b/content/notes/bioinfo-reproducibility/bioinfo-reproducibility_32.md
@@ -0,0 +1,30 @@
+---
+title: Using Environments
+date: 2026-03-25T19:08:46Z
+type: docs
+weight: 1650
+menu:
+ bioinfo-reproducibility:
+---
+
+
+├── Snakefile├── config/│ └── config.yml├── envs/│ └── bwa.yml├── rules/│ ├── alignment.smk│ ├── qc.smk│ └── variant_calling.smk├── scripts/│ └── custom_processing.py├── data/│ └── raw/├── results/│ ├── bam/│ ├── qc/│ └── variants/└── logs/
+
+Can also create a environment.yml file, list conda envs and what to install
+
+__name__ : bwa.yml
+
+__channels:__
+
+- conda-forge
+
+- bioconda
+
+__dependencies__ :
+
+-bwa= 0.7.17
+
+---
+
+.yml file can indicate how to make conda environment and what packages and dependencies you need
+
diff --git a/content/notes/bioinfo-reproducibility/bioinfo-reproducibility_33.md b/content/notes/bioinfo-reproducibility/bioinfo-reproducibility_33.md
new file mode 100644
index 00000000..521b67f0
--- /dev/null
+++ b/content/notes/bioinfo-reproducibility/bioinfo-reproducibility_33.md
@@ -0,0 +1,29 @@
+---
+title: Snakemake with Conda Environment
+date: 2026-03-25T19:08:46Z
+type: docs
+weight: 1700
+menu:
+ bioinfo-reproducibility:
+---
+
+```bash
+$ module load miniforge
+
+$ conda create
+
+$ conda activate
+
+$ snakemake command
+
+$ screen/tmux
+```
+
+- keeps session running when disconnected
+
+- make sure to connect to same login node,
+
+- confirm login node with: hostname
+
+Can create different conda environment for different rules
+
diff --git a/content/notes/bioinfo-reproducibility/bioinfo-reproducibility_34.md b/content/notes/bioinfo-reproducibility/bioinfo-reproducibility_34.md
new file mode 100644
index 00000000..186f4b90
--- /dev/null
+++ b/content/notes/bioinfo-reproducibility/bioinfo-reproducibility_34.md
@@ -0,0 +1,21 @@
+---
+title: Smakemake and containers
+date: 2026-03-25T19:08:46Z
+type: docs
+weight: 1750
+menu:
+ bioinfo-reproducibility:
+---
+
+## Snakemake also supports containers:
+
+rule align: container: "docker://biocontainers/bwa"
+
+Advantages:
+
+identical software environments
+
+portable across HPC systems
+
+easier collaboration
+
diff --git a/content/notes/bioinfo-reproducibility/bioinfo-reproducibility_35.md b/content/notes/bioinfo-reproducibility/bioinfo-reproducibility_35.md
new file mode 100644
index 00000000..3ed82850
--- /dev/null
+++ b/content/notes/bioinfo-reproducibility/bioinfo-reproducibility_35.md
@@ -0,0 +1,13 @@
+---
+title: Best Practices for HPC
+date: 2026-03-25T19:08:46Z
+type: docs
+weight: 1800
+menu:
+ bioinfo-reproducibility:
+---
+
+## Recommendations:
+
+Use threads and resources properlyAvoid huge single jobsBreak workflows into modular rulesUse conda or containersUse --dry-run before submitting large workflowsStore configuration in YAML files
+
diff --git a/content/notes/bioinfo-reproducibility/bioinfo-reproducibility_36.md b/content/notes/bioinfo-reproducibility/bioinfo-reproducibility_36.md
new file mode 100644
index 00000000..e0ff90a3
--- /dev/null
+++ b/content/notes/bioinfo-reproducibility/bioinfo-reproducibility_36.md
@@ -0,0 +1,19 @@
+---
+title: Common HPC Pitfalls with Workflow Managers
+date: 2026-03-25T19:08:46Z
+type: docs
+weight: 1850
+menu:
+ bioinfo-reproducibility:
+---
+
+**Examples:**
+
+requesting too many cores per rule
+
+forgetting to specify memory
+
+submitting thousands of tiny jobs
+
+running Snakemake or Nextflow themselves on a login node
+
diff --git a/content/notes/bioinfo-reproducibility/bioinfo-reproducibility_37.md b/content/notes/bioinfo-reproducibility/bioinfo-reproducibility_37.md
new file mode 100644
index 00000000..ff2e1ca2
--- /dev/null
+++ b/content/notes/bioinfo-reproducibility/bioinfo-reproducibility_37.md
@@ -0,0 +1,21 @@
+---
+title: Key Takeaways with Workflow Managers
+date: 2026-03-25T19:08:46Z
+type: docs
+weight: 1900
+menu:
+ bioinfo-reproducibility:
+---
+
+Snakemake & Nextflow provide:
+
+reproducible pipelines
+
+automatic dependency tracking
+
+scalable HPC execution
+
+environment management
+
+workflow portability
+
diff --git a/content/notes/bioinfo-reproducibility/bioinfo-reproducibility_38.md b/content/notes/bioinfo-reproducibility/bioinfo-reproducibility_38.md
new file mode 100644
index 00000000..f430414f
--- /dev/null
+++ b/content/notes/bioinfo-reproducibility/bioinfo-reproducibility_38.md
@@ -0,0 +1,21 @@
+---
+title: Nextflow
+date: 2026-03-25T19:08:46Z
+type: docs
+weight: 1950
+menu:
+ bioinfo-reproducibility:
+---
+
+Snakemake & Nextflow provide:
+
+* reproducible pipelines
+
+* automatic dependency tracking
+
+* scalable HPC execution
+
+* environment management
+
+* workflow portability
+
diff --git a/content/notes/bioinfo-reproducibility/bioinfo-reproducibility_39.md b/content/notes/bioinfo-reproducibility/bioinfo-reproducibility_39.md
new file mode 100644
index 00000000..f4ea6322
--- /dev/null
+++ b/content/notes/bioinfo-reproducibility/bioinfo-reproducibility_39.md
@@ -0,0 +1,13 @@
+---
+title: What is Nextflow?
+date: 2026-03-25T19:08:46Z
+type: docs
+weight: 2000
+menu:
+ bioinfo-reproducibility:
+---
+
+Nextflow is a workflow management system that helps automate and organize multi-step computational pipelines.
+
+At a high level, it connects software steps together, manages how data moves between them, and handles execution across local machines, HPC schedulers like SLURM, or cloud platforms.
+
diff --git a/content/notes/bioinfo-reproducibility/bioinfo-reproducibility_4.md b/content/notes/bioinfo-reproducibility/bioinfo-reproducibility_4.md
new file mode 100644
index 00000000..f236cf8b
--- /dev/null
+++ b/content/notes/bioinfo-reproducibility/bioinfo-reproducibility_4.md
@@ -0,0 +1,16 @@
+---
+title: Reproducibility in Science
+date: 2026-03-25T19:08:46Z
+type: docs
+weight: 250
+menu:
+ bioinfo-reproducibility:
+---
+
+
+* Reproducibility - redo a scientific experiment & generate similar results
+ * Same sample, software, data, code - same result?
+* Replication - different data, same methods - conclusions consistent?
+* Reusability - Will someone be able to use your pipeline in the future?
+ * - Will you be able to use it?
+
diff --git a/content/notes/bioinfo-reproducibility/bioinfo-reproducibility_40.md b/content/notes/bioinfo-reproducibility/bioinfo-reproducibility_40.md
new file mode 100644
index 00000000..18347a50
--- /dev/null
+++ b/content/notes/bioinfo-reproducibility/bioinfo-reproducibility_40.md
@@ -0,0 +1,21 @@
+---
+title: Nextflow Pipelines
+date: 2026-03-25T19:08:46Z
+type: docs
+weight: 2050
+menu:
+ bioinfo-reproducibility:
+---
+
+* Key concepts:
+ * Processes, workflows, and parameters
+* In general, we are going to:
+ * Create processes to execute desired commands
+ * Specify parameters to represent workflow settings
+ * Define a workflow to execute processes in a specific order
+* Key files:
+ * main.nf and nextflow.config
+
+
+Parameters are user-adjustable values that control how a workflow runs. They can specify input files, output locations, software options, reference files, or general pipeline behavior.
+
diff --git a/content/notes/bioinfo-reproducibility/bioinfo-reproducibility_41.md b/content/notes/bioinfo-reproducibility/bioinfo-reproducibility_41.md
new file mode 100644
index 00000000..392fd364
--- /dev/null
+++ b/content/notes/bioinfo-reproducibility/bioinfo-reproducibility_41.md
@@ -0,0 +1,36 @@
+---
+title: A Toy Example
+date: 2026-03-25T19:08:46Z
+type: docs
+weight: 2100
+menu:
+ bioinfo-reproducibility:
+---
+
+**Example**
+
+Let's start with a very simple toy example for echo'ing the text "Hello World!" And then we'll build to our bioinformatics example.
+
+First, create a process called HELLO with our shell command:
+
+process HELLO {
+
+script:
+
+"""
+
+echo "Hello World!"
+
+"""
+
+}
+
+Then we execute this process in our workflow:
+
+workflow {
+
+HELLO()
+
+}
+
+
diff --git a/content/notes/bioinfo-reproducibility/bioinfo-reproducibility_42.md b/content/notes/bioinfo-reproducibility/bioinfo-reproducibility_42.md
new file mode 100644
index 00000000..c03afef2
--- /dev/null
+++ b/content/notes/bioinfo-reproducibility/bioinfo-reproducibility_42.md
@@ -0,0 +1,32 @@
+---
+title: Create a New File called main.nf
+date: 2026-03-25T19:08:46Z
+type: docs
+weight: 2150
+menu:
+ bioinfo-reproducibility:
+---
+
+We can create a new file called main.nf with these lines.
+
+process HELLO {
+
+script:
+
+"""
+
+echo "Hello World!"
+
+"""
+
+}
+
+workflow {
+
+HELLO()
+
+}
+
+Show and execute main.nf in terminal. Show where the file goes. Went to .command.out file in 'work' directory
+ for the specific process
+
diff --git a/content/notes/bioinfo-reproducibility/bioinfo-reproducibility_43.md b/content/notes/bioinfo-reproducibility/bioinfo-reproducibility_43.md
new file mode 100644
index 00000000..f378ed48
--- /dev/null
+++ b/content/notes/bioinfo-reproducibility/bioinfo-reproducibility_43.md
@@ -0,0 +1,17 @@
+---
+title: Make Some Changes
+date: 2026-03-25T19:08:46Z
+type: docs
+weight: 2200
+menu:
+ bioinfo-reproducibility:
+---
+
+__process__ hello { __output__ : path 'hello.txt' script: """ echo 'Hello world!' > hello.txt """}
+
+We want to send the text to a file called 'hello.txt.' Now we can update our shell command to send the text to a file, and we can add an output in our process to define our file name and since out output is a file, we'll specify the type of output as a path.
+
+Run `main.nf` in terminal and show it still went to 'work' directory
+
+This was better, but we still have to dig around for the file, so let's add one more thing to our process.
+
diff --git a/content/notes/bioinfo-reproducibility/bioinfo-reproducibility_44.md b/content/notes/bioinfo-reproducibility/bioinfo-reproducibility_44.md
new file mode 100644
index 00000000..78634f52
--- /dev/null
+++ b/content/notes/bioinfo-reproducibility/bioinfo-reproducibility_44.md
@@ -0,0 +1,16 @@
+---
+title: Add a publishDir
+date: 2026-03-25T19:08:46Z
+type: docs
+weight: 2250
+menu:
+ bioinfo-reproducibility:
+---
+
+Now let's try sending our output to a directory called 'results' - we can add a publishDir to our process and specify the mode "copy" is safest, but you can do other things like move or even create links to the file.
+Re-run the main.nf in the terminal and show where the file goes to results but since we did copy, it still does go to work. Point out that we need to be mindful of any extra data we're creating so we don't unnecessarily have duplicates for everything.
+
+__process__ hello { publishDir "results/" , mode: "copy"
+
+__ output__ : path 'hello.txt' script: """ echo 'Hello world!' > hello.txt """}
+
diff --git a/content/notes/bioinfo-reproducibility/bioinfo-reproducibility_45.md b/content/notes/bioinfo-reproducibility/bioinfo-reproducibility_45.md
new file mode 100644
index 00000000..804d869b
--- /dev/null
+++ b/content/notes/bioinfo-reproducibility/bioinfo-reproducibility_45.md
@@ -0,0 +1,30 @@
+---
+title: Look at a Trim Rule
+date: 2026-03-25T19:08:46Z
+type: docs
+weight: 2300
+menu:
+ bioinfo-reproducibility:
+---
+
+Let's look at our snakemake "trim" rule from earlier:
+
+__rule__ trim:
+
+input:
+
+”reads/sample1.fastq”
+
+output:
+
+”trimmed_reads/sample1-trimmed.fastq”
+
+shell:
+
+cutadapt -A TCCGGGTS -o {output} {input}
+
+{{< figure src=/notes/bioinfo-reproducibility/img/Triant-Bobar_Reproducibility_41.png >}}
+
+
+Here we specified our inputs/outputs and our shell command.
+
diff --git a/content/notes/bioinfo-reproducibility/bioinfo-reproducibility_46.md b/content/notes/bioinfo-reproducibility/bioinfo-reproducibility_46.md
new file mode 100644
index 00000000..31eb5d2f
--- /dev/null
+++ b/content/notes/bioinfo-reproducibility/bioinfo-reproducibility_46.md
@@ -0,0 +1,16 @@
+---
+title: What to Update in Nextflow?
+date: 2026-03-25T19:08:46Z
+type: docs
+weight: 2350
+menu:
+ bioinfo-reproducibility:
+---
+
+So, looking at our HELLO process, what do we need to add? We already have a publishDir, an output, and script, so let's update those for cutadapt.
+
+__process__ HELLO { publishDir "results/" , mode: "copy"
+
+__ __ output: path 'hello.txt' script: """ echo 'Hello world!' > hello.txt """}
+
+
diff --git a/content/notes/bioinfo-reproducibility/bioinfo-reproducibility_47.md b/content/notes/bioinfo-reproducibility/bioinfo-reproducibility_47.md
new file mode 100644
index 00000000..91e3be8c
--- /dev/null
+++ b/content/notes/bioinfo-reproducibility/bioinfo-reproducibility_47.md
@@ -0,0 +1,24 @@
+---
+title: Update for Running Cutadapt
+date: 2026-03-25T19:08:46Z
+type: docs
+weight: 2400
+menu:
+ bioinfo-reproducibility:
+---
+
+
+__process__ CUTADAPT { publishDir "results/" , mode: "copy"
+
+output: path 'trimmed.fastq' script: """ cutadapt -a AACCGGTT -o trimmed.fastq ~/sample1.fastq """}
+
+workflow {
+
+CUTADAPT()
+
+}
+
+
+We can keep 'results' as our publishDir for this example, but we'll need to change our output to trimmed.fastq and we'll change the command for cutadapt with our adapter and our input and output file names. Because Nextflow executes each task in its own work directory, we need to provide the full path. Our workflow just becomes running the CUTADAPT process.
+Does this work? Yes, it does. However, but we are hard-coding everything and this not really flexible and does not really allow us to scale.
+
diff --git a/content/notes/bioinfo-reproducibility/bioinfo-reproducibility_48.md b/content/notes/bioinfo-reproducibility/bioinfo-reproducibility_48.md
new file mode 100644
index 00000000..57c20740
--- /dev/null
+++ b/content/notes/bioinfo-reproducibility/bioinfo-reproducibility_48.md
@@ -0,0 +1,25 @@
+---
+title: A More Common Approach
+date: 2026-03-25T19:08:46Z
+type: docs
+weight: 2450
+menu:
+ bioinfo-reproducibility:
+---
+
+A better approach is to pass the file into the process with Channel.fromPath() and use input: path reads. The "input:" declares an input variable, not a literal source file location. And we use this variable "reads" our shell command and here $reads means: the local process input variable and use the actual input file that was provided to Nextflow for this task via our workflow. We can also use the reads variable to other things like dynamically name files or any
+
+__process__ CUTADAPT { publishDir "results/" , mode: "copy"
+
+input: path reads_var
+
+output: path 'trimmed.fastq' script: """ cutadapt -a AACCGGTT -o trimmed.fastq $reads_var """}
+
+workflow {
+
+CUTADAPT(Channel.fromPath('~/sample1.fastq', checkIfExists: true))
+
+}
+
+
+
diff --git a/content/notes/bioinfo-reproducibility/bioinfo-reproducibility_49.md b/content/notes/bioinfo-reproducibility/bioinfo-reproducibility_49.md
new file mode 100644
index 00000000..7bc6d555
--- /dev/null
+++ b/content/notes/bioinfo-reproducibility/bioinfo-reproducibility_49.md
@@ -0,0 +1,17 @@
+---
+title: Dynamically Scaling to Many Samples
+date: 2026-03-25T19:08:46Z
+type: docs
+weight: 2500
+menu:
+ bioinfo-reproducibility:
+---
+
+Now we can start to use the flexibility nextflow provides to name our output files dynamically based on sample name and we also can start to scale up by using the wildcard to grab all the fastq files in our example 'reads' directory. Here nextflow is going to create a new separate process for each of our samples.
+
+
+__process CUTADAPT {__ __ __ __publishDir__ __ "results/", mode: "copy"__ __ input:__ __ path __ __reads_var__ __ output:__ __ path "${__ __reads_var.simpleName__ __}___ __trimmed.fastq__ __"__ __ script:__ __ """__ __ __ __cutadapt__ __ -a AACCGGTT -o ${__ __reads_var.simpleName__ __}___ __trimmed.fastq__ __ $__ __reads_var__ __ """__ __}__ __workflow {__ __ CUTADAPT(__ __Channel.fromPath__ __('*.__ __fastq__ __', __ __checkIfExists__ __: true))__ __}__
+
+
+
+
diff --git a/content/notes/bioinfo-reproducibility/bioinfo-reproducibility_5.md b/content/notes/bioinfo-reproducibility/bioinfo-reproducibility_5.md
new file mode 100644
index 00000000..f475bf6e
--- /dev/null
+++ b/content/notes/bioinfo-reproducibility/bioinfo-reproducibility_5.md
@@ -0,0 +1,18 @@
+---
+title: The Reproducibility Problem
+date: 2026-03-25T19:08:46Z
+type: docs
+weight: 300
+menu:
+ bioinfo-reproducibility:
+---
+
+
+Where did you do the analysis - laptop, server, lab computer, environment
+
+Are you using the most recent version (scripts, datasets, analyses)
+
+We just used the default settings!
+
+{{< figure src=/notes/bioinfo-reproducibility/img/Triant-Bobar_Reproducibility_4.png >}}
+
diff --git a/content/notes/bioinfo-reproducibility/bioinfo-reproducibility_50.md b/content/notes/bioinfo-reproducibility/bioinfo-reproducibility_50.md
new file mode 100644
index 00000000..3db5e3ae
--- /dev/null
+++ b/content/notes/bioinfo-reproducibility/bioinfo-reproducibility_50.md
@@ -0,0 +1,20 @@
+---
+title: Parameter Options for Input Files
+date: 2026-03-25T19:08:46Z
+type: docs
+weight: 2550
+menu:
+ bioinfo-reproducibility:
+---
+As with many things with Nextflow, we have multple different ways we can accomplish this. We'll talk about nextflow.config shortly.
+
+Add a parameter for '--reads' in your 'nextflow run' command
+
+Add a params.reads at the top of your main.nf file
+
+Add a params.reads to a nextflow.config file
+
+Works for one file ('reads/sample1.fastq') or many ('reads/*.fastq')
+
+
+
diff --git a/content/notes/bioinfo-reproducibility/bioinfo-reproducibility_51.md b/content/notes/bioinfo-reproducibility/bioinfo-reproducibility_51.md
new file mode 100644
index 00000000..71065c63
--- /dev/null
+++ b/content/notes/bioinfo-reproducibility/bioinfo-reproducibility_51.md
@@ -0,0 +1,19 @@
+---
+title: Less Hard-coding = More Reproducibility
+date: 2026-03-25T19:08:46Z
+type: docs
+weight: 2600
+menu:
+ bioinfo-reproducibility:
+---
+
+If we use one of those parameter methods, instead of our workflow having a hard-coded path for our inputs, we can dynamically provide our input file names and clean things up in our workflow even further.
+
+From:workflow { CUTADAPT(Channel.fromPath(~/sample1.fastq', checkIfExists: true))}
+
+To:
+
+workflow { CUTADAPT(Channel.fromPath(params.reads, checkIfExists: true))}
+
+
+
diff --git a/content/notes/bioinfo-reproducibility/bioinfo-reproducibility_52.md b/content/notes/bioinfo-reproducibility/bioinfo-reproducibility_52.md
new file mode 100644
index 00000000..b1f1e2d7
--- /dev/null
+++ b/content/notes/bioinfo-reproducibility/bioinfo-reproducibility_52.md
@@ -0,0 +1,24 @@
+---
+title: Loading Software
+date: 2026-03-25T19:08:46Z
+type: docs
+weight: 2650
+menu:
+ bioinfo-reproducibility:
+---
+
+main.nf
+
+Use a 'beforeScript' in the CUTADAPT process in main.nf
+
+beforeScript runs specified shell command(s) before running the script command
+
+Load the cutadapt module: beforeScript 'module load cutadapt'
+
+Can also do other things like export variables or create directories
+
+beforeScript """ module purge module load cutadapt mkdir results export PATH="$PATH:/opt/tools"' """
+
+
+We can definitely load the software in our process, but we just cleaned that thing up, so let's put it somewhere better to keep our main.nf focused on workflow logic. To do this, let's go ahead and start to build a nextflow.config file.
+
diff --git a/content/notes/bioinfo-reproducibility/bioinfo-reproducibility_53.md b/content/notes/bioinfo-reproducibility/bioinfo-reproducibility_53.md
new file mode 100644
index 00000000..3aff08ce
--- /dev/null
+++ b/content/notes/bioinfo-reproducibility/bioinfo-reproducibility_53.md
@@ -0,0 +1,18 @@
+---
+title: Loading Software – nextflow.config
+date: 2026-03-25T19:08:46Z
+type: docs
+weight: 2700
+menu:
+ bioinfo-reproducibility:
+---
+
+
+Again, we use a 'beforeScript' specific to the CUTADAPT process
+
+Process {withName: CUTADAPT { beforeScript = ''' module purge module load cutadapt
+
+'''
+
+Now when we specifically run our CUTADAPT process, these commands will run before our script command and set up our process environment. Ok, so now we have our software dialed in for cutadapt. But we need to think about where we are running these processes. By default, nextflow is running shell commands locally, so that means if we're just at the command line, we'd be running the processes on the login nodes, which is a no-no.
+
diff --git a/content/notes/bioinfo-reproducibility/bioinfo-reproducibility_54.md b/content/notes/bioinfo-reproducibility/bioinfo-reproducibility_54.md
new file mode 100644
index 00000000..99678174
--- /dev/null
+++ b/content/notes/bioinfo-reproducibility/bioinfo-reproducibility_54.md
@@ -0,0 +1,22 @@
+---
+title: Adding SLURM Options
+date: 2026-03-25T19:08:46Z
+type: docs
+weight: 2750
+menu:
+ bioinfo-reproducibility:
+---
+
+So, we need to let Nextflow know that we want to use SLURM to execute our processes and with do this by specifying SLURM as our executor. We can also use this to specify various other options – nextflow doesn't have explicit options for all possible slurm commands, so we can supplement with any additional options we need with 'clusterOptions.' Again, there's multiple ways to configure everything – you can also do global slurm options, but often different parts of the workflow are going to need different resources. And we could potentially specify these slurm options in the CUTADAPT process in our main.nf, but we're trying to keep that tidy and focused on the workflow logic.
+
+
+Process { withName: CUTADAPT { beforeScript = ''' module purge module load cutadapt
+
+'''
+
+executor = 'slurm' queue = 'standard' cpus = 2 mem = '16 GB' time = '1h' clusterOptions = '--account=hpc_build'
+
+}
+
+
+
diff --git a/content/notes/bioinfo-reproducibility/bioinfo-reproducibility_55.md b/content/notes/bioinfo-reproducibility/bioinfo-reproducibility_55.md
new file mode 100644
index 00000000..8a0d56b2
--- /dev/null
+++ b/content/notes/bioinfo-reproducibility/bioinfo-reproducibility_55.md
@@ -0,0 +1,16 @@
+---
+title: What We Have
+date: 2026-03-25T19:08:46Z
+type: docs
+weight: 2800
+menu:
+ bioinfo-reproducibility:
+---
+
+Workflow logic in main.nf
+
+Software and slurm options in nextflow.config
+
+{{< figure src=/notes/bioinfo-reproducibility/img/Triant-Bobar_Reproducibility_51.png >}}
+
+
diff --git a/content/notes/bioinfo-reproducibility/bioinfo-reproducibility_56.md b/content/notes/bioinfo-reproducibility/bioinfo-reproducibility_56.md
new file mode 100644
index 00000000..df17e220
--- /dev/null
+++ b/content/notes/bioinfo-reproducibility/bioinfo-reproducibility_56.md
@@ -0,0 +1,21 @@
+---
+title: Extend to CUTADAPT → BWA_ALIGN → FREEBAYES
+date: 2026-03-25T19:08:46Z
+type: docs
+weight: 2850
+menu:
+ bioinfo-reproducibility:
+---
+
+
+* Same rules apply – largely rinse and repeat for additional processes
+* Create processes for each step: inputs/outputs, commands, etc.
+* Software and slurm options in nextflow.config
+* Main difference is our workflow - more processes and channels
+ * Send channel into process
+ * Process produces output
+ * Output becomes new channel for next process.
+
+
+With Nextflow, channels carry data and processes do work on that data. You link them together by sending a channel into a process, and if that process produces output, its output can become a new channel for the next process.
+
diff --git a/content/notes/bioinfo-reproducibility/bioinfo-reproducibility_57.md b/content/notes/bioinfo-reproducibility/bioinfo-reproducibility_57.md
new file mode 100644
index 00000000..8b33d092
--- /dev/null
+++ b/content/notes/bioinfo-reproducibility/bioinfo-reproducibility_57.md
@@ -0,0 +1,24 @@
+---
+title: Workflow for CUTADAPT → BWA_ALIGN → FREEBAYES
+date: 2026-03-25T19:08:46Z
+type: docs
+weight: 2900
+menu:
+ bioinfo-reproducibility:
+---
+
+Here's how we could link the trim, align and variant calling together. So now we'll put it all together and run the entire workflow from end to end on the system.
+
+workflow {
+
+reads_ch = Channel.fromPath("${params.reads_dir}/*.fastq", checkIfExists: true)
+
+trimmed_ch = CUTADAPT(reads_ch)
+
+aligned_ch = BWA_ALIGN(trimmed_ch)
+
+FREEBAYES(aligned_ch)
+
+}
+
+
diff --git a/content/notes/bioinfo-reproducibility/bioinfo-reproducibility_58.md b/content/notes/bioinfo-reproducibility/bioinfo-reproducibility_58.md
new file mode 100644
index 00000000..56b980de
--- /dev/null
+++ b/content/notes/bioinfo-reproducibility/bioinfo-reproducibility_58.md
@@ -0,0 +1,14 @@
+---
+title: Additional Links
+date: 2026-03-25T19:08:46Z
+type: docs
+weight: 2950
+menu:
+ bioinfo-reproducibility:
+---
+
+[https](https://nf-co.re/rnaseq/3.23.0/)[://nf-co.re/rnaseq/3.23.0/](https://nf-co.re/rnaseq/3.23.0/)
+
+[https://training.nextflow.io](https://training.nextflow.io)
+
+https://github.com/nextflow-io/nextflow
diff --git a/content/notes/bioinfo-reproducibility/bioinfo-reproducibility_6.md b/content/notes/bioinfo-reproducibility/bioinfo-reproducibility_6.md
new file mode 100644
index 00000000..706b9986
--- /dev/null
+++ b/content/notes/bioinfo-reproducibility/bioinfo-reproducibility_6.md
@@ -0,0 +1,20 @@
+---
+title: Studies in Reproducibility
+date: 2026-03-25T19:08:46Z
+type: docs
+weight: 350
+menu:
+ bioinfo-reproducibility:
+---
+
+
+Nature (2016) - Found that 70% of researchers have failed in reproducing another researcher’s results & >50% failed to reproduce their own
+
+PLoS Biology (2024) - Biomedical researchers - 72% reported “reproducibility crisis”
+
+Genome Biol (2024) - Reproducibility in bioinformatics era
+
+{{< figure src=/notes/bioinfo-reproducibility/img/Triant-Bobar_Reproducibility_5.png >}}
+
+{{< figure src=/notes/bioinfo-reproducibility/img/Triant-Bobar_Reproducibility_6.png >}}
+
diff --git a/content/notes/bioinfo-reproducibility/bioinfo-reproducibility_7.md b/content/notes/bioinfo-reproducibility/bioinfo-reproducibility_7.md
new file mode 100644
index 00000000..42899115
--- /dev/null
+++ b/content/notes/bioinfo-reproducibility/bioinfo-reproducibility_7.md
@@ -0,0 +1,18 @@
+---
+title: Challenges of Bioinformatics
+date: 2026-03-25T19:08:46Z
+type: docs
+weight: 400
+menu:
+ bioinfo-reproducibility:
+---
+
+
+* So many tools, often with:
+ * Multiple versions & releases
+ * Complex dependencies & hidden parameters, starting seeds
+ * Running tools locally vs on HPC
+ * Formatting conversions between software
+ * Scalability - how tools handle datasets increasing in size
+ * Keeping codes organized!
+
diff --git a/content/notes/bioinfo-reproducibility/bioinfo-reproducibility_8.md b/content/notes/bioinfo-reproducibility/bioinfo-reproducibility_8.md
new file mode 100644
index 00000000..1c8f06af
--- /dev/null
+++ b/content/notes/bioinfo-reproducibility/bioinfo-reproducibility_8.md
@@ -0,0 +1,19 @@
+---
+title: Aspects of Reproducibility
+date: 2026-03-25T19:08:46Z
+type: docs
+weight: 450
+menu:
+ bioinfo-reproducibility:
+---
+
+Version control
+
+Environment management
+
+Data storage
+
+Containers
+
+Tool/software maintenance
+
diff --git a/content/notes/bioinfo-reproducibility/bioinfo-reproducibility_9.md b/content/notes/bioinfo-reproducibility/bioinfo-reproducibility_9.md
new file mode 100644
index 00000000..469b1270
--- /dev/null
+++ b/content/notes/bioinfo-reproducibility/bioinfo-reproducibility_9.md
@@ -0,0 +1,14 @@
+---
+title: Saving Document Versions
+date: 2026-03-25T19:08:46Z
+type: docs
+weight: 500
+menu:
+ bioinfo-reproducibility:
+---
+
+
+{{< figure src=/notes/bioinfo-reproducibility/img/Triant-Bobar_Reproducibility_9.gif >}}
+
+{{< figure src=/notes/bioinfo-reproducibility/img/Triant-Bobar_Reproducibility_10.png >}}
+
diff --git a/content/notes/bioinfo-reproducibility/img/Triant-Bobar_Reproducibility_10.png b/content/notes/bioinfo-reproducibility/img/Triant-Bobar_Reproducibility_10.png
new file mode 100644
index 00000000..885acdc4
Binary files /dev/null and b/content/notes/bioinfo-reproducibility/img/Triant-Bobar_Reproducibility_10.png differ
diff --git a/content/notes/bioinfo-reproducibility/img/Triant-Bobar_Reproducibility_11.png b/content/notes/bioinfo-reproducibility/img/Triant-Bobar_Reproducibility_11.png
new file mode 100644
index 00000000..3900ccf8
Binary files /dev/null and b/content/notes/bioinfo-reproducibility/img/Triant-Bobar_Reproducibility_11.png differ
diff --git a/content/notes/bioinfo-reproducibility/img/Triant-Bobar_Reproducibility_4.png b/content/notes/bioinfo-reproducibility/img/Triant-Bobar_Reproducibility_4.png
new file mode 100644
index 00000000..3727c414
Binary files /dev/null and b/content/notes/bioinfo-reproducibility/img/Triant-Bobar_Reproducibility_4.png differ
diff --git a/content/notes/bioinfo-reproducibility/img/Triant-Bobar_Reproducibility_5.png b/content/notes/bioinfo-reproducibility/img/Triant-Bobar_Reproducibility_5.png
new file mode 100644
index 00000000..f795ad1e
Binary files /dev/null and b/content/notes/bioinfo-reproducibility/img/Triant-Bobar_Reproducibility_5.png differ
diff --git a/content/notes/bioinfo-reproducibility/img/Triant-Bobar_Reproducibility_6.png b/content/notes/bioinfo-reproducibility/img/Triant-Bobar_Reproducibility_6.png
new file mode 100644
index 00000000..937e9a87
Binary files /dev/null and b/content/notes/bioinfo-reproducibility/img/Triant-Bobar_Reproducibility_6.png differ
diff --git a/content/notes/bioinfo-reproducibility/img/Triant-Bobar_Reproducibility_9.gif b/content/notes/bioinfo-reproducibility/img/Triant-Bobar_Reproducibility_9.gif
new file mode 100644
index 00000000..721323e9
Binary files /dev/null and b/content/notes/bioinfo-reproducibility/img/Triant-Bobar_Reproducibility_9.gif differ
diff --git a/content/notes/bioinfo-reproducibility/src/Triant-Bobar_Reproducibility.pptx b/content/notes/bioinfo-reproducibility/src/Triant-Bobar_Reproducibility.pptx
new file mode 100644
index 00000000..0b1ce505
Binary files /dev/null and b/content/notes/bioinfo-reproducibility/src/Triant-Bobar_Reproducibility.pptx differ
diff --git a/content/notes/bioinfo-reproducibility/src/out.md b/content/notes/bioinfo-reproducibility/src/out.md
new file mode 100644
index 00000000..3426c31e
--- /dev/null
+++ b/content/notes/bioinfo-reproducibility/src/out.md
@@ -0,0 +1,973 @@
+# Reproducibility in Bioinformatics
+
+# Deb Triant & Marcus Bobar
+Research Computing, University of Virginia
+dtriant@virginia.edu, mb5wt@virginia.edu
+
+
+
+---
+
+Introductions
+
+
+# Workshop outline
+
+Difficulties in achieving reproducibility
+
+Potential problems with bioinformatics pipelines
+
+Some helpful tools
+
+Snakemake & Nextflow examples
+
+
+
+# Reproducibility in science
+
+
+
+* Reproducibility \- redo a scientific experiment & generate similar results
+ * Same sample\, software\, data\, code \- same result?
+* Replication \- different data\, same methods \- conclusions consistent?
+* Reusability \- Will someone be able to use your pipeline in the future?
+ * \- Will you be able to use it?
+
+
+
+
+# Reproducibility Problem
+
+Where did you do the analysis \- laptop\, server\, lab computer\, environment
+
+Are you using the most recent version \(scripts\, datasets\, analyses\)
+
+We just used the default settings\!
+
+
+
+
+
+# Studies in reproducibility
+
+Nature \(2016\) \- Found that 70% of researchers have failed in reproducing another researcher’s results & >50% failed to reproduce their own
+
+PLoS Biology \(2024\) \- Biomedical researchers \- 72% reported “reproducibility crisis”
+
+Genome Biol \(2024\) \- Reproducibility in bioinformatics era
+
+
+
+
+
+# Challenges of Bioinformatics
+
+
+
+* So many tools\, often with:
+ * Multiple versions & releases
+ * Complex dependencies & hidden parameters\, starting seeds
+ * Running tools locally vs on HPC
+ * Formatting conversions between software
+ * Scalability \- how tools handle datasets increasing in size
+ * Keeping codes organized\!
+
+
+
+
+# Aspects of reproducibility
+
+Version control
+
+Environment management
+
+Data storage
+
+Containers
+
+Tool/software maintenance
+
+
+
+# Saving document versions
+
+
+
+
+
+# Version Control
+
+
+
+GitHub: https://github\.com
+
+Track and manage changes to your code & files
+
+Store and label changes at every step
+
+Small or large projects
+
+Collaborate on projects and minimize conflicting edits
+
+Works on multiple platforms \(MacOS\, Windows\, Linux\)
+
+
+
+---
+
+Website for github, cutadapt repository
+
+# Environment Management
+
+
+
+* Conda/Mamba environments
+ * Isolated spaces for each project with specific tool versions
+ * Manage Python versions and dependencies
+ * Install packages and software directly into environment
+ * Stable and reproducible place to run code and applications
+ * Not limited to Python\, can run bash\, Rscript
+ * YAML configuration file to create or export and transfer an environment
+
+
+
+
+# Storing results
+
+
+
+* Public repositories for sequence data \- required for most journals
+ * NCBI: https://www\.ncbi\.nlm\.nih\.gov
+ * Ensembl: https://www\.ensembl\.org/index\.html
+ * Always document and archive changes\, especially if unpublished:
+ * \- genome assembly versions
+ * \- sequence data: SNPs\, isoforms
+
+
+
+
+---
+
+Websites: NCBI, Ensembl, Santa Cruz
+
+# Containers
+
+Portable environments that run across different computing environments
+
+Contain packages\, software and dependencies that remain isolated from host infrastructure
+
+Standalone unit of software and can produce same results on different machine or server
+
+ __Ruoshi__ __ Sun \- Research Computing Workshop Series__
+
+1\. Using Containers on HPC \- Monday\, March 30\, 2026 \- 9:00AM
+
+2\. Building Containers on HPC \- Monday April 6\, 2026 \- 9:00AM
+
+
+
+# Bioinformatic Pipelines
+
+
+
+* Typical bioinformatics workflows involve many steps:
+* FASTQ → QC → Alignment → Sorting → Variant Calling → Annotation
+ * \- FASTQ files need quality check and trimming
+ * Cutadapt
+ * BWA
+ * Samtools
+ * Freebayes
+ * VCFtools
+* Create pipeline to string software together for “final” output
+
+
+
+
+# Bioinformatic Pipeline challenges
+
+Complex dependencies between steps
+
+Formatting inconsistencies
+
+Hard to reproduce results \- scalability\, parameters\, version changes
+
+Difficult to parallelize efficiently
+
+Manual scripts often fail on HPC
+
+
+
+# Bioinformatic Pipelines on HPC
+
+Which modules were loaded?
+
+Where are scripts being run
+
+Tracking paths \- hard\-coded in scripts?
+
+Out/error files \- software vs slurm conflicts
+
+ __Goal:__ Automate and track these workflows
+
+
+
+
+
+# Snakemake
+
+https://snakemake\.github\.io/
+
+__Snakemake__ is a workflow management system designed for scientific pipelines
+
+Created by Johannes Köster\, first released in 2012
+
+Based on UNIX make \- originally created in 1976 but still standard use
+
+Python based \- “ _snake\-make_ ”
+
+Free and open source\, available on Mac\, Windows\, Unix
+
+https://snakemake\.readthedocs\.io/en/stable/
+
+https://github\.com/snakemake
+
+---
+
+Make is a command-line interface software tool that performs actions ordered by configured dependencies as defined in a configuration file called a makefile. It is commonly used for build automation to build executable code from source code.
+
+# Snakemake format
+
+Similar to writing shell scripts but snake files contains sets of rules
+
+Format is based on Python structure
+
+Snakemake reads from snakefile that defines the rules
+
+Snakefile rules have a target output
+
+Snakemake uses pattern matching to follow the inputs\, outputs and commands contained in rules to reach final target output
+
+
+
+# Snakemake Core Idea
+
+Instead of defining _steps_ \, you define __rules that produce files__ \.
+
+rule align:
+
+input:
+
+"reads\.fastq"
+
+output:
+
+"aligned\.bam"
+
+shell:
+
+"bwa mem ref\.fa \{input\} > \{output\}"
+
+Snakemake builds a __directed acyclic graph \(DAG\)__ automatically\.
+
+Fastq → Cutadapt → BWA → Sorted BAM → Freebayes → VCF
+
+
+
+# Recommended Pipeline Directory Structure
+
+Benefits:
+
+separates __workflow logic from data__
+
+easier debugging
+
+easier collaboration
+
+Common practice:
+
+config/ → parameters and sample tables
+
+envs/ → reproducible environments
+
+rules/ → modular workflow steps
+
+results/ → generated outputs
+
+Example:
+
+bioinformatics\_pipeline/├── Snakefile├── config/│ └── config\.yml├── envs/│ └── bwa\.yml├── rules/│ ├── alignment\.smk│ ├── qc\.smk│ └── variant\_calling\.smk├── scripts/│ └── custom\_processing\.py├── data/│ └── raw/├── results/│ ├── bam/│ ├── qc/│ └── variants/└── logs/
+
+A clean directory structure makes pipelines easier to maintain and reproduce\.
+
+---
+
+.yml file can indicate how to make conda environment and what packages and dependencies you need
+
+# Snakefile breakdown
+
+Fastq files that need trimming \- input: sample\.fastq
+
+Cutadapt \- output: sample\-trimmed\.fastq
+
+BWA \- align trimmed fastq to assembly output: sample\-aligned\.sam
+
+Samtools sorting\, indexing \- output: sample\-sorted\.bam
+
+Freebayes variant calling \- output: sample\-variants\.vcf
+
+# Example snakefile
+
+__rule__ all: input: "variants/sample1\.vcf”
+
+__rule__ trim:
+
+input:
+
+”reads/sample1\.fastq”
+
+output:
+
+”trimmed\_reads/sample1\-trimmed\.fastq”
+
+shell:
+
+cutadapt \-A TCCGGGTS \-o \{output\} \{input\}
+
+__rule__ align: input: "trimmed\_reads/sample1\-trimmed\.fastq" output: "bam/sample1\.bam" threads: 1 shell: "bwa mem \-t \{threads\} ref\.fa \{input\} | samtools view \-Sb \- > \{output\}”
+
+__rule__ call\_variants: input: "bam/sample1\.bam" output: "variants/sample1\.vcf" shell: "freebayes \-f ref\.fa \{input\} > \{output\}”
+
+Snakemake takes first rule as the target
+
+then constructs graph of dependencies
+
+\{wildcards\} serve as placeholders within rules to operate
+
+on multiple files via pattern matching
+
+---
+
+Snakemake builds the entire pipeline graph automatically.
+
+
+# Snakemake exercises on HPC
+
+Class data:
+
+/project/ hpc\_training /reproducibility/ snakemake
+
+$ cp /project/ hpc\_training /reproducibility/ snakemake \.
+
+ \- GCF\_000005845\.2\_ASM584v2\_genomic\.fna \- genome assembly
+
+ \- SRR2584863\_1\.fastq \- fastq sequence file\, paired\-1
+
+ \- SRR2584863\_2\.fastq \- fastq sequence file\, paired\-2
+
+ \- \*\. smk \- snakemake files
+
+ \- config\_variant\.yml \- configuration file
+
+ \- submit\_snakemake\.sh \- sample slurm file
+
+
+
+---
+
+Yet another markup language- YAML Ain't Markup Language
+
+# Running jobs on interactive node
+
+Run interactively \- good for testing
+
+$ ijob \-c 1 \-A hpc\_training \-p interactive –v \-t 2:00:00
+
+$ cp /project/ hpc\_training /reproducibility/ snakemake \.
+
+
+
+---
+
+Default execution here is local so everything is running in my ijob session on a compute node. If we wanted to have these processes run non-interactively we would want to make sure we are using the executor flag in our snakemake call: "--executor slurm"
+
+Work in scratch
+
+# Modules
+
+$ module spider \
+
+\- specifics and version of package available
+
+$ module spider snakemake
+
+$ module load snakemake/9\.8\.1
+
+$ module list
+
+$ snakemake \-help
+
+
+
+# Other modules needed for today
+
+$ module load bwa/0\.7\.17
+
+$ module load cutadapt/4\.9
+
+$ module load snakemake/9\.8\.1
+
+$ module load freebayes/1\.3\.10
+
+$ module load samtools/1\.21
+
+
+
+# Running snakemake - genome alignment
+
+Snakefile \- file\.smk\, contains rules for snakemake
+
+$ snakemake \-c 1 \-s align\.smk
+
+\-\-dry\-run \-np good to test first without producing output
+
+\-n only show steps\, don't run\, \-p print shell commands
+
+\-c number of cores
+
+\-s needed if using a named snakefile \(if just called "snakefile"\, don't need the –s flag\)
+
+$ snakemake \-\-dag| dot \-Tpng > dag\_align\.png
+
+
+
+# Running snakemake - variant detection
+
+Snakefile \- file\.smk\, contains rules for snakemake
+
+$ snakemake \-c 1 \-s variant\-call\.smk
+
+\-\-dry\-run
+
+\-c number of cores
+
+\-s needed if using a named snakefile \(if just called "snakefile"\, don't need\)
+
+$ snakemake \-\-dag \-s variant\-call\.smk | dot \-Tpng \\ > dag\_variant\.png
+
+
+
+# Snakemake Examples on HPC
+
+Not recommended to hard\-code files within snake file
+
+Can organize sample names\, file paths\, and software parameters in a YAML configuration file
+
+YAML \- serialization language that transforms data into a format that can be shared between systems
+
+With snakemake\, configuration file is a reference for the workflow
+
+
+
+---
+
+Yet another markup language- YAML Ain't Markup Language
+Easy to keep things organized within a single file
+While showing, good to have separate config files rather than one huge one and commenting sections out
+
+# Running snakemake with config file
+
+Snakefile \- file\.smk\, contains rules for snakemake
+
+$ snakemake \-c 1 \-s variant\- yml\.smk \-\- configfile config\_variant\.yml
+
+\-\-configfile – directing snakemake to a config file
+
+\-c number of cores
+
+\-s needed if using a named snakefile
+
+
+
+# Reproducible environments
+
+Snakemake supports reproducible environments\.
+
+Example with Conda:
+
+rule fastqc: input: "reads\.fastq" output: "qc\.html" conda: ”~/\.conda/envs/fastqc\_env” \#path to conda environment shell: "fastqc \{input\}"
+
+Benefits: Easy dependency management\, portable workflows
+
+---
+
+.yml file can indicate how to make conda environment and what packages and dependencies you need
+
+# Using Environments
+
+├── Snakefile├── config/│ └── config\.yml├── envs/│ └── bwa\.yml├── rules/│ ├── alignment\.smk│ ├── qc\.smk│ └── variant\_calling\.smk├── scripts/│ └── custom\_processing\.py├── data/│ └── raw/├── results/│ ├── bam/│ ├── qc/│ └── variants/└── logs/
+
+Can also create a environment\.yml file\, list conda envs and what to install
+
+__name__ : bwa\.yml
+
+__channels:__
+
+\- conda\-forge
+
+\- bioconda
+
+__dependencies__ :
+
+\-bwa= 0\.7\.17
+
+
+
+---
+
+.yml file can indicate how to make conda environment and what packages and dependencies you need
+
+# Snakemake with conda environment
+
+$ module load miniforge
+
+$ conda create
+
+$ conda activate
+
+$ snakemake command
+
+$ screen/tmux
+
+\- keeps session running when disconnected
+
+\- make sure to connect to same login node\,
+
+\- confirm login node with: hostname
+
+Can create different conda environment for different rules
+
+# Smakemake and containers
+
+Snakemake also supports containers:
+
+rule align: container: "docker://biocontainers/bwa"
+
+Advantages:
+
+identical software environments
+
+portable across HPC systems
+
+easier collaboration
+
+ __Ruoshi__ __ Sun \- Research Computing Workshop Series__
+
+1\. Using Containers on HPC \- Monday\, March 30\, 2026 \- 9:00AM
+
+2\. Building Containers on HPC \- Monday April 6\, 2026 \- 9:00AM
+
+# Best Practices for HPC
+
+Recommendations:
+
+Use threads and resources properlyAvoid huge single jobsBreak workflows into modular rulesUse conda or containersUse \-\-dry\-run before submitting large workflowsStore configuration in YAML files
+
+
+
+# Common HPC Pitfalls with workflow managers
+
+Examples:
+
+requesting too many cores per rule
+
+forgetting to specify memory
+
+submitting thousands of tiny jobs
+
+running Snakemake or Nextflow themselves on a login node
+
+
+
+# Key Takeaways with workflow managers
+
+Snakemake & Nextflow provide:
+
+reproducible pipelines
+
+automatic dependency tracking
+
+scalable HPC execution
+
+environment management
+
+workflow portability
+
+
+
+# Nextflow
+
+Snakemake & Nextflow provide:
+
+reproducible pipelines
+
+automatic dependency tracking
+
+scalable HPC execution
+
+environment management
+
+workflow portability
+
+
+
+# What is Nextflow?
+
+Nextflow is a workflow management system that helps automate and organize multi\-step computational pipelines\.
+
+At a high level\, it connects software steps together\, manages how data moves between them\, and handles execution across local machines\, HPC schedulers like SLURM\, or cloud platforms\.
+
+
+
+# Nextflow Pipelines
+
+
+
+* Key concepts:
+ * Processes\, workflows\, and parameters
+* In general\, we are going to:
+ * Create processes to execute desired commands
+ * Specify parameters to represent workflow settings
+ * Define a workflow to execute processes in a specific order
+* Key files:
+ * main\.nf and nextflow\.config
+
+
+
+
+---
+
+Parameters are user-adjustable values that control how a workflow runs. They can specify input files, output locations, software options, reference files, or general pipeline behavior.
+
+# Toy example: print the text "Hello World!"
+
+First\, create a process called HELLO with our shell command:
+
+process HELLO \{
+
+script:
+
+"""
+
+echo "Hello World\!"
+
+"""
+
+\}
+
+Then we execute this process in our workflow:
+
+workflow \{
+
+HELLO\(\)
+
+\}
+
+
+
+---
+
+Let's start with a very simple toy example for echo'ing the text "Hello World!" And then we'll build to our bioinformatics example.
+
+# Create a new file called main.nf
+
+process HELLO \{
+
+script:
+
+"""
+
+echo "Hello World\!"
+
+"""
+
+\}
+
+workflow \{
+
+HELLO\(\)
+
+\}
+
+
+
+---
+
+We can create a new file called main.nf with these lines.
+
+Show and execute main.nf in terminal. Show where the file goes. Went to .command.out file in 'work' directory
+ for the specific process
+
+# Let's make some changes
+
+__process__ hello \{ __output__ : path 'hello\.txt' script: """ echo 'Hello world\!' > hello\.txt """\}
+
+
+
+---
+
+We want to send the text to a file called 'hello.txt.' Now we can update our shell command to send the text to a file, and we can add an output in our process to define our file name and since out output is a file, we'll specify the type of output as a path.
+
+Run main.nf in terminal and show it still went to 'work' directory
+
+This was better, but we still have to dig around for the file, so let's add one more thing to our process.
+
+# Add a publishDir for output file destination
+
+__process__ hello \{ publishDir "results/" \, mode: "copy"
+
+__ output__ : path 'hello\.txt' script: """ echo 'Hello world\!' > hello\.txt """\}
+
+
+
+---
+
+Now let's try sending our output to a directory called 'results' - we can add a publishDir to our process and specify the mode "copy" is safest, but you can do other things like move or even create links to the file.
+Re-run the main.nf in the terminal and show where the file goes to results but since we did copy, it still does go to work. Point out that we need to be mindful of any extra data we're creating so we don't unnecessarily have duplicates for everything.
+
+# Let's look at our snakemake "trim" rule from earlier
+
+__rule__ trim:
+
+input:
+
+”reads/sample1\.fastq”
+
+output:
+
+”trimmed\_reads/sample1\-trimmed\.fastq”
+
+shell:
+
+cutadapt \-A TCCGGGTS \-o \{output\} \{input\}
+
+
+
+---
+
+Here we specified our inputs/outputs and our shell command.
+
+# What do we need to update in Nextflow?
+
+__process__ HELLO \{ publishDir "results/" \, mode: "copy"
+
+__ __ output: path 'hello\.txt' script: """ echo 'Hello world\!' > hello\.txt """\}
+
+
+
+---
+
+So, looking at our HELLO process, what do we need to add? We already have a publishDir, an output, and script, so let's update those for cutadapt.
+
+# Update for running cutadapt
+
+__process__ CUTADAPT \{ publishDir "results/" \, mode: "copy"
+
+output: path 'trimmed\.fastq' script: """ cutadapt \-a AACCGGTT \-o trimmed\.fastq ~/sample1\.fastq """\}
+
+workflow \{
+
+CUTADAPT\(\)
+
+\}
+
+
+
+---
+
+We can keep 'results' as our publishDir for this example, but we'll need to change our output to trimmed.fastq and we'll change the command for cutadapt with our adapter and our input and output file names. Because Nextflow executes each task in its own work directory, we need to provide the full path. Our workflow just becomes running the CUTADAPT process.
+Does this work? Yes, it does. However, but we are hard-coding everything and this not really flexible and does not really allow us to scale.
+
+# More common approach for input files
+
+__process__ CUTADAPT \{ publishDir "results/" \, mode: "copy"
+
+input: path reads\_var
+
+output: path 'trimmed\.fastq' script: """ cutadapt \-a AACCGGTT \-o trimmed\.fastq $reads\_var """\}
+
+workflow \{
+
+CUTADAPT\(Channel\.fromPath\('~/sample1\.fastq'\, checkIfExists: true\)\)
+
+\}
+
+
+
+---
+
+A better approach is to pass the file into the process with Channel.fromPath() and use input: path reads. The "input:" declares an input variable, not a literal source file location. And we use this variable "reads" our shell command and here $reads means: the local process input variable and use the actual input file that was provided to Nextflow for this task via our workflow. We can also use the reads variable to other things like dynamically name files or any
+
+
+# Dynamically scaling to many samples
+
+__process CUTADAPT \{__ __ __ __publishDir__ __ "results/"\, mode: "copy"__ __ input:__ __ path __ __reads\_var__ __ output:__ __ path "$\{__ __reads\_var\.simpleName__ __\}\___ __trimmed\.fastq__ __"__ __ script:__ __ """__ __ __ __cutadapt__ __ \-a AACCGGTT \-o $\{__ __reads\_var\.simpleName__ __\}\___ __trimmed\.fastq__ __ $__ __reads\_var__ __ """__ __\}__ __workflow \{__ __ CUTADAPT\(__ __Channel\.fromPath__ __\('\*\.__ __fastq__ __'\, __ __checkIfExists__ __: true\)\)__ __\}__
+
+
+
+---
+
+Now we can start to use the flexibility nextflow provides to name our output files dynamically based on sample name and we also can start to scale up by using the wildcard to grab all the fastq files in our example 'reads' directory. Here nextflow is going to create a new separate process for each of our samples.
+
+
+# Parameter options for input files
+
+Add a parameter for '\-\-reads' in your 'nextflow run' command
+
+Add a params\.reads at the top of your main\.nf file
+
+Add a params\.reads to a nextflow\.config file
+
+Works for one file \('reads/sample1\.fastq'\) or many \('reads/\*\.fastq'\)
+
+
+
+---
+
+As with many things with Nextflow, we have multple different ways we can accomplish this. Will talk about nextflow.config shortly.
+
+# Less hard-coding = more reproducibility
+
+From:workflow \{ CUTADAPT\(Channel\.fromPath\(~/sample1\.fastq'\, checkIfExists: true\)\)\}
+
+To:
+
+workflow \{ CUTADAPT\(Channel\.fromPath\(params\.reads\, checkIfExists: true\)\)\}
+
+
+
+---
+
+And if we use one of those parameter methods, instead of our workflow having a hard-coded path for our inputs, we can dynamically provide our input file names and clean things up in our workflow even further.
+
+# Loading software – main.nf
+
+Use a 'beforeScript' in the CUTADAPT process in main\.nf
+
+beforeScript runs specified shell command\(s\) before running the script command
+
+Load the cutadapt module: beforeScript 'module load cutadapt'
+
+Can also do other things like export variables or create directories
+
+beforeScript """ module purge module load cutadapt mkdir results export PATH="$PATH:/opt/tools"' """
+
+
+
+---
+
+We can definitely load the software in our process, but we just cleaned that thing up, so let's put it somewhere better to keep our main.nf focused on workflow logic. To do this, let's go ahead and start to build a nextflow.config file.
+
+# Loading software – nextflow.config
+
+Again\, we use a 'beforeScript' specific to the CUTADAPT process
+
+Process \{withName: CUTADAPT \{ beforeScript = ''' module purge module load cutadapt
+
+'''
+
+
+
+---
+
+Now when we specifically run our CUTADAPT process, these commands will run before our script command and set up our process environment. Ok, so now we have our software dialed in for cutadapt. But we need to think about where we are running these processes. By default, nextflow is running shell commands locally, so that means if we're just at the command line, we'd be running the processes on the login nodes, which is a no-no.
+
+# Adding SLURM options – nextflow.config
+
+Process \{ withName: CUTADAPT \{ beforeScript = ''' module purge module load cutadapt
+
+'''
+
+executor = 'slurm' queue = 'standard' cpus = 2 mem = '16 GB' time = '1h' clusterOptions = '\-\-account=hpc\_build'
+
+\}
+
+
+
+---
+
+So, we need to let Nextflow know that we want to use SLURM to execute our processes and with do this by specifying SLURM as our executor. We can also use this to specify various other options – nextflow doesn't have explicit options for all possible slurm commands, so we can supplement with any additional options we need with 'clusterOptions.' Again, there's multiple ways to configure everything – you can also do global slurm options, but often different parts of the workflow are going to need different resources. And we could potentially specify these slurm options in the CUTADAPT process in our main.nf, but we're trying to keep that tidy and focused on the workflow logic.
+
+
+# Now we have:
+
+Workflow logic in main\.nf
+
+Software and slurm options in nextflow\.config
+
+
+
+---
+
+As you can imagine, there's also multiple ways to set
+
+# Extend to CUTADAPT → BWA_ALIGN → FREEBAYES
+
+
+
+* Same rules apply – largely rinse and repeat for additional processes
+* Create processes for each step: inputs/outputs\, commands\, etc\.
+* Software and slurm options in nextflow\.config
+* Main difference is our workflow \- more processes and channels
+ * Send channel into process
+ * Process produces output
+ * Output becomes new channel for next process\.
+
+
+
+
+---
+
+With Nextflow, channels carry data and processes do work on that data. You link them together by sending a channel into a process, and if that process produces output, its output can become a new channel for the next process.
+
+# Workflow for CUTADAPT → BWA_ALIGN → FREEBAYES
+
+workflow \{
+
+reads\_ch = Channel\.fromPath\("$\{params\.reads\_dir\}/\*\.fastq"\, checkIfExists: true\)
+
+trimmed\_ch = CUTADAPT\(reads\_ch\)
+
+aligned\_ch = BWA\_ALIGN\(trimmed\_ch\)
+
+FREEBAYES\(aligned\_ch\)
+
+\}
+
+
+
+---
+
+Here's how we could link the trim, align and variant calling together. So now we'll put it all together and run the entire workflow from end to end on the system.
+
+# Additional links
+
+[https](https://nf-co.re/rnaseq/3.23.0/)[://nf\-co\.re/rnaseq/3\.23\.0/](https://nf-co.re/rnaseq/3.23.0/)
+
+[https://training\.nextflow\.io](https://training.nextflow.io)
+
+https://github\.com/nextflow\-io/nextflow
+
+
+
+# Workflows for computational data analysis
+
+
+
+* https://github\.com/common\-workflow\-language/common\-workflow\-language/wiki/Existing\-Workflow\-systems
+* https://github\.com/pditommaso/awesome\-pipeline
+* Galaxy platform \- bioinformatic software\, pipeline and workflows:
+ * https://usegalaxy\.org
+
+
+
+
diff --git a/content/notes/containers-for-hpc/_index.md b/content/notes/containers-for-hpc/_index.md
index 2dcd0a32..1356039a 100644
--- a/content/notes/containers-for-hpc/_index.md
+++ b/content/notes/containers-for-hpc/_index.md
@@ -1,6 +1,5 @@
---
title: "Software Containers for HPC"
-authors: [rs]
toc: true
type: docs
date: "2025-05-15T00:00:00"
diff --git a/content/notes/containers-for-hpc/using.md b/content/notes/containers-for-hpc/using.md
index 13b73770..c976f500 100755
--- a/content/notes/containers-for-hpc/using.md
+++ b/content/notes/containers-for-hpc/using.md
@@ -215,10 +215,10 @@ Currently Loaded Modules:
```bash
module purge
module list
- module load tensorflow
+ module load pytorch
```
-1. Check the versions of tensorflow via `module spider tensorflow`. How would you load a non-default version?
-1. What is the default command of the tensorflow container? Where was it pulled from?
+1. Check the versions of pytorch via `module spider pytorch`. How would you load a non-default version?
+1. What is the default command of the pytorch container? Where was it pulled from?
---
diff --git a/content/notes/matlab-parallel-programming/_index.md b/content/notes/matlab-parallel-programming/_index.md
index 164eb22c..6cdcbdef 100644
--- a/content/notes/matlab-parallel-programming/_index.md
+++ b/content/notes/matlab-parallel-programming/_index.md
@@ -1,9 +1,6 @@
---
title: Matlab Parallel Programming
date: 2024-11-16T20:28:40Z
-authors: [teh]
-categories: [matlab, parallel computing]
-tags: [matlab, Parallel_Computing]
type: docs
weight: 1
diff --git a/content/notes/multigpu-inference/_index.md b/content/notes/multigpu-inference/_index.md
index c7d48fde..639f3d32 100644
--- a/content/notes/multigpu-inference/_index.md
+++ b/content/notes/multigpu-inference/_index.md
@@ -2,8 +2,6 @@
title: Multi-GPU LLM Inference
date: 2025-07-08T20:40:54Z
authors: [uvarc]
-categories: ["HPC"]
-tags: ["HPC"]
type: docs
weight: 1
diff --git a/content/notes/python-high-performance/_index.md b/content/notes/python-high-performance/_index.md
index 60806e31..a00a1d5c 100644
--- a/content/notes/python-high-performance/_index.md
+++ b/content/notes/python-high-performance/_index.md
@@ -3,8 +3,6 @@ title: "High Performance Programming in Python"
type: docs
toc: true
date: "2020-11-17T00:00:00"
-tags: ["Programming","Python","HPC"]
-categories: ["Programming","Python","HPC"]
weight: 1
menu:
hp-python:
diff --git a/content/notes/pytorch-hpc/_index.md b/content/notes/pytorch-hpc/_index.md
index 8b5e00c0..2cb144f8 100644
--- a/content/notes/pytorch-hpc/_index.md
+++ b/content/notes/pytorch-hpc/_index.md
@@ -2,8 +2,6 @@
date : "2025-02-26T00:00:00-05:00"
title : "Introduction to PyTorch for HPC"
#summary: "An introduction to using PyTorch on an HPC system."
-categories: ["Deep Learning","Python","HPC","Machine Learning"]
-tags: [Deep_learning,Machine_learning,Python,HPC]
toc: true
type: docs
weight: 1
diff --git a/content/notes/qiime2/_index.md b/content/notes/qiime2/_index.md
index cedaab3c..2659a8ea 100644
--- a/content/notes/qiime2/_index.md
+++ b/content/notes/qiime2/_index.md
@@ -1,7 +1,6 @@
---
title: Introduction to QIIME 2
date: 2025-06-13T00:00:00-05:00
-authors: []
type: docs
weight: 1
diff --git a/content/notes/rio-intro/_index.md b/content/notes/rio-intro/_index.md
index 8b0f6f34..f65e72f4 100644
--- a/content/notes/rio-intro/_index.md
+++ b/content/notes/rio-intro/_index.md
@@ -2,9 +2,6 @@
title: Intro to High Security HPC (Rio)
authors: [as, pbo, cmd]
date: 2025-11-12T03:53:56Z
-authors: [as, pbo, cmd]
-categories: [HPC Intro]
-tags: [HPC_Intro]
type: docs
weight: 1
diff --git a/content/notes/seurat-bioinformatics/_index.md b/content/notes/seurat-bioinformatics/_index.md
index a45588bd..6257d34f 100644
--- a/content/notes/seurat-bioinformatics/_index.md
+++ b/content/notes/seurat-bioinformatics/_index.md
@@ -1,9 +1,6 @@
---
title: Bioinformatics Data Visualization with Seurat
date: 2025-07-16T17:52:00Z
-authors: [mab, gka]
-categories: ["Bioinformatics"]
-tags: ["Bioinformatics"]
type: docs
weight: 1
diff --git a/content/tutorials/bioinfo-intro/index.md b/content/tutorials/bioinfo-intro/index.md
index 03c55a6a..7aba9b27 100644
--- a/content/tutorials/bioinfo-intro/index.md
+++ b/content/tutorials/bioinfo-intro/index.md
@@ -1,7 +1,7 @@
---
title: "Introduction to Bioinformatics Tools for HPC"
date: "2025-05-14T00:00:00"
-authors: [mab]
+authors: [mab, dat]
categories: ["Bioinformatics"]
tags: ["Bioinformatics"]
summary: "An introduction to bioinformatics concepts, sequencing technologies, common analysis tools, and their use on high-performance computing systems."
diff --git a/content/tutorials/bioinfo-reproducibility/bioinfo-reproducibility.pdf b/content/tutorials/bioinfo-reproducibility/bioinfo-reproducibility.pdf
new file mode 100644
index 00000000..98cce589
Binary files /dev/null and b/content/tutorials/bioinfo-reproducibility/bioinfo-reproducibility.pdf differ
diff --git a/content/tutorials/bioinfo-reproducibility/index.md b/content/tutorials/bioinfo-reproducibility/index.md
new file mode 100644
index 00000000..d609fe37
--- /dev/null
+++ b/content/tutorials/bioinfo-reproducibility/index.md
@@ -0,0 +1,18 @@
+---
+title: Reproducibility in Bioinformatics
+summary: "This tutorial introduces strategies to improve reusability and transparency in bioinformatics research. It presents workflow tools, including Nextflow and Snakemake, that can help to address challenges of reproducibility."
+
+# Schedule page publish date (NOT talk date).
+publishDate: "2025-01-30T00:00:00"
+
+authors: [mab, dat]
+tags: [Bioinformatics]
+categories: ["Bioinformatics"]
+
+weight: 540
+
+notes: bioinfo-reproducibility
+
+pdf: bioinfo-reproducibility
+
+---
diff --git a/content/tutorials/pytorch-hpc/index.md b/content/tutorials/pytorch-hpc/index.md
index e99d4e1e..0831279b 100755
--- a/content/tutorials/pytorch-hpc/index.md
+++ b/content/tutorials/pytorch-hpc/index.md
@@ -4,8 +4,8 @@ summary: "This tutorial provides a practical introduction to building artificial
date: 2024-06-27T21:13:14-05:00
authors: [abd, bmr]
-categories: ["GPU","Machine Learning","Deep Learning"]
-tags: ["GPU","Machine_Learning","Deep_Learning"]
+categories: ["GPU","Machine Learning","Deep Learning", "HPC"]
+tags: ["GPU","Machine_Learning","Deep_Learning", "HPC"]
weight: 330