iMOKA
My scratch version of the pipeline
#!/usr/bin/env bash
#SBATCH --export=NONE # required when using 'module' IN THIS SCRIPT OR ANY THAT ARE CALLED
hostname
set -e # exit if any command fails
set -u # Error on usage of unset variables
set -o pipefail
if [ -n "$( declare -F module )" ] ; then
echo "Loading required modules"
#module load CBI samtools
fi
set -x
threads=${SLURM_NTASKS:-1}
img=/francislab/data2/refs/singularity/iMOKA_extended-1.1.4.img
k=31
mem=7 # per thread (keep 7)
step="preprocess"
source_file="${PWD}/source.tsv"
export SINGULARITY_BINDPATH=/francislab,/scratch
export OMP_NUM_THREADS=${threads}
export IMOKA_MAX_MEM_GB=$((threads*(mem-1)))
SELECT_ARGS=""
while [ $# -gt 0 ] ; do
case $1 in
--dir)
shift; dir=$1; shift;;
--k)
shift; k=$1; shift;;
--source_file)
shift; source_file=$1; shift;;
--step)
shift; step=$1; shift;;
--threads)
shift; threads=$1; shift;;
*)
SELECT_ARGS="${SELECT_ARGS} $1"; shift;;
esac
done
trap "{ chmod -R a+w $TMPDIR ; }" EXIT
date
cp ${dir}/config.json ${TMPDIR}/
if [ "${step}" == "preprocess" ] ; then
echo "Preprocessing"
cp ${source_file} ${TMPDIR}/
# Copy raw data defined in source file???
singularity exec ${img} preprocess.sh \
--input-file ${source_file} \
--kmer-length ${k} \
--ram $((threads*mem)) \
--threads ${threads}
cp -r ${TMPDIR}/preprocess ${dir}/
## create_matrix.tsv will include the temp scratch path.
cat ${TMPDIR}/create_matrix.tsv | sed "s'${TMPDIR}'${dir}'" > ${dir}/create_matrix.tsv
step="create"
else
echo "Skipping preprocessing. Copying in data."
sed "s'${dir}'${TMPDIR}'" ${dir}/create_matrix.tsv > ${TMPDIR}/create_matrix.tsv
cp -Lr ${dir}/preprocess ${TMPDIR}/
fi
date
if [ "${step}" == "create" ] ; then
echo "Creating"
singularity exec ${img} iMOKA_core create \
--input ${TMPDIR}/create_matrix.tsv \
--output ${TMPDIR}/matrix.json
cp ${TMPDIR}/matrix.json ${dir}/
step="reduce"
else
echo "Skipping create. Copying in matrix.json"
sed "s'/scratch/gwendt/[[:digit:]]*/'${TMPDIR}/'g" ${dir}/matrix.json > ${TMPDIR}/matrix.json
fi
date
if [ "${step}" == "reduce" ] ; then
echo "Reducing"
singularity exec ${img} iMOKA_core reduce \
--input ${TMPDIR}/matrix.json \
--output ${TMPDIR}/reduced.matrix
cp ${TMPDIR}/reduced.matrix* ${dir}/
step="aggregate"
else
echo "Skipping reduce. Copying in reduced matrix"
sed "s'/scratch/gwendt/[[:digit:]]*/'${TMPDIR}/'g" ${dir}/reduced.matrix.json > ${TMPDIR}/reduced.matrix.json
sed "1s'/scratch/gwendt/[[:digit:]]*/'${TMPDIR}/'g" ${dir}/reduced.matrix > ${TMPDIR}/reduced.matrix
fi
date
if [ "${step}" == "aggregate" ] ; then
echo "Aggregating"
singularity exec ${img} iMOKA_core aggregate \
--input ${TMPDIR}/reduced.matrix \
--count-matrix ${TMPDIR}/matrix.json \
--mapper-config ${TMPDIR}/config.json \
--output ${TMPDIR}/aggregated \
--origin-threshold 95
cp ${TMPDIR}/aggregated* ${dir}/
step="random_forest"
#else
fi
date
if [ "${step}" == "random_forest" ] ; then
echo "Modeling"
singularity exec ${img} random_forest.py \
--threads ${threads} \
-r 50 \
${TMPDIR}/aggregated.kmers.matrix ${TMPDIR}/output
cp -r ${TMPDIR}/output* ${dir}/
#else
fi
echo "Complete"
date
exit
date=$( date "+%Y%m%d%H%M%S" )
sbatch="sbatch --mail-user=$(tail -1 ~/.forward) --mail-type=FAIL "
${sbatch} --job-name=TiMOKAscratch --time=11520 --nodes=1 --ntasks=32 --mem=240G --gres=scratch:1500G --output=${PWD}/iMOKA_scratch.31.gender_test.${date}.txt ${PWD}/iMOKA_scratch.bash --dir ${PWD}/31.gender_test
date=$( date "+%Y%m%d%H%M%S" )
sbatch="sbatch --mail-user=$(tail -1 ~/.forward) --mail-type=FAIL "
${sbatch} --job-name=TiMOKAscratch --time=20160 --nodes=1 --ntasks=64 --mem=499G --gres=scratch:1500G --output=${PWD}/31.primary_diagnosis/iMOKA_scratch.${date}.txt ${PWD}/iMOKA_scratch.bash --dir ${PWD}/31.primary_diagnosis --step create
date=$( date "+%Y%m%d%H%M%S" )
sbatch="sbatch --mail-user=$(tail -1 ~/.forward) --mail-type=FAIL "
${sbatch} --job-name=WHO_groups --time=20160 --nodes=1 --ntasks=64 --mem=499G --gres=scratch:1500G --output=${PWD}/31.WHO_groups/iMOKA_scratch.${date}.txt ${PWD}/iMOKA_scratch.bash --dir ${PWD}/31.WHO_groups --step create
date=$( date "+%Y%m%d%H%M%S" )
sbatch="sbatch --mail-user=$(tail -1 ~/.forward) --mail-type=FAIL "
${sbatch} --job-name=IDH --time=20160 --nodes=1 --ntasks=64 --mem=499G --gres=scratch:1500G --output=${PWD}/31.IDH/iMOKA_scratch.${date}.txt ${PWD}/iMOKA_scratch.bash --dir ${PWD}/31.IDH --step create
date=$( date "+%Y%m%d%H%M%S" )
sbatch="sbatch --mail-user=$(tail -1 ~/.forward) --mail-type=FAIL "
${sbatch} --job-name=IDH_1p19q_status --time=20160 --nodes=1 --ntasks=64 --mem=499G --gres=scratch:1500G --output=${PWD}/31.IDH_1p19q_status/iMOKA_scratch.${date}.txt ${PWD}/iMOKA_scratch.bash --dir ${PWD}/31.IDH_1p19q_status --step create
20230802 Note
Not sure what the repercussions of doing or not doing this is.
If the data is stranded paired end sequencing, the user can reverse complement one or both the files using SeqKit
As iMOKA is a kmer analysis, pairedness is irrelevant, however, I think that it does treat kmers as unique even if they are reverse complements. (There's a word for this. Starts with a c? )
Unlike MetaGO and jellyfish where The reverse complements of reads were taken into consideration. A k-mer and its reverse complement were considered as the same object
.
Seems that passing both files, semicolon separated, is the way to go. The above is irrelevant and likely applied to older version.