Skip to the content.

VCF

How to create

#!/usr/bin/env bash
#SBATCH --export=NONE   # required when using 'module'

hostname
echo "Slurm job id:${SLURM_JOBID}:"
date

#		NOPE. Will be "slurm_script" at runtime
if [ -n "${SLURM_JOB_NAME}" ] ; then
	script=${SLURM_JOB_NAME}
else
	script=$( basename $0 )
fi

#	PWD preserved by slurm for where job is run? I guess so.
arguments_file=${PWD}/${script}.arguments

if [ -n "${SLURM_ARRAY_TASK_ID}" ] ; then

	set -e	#	exit if any command fails
	set -u	#	Error on usage of unset variables
	set -o pipefail
	if [ -n "$( declare -F module )" ] ; then
		echo "Loading required modules"
		module load CBI bcftools/1.15.1
	fi
	set -x	#	print expanded command before executing it

	line=${SLURM_ARRAY_TASK_ID:-1}
	echo "Running line :${line}:"

	#	Use a 1 based index since there is no line 0.

	args=$( sed -n "$line"p ${arguments_file} )
	echo $args

	if [ -z "${args}" ] ; then
		echo "No line at :${line}:"
		exit
	fi

	echo $args


	outdir=${PWD}/vcf
	mkdir -p ${outdir}
	bam=${PWD}/in/${args}
	basename=$( basename ${bam} .bam )

	#ref=/francislab/data1/refs/sources/hgdownload.cse.ucsc.edu/goldenPath/hg38/bigZips/latest/hg38.chrXYM_alts.fa
	ref=/francislab/data1/refs/sources/hgdownload.cse.ucsc.edu/goldenPath/hg19/bigZips/latest/hg19.chrXYMT_alts.fa


	ntasks=${SLURM_NTASKS:-2}

	#	should probably add -q 60

	f=${outdir}/${basename}.vcf.gz
	if [ -f $f ] && [ ! -w $f ] ; then
		echo "Write-protected $f exists. Skipping."
	else
		bcftools mpileup --threads $[ntasks/2] -q 60 -Ou -f ${ref} ${bam} \
			| bcftools call --threads $[ntasks/2] -m -Oz -o ${f}	#${outdir}/${basename}.vcf.gz

		#	want all positions. This is just variants.
		#		| bcftools call -mv -Oz -o ${outdir}/${basename}.vcf.gz

		chmod a-w ${f}	#${outdir}/${basename}.vcf.gz
	fi

	f=${outdir}/${basename}.vcf.gz.csi
	if [ -f $f ] && [ ! -w $f ] ; then
		echo "Write-protected $f exists. Skipping."
	else
		bcftools index --threads ${ntasks} ${outdir}/${basename}.vcf.gz

		chmod a-w ${f}	#	${outdir}/${basename}.vcf.gz.csi
	fi

	echo "Complete"

else

	date=$( date "+%Y%m%d%H%M%S" )

	cat <<- EOF > ${arguments_file}
SFHH011AC
SFHH011BB
SFHH011BZ
SFHH011CH
SFHH011I
SFHH011S
SFHH011Z
SFHH011BO
	EOF

	max=$( cat ${arguments_file} | wc -l )

	mkdir ${PWD}/logs/
	date=$( date "+%Y%m%d%H%M%S%N" )
	sbatch --mail-user=$(tail -1 ~/.forward)  --mail-type=FAIL \
		--array=1-${max}%1 --job-name=${script} \
		--output="${PWD}/logs/${script}.${date}.%A_%a.out" \
		--time=1440 --nodes=1 --ntasks=8 --mem=60G \
		$( realpath ${0} )

fi

#module load CBI bcftools/1.11

#	Can fail on weird CIGAR strings.
#	bcftools: sam.c:3948: resolve_cigar2: Assertion `k < c->n_cigar' failed.
#	use module load bcftools/1.10.2 until 1.12 is released

How to compare

for a in out/*.vcf.gz; do echo $a
for b in out/*.vcf.gz; do echo $b
if [ ${a} != ${b} ] ; then
bcftools isec -n =2 ${a} ${b} | wc -l > ${a}.$(basename ${b}).shared_isec_count
fi
done ; done

for a in out/*.vcf.gz; do echo $a
for b in out/*.vcf.gz; do echo $b
if [ ${a} != ${b} ] ; then
bcftools isec -n -1 ${a} ${b} | wc -l > ${a}.$(basename ${b}).diff_isec_count
fi
done ; done

They can be merged into 1 giant vcf file or compared with something like ... bedtools intersect ... or bcftools isec ...

put in decreasing order. WHY?

nohup bcftools merge -o 120207.vcf.gz -Oz newvcf/120207.100.vcf.gz newvcf/120207.80?.vcf.gz newvcf/120207.60?.vcf.gz newvcf/120207.50?.vcf.gz &
nohup bcftools merge -o 122997.vcf.gz -Oz newvcf/122997.100.vcf.gz newvcf/122997.80?.vcf.gz newvcf/122997.60?.vcf.gz newvcf/122997.50?.vcf.gz &
nohup bcftools merge -o 186069.vcf.gz -Oz newvcf/186069.100.vcf.gz newvcf/186069.80?.vcf.gz newvcf/186069.60?.vcf.gz newvcf/186069.50?.vcf.gz &
for f in newvcf/*.*.vcf.gz; do echo $f ; bcftools view --no-header $f | wc -l > $f.count ; done

for f in newvcf/*.*.vcf.gz; do echo $f ; bcftools isec -C ${f%%.*}.100.vcf.gz ${f} | wc -l > $f.100_isec_count ; done
for f in newvcf/*.*.vcf.gz; do echo $f ; bcftools isec -C ${f} ${f%%.*}.100.vcf.gz | wc -l > $f.isec_100_count ; done



for f in newvcf/*.*.vcf.gz; do echo $f ; bcftools isec -n =2 ${f} ${f%%.*}.100.vcf.gz | wc -l > $f.shared_100_isec_count ; done
for f in newvcf/*.*.vcf.gz; do echo $f ; bcftools isec -n -1 ${f} ${f%%.*}.100.vcf.gz | wc -l > $f.diff_100_isec_count ; done