GWAS
Use plink 1.9???
Newer versions produce different output.
Using 1000genomes, for example
aws s3 sync s3://1000genomes/release/20130502/ /francislab/data1/raw/1000genomes/release/20130502/ --exclude "*supporting/*"
Sample,Family ID,Population,Population Description,Gender,Relationship,Unexpected Parent/Child ,Non Paternity,Siblings,Grandparents,Avuncular,Half Siblings,Unknown Second Order,Third Order,Other Comments
HG00096,HG00096,GBR,British in England and Scotland,male,,,,,,,,,,
HG00097,HG00097,GBR,British in England and Scotland,female,,,,,,,,,,
HG00098,HG00098,GBR,British in England and Scotland,male,,,,,,,,,,
HG00099,HG00099,GBR,British in England and Scotland,female,,,,,,,,,,
HG00100,HG00100,GBR,British in England and Scotland,female,,,,,,,,,,
HG00101,HG00101,GBR,British in England and Scotland,male,,,,,,,,,,
Population Description Population Code Super Population DNA from Blood Offspring available from trios Pilot Samples Phase1 Samples Final Phase Samples Total
Chinese Dai in Xishuangbanna, China CDX EAS no yes 0 0 99 99
Han Chinese in Bejing, China CHB EAS no no 91 97 103 106
Japanese in Tokyo, Japan JPT EAS no no 94 89 104 105
Description Population Code
East Asian EAS
South Asian SAS
African AFR
Create own metadata with Sample, Gender, Population, Population Description, Super Population, Super Population Description by merging 20130606_sample_info\ -\ Sample\ Info.csv, 20131219.populations.tsv and 20131219.superpopulations.tsv
awk -F"\t" '( ARGIND == 1 ){FS="\t";s[$2]=$1} ( ARGIND == 2 ){FS="\t";p[$2]=$3; print($1,$2,$3,s[$3])}' \
20131219.superpopulations.tsv 20131219.populations.tsv
echo -e "Sample\tFamily\tGender\tPopulation\tPopulation Description\tSuper Population\tSuper Population Description" > metadata.tsv
awk '( ARGIND == 1 ){FS="\t";s[$2]=$1} ( ARGIND == 2 ){FS="\t";p[$2]=$3} ( ARGIND == 3 ){FPAT="([^,]*)|(\"[^\"]+\")";OFS="\t"; print( $1, $2, $5, $3, $4, p[$3], s[p[$3]]) } ' 20131219.superpopulations.tsv 20131219.populations.tsv 20130606_sample_info\ -\ Sample\ Info.csv | tail -n +2 | sed 's/"//g' >> metadata.tsv
#
# 1=male, 2=female, 0=unknown
#
# Use sample id as family id
awk 'BEGIN{FS=OFS="\t"}(NR>1){ s=($3=="male")?1:2; print $1,$1,s >> $6"_ids.txt" }' \
/francislab/data1/raw/1000genomes/metadata.tsv
awk 'BEGIN{FS=OFS="\t"}(NR>1){ s=($3=="male")?1:2; print $1,$1,s >> $6"_"$4"_ids.txt" }' \
/francislab/data1/raw/1000genomes/metadata.tsv
#!/usr/bin/env bash
date=$( date "+%Y%m%d%H%M%S" )
basedir=/francislab/data1/raw/1000genomes/gwas
dir=${basedir}/20200402
for population in afr amr eas eur sas ; do
echo $population
outdir="${dir}/${population}"
mkdir -p "${outdir}"
for vcf in /francislab/data1/raw/1000genomes/release/20130502/ALL.chr*vcf.gz ; do
echo $vcf
base=$( basename $vcf )
base=${base%.phase3*}
base=${base#ALL.}
echo $base
# -l nodes=1:ppn=${threads} -l vmem=${vmem}gb \
outbase="${outdir}/${base}.plink-make-bed"
qsub -N ${population}:${base} \
-o ${outbase}.${date}.out.txt -e ${outbase}.${date}.err.txt \
~/.local/bin/plink.bash \
-F "--check ${outdir}/${base}.bed \
--make-bed \
--keep ${basedir}/${population^^}_ids.txt \
--update-sex ${basedir}/${population^^}_ids.txt \
--vcf ${vcf} \
--out ${outdir}/${base}"
done ; done
#!/usr/bin/env bash
hostname
set -e # exit if any command fails
set -u # Error on usage of unset variables
set -o pipefail
set -x
plot_prefix=""
while [ $# -gt 0 ] ; do
case $1 in
--out)
shift; out=$1; shift;;
--pheno)
shift; pheno=$1; shift;;
--covar)
shift; covar=$1; shift;;
--beddir)
shift; beddir=$1; shift;;
--plot_prefix)
shift; plot_prefix=$1; shift;;
esac
done
## 0. Create job-specific scratch folder that ...
SCRATCH_JOB=/scratch/$USER/job/$PBS_JOBID
mkdir -p $SCRATCH_JOB
## ... is automatically removed upon exit
## (regardless of success or failure)
trap "{ cd /scratch/; chmod -R +w $SCRATCH_JOB/; \rm -rf $SCRATCH_JOB/ ; }" EXIT
pheno_name=$(basename ${pheno} )
f="${out}/${pheno_name}.for.manhattan.plot.png"
if [ -f $f ] && [ ! -w $f ] ; then
echo "Write-protected $f exists. Skipping."
else
echo "Creating $f"
mkdir -p ${SCRATCH_JOB}/bed
cp --archive ${beddir}/*.{bed,bim,fam} ${SCRATCH_JOB}/bed/
scratch_beddir=${SCRATCH_JOB}/bed/
cp --archive ${pheno} ${SCRATCH_JOB}/
scratch_pheno=${SCRATCH_JOB}/${pheno_name}
cp --archive ${covar} ${SCRATCH_JOB}/
scratch_covar=${SCRATCH_JOB}/$( basename ${covar} )
scratch_out=${SCRATCH_JOB}/out
mkdir -p ${scratch_out}
for bedfile in ${scratch_beddir}/*.bed ; do
echo $bedfile
# drop the shortest suffix match to ".*" (the .bed extension)
bedfile_noext=${bedfile%.bed}
# drop the longest prefix match to "*/" (the path)
bedfile_core=${bedfile_noext##*/}
# mkdir -p ${OUT}/${population}/${pheno_dir}
# Initial run was with PLINK v1.90b3.38 64-bit (7 Jun 2016)
# This output produced .no.covar.assoc.logistic
# This is different than now with different columns
# Not sure how to resolve. Will try downloading older version.
# $ head test.no.covar.assoc.logistic
# CHR SNP BP A1 TEST NMISS OR STAT P
# 6 rs561313667 63979 T ADD 489 NA NA NA
# 6 rs530120680 63980 G ADD 489 NA NA NA
# 6 rs540888038 73938 G ADD 489 NA NA NA
# head gwas/pheno_files_1/10298_Human_alphaherpesvirus_1.ALL.chr13.phase3_shapeit2_mvncall_integrated_v5a.20130502.genotypes.vcf.afr.pruned.no.covar.PHENO1.glm.logistic
# #CHROM POS ID REF ALT A1 TEST OBS_CT OR LOG(OR)_SE Z_STAT P
# 13 19020047 rs186129910 A T T ADD 661 NA NA NA NA
# 13 19020078 rs527875342 C T T ADD 661 NA NA NA NA
# 13 19020095 rs140871821 C T T ADD 661 1.78567 0.259021 2.2384 0.0251947
# 13 19020165 rs550529448 T A A ADD 661 1.25648 1.21356 0.188137 0.85077
#plink2 --covar-variance-standardize \
#--out /scratch/gwendt/job/1775728.cclc01.som.ucsf.edu/pheno_files_1/NC_000898_e2e.chr1.no.covar
#Error: Failed to open /scratch/gwendt/job/1775728.cclc01.som.ucsf.edu/pheno_files_1/NC_000898_e2e.chr1.no.covar.log. Try changing the --out parameter.
plink --allow-no-sex \
--snps-only \
--logistic hide-covar \
--covar-name C1,C2,C3,C4,C5,C6 \
--bfile ${bedfile_noext} \
--pheno ${scratch_pheno} \
--out ${scratch_out}/${pheno_name}.${bedfile_core}.no.covar \
--covar ${scratch_covar}
# plink produces .assoc.logistic and .log
# plink2 produces .PHENO1.glm.logistic and .log
if [ -f ${scratch_out}/${pheno_name}.${bedfile_core}.no.covar.assoc.logistic ] ; then
# PLINK v1.90b6.16 64-bit (19 Feb 2020)
awk '{print $1,$2,$3,$9,$4,$7}' \
${scratch_out}/${pheno_name}.${bedfile_core}.no.covar.assoc.logistic \
> ${scratch_out}/${pheno_name}.${bedfile_core}.for.plot.txt
else
touch ${scratch_out}/${pheno_name}.${bedfile_core}.for.plot.txt
fi
# plink2 output
# PLINK v2.00a2LM 64-bit Intel (10 Aug 2019)
# Newer version uses different columns in different file
# awk '{print $1,$3,$2,$12,$6,$9}' \
# ${OUT}/${population}/${pheno_dir}/${pheno_name}.${bedfile_core}.no.covar.PHENO1.glm.logistic \
# > ${OUT}/${population}/${pheno_dir}/${pheno_name}.${bedfile_core}.for.plot.txt
done # for bedfile in ${REFS}/pruned_vcfs/${population}/ALL.chr*.bed ; do
# Not keeping for.plot.all.txt so doesn't need a header
#echo "CHR SNP BP P A1 OR" > ${pheno_name}.for.plot.all.txt
#grep -v "CHR" *.for.plot.txt >> ${pheno_name}.for.plot.all.txt
#grep --invert-match --no-filename "CHR" *.for.plot.txt >> ${pheno_name}.for.plot.all.txt
grep --invert-match --no-filename "CHR" \
${scratch_out}/${pheno_name}.*.for.plot.txt \
| sed -e 's/^X/23/' -e 's/^Y/24/' \
> ${scratch_out}/${pheno_name}.for.plot.all.txt
# No wildcards, so don't need to specify --no-filename
grep --invert-match "NA" ${scratch_out}/${pheno_name}.for.plot.all.txt \
| shuf -n 200000 > ${scratch_out}/${pheno_name}.for.qq.plot
# If not keeping header, don't need to skip the first line anymore!
#tail -n +2 ${pheno_name}.for.plot.all.txt | grep -v "NA" | shuf -n 200000 > ${pheno_name}.for.qq.plot
# Keeping the NA rows now
grep "NA" ${scratch_out}/${pheno_name}.for.plot.all.txt \
> ${scratch_out}/${pheno_name}.NA.txt
awk '$4 < 0.10' ${scratch_out}/${pheno_name}.for.plot.all.txt \
> ${scratch_out}/${pheno_name}.for.manhattan.plot
if [ -s ${scratch_out}/${pheno_name}.for.manhattan.plot ] && \
[ -s ${scratch_out}/${pheno_name}.for.qq.plot ] ; then
manhattan_qq_plot.r \
--plot_prefix "${plot_prefix}" \
--manhattan ${scratch_out}/${pheno_name}.for.manhattan.plot \
--qq ${scratch_out}/${pheno_name}.for.qq.plot \
--outpath ${scratch_out}
# produces ${pheno_name}.for.manhattan.plot.png
fi
chmod a-w ${scratch_out}/*
mkdir -p ${out}/ # just in case
mv --update ${scratch_out}/* ${out}/
fi
Learn GWAS / Manhattan plots https://en.wikipedia.org/wiki/Genome-wide_association_study https://www.ncbi.nlm.nih.gov/pmc/articles/PMC6001694/ https://elifesciences.org/articles/32920 http://bioinformatics.org.au/ws09/presentations/Day3_JStankovich.pdf Try Hail or other for GWAS?