Three sets of samples were used :
Remarks :
Summary of genomes used for building bwa index:
index is located in /omaha-beach/cguyomar/ref_genome/ref_genome_review
#!/bin/bash
source /local/env/envbwa-0.7.10.sh
source /local/env/envsamtools.sh
# $1 and $2 : paired end libraries
# $3 : outname
# $4 number of threads
echo "processed file : " $3
bwa mem -t $4 /omaha-beach/cguyomar/ref_genome/ref_genome_review $1 $2 | samtools view -b - > $3.bam
samtools sort -@ $4 $3.bam $3_sorted
mv $3_sorted.bam $3.bam
samtools index $3.bam
samtools flagstat $3.bam > $3.flagstat
/omaha-beach/cguyomar/fof_speciaphid/
)Script for batch submission
#!/bin/bash
# $1 : fof location
# $2 : number of threads for each job
while read line
do
read file1 <<< $(echo $line | cut -d " " -f1)
read matefile <<< $(echo $line | cut -d " " -f2)
read ID <<< $(echo $line | cut -d " " -f3)
qsub -pe make $2 -cwd ~/scripts/mapping_bwa_gz_multi.sh $file1 $matefile $ID $2
done < $1
Warning : High disk usage when submitting all mapping jobs in parrallel.
Samples with technical replicates are merged at this point :
#!/bin/bash
source /local/env/envsamtools.sh
samtools merge Pea.bam Pea_L*.bam
samtools merge Alfalfa Alfalfa_L*.bam
samtools merge Clover.bam Clover_L*.bam
samtools merge Cast.bam Cast*.bam
samtools merge Gers.bam Gers*.bam
samtools merge Lus.bam Lus*.bam
samtools merge Mil.bam Mil*.bal
samtools merge Vl.bam Vl*.bam
samtools merge Sl.bam Sl*.bam
rm Alfalfa_L*
rm Clover_L*
rm Pea_L*
rm Cast04*
rm Cast10*
rm Gers05*
rm Gers11*
rm Lus06*
rm Lus12*
rm Mil01*
rm Mil07*
rm Vl03*
rm Vl09*
rm Sl02*
rm Sl09*
samtools index Pea.bam
samtools index Alfalfa
samtools index Clover.bam
samtools index Cast.bam
samtools index Gers.bam
samtools index Lus.bam
samtools index Mil.bam
samtools index Vl.bam
samtools index Sl.bam
Resulting bam files are stored in /groups/bipaa/ref/Hemiptera/Acyrthosiphon_pisum/Speciaphid/BAMs_bwa/MAPPED_BWA
comptage.sh
is used to get the coverage of a given bam file for each reference organism - Each indexed bam file is querried using samtools idxstats
to get the number of reads mapping on each contig of the reference set - The results is joined with a custom table indicating the organisms associated to each contig and formatted
#!/bin/bash
source /local/env/envsamtools.sh
filename=$(basename $1)
outname=$(echo $filename | cut -d "." -f1)
samtools idxstats $1 | sort -k 1b,1 > $outname.idxstats
join $outname.idxstats /omaha-beach/cguyomar/scaffolds_review.list > $outname.scaff.comptage
awk '{SUM+=$4} END {print "unmapped "SUM}' $outname.idxstats >> $outname.scaff.comptage
awk '{a[$5]+=$3}END{for (i in a) print i,a[i]}' $outname.scaff.comptage | sed 1d > $outname.symb.comptage
awk '{SUM+=$4} END {print "unmapped "SUM}' $outname.idxstats >> $outname.symb.comptage
batch submission :
for file in /groups/bipaa/ref/Hemiptera/Acyrthosiphon_pisum/Speciaphid/BAMs_bwa/MAPPED_BWA/33genomes_review/*.bam
do qsub -cwd ~/scripts/comptage.sh $file
done
for file in /groups/bipaa/ref/Hemiptera/Acyrthosiphon_pisum/Speciaphid/BAMs_bwa/MAPPED_BWA/poolseq_review/*.bam
do qsub -cwd ~/scripts/comptage.sh $file
done
for file in /groups/bipaa/ref/Hemiptera/Acyrthosiphon_pisum/Speciaphid/BAMs_bwa/MAPPED_BWA/sex_asex_review/*.bam
do qsub -cwd ~/scripts/comptage.sh $file
done
for file in /groups/bipaa/ref/Hemiptera/Acyrthosiphon_pisum/Speciaphid/BAMs_bwa/MAPPED_BWA/japonais_review/*.bam
do qsub -cwd ~/scripts/comptage.sh $file
done
Counts for each sample are then merged in a single tab, and sorted in the paper order
#!/bin/bash
source /local/env/envR.sh
echo Symbiote > left
cut -f1 -d " " ArPo28.symb.comptage >> left
for file in *.symb.comptage
do echo ${file%%.*} > $file.symb.temp
cut -d " " -f2 $file >> $file.symb.temp
done
paste -d ";" left *.symb.temp > symb.comptages
rm left
rm *.temp
Rscript sort_tab.R
Output : A csv table giving the number of reads mapped on each reference genome for each sample (symb.comptages.sorted)
split_bam.sh
can be used to generate a bam file of reads mapping on a specific reference genome for each sample
#!/bin/bash
source /local/env/envsamtools.sh
filename=$(basename $1)
outname=$(echo $filename | cut -d "." -f1)
samtools view -b $1 APSE1_ENA\|AF157835\|AF157835.1 > /omaha-beach/cguyomar/genomecov/APSE1/$outname.APSE1.bam
samtools view -b $1 Buchnera_gi15616630refNC_002528.1 > /omaha-beach/cguyomar/genomecov/buchnera/$outname.buchnera.bam
samtools view -b $1 Hamiltonella_gi238897251refNC_012751.1> /omaha-beach/cguyomar/genomecov/hamiltonella/$outname.hamiltonella.bam
samtools view -b $1 Mitochondrion_gi213948225refNC_011594.1 > /omaha-beach/cguyomar/genomecov/mitochondrie/$outname1.mito.bam
samtools view -b $1 $(cat /omaha-beach/cguyomar/genomecov/regiella/regiella.contigs) > /omaha-beach/cguyomar/genomecov/regiella/$outname.regiella.bam
samtools view -b $1 $(cat /omaha-beach/cguyomar/ref_genome/genomes/rickettsia.contigs) > /omaha-beach/cguyomar/genomecov/rickettsia/$outname.rickettsia.
bam
samtools view -b $1 Rickettsiella_viridis > /omaha-beach/cguyomar/genomecov/rickettsiella/$outname.rickettsiella.bam
samtools view -b $1 $(cat /omaha-beach/cguyomar/genomecov/serratia/serratia.contigs) > /omaha-beach/cguyomar/genomecov/serratia/$outname.serratia.bam
samtools view -b $1 $(cat /omaha-beach/cguyomar/genomecov/spiroplasma/spiro.contigs) > /omaha-beach/cguyomar/genomecov/spiroplasma/$outname.spiroplasma.
bam
samtools view -b $1 $(cat /omaha-beach/cguyomar/ref_genome/genomes/fukatsuia.contigs) > /omaha-beach/cguyomar/genomecov/fukatsuia/$outname.fukatsuia.bam
samtools view -b $1.bam Mitochondrion_gi213948225refNC_011594.1 > /omaha-beach/cguyomar/genomecov/mitochondrie/$1.mito.bam
Samples with insufficient coverage are removed manually.
samtools depth $sample | awk '$3 > 2' | wc -l
Samples with coverage greater than 10X but with less than 70% of the genome covered are removed.