BIOMED2
Here we will discuss how to process BCR cDNA libraries obtained with BIOMED2 kit.
Data libraries
This tutorial uses the data from the following publication: High frequency of shared clonotypes in human B cell receptor repertoires. Soto C et al., Nature, 2019 Feb;566(7744):398-402 doi: 10.1038/s41586-019-0934-8
Peripheral blood samples were collected from three donors: HIP1 (female, 47 y.o.), HIP2 (male, 22 y.o.) and HIP3 (male, 29 y.o.). Samples were collected in multiple replicas. Total RNA was extracted from PBMCs and then RT-PCR was utilized to generate BCR amplicon libraries using BIOMED2 multiplex primers. Subsequent libraries were sequenced using next-generation sequencing.
All data is available from SRA (PRJNA511481) using e.g. SRA Explorer.
Use aria2c for efficient download of the full dataset with the proper filenames:
mkdir -p raw
aria2c -c -s 16 -x 16 -k 1M -j 8 -i download-list.txt
ftp://ftp.sra.ebi.ac.uk/vol1/fastq/SRR836/008/SRR8365468/SRR8365468_1.fastq.gz
out=raw/SRR8365468_HIP2_male_R1.fastq.gz
ftp://ftp.sra.ebi.ac.uk/vol1/fastq/SRR836/008/SRR8365468/SRR8365468_2.fastq.gz
out=raw/SRR8365468_HIP2_male_R2.fastq.gz
ftp://ftp.sra.ebi.ac.uk/vol1/fastq/SRR836/007/SRR8365457/SRR8365457_1.fastq.gz
out=raw/SRR8365457_HIP1_female_R1.fastq.gz
ftp://ftp.sra.ebi.ac.uk/vol1/fastq/SRR836/007/SRR8365457/SRR8365457_2.fastq.gz
out=raw/SRR8365457_HIP1_female_R2.fastq.gz
ftp://ftp.sra.ebi.ac.uk/vol1/fastq/SRR836/008/SRR8365458/SRR8365458_1.fastq.gz
out=raw/SRR8365458_HIP1_female_R1.fastq.gz
ftp://ftp.sra.ebi.ac.uk/vol1/fastq/SRR836/008/SRR8365458/SRR8365458_2.fastq.gz
out=raw/SRR8365458_HIP1_female_R2.fastq.gz
ftp://ftp.sra.ebi.ac.uk/vol1/fastq/SRR836/009/SRR8365459/SRR8365459_1.fastq.gz
out=raw/SRR8365459_HIP1_female_R1.fastq.gz
ftp://ftp.sra.ebi.ac.uk/vol1/fastq/SRR836/009/SRR8365459/SRR8365459_2.fastq.gz
out=raw/SRR8365459_HIP1_female_R2.fastq.gz
ftp://ftp.sra.ebi.ac.uk/vol1/fastq/SRR836/003/SRR8365463/SRR8365463_1.fastq.gz
out=raw/SRR8365463_HIP1_female_R1.fastq.gz
ftp://ftp.sra.ebi.ac.uk/vol1/fastq/SRR836/003/SRR8365463/SRR8365463_2.fastq.gz
out=raw/SRR8365463_HIP1_female_R2.fastq.gz
ftp://ftp.sra.ebi.ac.uk/vol1/fastq/SRR836/009/SRR8365469/SRR8365469_1.fastq.gz
out=raw/SRR8365469_HIP2_male_R1.fastq.gz
ftp://ftp.sra.ebi.ac.uk/vol1/fastq/SRR836/009/SRR8365469/SRR8365469_2.fastq.gz
out=raw/SRR8365469_HIP2_male_R2.fastq.gz
ftp://ftp.sra.ebi.ac.uk/vol1/fastq/SRR836/005/SRR8365465/SRR8365465_1.fastq.gz
out=raw/SRR8365465_HIP1_female_R1.fastq.gz
ftp://ftp.sra.ebi.ac.uk/vol1/fastq/SRR836/005/SRR8365465/SRR8365465_2.fastq.gz
out=raw/SRR8365465_HIP1_female_R2.fastq.gz
ftp://ftp.sra.ebi.ac.uk/vol1/fastq/SRR836/007/SRR8365467/SRR8365467_1.fastq.gz
out=raw/SRR8365467_HIP2_male_R1.fastq.gz
ftp://ftp.sra.ebi.ac.uk/vol1/fastq/SRR836/007/SRR8365467/SRR8365467_2.fastq.gz
out=raw/SRR8365467_HIP2_male_R2.fastq.gz
ftp://ftp.sra.ebi.ac.uk/vol1/fastq/SRR836/004/SRR8365464/SRR8365464_1.fastq.gz
out=raw/SRR8365464_HIP1_female_R1.fastq.gz
ftp://ftp.sra.ebi.ac.uk/vol1/fastq/SRR836/004/SRR8365464/SRR8365464_2.fastq.gz
out=raw/SRR8365464_HIP1_female_R2.fastq.gz
ftp://ftp.sra.ebi.ac.uk/vol1/fastq/SRR836/000/SRR8365450/SRR8365450_1.fastq.gz
out=raw/SRR8365450_HIP3_male_R1.fastq.gz
ftp://ftp.sra.ebi.ac.uk/vol1/fastq/SRR836/000/SRR8365450/SRR8365450_2.fastq.gz
out=raw/SRR8365450_HIP3_male_R2.fastq.gz
ftp://ftp.sra.ebi.ac.uk/vol1/fastq/SRR836/001/SRR8365461/SRR8365461_1.fastq.gz
out=raw/SRR8365461_HIP1_female_R1.fastq.gz
ftp://ftp.sra.ebi.ac.uk/vol1/fastq/SRR836/001/SRR8365461/SRR8365461_2.fastq.gz
out=raw/SRR8365461_HIP1_female_R2.fastq.gz
ftp://ftp.sra.ebi.ac.uk/vol1/fastq/SRR836/002/SRR8365462/SRR8365462_1.fastq.gz
out=raw/SRR8365462_HIP1_female_R1.fastq.gz
ftp://ftp.sra.ebi.ac.uk/vol1/fastq/SRR836/002/SRR8365462/SRR8365462_2.fastq.gz
out=raw/SRR8365462_HIP1_female_R2.fastq.gz
ftp://ftp.sra.ebi.ac.uk/vol1/fastq/SRR836/006/SRR8365456/SRR8365456_1.fastq.gz
out=raw/SRR8365456_HIP1_female_R1.fastq.gz
ftp://ftp.sra.ebi.ac.uk/vol1/fastq/SRR836/006/SRR8365456/SRR8365456_2.fastq.gz
out=raw/SRR8365456_HIP1_female_R2.fastq.gz
ftp://ftp.sra.ebi.ac.uk/vol1/fastq/SRR836/005/SRR8365475/SRR8365475_1.fastq.gz
out=raw/SRR8365475_HIP2_male_R1.fastq.gz
ftp://ftp.sra.ebi.ac.uk/vol1/fastq/SRR836/005/SRR8365475/SRR8365475_2.fastq.gz
out=raw/SRR8365475_HIP2_male_R2.fastq.gz
ftp://ftp.sra.ebi.ac.uk/vol1/fastq/SRR836/000/SRR8365460/SRR8365460_1.fastq.gz
out=raw/SRR8365460_HIP1_female_R1.fastq.gz
ftp://ftp.sra.ebi.ac.uk/vol1/fastq/SRR836/000/SRR8365460/SRR8365460_2.fastq.gz
out=raw/SRR8365460_HIP1_female_R2.fastq.gz
ftp://ftp.sra.ebi.ac.uk/vol1/fastq/SRR836/000/SRR8365470/SRR8365470_1.fastq.gz
out=raw/SRR8365470_HIP2_male_R1.fastq.gz
ftp://ftp.sra.ebi.ac.uk/vol1/fastq/SRR836/000/SRR8365470/SRR8365470_2.fastq.gz
out=raw/SRR8365470_HIP2_male_R2.fastq.gz
ftp://ftp.sra.ebi.ac.uk/vol1/fastq/SRR836/001/SRR8365471/SRR8365471_1.fastq.gz
out=raw/SRR8365471_HIP2_male_R1.fastq.gz
ftp://ftp.sra.ebi.ac.uk/vol1/fastq/SRR836/001/SRR8365471/SRR8365471_2.fastq.gz
out=raw/SRR8365471_HIP2_male_R2.fastq.gz
ftp://ftp.sra.ebi.ac.uk/vol1/fastq/SRR836/003/SRR8365473/SRR8365473_1.fastq.gz
out=raw/SRR8365473_HIP2_male_R1.fastq.gz
ftp://ftp.sra.ebi.ac.uk/vol1/fastq/SRR836/003/SRR8365473/SRR8365473_2.fastq.gz
out=raw/SRR8365473_HIP2_male_R2.fastq.gz
ftp://ftp.sra.ebi.ac.uk/vol1/fastq/SRR836/009/SRR8365449/SRR8365449_1.fastq.gz
out=raw/SRR8365449_HIP3_male_R1.fastq.gz
ftp://ftp.sra.ebi.ac.uk/vol1/fastq/SRR836/009/SRR8365449/SRR8365449_2.fastq.gz
out=raw/SRR8365449_HIP3_male_R2.fastq.gz
ftp://ftp.sra.ebi.ac.uk/vol1/fastq/SRR836/002/SRR8365482/SRR8365482_1.fastq.gz
out=raw/SRR8365482_HIP1_female_R1.fastq.gz
ftp://ftp.sra.ebi.ac.uk/vol1/fastq/SRR836/002/SRR8365482/SRR8365482_2.fastq.gz
out=raw/SRR8365482_HIP1_female_R2.fastq.gz
ftp://ftp.sra.ebi.ac.uk/vol1/fastq/SRR836/006/SRR8365446/SRR8365446_1.fastq.gz
out=raw/SRR8365446_HIP3_male_R1.fastq.gz
ftp://ftp.sra.ebi.ac.uk/vol1/fastq/SRR836/006/SRR8365446/SRR8365446_2.fastq.gz
out=raw/SRR8365446_HIP3_male_R2.fastq.gz
ftp://ftp.sra.ebi.ac.uk/vol1/fastq/SRR836/003/SRR8365483/SRR8365483_1.fastq.gz
out=raw/SRR8365483_HIP1_female_R1.fastq.gz
ftp://ftp.sra.ebi.ac.uk/vol1/fastq/SRR836/003/SRR8365483/SRR8365483_2.fastq.gz
out=raw/SRR8365483_HIP1_female_R2.fastq.gz
ftp://ftp.sra.ebi.ac.uk/vol1/fastq/SRR836/007/SRR8365447/SRR8365447_1.fastq.gz
out=raw/SRR8365447_HIP3_male_R1.fastq.gz
ftp://ftp.sra.ebi.ac.uk/vol1/fastq/SRR836/007/SRR8365447/SRR8365447_2.fastq.gz
out=raw/SRR8365447_HIP3_male_R2.fastq.gz
ftp://ftp.sra.ebi.ac.uk/vol1/fastq/SRR836/004/SRR8365484/SRR8365484_1.fastq.gz
out=raw/SRR8365484_HIP1_female_R1.fastq.gz
ftp://ftp.sra.ebi.ac.uk/vol1/fastq/SRR836/004/SRR8365484/SRR8365484_2.fastq.gz
out=raw/SRR8365484_HIP1_female_R2.fastq.gz
ftp://ftp.sra.ebi.ac.uk/vol1/fastq/SRR836/008/SRR8365448/SRR8365448_1.fastq.gz
out=raw/SRR8365448_HIP3_male_R1.fastq.gz
ftp://ftp.sra.ebi.ac.uk/vol1/fastq/SRR836/008/SRR8365448/SRR8365448_2.fastq.gz
out=raw/SRR8365448_HIP3_male_R2.fastq.gz
ftp://ftp.sra.ebi.ac.uk/vol1/fastq/SRR836/004/SRR8365424/SRR8365424_1.fastq.gz
out=raw/SRR8365424_HIP3_male_R1.fastq.gz
ftp://ftp.sra.ebi.ac.uk/vol1/fastq/SRR836/004/SRR8365424/SRR8365424_2.fastq.gz
out=raw/SRR8365424_HIP3_male_R2.fastq.gz
ftp://ftp.sra.ebi.ac.uk/vol1/fastq/SRR836/005/SRR8365485/SRR8365485_1.fastq.gz
out=raw/SRR8365485_HIP1_female_R1.fastq.gz
ftp://ftp.sra.ebi.ac.uk/vol1/fastq/SRR836/005/SRR8365485/SRR8365485_2.fastq.gz
out=raw/SRR8365485_HIP1_female_R2.fastq.gz
ftp://ftp.sra.ebi.ac.uk/vol1/fastq/SRR836/008/SRR8365488/SRR8365488_1.fastq.gz
out=raw/SRR8365488_HIP1_female_R1.fastq.gz
ftp://ftp.sra.ebi.ac.uk/vol1/fastq/SRR836/008/SRR8365488/SRR8365488_2.fastq.gz
out=raw/SRR8365488_HIP1_female_R2.fastq.gz
ftp://ftp.sra.ebi.ac.uk/vol1/fastq/SRR836/001/SRR8365421/SRR8365421_1.fastq.gz
out=raw/SRR8365421_HIP3_male_R1.fastq.gz
ftp://ftp.sra.ebi.ac.uk/vol1/fastq/SRR836/001/SRR8365421/SRR8365421_2.fastq.gz
out=raw/SRR8365421_HIP3_male_R2.fastq.gz
ftp://ftp.sra.ebi.ac.uk/vol1/fastq/SRR836/009/SRR8365489/SRR8365489_1.fastq.gz
out=raw/SRR8365489_HIP1_female_R1.fastq.gz
ftp://ftp.sra.ebi.ac.uk/vol1/fastq/SRR836/009/SRR8365489/SRR8365489_2.fastq.gz
out=raw/SRR8365489_HIP1_female_R2.fastq.gz
ftp://ftp.sra.ebi.ac.uk/vol1/fastq/SRR836/000/SRR8365490/SRR8365490_1.fastq.gz
out=raw/SRR8365490_HIP1_female_R1.fastq.gz
ftp://ftp.sra.ebi.ac.uk/vol1/fastq/SRR836/000/SRR8365490/SRR8365490_2.fastq.gz
out=raw/SRR8365490_HIP1_female_R2.fastq.gz
ftp://ftp.sra.ebi.ac.uk/vol1/fastq/SRR836/006/SRR8365246/SRR8365246_1.fastq.gz
out=raw/SRR8365246_HIP1_female_R1.fastq.gz
ftp://ftp.sra.ebi.ac.uk/vol1/fastq/SRR836/006/SRR8365246/SRR8365246_2.fastq.gz
out=raw/SRR8365246_HIP1_female_R2.fastq.gz
ftp://ftp.sra.ebi.ac.uk/vol1/fastq/SRR836/004/SRR8365474/SRR8365474_1.fastq.gz
out=raw/SRR8365474_HIP1_female_R1.fastq.gz
ftp://ftp.sra.ebi.ac.uk/vol1/fastq/SRR836/004/SRR8365474/SRR8365474_2.fastq.gz
out=raw/SRR8365474_HIP1_female_R2.fastq.gz
ftp://ftp.sra.ebi.ac.uk/vol1/fastq/SRR836/002/SRR8365422/SRR8365422_1.fastq.gz
out=raw/SRR8365422_HIP3_male_R1.fastq.gz
ftp://ftp.sra.ebi.ac.uk/vol1/fastq/SRR836/002/SRR8365422/SRR8365422_2.fastq.gz
out=raw/SRR8365422_HIP3_male_R2.fastq.gz
ftp://ftp.sra.ebi.ac.uk/vol1/fastq/SRR836/003/SRR8365423/SRR8365423_1.fastq.gz
out=raw/SRR8365423_HIP3_male_R1.fastq.gz
ftp://ftp.sra.ebi.ac.uk/vol1/fastq/SRR836/003/SRR8365423/SRR8365423_2.fastq.gz
out=raw/SRR8365423_HIP3_male_R2.fastq.gz
ftp://ftp.sra.ebi.ac.uk/vol1/fastq/SRR836/000/SRR8365420/SRR8365420_1.fastq.gz
out=raw/SRR8365420_HIP3_male_R1.fastq.gz
ftp://ftp.sra.ebi.ac.uk/vol1/fastq/SRR836/000/SRR8365420/SRR8365420_2.fastq.gz
out=raw/SRR8365420_HIP3_male_R2.fastq.gz
ftp://ftp.sra.ebi.ac.uk/vol1/fastq/SRR836/009/SRR8365419/SRR8365419_1.fastq.gz
out=raw/SRR8365419_HIP3_male_R1.fastq.gz
ftp://ftp.sra.ebi.ac.uk/vol1/fastq/SRR836/009/SRR8365419/SRR8365419_2.fastq.gz
out=raw/SRR8365419_HIP3_male_R2.fastq.gz
ftp://ftp.sra.ebi.ac.uk/vol1/fastq/SRR836/008/SRR8365248/SRR8365248_1.fastq.gz
out=raw/SRR8365248_HIP1_female_R1.fastq.gz
ftp://ftp.sra.ebi.ac.uk/vol1/fastq/SRR836/008/SRR8365248/SRR8365248_2.fastq.gz
out=raw/SRR8365248_HIP1_female_R2.fastq.gz
ftp://ftp.sra.ebi.ac.uk/vol1/fastq/SRR836/009/SRR8365249/SRR8365249_1.fastq.gz
out=raw/SRR8365249_HIP1_female_R1.fastq.gz
ftp://ftp.sra.ebi.ac.uk/vol1/fastq/SRR836/009/SRR8365249/SRR8365249_2.fastq.gz
out=raw/SRR8365249_HIP1_female_R2.fastq.gz
ftp://ftp.sra.ebi.ac.uk/vol1/fastq/SRR836/007/SRR8365247/SRR8365247_1.fastq.gz
out=raw/SRR8365247_HIP1_female_R1.fastq.gz
ftp://ftp.sra.ebi.ac.uk/vol1/fastq/SRR836/007/SRR8365247/SRR8365247_2.fastq.gz
out=raw/SRR8365247_HIP1_female_R2.fastq.gz
ftp://ftp.sra.ebi.ac.uk/vol1/fastq/SRR836/000/SRR8365250/SRR8365250_1.fastq.gz
out=raw/SRR8365250_HIP1_female_R1.fastq.gz
ftp://ftp.sra.ebi.ac.uk/vol1/fastq/SRR836/000/SRR8365250/SRR8365250_2.fastq.gz
out=raw/SRR8365250_HIP1_female_R2.fastq.gz
ftp://ftp.sra.ebi.ac.uk/vol1/fastq/SRR836/008/SRR8365348/SRR8365348_1.fastq.gz
out=raw/SRR8365348_HIP1_female_R1.fastq.gz
ftp://ftp.sra.ebi.ac.uk/vol1/fastq/SRR836/008/SRR8365348/SRR8365348_2.fastq.gz
out=raw/SRR8365348_HIP1_female_R2.fastq.gz
ftp://ftp.sra.ebi.ac.uk/vol1/fastq/SRR836/001/SRR8365251/SRR8365251_1.fastq.gz
out=raw/SRR8365251_HIP1_female_R1.fastq.gz
ftp://ftp.sra.ebi.ac.uk/vol1/fastq/SRR836/001/SRR8365251/SRR8365251_2.fastq.gz
out=raw/SRR8365251_HIP1_female_R2.fastq.gz
ftp://ftp.sra.ebi.ac.uk/vol1/fastq/SRR836/008/SRR8365418/SRR8365418_1.fastq.gz
out=raw/SRR8365418_HIP3_male_R1.fastq.gz
ftp://ftp.sra.ebi.ac.uk/vol1/fastq/SRR836/008/SRR8365418/SRR8365418_2.fastq.gz
out=raw/SRR8365418_HIP3_male_R2.fastq.gz
ftp://ftp.sra.ebi.ac.uk/vol1/fastq/SRR836/000/SRR8365310/SRR8365310_1.fastq.gz
out=raw/SRR8365310_HIP1_female_R1.fastq.gz
ftp://ftp.sra.ebi.ac.uk/vol1/fastq/SRR836/000/SRR8365310/SRR8365310_2.fastq.gz
out=raw/SRR8365310_HIP1_female_R2.fastq.gz
ftp://ftp.sra.ebi.ac.uk/vol1/fastq/SRR836/002/SRR8365252/SRR8365252_1.fastq.gz
out=raw/SRR8365252_HIP1_female_R1.fastq.gz
ftp://ftp.sra.ebi.ac.uk/vol1/fastq/SRR836/002/SRR8365252/SRR8365252_2.fastq.gz
out=raw/SRR8365252_HIP1_female_R2.fastq.gz
ftp://ftp.sra.ebi.ac.uk/vol1/fastq/SRR836/008/SRR8365308/SRR8365308_1.fastq.gz
out=raw/SRR8365308_HIP1_female_R1.fastq.gz
ftp://ftp.sra.ebi.ac.uk/vol1/fastq/SRR836/008/SRR8365308/SRR8365308_2.fastq.gz
out=raw/SRR8365308_HIP1_female_R2.fastq.gz
ftp://ftp.sra.ebi.ac.uk/vol1/fastq/SRR836/009/SRR8365309/SRR8365309_1.fastq.gz
out=raw/SRR8365309_HIP1_female_R1.fastq.gz
ftp://ftp.sra.ebi.ac.uk/vol1/fastq/SRR836/009/SRR8365309/SRR8365309_2.fastq.gz
out=raw/SRR8365309_HIP1_female_R2.fastq.gz
ftp://ftp.sra.ebi.ac.uk/vol1/fastq/SRR836/003/SRR8365253/SRR8365253_1.fastq.gz
out=raw/SRR8365253_HIP1_female_R1.fastq.gz
ftp://ftp.sra.ebi.ac.uk/vol1/fastq/SRR836/003/SRR8365253/SRR8365253_2.fastq.gz
out=raw/SRR8365253_HIP1_female_R2.fastq.gz
ftp://ftp.sra.ebi.ac.uk/vol1/fastq/SRR836/007/SRR8365307/SRR8365307_1.fastq.gz
out=raw/SRR8365307_HIP1_female_R1.fastq.gz
ftp://ftp.sra.ebi.ac.uk/vol1/fastq/SRR836/007/SRR8365307/SRR8365307_2.fastq.gz
out=raw/SRR8365307_HIP1_female_R2.fastq.gz
ftp://ftp.sra.ebi.ac.uk/vol1/fastq/SRR836/009/SRR8365259/SRR8365259_1.fastq.gz
out=raw/SRR8365259_HIP2_male_R1.fastq.gz
ftp://ftp.sra.ebi.ac.uk/vol1/fastq/SRR836/009/SRR8365259/SRR8365259_2.fastq.gz
out=raw/SRR8365259_HIP2_male_R2.fastq.gz
ftp://ftp.sra.ebi.ac.uk/vol1/fastq/SRR836/008/SRR8365258/SRR8365258_1.fastq.gz
out=raw/SRR8365258_HIP2_male_R1.fastq.gz
ftp://ftp.sra.ebi.ac.uk/vol1/fastq/SRR836/008/SRR8365258/SRR8365258_2.fastq.gz
out=raw/SRR8365258_HIP2_male_R2.fastq.gz
ftp://ftp.sra.ebi.ac.uk/vol1/fastq/SRR836/006/SRR8365306/SRR8365306_1.fastq.gz
out=raw/SRR8365306_HIP1_female_R1.fastq.gz
ftp://ftp.sra.ebi.ac.uk/vol1/fastq/SRR836/006/SRR8365306/SRR8365306_2.fastq.gz
out=raw/SRR8365306_HIP1_female_R2.fastq.gz
ftp://ftp.sra.ebi.ac.uk/vol1/fastq/SRR836/005/SRR8365305/SRR8365305_1.fastq.gz
out=raw/SRR8365305_HIP1_female_R1.fastq.gz
ftp://ftp.sra.ebi.ac.uk/vol1/fastq/SRR836/005/SRR8365305/SRR8365305_2.fastq.gz
out=raw/SRR8365305_HIP1_female_R2.fastq.gz
ftp://ftp.sra.ebi.ac.uk/vol1/fastq/SRR836/004/SRR8365304/SRR8365304_1.fastq.gz
out=raw/SRR8365304_HIP1_female_R1.fastq.gz
ftp://ftp.sra.ebi.ac.uk/vol1/fastq/SRR836/004/SRR8365304/SRR8365304_2.fastq.gz
out=raw/SRR8365304_HIP1_female_R2.fastq.gz
ftp://ftp.sra.ebi.ac.uk/vol1/fastq/SRR836/000/SRR8365260/SRR8365260_1.fastq.gz
out=raw/SRR8365260_HIP2_male_R1.fastq.gz
ftp://ftp.sra.ebi.ac.uk/vol1/fastq/SRR836/000/SRR8365260/SRR8365260_2.fastq.gz
out=raw/SRR8365260_HIP2_male_R2.fastq.gz
ftp://ftp.sra.ebi.ac.uk/vol1/fastq/SRR836/003/SRR8365303/SRR8365303_1.fastq.gz
out=raw/SRR8365303_HIP1_female_R1.fastq.gz
ftp://ftp.sra.ebi.ac.uk/vol1/fastq/SRR836/003/SRR8365303/SRR8365303_2.fastq.gz
out=raw/SRR8365303_HIP1_female_R2.fastq.gz
ftp://ftp.sra.ebi.ac.uk/vol1/fastq/SRR836/001/SRR8365261/SRR8365261_1.fastq.gz
out=raw/SRR8365261_HIP2_male_R1.fastq.gz
ftp://ftp.sra.ebi.ac.uk/vol1/fastq/SRR836/001/SRR8365261/SRR8365261_2.fastq.gz
out=raw/SRR8365261_HIP2_male_R2.fastq.gz
ftp://ftp.sra.ebi.ac.uk/vol1/fastq/SRR836/002/SRR8365262/SRR8365262_1.fastq.gz
out=raw/SRR8365262_HIP2_male_R1.fastq.gz
ftp://ftp.sra.ebi.ac.uk/vol1/fastq/SRR836/002/SRR8365262/SRR8365262_2.fastq.gz
out=raw/SRR8365262_HIP2_male_R2.fastq.gz
ftp://ftp.sra.ebi.ac.uk/vol1/fastq/SRR836/002/SRR8365302/SRR8365302_1.fastq.gz
out=raw/SRR8365302_HIP1_female_R1.fastq.gz
ftp://ftp.sra.ebi.ac.uk/vol1/fastq/SRR836/002/SRR8365302/SRR8365302_2.fastq.gz
out=raw/SRR8365302_HIP1_female_R2.fastq.gz
ftp://ftp.sra.ebi.ac.uk/vol1/fastq/SRR836/001/SRR8365301/SRR8365301_1.fastq.gz
out=raw/SRR8365301_HIP1_female_R1.fastq.gz
ftp://ftp.sra.ebi.ac.uk/vol1/fastq/SRR836/001/SRR8365301/SRR8365301_2.fastq.gz
out=raw/SRR8365301_HIP1_female_R2.fastq.gz
ftp://ftp.sra.ebi.ac.uk/vol1/fastq/SRR836/003/SRR8365263/SRR8365263_1.fastq.gz
out=raw/SRR8365263_HIP2_male_R1.fastq.gz
ftp://ftp.sra.ebi.ac.uk/vol1/fastq/SRR836/003/SRR8365263/SRR8365263_2.fastq.gz
out=raw/SRR8365263_HIP2_male_R2.fastq.gz
ftp://ftp.sra.ebi.ac.uk/vol1/fastq/SRR836/004/SRR8365264/SRR8365264_1.fastq.gz
out=raw/SRR8365264_HIP1_female_R1.fastq.gz
ftp://ftp.sra.ebi.ac.uk/vol1/fastq/SRR836/004/SRR8365264/SRR8365264_2.fastq.gz
out=raw/SRR8365264_HIP1_female_R2.fastq.gz
ftp://ftp.sra.ebi.ac.uk/vol1/fastq/SRR836/007/SRR8365267/SRR8365267_1.fastq.gz
out=raw/SRR8365267_HIP1_female_R1.fastq.gz
ftp://ftp.sra.ebi.ac.uk/vol1/fastq/SRR836/007/SRR8365267/SRR8365267_2.fastq.gz
out=raw/SRR8365267_HIP1_female_R2.fastq.gz
ftp://ftp.sra.ebi.ac.uk/vol1/fastq/SRR836/004/SRR8365274/SRR8365274_1.fastq.gz
out=raw/SRR8365274_HIP1_female_R1.fastq.gz
ftp://ftp.sra.ebi.ac.uk/vol1/fastq/SRR836/004/SRR8365274/SRR8365274_2.fastq.gz
out=raw/SRR8365274_HIP1_female_R2.fastq.gz
ftp://ftp.sra.ebi.ac.uk/vol1/fastq/SRR836/009/SRR8365269/SRR8365269_1.fastq.gz
out=raw/SRR8365269_HIP1_female_R1.fastq.gz
ftp://ftp.sra.ebi.ac.uk/vol1/fastq/SRR836/009/SRR8365269/SRR8365269_2.fastq.gz
out=raw/SRR8365269_HIP1_female_R2.fastq.gz
ftp://ftp.sra.ebi.ac.uk/vol1/fastq/SRR836/001/SRR8365271/SRR8365271_1.fastq.gz
out=raw/SRR8365271_HIP1_female_R1.fastq.gz
ftp://ftp.sra.ebi.ac.uk/vol1/fastq/SRR836/001/SRR8365271/SRR8365271_2.fastq.gz
out=raw/SRR8365271_HIP1_female_R2.fastq.gz
ftp://ftp.sra.ebi.ac.uk/vol1/fastq/SRR836/003/SRR8365273/SRR8365273_1.fastq.gz
out=raw/SRR8365273_HIP1_female_R1.fastq.gz
ftp://ftp.sra.ebi.ac.uk/vol1/fastq/SRR836/003/SRR8365273/SRR8365273_2.fastq.gz
out=raw/SRR8365273_HIP1_female_R2.fastq.gz
ftp://ftp.sra.ebi.ac.uk/vol1/fastq/SRR836/005/SRR8365265/SRR8365265_1.fastq.gz
out=raw/SRR8365265_HIP1_female_R1.fastq.gz
ftp://ftp.sra.ebi.ac.uk/vol1/fastq/SRR836/005/SRR8365265/SRR8365265_2.fastq.gz
out=raw/SRR8365265_HIP1_female_R2.fastq.gz
ftp://ftp.sra.ebi.ac.uk/vol1/fastq/SRR836/000/SRR8365270/SRR8365270_1.fastq.gz
out=raw/SRR8365270_HIP1_female_R1.fastq.gz
ftp://ftp.sra.ebi.ac.uk/vol1/fastq/SRR836/000/SRR8365270/SRR8365270_2.fastq.gz
out=raw/SRR8365270_HIP1_female_R2.fastq.gz
ftp://ftp.sra.ebi.ac.uk/vol1/fastq/SRR836/002/SRR8365272/SRR8365272_1.fastq.gz
out=raw/SRR8365272_HIP1_female_R1.fastq.gz
ftp://ftp.sra.ebi.ac.uk/vol1/fastq/SRR836/002/SRR8365272/SRR8365272_2.fastq.gz
out=raw/SRR8365272_HIP1_female_R2.fastq.gz
ftp://ftp.sra.ebi.ac.uk/vol1/fastq/SRR836/006/SRR8365266/SRR8365266_1.fastq.gz
out=raw/SRR8365266_HIP1_female_R1.fastq.gz
ftp://ftp.sra.ebi.ac.uk/vol1/fastq/SRR836/006/SRR8365266/SRR8365266_2.fastq.gz
out=raw/SRR8365266_HIP1_female_R2.fastq.gz
ftp://ftp.sra.ebi.ac.uk/vol1/fastq/SRR836/008/SRR8365268/SRR8365268_1.fastq.gz
out=raw/SRR8365268_HIP1_female_R1.fastq.gz
ftp://ftp.sra.ebi.ac.uk/vol1/fastq/SRR836/008/SRR8365268/SRR8365268_2.fastq.gz
out=raw/SRR8365268_HIP1_female_R2.fastq.gz
The above script downloads all the data to raw/
folder. Each file name encodes the data about the donor and its sex. For example for the first file from the above list (SRR8365468_HIP2_male_R1.fastq.gz
):
SRR8365468
- sample idHIP2
- patient idfemale
- patient sex
The project contains 147 FASTQ file pairs. The structure of sequences is shown on the picture bellow.
The structure of the library is shown on the picture bellow.
This data is obtained using multiplex mix of V and J primers that can be found here. In the original paper, where Biomed2 primer set was published the authors declare that some primers allow upto 2 mismatches. This means that these mismatches will end up in the final sequences and may introduce bias into alignment process.
Briefly: 1. There are 16 primers (complimentary to VH, VK and VL segments). These primers' sequences are located in FR1
region and the longest primer consists of 32 nucleotides. R1
is 250bp long and, depending on a certain V gene, it covers the whole V segment upto to the beginning ofCDR3
region. 2. There are 5 primers (complimentary to JH, JK and JL segments). These primers are located in FR4
and the longest consists of 26 nucleotides. R2
starts from one of these primers and ends in FR2
.
Bellow you can see J primers aligned with IGHJ / IGKL / IGLJ genes sequences.
# IGHJ
<---------FR4----------------->
NNNNCTTACCTGAGGAGACGGTGACC <- Jh-primer
CTGAGGAGACGGTGACCAGGGTGCCCTGGCCCCAGTGCTGGAAGTATTCAGC <- IGHJ1
CTGAGGAGACAGTGACCAGGGTGCCACGGCCCCAGAGATCGAAGTACCAGTAG <- IGHJ2
CTGAAGAGACGGTGACCATTGTCCCTTGGCCCCAGACATCAAAAGCATCA <- IGHJ3
CTGAGGAGACGGTGACCAGGGTTCCTTGGCCCCAGTAGTCAAAGTAGT <- IGHJ4
CTGAGGAGACGGTGACCAGGGTTCCTTGGCCCCAGGAGTCGAACCAGTTGT <- IGHJ5
AGGAGACGGTGACCGTGGTCCCTTGCCCCCAGACGTCCATACCGTAGTAGTAGTAGTAAT <- IGHJ6
# IGK
<------------ FR4---------->
NNNNTTTGATaTCCAccTTGGTCCC <- Jk1-primer
GTTTGATTTCCACCTTGGTCCCTTGGCCGAACGTCCAC <- IGKJ1
GTTTGATCTCCAGCTTGGTCCCCTGGCCAAAAGTGTACA <- IGKJ2
GTTTGATATCCACTTTGGTCCCAGGGCCGAAAGTGAAT <- IGKJ3
GTTTGATCTCCACCTTGGTCCCTCCGCCGAAAGTGAGC <- IGKJ4
NNNNTTTAATCTCCAGTCGTGTCCC <- Jk2-primer
GTTTAATCTCCAGTCGTGTCCCTTGGCCGAAGGTGATC <- IGKJ5
# IGL
<------------ FR4---------->
NNNNAGGACGGTGACCTTGGTCCC <- Jl1-primer
NNNNAGGACGGTCAGCTgGGTCCC <- Jl2-primer
CTAGGACGGTGACCTTGGTCCCAGTTCCGAAGACATAA <- IGLJ1
CTAGGACGGTCAGCTTGGTCCCTCCGCCGAATACCACA <- IGLJ2
CTAGGACGGTCAGCTTGGTCCCTCCGCCGAATACCACA <- IGLJ3*01
CTAGGACGGTCAGCTTGGTCCCTCCGCCGAACACCCAA <- IGLJ3*02
CTAGGACGGTCAGCTCGGTCCCCTCACCAAACACCCAG <- IGLJ5
CGAGGACGGTCACCTTGGTGCCACTGCCGAACACATTA <- IGLJ6
CTAAAATGATCAGCTGGGTTCCTCCACCAAATACAAAA <- IGLJ4
CGAGGACGGTCAGCTGGGTGCCTCCTCCGAACACAGCA <- IGLJ7
Upstream analysis
MiXCR has a dedicated preset for this protocol, thus analysing the data ia as easy as:
mixcr analyze biomed2-human-rna-igh \
raw/SRR8365277_HIP1_female_IgG1_R1.fastq.gz \
raw/SRR8365277_HIP1_female_IgG1_R2.fastq.gz \
results/SRR8365277_HIP1_female_IgG1
One might also use GNU Parallel to process all samples at once:
#!/usr/bin/env bash
mkdir -p results
ls /raw/*R1* |
parallel -j 2 --line-buffer \
"mixcr analyze biomed2-human-rna-igh \
{} \
{=s:R1:R2:=} \
{=s:.*/:results/:;s:_R.*::=}"
Under the hood pipeline:
Under the hood mixcr analyze biomed2-human-bcr-cdr3
executes the following pipeline:
align
Alignment of raw sequencing reads against reference database of V-, D-, J- and C- gene segments.
mixcr align \
--species hsa \
-p generic-amplicon \
--tag-pattern "^N{32}(R1:*)\^N{26}(R2:*)" \
-OvParameters.geneFeatureToAlign="VTranscriptWithout5UTRWithP" \
-OvParameters.parameters.floatingLeftBound=false \
-OjParameters.parameters.floatingRightBound=false \
--report results/SRR8365468_HIP2_male.report \
--json-report results/SRR8365468_HIP2_male.json \
raw/SRR8365468_HIP2_male_R1.fastq.gz \
raw/SRR8365468_HIP2_male_R2.fastq.gz \
results/SRR8365468_HIP2_male.vdjca
Option --report
is specified here explicitly.
--species hsa
- determines the organism species.
-p
generic-amplicon
a preset of MiXCR parameters for amplicon data.--tag-pattern "^N{32}(R1:*)\^N{26}(R2:*)"
- With this pattern we trim 32 nucleotides (the length of the longest forward primer) from the beginning of R1 file and 26 nucleotide (the length of the longest reverse primer from the beginning of R2 file.
-OvParameters.geneFeatureToAlign="VTranscriptWithout5UTRWithP"
- Sets a V gene feature to align. Check gene features for more info.
-OvParameters.parameters.floatingLeftBound=false
- Results in a global alignment algorithm for V gene left bound. We can use global alignment here, because primer sequences were trimmed with
--tag-pattern
. -OjParameters.parameters.floatingRightBound=false
- Results in a global alignment algorithm for J gene right bound. We can use global alignment here, because primer sequences were trimmed with
--tag-pattern
.
assemble
Assembles alignments into clonotypes and applies several layers of errors correction(ex. quality-awared correction for sequencing errors, clustering to correct for PCR errors). Check mixcr assemble
for more information. By default clones will be assembled by CDR3
gene feature.
mixcr assemble \
-OassemblingFeatures="CDR3" \
-OseparateByJ=true \
-OseparateByV=true \
--report results/SRR8365468_HIP2_male.report \
--json-report results/SRR8365468_HIP2_male.json \
results/SRR8365468_HIP2_male.vdjca \
results/SRR8365468_HIP2_male.clns
-OseparateByV=true
- Split clones with the same
CDR3
sequence and different V-genes -OseparateByJ=true
- Split clones with the same
CDR3
sequence and different J-genes
export
Exports clonotypes from .clns file into human-readable tables.
mixcr exportClones \
-c IGH \
results/SRR8365468_HIP2_male.clns \
results/SRR8365468_HIP2_male.clonotypes.IGH.tsv
mixcr exportClones \
-c IGL \
results/SRR8365468_HIP2_male.clns \
results/SRR8365468_HIP2_male.clonotypes.IGL.tsv
mixcr exportClones \
-c IGK \
results/SRR8365468_HIP2_male.clns \
results/SRR8365468_HIP2_male.clonotypes.IGK.tsv
Here -p full
defines the full preset of common export columns. Check mixcr export
for more information.
-с <chain>
- defines a specific chain to be exported.
After execution is complete the following list of files is generated for every sample:
# human-readable reports
SRR8365468_HIP2_male.report
# raw alignments (highly compressed binary file)
SRR8365468_HIP2_male.vdjca
# IGH, IGK and IGL CDR3 clonotypes (highly compressed binary file)
SRR8365468_HIP2_male.clns
# IGH, IGK and IGL CDR3 clonotypes exported in tab-delimited txt
SRR8365468_HIP2_male.IGH.tsv
SRR8365468_HIP2_male.IGL.tsv
SRR8365468_HIP2_male.IGK.tsv
While .clns
file holds all data and is used for downstream analysis using mixcr postanalisis
, the output .tsv
clonotype table will contain exhaustive information about each clonotype as well:
See first 100 records from clonotype table SRR8365468_HIP2_male:
cloneId | cloneCount | cloneFraction | targetSequences | targetQualities | allVHitsWithScore | allDHitsWithScore | allJHitsWithScore | allCHitsWithScore | allVAlignments | allDAlignments | allJAlignments | allCAlignments | nSeqFR1 | minQualFR1 | nSeqCDR1 | minQualCDR1 | nSeqFR2 | minQualFR2 | nSeqCDR2 | minQualCDR2 | nSeqFR3 | minQualFR3 | nSeqCDR3 | minQualCDR3 | nSeqFR4 | minQualFR4 | aaSeqFR1 | aaSeqCDR1 | aaSeqFR2 | aaSeqCDR2 | aaSeqFR3 | aaSeqCDR3 | aaSeqFR4 | refPoints |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 2 | 0.0952381 | AGCAGTGACGTTGATACTTATAACTATGTCTCCTGGTCTGGCAACACGGCCTCCCTGACCATCTCTGGGCTCCAGGCTGAGGACGAGGCTCATTATTACTGCTGCTCACATTCGACCGGCAGCACTCGTTATGTCTTCGGAACT | IIIIIIIIGIGGIGGIGGIGGIIGIIIIIIIIIIIIGIIIIIIIIIIIIIIIIIIIIIIAGIGGGGGGGIGIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIGGGIIIAG | IGLV2-14*00(569) | nan | IGLJ1*00(257.5) | nan | 75 | 112 | 317 | 0 | 37 | SG88ASG90ASG91C | 283.0 | nan | 20 | 36 | 58 | 128 | 144 | 160.0 | nan | nan | nan | AGCAGTGACGTTGATACTTATAACTAT | 38 | nan | nan | nan | nan | nan | nan | |
1 | 2 | 0.0952381 | AGCAATGACGTTGGGGTTTCTCTTCGCTTCTCTGGCTCCAAGTCTGGCAACACGGCCTCCCTGACCATCTCTGGGCTCCAGGCTGAGGACGAGGCTGATTATTACTGCAACTCATATACACTCGGCAGCACCCCGGTTTGTGTCTTCGGAACT | IIIIIIIIGGIIGIIIIIIIIIIIIIIIIIIIIIGIIIIIIGIIIIIIGIIIIIIIIIIGIGGGIIIIIIIIGGGIGIIIIIGIIIIIIIIIIIIIIIIIIIIIIIIGIIIIIIIIIGGIIIIIIIIIGIIIIIIIGIIGIIIIIIIIIIIAA | IGLV2-14*00(737.5) | nan | IGLJ1*00(206) | nan | 75 | 89 | 317 | 0 | 14 | SG79A | 111.0 | nan | 20 | 36 | 58 | 137 | 153 | SA22G | 131.0 | nan | nan | nan | nan | nan | nan | nan | nan | nan | nan | nan |
2 | 1 | 0.047619 | GGTGGCTCCATCAGTAGTTACTACTGGAGCTGGATCCGGCAGCCCCCAGGGAAGGGACTGGAGTGGATTGGGTATATCTATTACAGTGGGAGCACCAACTACAACCCCTCCCTCAAGAGTCGAGTCACCATATCAGTAGACACGTCCAAGAACCAGTTCTCCCTGAAGCTGAGCTCTGTGACCGCTGCGGACACGGCCGTGTATTACTGTGCGAGAGGAGTAATACCGCACCCGCTCGACTACTGGGGCCAGGGA | IIIIIIIIIIIGIIIIIGIIIIIGIGGIIIIGIGIIIIGIIEIGIIIGEGGGGIIGIGIGGG<GGGGGG<GIGIIIIGIAGGGIIIIGGGGGGGGGIGGGGGGGGIGIGGGG<GGGIGGIIGGIGIGGGAGGGGGGGGGGGGGGIGIIGIGGGGAAGAAAGGIIIIGIGGIGIGGGGGGGGAGAGIIIGGGGGGGIGIIIIIIIIIGIIGIIIIIGGIIIIIIGGIIIGGGGGGGGIGGGGGGIIGGIGIIIIII | IGHV4-59*00(2230) | IGHD3-1000(40),IGHD3-2200(40),IGHD1-7*00(36) | IGHJ4*00(210) | nan | 75 | 292 | 313 | 0 | 217 | 2170.0 | 23 | 31 | 93 | 218 | 226 | 40.0;23 | 31 | 93 | 218 | 226 | 40.0;8 | 18 | 51 | 218 | 228 | ST11A | 36.0 | |||
3 | 1 | 0.047619 | GGTGGCTCCATCAGCAGTGGTGGTTACTACTGGAGCTGGATCCGCCAGCACCCAGGGAAGGGCCTGGAGTGGATTGGGTACATCTATTACAGTGGGAGCACCTACTACAACCCGTCCCTCAAGAGTCGAGTTACCATATCAGTAGACACGTCTAAGAACCAGTTCTCCCTGAAGCTGAGCTCTGTGACTGCCGCGGACACGGCCGTGTATTACTGTGCGAGAGACACCCGTATTACGATTTTTGGAGTGGACTCCAACGGTATGGACGTCTGGGGCCAAGGG | GGGGGGGIGGIIIGIIAGGIIIIGGIIGIIIIGIIIIIIIIIIIIIIGGIIIIIIIIIIGAGGEIIIGGGEIIIIGIIIIIIIIGGIIGIIIGGGIIGIIIIGGGIGIIIGGG:GGIIGGGIGIIIGGGIIGIGGGGGGGIIIIIII<GAGGIIIGGIIGGGGGGGGIGIIGIGGIGGGGGGGGIGGIGIGGGGGGGGIIGGAGGIAAAG<<<<<<GGAAAGGAGGIGIIGGGGIIGIIGGGIIGGIIIGIIGIIIGIGGGGIGGGAGGGGGIGGGGIIIGI | IGHV4-31*00(2300) | IGHD3-3*00(110) | IGHJ6*00(312) | nan | 75 | 299 | 319 | 0 | 224 | 2240.0 | 30 | 52 | 93 | 228 | 250 | 110.0 | 29 | 61 | 83 | 250 | 282 | SA32CST34A | 262.0 | nan | nan | nan | GGTGGCTCCATCAGCAGTGGTGGTTACTAC | 32 | ||
4 | 1 | 0.047619 | GGTGGCTCCATCAGCAGTAGTAACTGGTGGAGTTGGGTCCGCCAGCCCCCAGGGAAGGGGCTGGAGTGGATTGGGGAAATCTATCATAGTGGGAGCACCAACTACAACCCGTCCCTCAAGAGTCGAGTCACCATATCAGTAGACAAGTCCAAGAACCAGTTCTCCCTGAAGCTGAGCTCTGTGACCGCCGCGGACACGGCCGTGTATTACTGTGCGAGAGTTAGTGGGAGCTACTACTACGGTATGGACGTCTGGGGCCAAGGG | IIIIIIIIIIIIIIIGIIIIIIIIIIGGIIIIIIIIIIIIIIIIIIIIIIIIIIIIIGEGAAGA<<<<:<<GGIIGIGGGGIGIGIIG<<<GEGGGIIIIIGGGGGGGIIGGGGGGIIIGGGGGIGA<GGGIIIIGGGGIIIIIIIIGGIGGAGGGGGGIIIGIIGGGGGGGIIIIGGGIIGGGGGGGGAAAGG<AGA<<<<<<G<GGIIIIIIIIIIIIIIIIIIIIIIGIIIIIIIIIIIIIIIIIIIIGIIIGGGGGIIII | IGHV4-4*00(2260) | IGHD1-26*00(45) | IGHJ6*00(351) | nan | 75 | 295 | 316 | 0 | 220 | 2200.0 | 24 | 33 | 60 | 221 | 230 | 45.0 | 27 | 61 | 83 | 230 | 264 | 340.0 | nan | nan | nan | GGTGGCTCCATCAGCAGTAGTAACTGG | 38 | |||
5 | 1 | 0.047619 | GGTGGGTCCTTCAGTGGTTACTACTGGAGCTGGATCCGCCAGCCCCCAGGGAAGGGGCTGGAGTGGATTGGGGAAATCAATCATAGTGGAAGCACCAACTACAACCCGTCCCTCAAGAGTCGAGTCACCATATCAGTAGACACGTCCAAGAACCAGTTCTCCCTGAAGCTGAGCTCTGTGACCGCCGCGGACACGGCTGTGTATTACTGTGCGAGAGGCCCCTATTACGATTTTTGGAGTGGTTATCGAAACCTCTACTACTACTACGGTATGGACGTCTGGGGCCAAGGG | GGIGIIIGGGGGGGGGIIIGIIIIIIIIIIIIIIIIIIIIIGIIIIIIGIIIIIIIIIGGI<<GGGGIIIIIIIGGIIIIIIIIIGGGGGGIIGGGGGGIIIGGGIGIIIIIGGGIGGGGGIGGGGIGGGIGIIIIIIGGIIIIIGIIGGIIIGG:GIIIIGIGIAAAAGGIGIGGGIGGGGGIIIIIGGIIGIGGGIIIIIIIIIIIIIIIIIIIGG<<<<<GGGG<<<GGGGIIIIIIIIIIIIIIIIIGIIIIIIIIGGGGGGIGIIIIIIIIGGGIGIIGGGGIIGI | IGHV4-34*00(2261) | IGHD3-3*00(115) | IGHJ6*00(410) | nan | 75 | 298 | 313 | 0 | 223 | ST295C | 2201.0 | 33 | 56 | 93 | 223 | 246 | 115.0 | 24 | 61 | 83 | 254 | 291 | 370.0 | nan | nan | nan | GGTGGGTCCTTCAGTGGTTACTAC | 38 | ||
6 | 1 | 0.047619 | GGTGGGTCCTTCAGTGGTTACATCTGGACCTGGATCCGCCAGACCCCAGGAAAGGGGCTGCAGTAGATTGGACAAATCAATCATAGTGGAAGCGCCAACTACAACCCGTCCCTCAAGAGTCGAGTCACCATATCAGTACACACGTCCAATAGTCAGTTCTCCCTGGAGCTGAGCTCTGTGACCGCCGCGGACACGGCTGTGTATTACTGTGCGAGTCCCAAGATAGCATTCTACAACTGGTTCGACCCCTGGGGCCAGGGA | GIIIIIIIIIIIIIIIIIIIIIIIIIIGIIIIIIIIIIIIGGEIIIIIIIIIIGIIIIIIIGGGG<GGGGIGGIIIIGGIIIIIIIIIIIIGAGIIIIIGGGIIIGIIIIIIGIGIGIGGGGIGGGGGGGGGGGIGIIIIIGG:GGIIIIGGIIIGIGIIIIIIGGGIIIIIIIIIIIGGGGIIIIIIIIIG<GGGG<<GAG<GAIIIIGIIGIIIIIIIIIIIIIIIIIGGGIGGIIIIIGGGGGGGIGIIIIGGIIIGI | IGHV4-34*00(1775) | IGHD7-2700(32),IGHD6-1300(30),IGHD6-25*00(30) | IGHJ5*00(351) | nan | 75 | 290 | 313 | 0 | 215 | ST96ASA97TSG103CSC117ASG125ASG135CSG139ASG146ASG147CSA168GSG213CSG224TSA226GSC227TSA240G | 1715.0 | 0 | 12 | 33 | 215 | 227 | SC4AST7A | 32.0;25 | 31 | 63 | 222 | 228 | 30.0;22 | 28 | 54 | 222 | 228 | 30.0 | ||
7 | 1 | 0.047619 | GGTGACTTCCTCGACGATAGTAGGTGGTGGAGTTGGGTCCGCCAGCCCCCAGGGAAGGGGCTGGAGTGGATTGGAGAGATCCATCATACGGGGATCACCAACTACATCCCGTCCCTCAAGAGTCGGGTCACCATGTCAGTGGACAAGAACAAGAACCAGGTCTCCCTAGAGGTGTATTCTGTGACCGCCGCGGACACGGCCGTGTATTACTGTGCGAGTGGGGCCCACTATATTTGGAATGGGTGGGGCCAGGGA | GGIIIIIIIGIIGIIGGIGGIGGGIIIIIIIIIIIGIGIIIIIIIIIIIIIIIIIIIIIGIIIIGGGGI<<<IIGIIIIIIIIIIIIIGIIIIIIIIIIIIIIIIIIIIIIIGAGGGGIGIIGGGIGGIGIIIIIGGGIGIIGGIIGGIIIIIGGIIGGGGGIIIIIIIIGGIIIIIGIIIIIIGGIIGIIIIGIGIGGGIIGIIGGGGGGIIIIIGGIGIIIIIIIIIIIIIGIIIGIIIIIIIIIIGGIIIGG | IGHV4-4*00(1399) | IGHD5-1800(50),IGHD1-2600(45),IGHD5-12*00(45) | IGHJ400(170),IGHJ500(170) | nan | 75 | 293 | 316 | 0 | 218 | SG79ASC82TSA84CSA87GSG88ASA90GSG91ASA97GSC98GSG149ASA152GST156CSG163CST164GSG169TSA181TSA200GSA209GSA215GST222ASC223AST234GSG242ASA243GSC246GSA249TSG250ASC251T | 1368.0 | 10 | 20 | 69 | 224 | 234 | 50.0;9 | 18 | 60 | 223 | 232 | 45.0;10 | 19 | 69 | 224 | 233 | 45.0 | |||
8 | 1 | 0.047619 | GGATTCACCTTCAGTAGCTACTGGATGCACTGGGTCCGCCAAGCTCCAGGGAAGGGGCTGGTGTGGGTCTCACGTATTAATAGTGATGGGAGTATCACAAGCTACGCGGACTCCGTGAAGGGCCGATTCACCATCTCCAGAGACAACGCCAAGAGCACGCTGTATCTGCAAATGAACAGTCTGAGAGCCGAGGACACGGCTGTGTATTACTGTGCCCCATACGGTAGTAACCACATTTTCGACTACTGGGGCCAGGGA | IIGIIIIIIIIIGIIIIIIIIIIIIIGIIIIIIIIIIIIIGIIIIIIIIIGIIIIII<:GIGGGGIGIIIIIIIIIIGGIIIIGGIIIIIIGGGGGGGIIGGIGGGIGIIIGGGGGGGGGIIIIIIIIIGIIIIGIIIIIIGIIIIGIIIGIIIGGGIGIGGGGGGIGGIGGGIIIIIGGGGIIGIGIIGGGGGGGGGAGGIIIIIIIIIIIIIGIIIGGIIIIIGIIIIIIIGGGIGIGGIIGIGIIIIGGAIIIII | IGHV3-74*00(2152) | IGHD4-2300(46),IGHD1-1400(41),IGHD3-22*00(41) | IGHJ4*00(221) | nan | 75 | 290 | 316 | 0 | 215 | SG169TSA229G | 2092.0 | 23 | 35 | 57 | 219 | 231 | SG29A | 46.0;12 | 26 | 51 | 218 | 232 | DC15I21G | 41.0;3 | 14 | 93 | 223 | 234 | SA6G | 41.0 |
9 | 1 | 0.047619 | AGTTCCAACATCGGAAGCAATACTGTAAACTGGTCTGGCACCTCAGCCTCCCTGGCCATCAGTGGTCTCCAGTCTGAGGATGAAGCTGATTATTACTGTGCAGTCTGGGATGACAAGCTGCGTGGTCGGATAATCGGCGGA | IIIIIIGIGG<GAG<G<<G<GGGIIGIIIIIIIIIIIGGGIIIIGGGIGIIIGIIIIIGG<<<<<GGIIIGGGGGGGGIIIIIIIIIIGIGIIIIIIIIIIIIGIIIIIGGIGGIGGGGGIIIIIIIIIIIGIIIGIIGGG | IGLV1-44*00(627) | nan | IGLJ200(223),IGLJ300(222) | nan | 75 | 109 | 316 | 0 | 34 | SC77TST92C | 282.0 | nan | 25 | 36 | 58 | 130 | 141 | ST27A | 81.0;28 | 36 | 58 | 133 | 141 | 80.0 | nan | nan | nan | AGTTCCAACATCGGAAGCAATACT | 27 | |
10 | 1 | 0.047619 | AGCAGTGATATTGGGGGTTATAACCATGTCTCCTGGTACCAACAACACCCAGGCAAAGCCCCCAAACTCATCATTTATGAGCACCCGTTATGTCGTCGGAACA | GIIIIGGIG<<GAGGGIIIGGIIGGGGGIIIIIIIIIIGIIIIIIGIIIIIIIIIIIIGGIIIIIIGIIIGIIGIIIIIGGGGGGGGGGGIGIIIIIIIIIGG | IGLV2-2300(1357),IGLV2-1400(1318) | nan | IGLJ1*00(264) | nan | 75 | 156 | 317 | 0 | 81 | SG84ASA90GST100ASG119ASG146C | 665.0;75 | 155 | 317 | 0 | 80 | SC83TSG84AST89GST99CSG119ASG146C | 626.0 | nan | 20 | 36 | 58 | 87 | 103 | ST27GST35A | 102.0 | nan | nan | nan | AGCAGTGATATTGGGGGTTATAACCAT | 27 |
11 | 1 | 0.047619 | AGCAGTGATATTGGGAGGTCTGGCAACACGTCCTCCCTGACAATGTTTGGGCTCCAGGGTGAGGACGAGGCTGATAATTACTGCTACTCATATGCGTTTAATTGGACGTCTTCTAAACT | IGGIIIGGIGIGIGGGIGIIGIIIIIGIIIIIIIIGGIGGGGGIIIIIIIIIIGIIIIIGGIGIIIIIIIIIIIGGGGIIIIIIGIIIIIIGIIGIIIIIIIIIIIIII7AAGGAG<<7 | IGLV2-23*00(167) | nan | IGLJ1*00(224) | nan | 75 | 92 | 317 | 0 | 17 | SG84A | 141.0 | nan | 24 | 36 | 58 | 107 | 119 | SG30TSG31A | 62.0 | nan | nan | nan | nan | nan | nan | nan | nan | nan | nan | nan |
12 | 1 | 0.047619 | AGCAGTGACGTTGGTGGTTATAACTATGTCTCCTGGTCTGGCAACACGGCCTCCCTGACCATCTCTGGGCTCCAGGCTGAGGACGAGGCTGATTATTACTGCACCTCATATACAAGCAGTGCGAATTATGTCTTCGGAACT | GIIIIIIGGGGGIIIIIGIGIGIIIIIIIIIIIIIIIIIGIIIIIIIIIIIIIIIIIIIG<<GGGIIIIIGIIIGIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIGGGGGIIIIIIIIIIIIIIGIIIIIGIAG | IGLV2-14*00(808) | nan | IGLJ1*00(370) | nan | 75 | 112 | 317 | 0 | 37 | 370.0 | nan | 18 | 36 | 58 | 123 | 141 | 180.0 | nan | nan | nan | AGCAGTGACGTTGGTGGTTATAACTAT | 38 | nan | nan | nan | nan | nan | nan | ||
13 | 1 | 0.047619 | AGCAGTGACGTTGGTGGTTATAACTATGTCTCCTGGTACCAACAGCACCCAGGCAAAGCCCCCAAACTCATGATTTATGATGTCAGTAATCGGCCCTCAGGGGTTTCTAATCGCTTCTCTGGCTCCAAGTCTGGCAACACGGCCTCCCTGACCATCTCTGGGCTCCAGGCTGAGGACGAGGCTGATTATTACTGCAGCTCATATACAAGCAGCAGCACTCTCTATGTCTTCGGAACT | <<G<<<GGGAGGGGIGIIAGAGIEIIIIIIIIIIGGIIGGIIIIIIIGGIGGIIIGIGIGGGGGGGAAGGGGGIIGIGGGGGAAG:GIIGGIIIGIIIGIIGGGGGGGIIIIGGGIGIIGGGGGIIIIIGGGIIGGIIIIIIIIIIIIIIIIIIIIIIIIIIIGGGG<<<<GIIGIIIIGIIIGIIIIIIIIIGIIIGIIIIIIIIIIGIIIIIIIGGIGIIIIIIIIIIIIIIIIG | IGLV2-14*00(2850) | nan | IGLJ1*00(190) | nan | 75 | 297 | 317 | 0 | 222 | 2220.0 | nan | 21 | 36 | 58 | 222 | 237 | 150.0 | nan | nan | nan | AGCAGTGACGTTGGTGGTTATAACTAT | 27 | GTCTCCTGGTACCAACAGCACCCAGGCAAAGCCCCCAAACTCATGATTTAT | 32 | GATGTCAGT | 25 | AATCGGCCCTCAGGGGTTTCTAATCGCTTCTCTGGCTCCAAGTCTGGCAACACGGCCTCCCTGACCATCTCTGGGCTCCAGGCTGAGGACGAGGCTGATTATTAC | 27 | ||
14 | 1 | 0.047619 | AGCAGTGACGTTGGGGTTTCTAATCGCTTCTCTGGCTCCAAGTCTGGCGACACGGCCTCCCTGACCATCTCTGGGCTCCAGGCTGAGGACGAGGCTGATTATTACTGCAACTCATATACAAGCAGCAGCACTTATGTCTTCGGAACT | GGIIIIIIIIGGGGGIIGIIIIIIIIIIIIIIIIIIIIIGGGGGIIIIIIIIGG:IIIIIGGIGGGGGGGGIGGGGAIGGGGGIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIGGIIGGGGGIGIIIIIIGAA | IGLV2-14*00(845) | nan | IGLJ1*00(281) | nan | 75 | 294 | 317 | 0 | 132 | DT89DG90DG91DT92DT93DA94DT95DA96DA97DC98DT99DA100DT101DG102DT103DC104DT105DC106DC107DT108DG109DG110DT111DA112DC113DC114DA115DA116DC117DA118DG119DC120DA121DC122DC123DC124DA125DG126DG127DC128DA129DA130DA131DG132DC133DC134DC135DC136DC137DA138DA139DA140DC141DT142DC143DA144DT145DG146DA147DT148DT149DT150DA151DT152DG153DA154DT155DG156DT157DC158DA159DG160DT161DA162DA163DT164DC165DG166DG167DC168DC169DC170DT171DC172DA173DG174DG175SA210GSG271A | 254.0 | nan | 20 | 36 | 58 | 131 | 147 | 160.0 | nan | nan | nan | AGCAGTGACGTTGG | 38 | nan | 0 | nan | 0 | GGTTTCTAATCGCTTCTCTGGCTCCAAGTCTGGCGACACGGCCTCCCTGACCATCTCTGGGCTCCAGGCTGAGGACGAGGCTGATTATTAC | 25 | |
15 | 1 | 0.047619 | AGCAGTGACGTTGGGATCTCGGATCGCTTCTCTGGTTCCAAGTCTGGCAACACGGCCTCCCTGACCATCTCTGGGCTCCAGGCTGAGGACGAGGCTGATTATTACTGCAGCTCGTTTACAACCAGCGACACTTATGTCTTCGGAAGT | IGIIGGGIGIGIGGGGIGGGIIIIIIGGIIIIIIIIGGGGIGGGIIIIIIIIIIIIIIIIG<GGGGGGIIGGGGGIGIIGIIGGGIIGIIIGIIIIIGIGIIIIIIIIIIIIIIIGIIGIIIIIIIIIGIGIIIGG:IIIIIIIGIG | IGLV2-14*00(721) | nan | IGLJ1*00(271) | nan | 75 | 89 | 317 | 0 | 14 | 140.0 | nan | 20 | 36 | 58 | 131 | 147 | SC34G | 131.0 | nan | nan | nan | nan | nan | nan | nan | nan | nan | nan | nan | |
16 | 1 | 0.047619 | AGCAGTGACATTGGTCTGGCAACACGGCCTCCCTGACCATCTCTGGACTCCAGGCTGAGGACGAGGCTGATTATTACTGCAGCTCATATACAAGCAGCAGCACTGATGTCTTCGGAACT | IIIIIIIIGGGGGGGGIIIIGIIIIIIIIIIIIIIGIIIIIIIGGGIIIIGGGGIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIGGIIIIIIIGG<G | IGLV2-14*00(416) | nan | IGLJ1*00(302) | nan | 75 | 90 | 317 | 0 | 15 | SG84A | 121.0 | nan | 22 | 36 | 58 | 105 | 119 | 140.0 | nan | nan | nan | nan | nan | nan | nan | nan | nan | nan | nan | |
17 | 1 | 0.047619 | AGCAGTGACATTGGGGTTTCTAATCGCTTCTCTGGCTCCAAGTCTGGCAACACGGCCTCCCTGACCATCTCTGGGCTCCAGGCTGAGGACGAGGCTGATTATTACTGCACCTCATGGACAACCAGCACCACTATGATATTCGGCGGA | II<IIGIIGG<<<GGGGIIIIIIIGIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIGIIIGIGGGGGIGIGIGGGGGIIIIIIGIIIIIIIIIIIIIIIIIIIIGIIIIIIIIIIIIIGGGGIIGIIIIIIIIIIGGGG<IIGGGIA | IGLV2-14*00(859) | nan | IGLJ2*00(164) | nan | 75 | 294 | 317 | 0 | 132 | SG84ADT89DG90DG91DT92DT93DA94DT95DA96DA97DC98DT99DA100DT101DG102DT103DC104DT105DC106DC107DT108DG109DG110DT111DA112DC113DC114DA115DA116DC117DA118DG119DC120DA121DC122DC123DC124DA125DG126DG127DC128DA129DA130DA131DG132DC133DC134DC135DC136DC137DA138DA139DA140DC141DT142DC143DA144DT145DG146DA147DT148DT149DT150DA151DT152DG153DA154DT155DG156DT157DC158DA159DG160DT161DA162DA163DT164DC165DG166DG167DC168DC169DC170DT171DC172DA173DG174DG175SG271CSA277GST278GSG283CSG289C | 138.0 | nan | 22 | 36 | 58 | 133 | 147 | SG24A | 111.0 | nan | nan | nan | AGCAGTGACATTGG | 27 | nan | 0 | nan | 0 | GGTTTCTAATCGCTTCTCTGGCTCCAAGTCTGGCAACACGGCCTCCCTGACCATCTCTGGGCTCCAGGCTGAGGACGAGGCTGATTATTAC | 38 |
18 | 1 | 0.047619 | AGCAGAGATATTGGGACTTCTAACCTTGTCTCCTGGTACCAACAGTATCCAGGTATTTCTACTTCCCTTTTTGTCTTCGGAACA | IIIIGIIIG<GGGGG<GGGIIIIGIIIIIIIIIIIIIGIIIGIIIIIIIIIIIIIIIIIIIGIIIIGIGGIIIIIIIIIIIGG< | IGLV2-23*00(1048) | nan | IGLJ1*00(264) | nan | 75 | 128 | 317 | 0 | 53 | ST80ASG84ASG91CSA94CSC120TSC122T | 356.0 | nan | 20 | 36 | 58 | 68 | 84 | SA22TST35A | 102.0 | nan | nan | nan | AGCAGAGATATTGGGACTTCTAACCTT | 27 | nan | nan | nan | nan | nan | nan |
Quality control
Now when we have all files processed lets perform Quality Control. The first thing to check is the alignment rate. That can be easily done using mixcr exportQc align
function.
mixcr exportQc align results/*.clns figs/alignQc.pdf
Here we can see a percentage of successfully aligned reads for every sample as well as the percentage of reads that failed to align for various reasons. Some samples have a good alignment rate (more than 90%). But a lot of samples had some issues that lead to an alignment as low as 10-20%. Two major reasons for that outcome according to the plot are:
- the lack of TCR/IG sequence
- absence of J hits
Let's take one of the bad quality samples (ex. SRR8365459_HIP1_female) and examine it. To look at the reads' alignments for that sample we first will run mixcr align
command for that sample once again, but this time we will specify additional options (-OallowPartialAlignments=true -OallowNoCDR3PartAlignments=true
) that will preserve partially aligned reads (ex. reads that may lack J gene) and reads that lack CDR3
sequence.
mkdir -p debug
mixcr align -f \
--species hsa \
-p kAligner2_4.0 \
--tag-pattern "^N{32}(R1:*)\^N{26}(R2:*)" \
-OvParameters.geneFeatureToAlign="VTranscriptWithout5UTRWithP" \
-OvParameters.parameters.floatingLeftBound=false \
-OjParameters.parameters.floatingRightBound=false \
-OallowNoCDR3PartAlignments=true \
-OallowPartialAlignments=true \
--not-aligned-R1 debug/SRR8365459_HIP1_female_notAligned_R1.fastq \
--not-aligned-R2 debug/SRR8365459_HIP1_female_notAligned_R2.fastq \
--report debug/SRR8365459_HIP1_female_debug.report \
raw/SRR8365459_HIP1_female_R1.fastq.gz raw/SRR8365459_HIP1_female_R2.fastq.gz \
debug/SRR8365459_HIP1_female_debug.vdjca
Now, when we have a new .vdjca
file let's visualize how reads cover FRs and CDRs regions for that sample.
mixcr exportQc coverage \
debug/SRR8365459_HIP1_female_debug.vdjca \
figs/SRR8365459_HIP1_female_debug.vdjca.coverage.pdf
This will generate three .pdf
formatted plots: R1
alignment, R2
alignment and alignment of overlapped reads. These plots can tell us the percentage of reads that cover each region at a certain position. Briefly, for this sample, only those reads that overlap show a good coverage pattern.
Finally, we can look at raw alignments using mixcr exportAlignmentsPretty
.
The function bellow will generate a .txt
human-readable file with alignments. We use parameter --skip 1000
to skip first 1000 reads, as first reads usually have bad quality, and --limit 100
will export only 100 alignments as we usually don't need to examine every alignment to see the issue.
mixcr exportAlignmentsPretty
--skip 1000 \
--limit 100 \
debug/SRR8365459_HIP1_female_debug.vdjca \
debug/SRR8365459_HIP1_female_debug.alignments.txt
Bellow you can see a few alignments from the generated file. The first one is an example of well aligned read.
>>> Read ids: 1840
FR1><CDR1 CDR1><FR2
_ L R L S C A A S G F T L S D Y Y M S W I R Q A P G K
Quality 77767826778888888888888887788888888888878888888888888887888778888888888888887878
Target0 0 CCTGAGACTCTCCTGTGCAGCCTCTGGATTCACCTTGAGTGACTACTACATGAGCTGGATCCGCCAGGCTCCAGGGAAGG 79 Score
IGHV3-11*00 107 cctgagactctcctgtgcagcctctggattcaccttCagtgactactacatgagctggatccgccaggctccagggaagg 186 2102
FR2><CDR2 CDR2><FR3
G L E W V S H I S G S G N T I D Y A D S V K G R F T I
Quality 88788887778888888888887788888888888888888888888888888888887888888888888888888888
Target0 80 GGCTGGAGTGGGTTTCACACATTAGTGGCAGTGGTAATACCATAGACTACGCAGACTCTGTGAAGGGCCGATTCACCATC 159 Score
IGHV3-11*00 187 ggctggagtgggtttcaTacattagtAgTagtggtaGtaccataTactacgcagactctgtgaagggccgattcaccatc 266 2102
FR3><CDR3
S R D N A K N S L Y L Q M N S L R D D D T A V F Y C A
Quality 88888888888888888888888888888888888888888888888888888888888888888888888887888788
Target0 160 TCCAGGGACAACGCCAAGAACTCGCTCTATCTGCAAATGAACAGCCTGAGAGACGACGACACGGCCGTGTTTTATTGTGC 239 Score
IGHV3-11*00 267 tccagggacaacgccaagaactcActGtatctgcaaatgaacagcctgagagCcgaGgacacggccgtgtAttaCtgtgc 346 2102
V> <D D> <J CDR3><FR4 FR4>
R G R Y A L D Y W G Q G T R V T V S S _
Quality 88878788888888788888888888888888888888888888888888888888888888877776
Target0 240 GAGAGGCCGTTATGCCCTAGATTATTGGGGCCAGGGAACCCGGGTCACCGTCTCCTCAGGTAAGCCCC 307 Score
IGHV3-11*00 347 gagag 351 2102
IGHD3-16*00 65 cgttatAcc 73 31
IGHD2-2*00 56 tatgcc 61 30
IGHD3-10*00 61 cgttat 66 30
IGHJ4*00 28 gaCtaCtggggccagggaacccTggtcaccgtctcctcag 67 313
IGHJ5*00 37 tggggccagggaacccTggtcaccgtctcctcag 70 311
Now, the following alignment is a troubled one. And for this particular sample the majority of alignments look similar. We can see that one read does not align to any reference sequences and has a lot of low-quality nucleotides.
>>> Read ids: 1853
Quality 25762677888767572572527252276555757625555572625275777255225225525252725222252552
Target0 0 CGGCATTCCTGCTGAAACGAGATCGGAAGAGCGTCGTGTAGGGAAAGAGTGTAGATCTCGGTGGTCGCCGTCACATTAAA 79 Score
Quality 57222255256566257772557225555565256777672262255775556255567762277262262677727762
Target0 80 AAAAAAAACCAGCAGTGATGTTGGCAGTTATGACTATGTCTCCTGGTACCAACAGCACCCAGGCACAGTCTCCAAACCCA 159 Score
Quality 42222547622525426677662772660000000000000000000000000000000000000000000000000000
Target0 160 TGACGTACAATGACAATACTCAGCCCTCAGGGGTCCCTGATCGATTCTCTGGCTTCAAGTCTGGCAATACGCCCTCCATG 239 Score
Quality 00000000000
Target0 240 ACCATCCTTAG 250 Score
CDR2><F
CDR1><FR2 FR2><CDR2
F G S Y V Y V S W Y Q Q H S S T V P K P M I D N V N T
Quality 00000000000000000000000000000000000000000006247675667452527762544226662675276772
Target1 0 TTTGGGAGTTATGTCTATGTCTCCTGGTACCAACAGCACTCAAGCACAGTCCCCAAACCCATGATCGACAATGTCAATAC 79 Score
IGLV2-5*00 141 ttgggagttatgActatgtctcctggtaccaacagcacCcaGgcacagtccccaaacccatgatcTacaatgtcaatac 219 1445
R3
Q P S G V P D R F S G S K S G N T A S M T I S G L *
Quality 22262275272725267265252267676726267252555266222725267626677675277777757552652222
Target1 80 TCAGCCCTCAGGGGTCCCTGATCGTTTCTCTGGCTCCAAGTCTGGCAATACGGCCTCCATGACCATCTCTGGACTCTAGG 159 Score
IGLV2-5*00 220 tcagccctcaggggtccctgatcgtttctctggctccaagtctggcaatacggcctccatgaccatctctggactcCagg 299 1445
V Y K Q K T A Y E C Q S R S R H S C * T A L P I S A F
Quality 55222252675222525222752252222225622525266672252275226775565552222577665575255625
Target1 160 TTTACAAGCAGAAGACGGCATACGAGTGCCAGTCCCGGTCTCGGCATTCCTGCTGAACCGCTCTTCCGATCTCGGCATTC 239 Score
L L T _
Quality 72552252777
Target1 240 CTGCTGACCCG 250 Score
Another quality report we should investigate is chain abundance plot.
mixcr exportQc chainUsage \
results/*.clns \
figs/chainUsage.pdf
From that plot we can see another issue. According to the publication, the data was generated using a V and J primers multiplex protocol in such a way that every sample should have sequences for both heavy and light IG chains. But we see, that most samples have only one of the chains present in the sample, and those samples that have both still have a unexpected distribution, which has to be about 50\50, as every cell has both chains.
Full-length clonotype assembly
Biomed2 BCR protocol allows to recover a broader BCR receptor sequence then just CDR3
region. According to the protocol, forward primers are located in FR1
region, thus we can safely use an assembling feature that starts from CDR1
and be sure that no primers will affect the original sequence. The reverse primers are located in FR4
region very close to CDR3
, thus there is not much left from to include in clone assembly.
Taking into account what is mentioned above, the longest possible assembling feature for this protocol is "{CDR1Begin:CDR3End}"
.
MiXCR has a specific preset to obtain full-length BCR clones with Biomed2 protocol:
mixcr analyze biomed2-human-rna-igh \
raw/SRR8365468_HIP2_male_R1.fastq.gz \
raw/SRR8365468_HIP2_male_R2.fastq.gz \
results/SRR8365468_HIP2_male
The mixcr assemble
step in this preset differs from the one above in the following manner:
mixcr assemble \
-OassemblingFeatures="{CDR1Begin:CDR3End}" \
`-OseparateByJ=true` \
--report results/SRR8365468_HIP2_male.report \
--json-report results/SRR8365468_HIP2_male.json \
results/SRR8365468_HIP2_male.vdjca \
results/SRR8365468_HIP2_male.clns
-OassemblingFeatures="{CDR1Begin:CDR3End}"
- sets the assembling feature to the region which starts from
CDR1Begin
and ends at the end ofCDR3
.
Notice that we do not use -OseparateByV=true
in this case because assembling feature already covers most of the V regions, thus in case if clones have identical CDR3
they will still be separated.
Reports
Finally, MiXCR provides a very convenient way to look at the reports generated at ech step. Every .vdjca
, .clns
and .clna
file holds all the reports for every MiXCR function that has been applied to this sample. E.g. in our case .clns
file contains reports for mixcr align
and mixcr assemble
. To output this report use mixcr exportReports
as shown bellow. Note --json
parameter will output a JSON-formatted report.
mixcr exportReports \
results/SRR8365468_HIP2_male.clns \
figs/SRR8365468_HIP2_male.report.txt
mixcr exportReports \
--json \
results/SRR8365468_HIP2_male.clns \
figs/SRR8365468_HIP2_male.report.txt
Show report file
============== Align Report ==============
Input file(s): /raw/PRJNA511481-HIP/BIOMED2/SRR8365468_HIP2_male_R1.fastq.gz,/raw/PRJNA511481-HIP/BIOMED2/SRR8365468_HIP2_male_R2.fastq.gz
Output file(s): results/SRR8365468_HIP2_male.vdjca
Version: ; built=Mon Sep 26 10:55:18 CEST 2022; rev=8c998df1ab; lib=repseqio.v2.0
Command line arguments: --report results/SRR8365468_HIP2_male.align.report.txt --json-report results/SRR8365468_HIP2_male.align.report.json --preset local:biomed2-human-bcr-full-length +limitInput 100000 /raw/PRJNA511481-HIP/BIOMED2/SRR8365468_HIP2_male_R1.fastq.gz /raw/PRJNA511481-HIP/BIOMED2/SRR8365468_HIP2_male_R2.fastq.gz results/SRR8365468_HIP2_male.vdjca
Analysis time: 0ns
Total sequencing reads: 100000
Successfully aligned reads: 760 (0.76%)
Paired-end alignment conflicts eliminated: 98 (0.1%)
Alignment failed, no hits (not TCR/IG?): 82357 (82.36%)
Alignment failed because of absence of V hits: 51 (0.05%)
Alignment failed because of absence of J hits: 16810 (16.81%)
No target with both V and J alignments: 22 (0.02%)
Overlapped: 25788 (25.79%)
Overlapped and aligned: 547 (0.55%)
Alignment-aided overlaps: 20 (3.66%)
Overlapped and not aligned: 25241 (25.24%)
No CDR3 parts alignments, percent of successfully aligned: 2 (0.26%)
Partial aligned reads, percent of successfully aligned: 9 (1.18%)
V gene chimeras: 9 (0.01%)
IGH chains: 44 (5.79%)
IGH non-functional: 1 (2.27%)
IGK chains: 496 (65.26%)
IGK non-functional: 15 (3.02%)
IGL chains: 220 (28.95%)
IGL non-functional: 11 (5%)
Realigned with forced non-floating bound: 148464 (148.46%)
Realigned with forced non-floating right bound in left read: 997 (1%)
Realigned with forced non-floating left bound in right read: 997 (1%)
============== Assemble Report ==============
Input file(s): results/SRR8365468_HIP2_male.vdjca
Output file(s): results/SRR8365468_HIP2_male.clns
Version: ; built=Mon Sep 26 10:55:18 CEST 2022; rev=8c998df1ab; lib=repseqio.v2.0
Command line arguments: --report results/SRR8365468_HIP2_male.assemble.report.txt --json-report results/SRR8365468_HIP2_male.assemble.report.json results/SRR8365468_HIP2_male.vdjca results/SRR8365468_HIP2_male.clns
Analysis time: 0ns
Final clonotype count: 19
Average number of reads per clonotype: 1.11
Reads used in clonotypes, percent of total: 21 (0.02%)
Reads used in clonotypes before clustering, percent of total: 21 (0.02%)
Number of reads used as a core, percent of used: 21 (100%)
Mapped low quality reads, percent of used: 0 (0%)
Reads clustered in PCR error correction, percent of used: 0 (0%)
Reads pre-clustered due to the similar VJC-lists, percent of used: 0 (0%)
Reads dropped due to the lack of a clone sequence, percent of total: 610 (0.61%)
Reads dropped due to a too short clonal sequence, percent of total: 0 (0%)
Reads dropped due to low quality, percent of total: 0 (0%)
Reads dropped due to failed mapping, percent of total: 129 (0.13%)
Reads dropped with low quality clones, percent of total: 0 (0%)
Clonotypes eliminated by PCR error correction: 0
Clonotypes dropped as low quality: 0
Clonotypes pre-clustered due to the similar VJC-lists: 0
IGH chains: 7 (36.84%)
IGH non-functional: 0 (0%)
IGL chains: 12 (63.16%)
IGL non-functional: 0 (0%)
{
"type": "alignerReport",
"commandLine": "--report results/SRR8365468_HIP2_male.align.report.txt --json-report results/SRR8365468_HIP2_male.align.report.json --preset local:biomed2-human-bcr-full-length +limitInput 100000 /raw/PRJNA511481-HIP/BIOMED2/SRR8365468_HIP2_male_R1.fastq.gz /raw/PRJNA511481-HIP/BIOMED2/SRR8365468_HIP2_male_R2.fastq.gz results/SRR8365468_HIP2_male.vdjca",
"inputFiles": [
"/raw/PRJNA511481-HIP/BIOMED2/SRR8365468_HIP2_male_R1.fastq.gz",
"/raw/PRJNA511481-HIP/BIOMED2/SRR8365468_HIP2_male_R2.fastq.gz"
],
"outputFiles": [
"results/SRR8365468_HIP2_male.vdjca"
],
"version": "; built=Mon Sep 26 10:55:18 CEST 2022; rev=8c998df1ab; lib=repseqio.v2.0",
"trimmingReport": null,
"totalReadsProcessed": 100000,
"aligned": 760,
"notAligned": 99240,
"notAlignedReasons": {
"VAndJOnDifferentTargets": 22,
"NoVHits": 51,
"NoJHits": 16810,
"LowTotalScore": 0,
"NoBarcode": 0,
"NoHits": 82357,
"NoCDR3Parts": 0
},
"chimeras": 0,
"overlapped": 25788,
"alignmentAidedOverlaps": 20,
"overlappedAligned": 547,
"overlappedNotAligned": 25241,
"pairedEndAlignmentConflicts": 98,
"vChimeras": 9,
"jChimeras": 0,
"chainUsage": {
"type": "chainUsage",
"chimeras": 0,
"total": 760,
"chains": {
"IGH": {
"total": 44,
"nonFunctional": 1,
"isOOF": 0,
"hasStops": 1
},
"IGK": {
"total": 496,
"nonFunctional": 15,
"isOOF": 10,
"hasStops": 5
},
"IGL": {
"total": 220,
"nonFunctional": 11,
"isOOF": 7,
"hasStops": 4
}
}
},
"realignedWithForcedNonFloatingBound": 148464,
"realignedWithForcedNonFloatingRightBoundInLeftRead": 997,
"realignedWithForcedNonFloatingLeftBoundInRightRead": 997,
"noCDR3PartsAlignments": 2,
"partialAlignments": 9,
"tagReport": {
"type": "tagReport"
}
}
{
"type": "assemblerReport",
"commandLine": "--report results/SRR8365468_HIP2_male.assemble.report.txt --json-report results/SRR8365468_HIP2_male.assemble.report.json results/SRR8365468_HIP2_male.vdjca results/SRR8365468_HIP2_male.clns",
"inputFiles": [
"results/SRR8365468_HIP2_male.vdjca"
],
"outputFiles": [
"results/SRR8365468_HIP2_male.clns"
],
"version": "; built=Mon Sep 26 10:55:18 CEST 2022; rev=8c998df1ab; lib=repseqio.v2.0",
"preCloneAssemblerReport": null,
"totalReadsProcessed": 100000,
"initialClonesCreated": 19,
"readsDroppedNoTargetSequence": 610,
"readsDroppedTooShortClonalSequence": 0,
"readsDroppedLowQuality": 0,
"coreReads": 21,
"readsDroppedFailedMapping": 129,
"lowQualityRescued": 0,
"clonesClustered": 0,
"readsClustered": 0,
"clones": 19,
"clonesDroppedAsLowQuality": 0,
"clonesPreClustered": 0,
"readsPreClustered": 0,
"readsInClones": 21,
"readsInClonesBeforeClustering": 21,
"readsDroppedWithLowQualityClones": 0,
"clonalChainUsage": {
"type": "chainUsage",
"chimeras": 0,
"total": 19,
"chains": {
"IGH": {
"total": 7,
"nonFunctional": 0,
"isOOF": 0,
"hasStops": 0
},
"IGL": {
"total": 12,
"nonFunctional": 0,
"isOOF": 0,
"hasStops": 0
}
}
}
}