Skip to content

BIOMED2

Here we will discuss how to process BCR cDNA libraries obtained with BIOMED2 kit.

Data libraries

This tutorial uses the data from the following publication: High frequency of shared clonotypes in human B cell receptor repertoires. Soto C et al., Nature, 2019 Feb;566(7744):398-402 doi: 10.1038/s41586-019-0934-8

Peripheral blood samples were collected from three donors: HIP1 (female, 47 y.o.), HIP2 (male, 22 y.o.) and HIP3 (male, 29 y.o.). Samples were collected in multiple replicas. Total RNA was extracted from PBMCs and then RT-PCR was utilized to generate BCR amplicon libraries using BIOMED2 multiplex primers. Subsequent libraries were sequenced using next-generation sequencing.

All data is available from SRA (PRJNA511481) using e.g. SRA Explorer.

Use aria2c for efficient download of the full dataset with the proper filenames:

download.sh
mkdir -p raw
aria2c -c -s 16 -x 16 -k 1M -j 8 -i download-list.txt
download-list.txt
ftp://ftp.sra.ebi.ac.uk/vol1/fastq/SRR836/008/SRR8365468/SRR8365468_1.fastq.gz
  out=raw/SRR8365468_HIP2_male_R1.fastq.gz
ftp://ftp.sra.ebi.ac.uk/vol1/fastq/SRR836/008/SRR8365468/SRR8365468_2.fastq.gz
  out=raw/SRR8365468_HIP2_male_R2.fastq.gz
ftp://ftp.sra.ebi.ac.uk/vol1/fastq/SRR836/007/SRR8365457/SRR8365457_1.fastq.gz
  out=raw/SRR8365457_HIP1_female_R1.fastq.gz
ftp://ftp.sra.ebi.ac.uk/vol1/fastq/SRR836/007/SRR8365457/SRR8365457_2.fastq.gz
  out=raw/SRR8365457_HIP1_female_R2.fastq.gz
ftp://ftp.sra.ebi.ac.uk/vol1/fastq/SRR836/008/SRR8365458/SRR8365458_1.fastq.gz
  out=raw/SRR8365458_HIP1_female_R1.fastq.gz
ftp://ftp.sra.ebi.ac.uk/vol1/fastq/SRR836/008/SRR8365458/SRR8365458_2.fastq.gz
  out=raw/SRR8365458_HIP1_female_R2.fastq.gz
ftp://ftp.sra.ebi.ac.uk/vol1/fastq/SRR836/009/SRR8365459/SRR8365459_1.fastq.gz
  out=raw/SRR8365459_HIP1_female_R1.fastq.gz
ftp://ftp.sra.ebi.ac.uk/vol1/fastq/SRR836/009/SRR8365459/SRR8365459_2.fastq.gz
  out=raw/SRR8365459_HIP1_female_R2.fastq.gz
ftp://ftp.sra.ebi.ac.uk/vol1/fastq/SRR836/003/SRR8365463/SRR8365463_1.fastq.gz
  out=raw/SRR8365463_HIP1_female_R1.fastq.gz
ftp://ftp.sra.ebi.ac.uk/vol1/fastq/SRR836/003/SRR8365463/SRR8365463_2.fastq.gz
  out=raw/SRR8365463_HIP1_female_R2.fastq.gz
ftp://ftp.sra.ebi.ac.uk/vol1/fastq/SRR836/009/SRR8365469/SRR8365469_1.fastq.gz
  out=raw/SRR8365469_HIP2_male_R1.fastq.gz
ftp://ftp.sra.ebi.ac.uk/vol1/fastq/SRR836/009/SRR8365469/SRR8365469_2.fastq.gz
  out=raw/SRR8365469_HIP2_male_R2.fastq.gz
ftp://ftp.sra.ebi.ac.uk/vol1/fastq/SRR836/005/SRR8365465/SRR8365465_1.fastq.gz
  out=raw/SRR8365465_HIP1_female_R1.fastq.gz
ftp://ftp.sra.ebi.ac.uk/vol1/fastq/SRR836/005/SRR8365465/SRR8365465_2.fastq.gz
  out=raw/SRR8365465_HIP1_female_R2.fastq.gz
ftp://ftp.sra.ebi.ac.uk/vol1/fastq/SRR836/007/SRR8365467/SRR8365467_1.fastq.gz
  out=raw/SRR8365467_HIP2_male_R1.fastq.gz
ftp://ftp.sra.ebi.ac.uk/vol1/fastq/SRR836/007/SRR8365467/SRR8365467_2.fastq.gz
  out=raw/SRR8365467_HIP2_male_R2.fastq.gz
ftp://ftp.sra.ebi.ac.uk/vol1/fastq/SRR836/004/SRR8365464/SRR8365464_1.fastq.gz
  out=raw/SRR8365464_HIP1_female_R1.fastq.gz
ftp://ftp.sra.ebi.ac.uk/vol1/fastq/SRR836/004/SRR8365464/SRR8365464_2.fastq.gz
  out=raw/SRR8365464_HIP1_female_R2.fastq.gz
ftp://ftp.sra.ebi.ac.uk/vol1/fastq/SRR836/000/SRR8365450/SRR8365450_1.fastq.gz
  out=raw/SRR8365450_HIP3_male_R1.fastq.gz
ftp://ftp.sra.ebi.ac.uk/vol1/fastq/SRR836/000/SRR8365450/SRR8365450_2.fastq.gz
  out=raw/SRR8365450_HIP3_male_R2.fastq.gz
ftp://ftp.sra.ebi.ac.uk/vol1/fastq/SRR836/001/SRR8365461/SRR8365461_1.fastq.gz
  out=raw/SRR8365461_HIP1_female_R1.fastq.gz
ftp://ftp.sra.ebi.ac.uk/vol1/fastq/SRR836/001/SRR8365461/SRR8365461_2.fastq.gz
  out=raw/SRR8365461_HIP1_female_R2.fastq.gz
ftp://ftp.sra.ebi.ac.uk/vol1/fastq/SRR836/002/SRR8365462/SRR8365462_1.fastq.gz
  out=raw/SRR8365462_HIP1_female_R1.fastq.gz
ftp://ftp.sra.ebi.ac.uk/vol1/fastq/SRR836/002/SRR8365462/SRR8365462_2.fastq.gz
  out=raw/SRR8365462_HIP1_female_R2.fastq.gz
ftp://ftp.sra.ebi.ac.uk/vol1/fastq/SRR836/006/SRR8365456/SRR8365456_1.fastq.gz
  out=raw/SRR8365456_HIP1_female_R1.fastq.gz
ftp://ftp.sra.ebi.ac.uk/vol1/fastq/SRR836/006/SRR8365456/SRR8365456_2.fastq.gz
  out=raw/SRR8365456_HIP1_female_R2.fastq.gz
ftp://ftp.sra.ebi.ac.uk/vol1/fastq/SRR836/005/SRR8365475/SRR8365475_1.fastq.gz
  out=raw/SRR8365475_HIP2_male_R1.fastq.gz
ftp://ftp.sra.ebi.ac.uk/vol1/fastq/SRR836/005/SRR8365475/SRR8365475_2.fastq.gz
  out=raw/SRR8365475_HIP2_male_R2.fastq.gz
ftp://ftp.sra.ebi.ac.uk/vol1/fastq/SRR836/000/SRR8365460/SRR8365460_1.fastq.gz
  out=raw/SRR8365460_HIP1_female_R1.fastq.gz
ftp://ftp.sra.ebi.ac.uk/vol1/fastq/SRR836/000/SRR8365460/SRR8365460_2.fastq.gz
  out=raw/SRR8365460_HIP1_female_R2.fastq.gz
ftp://ftp.sra.ebi.ac.uk/vol1/fastq/SRR836/000/SRR8365470/SRR8365470_1.fastq.gz
  out=raw/SRR8365470_HIP2_male_R1.fastq.gz
ftp://ftp.sra.ebi.ac.uk/vol1/fastq/SRR836/000/SRR8365470/SRR8365470_2.fastq.gz
  out=raw/SRR8365470_HIP2_male_R2.fastq.gz
ftp://ftp.sra.ebi.ac.uk/vol1/fastq/SRR836/001/SRR8365471/SRR8365471_1.fastq.gz
  out=raw/SRR8365471_HIP2_male_R1.fastq.gz
ftp://ftp.sra.ebi.ac.uk/vol1/fastq/SRR836/001/SRR8365471/SRR8365471_2.fastq.gz
  out=raw/SRR8365471_HIP2_male_R2.fastq.gz
ftp://ftp.sra.ebi.ac.uk/vol1/fastq/SRR836/003/SRR8365473/SRR8365473_1.fastq.gz
  out=raw/SRR8365473_HIP2_male_R1.fastq.gz
ftp://ftp.sra.ebi.ac.uk/vol1/fastq/SRR836/003/SRR8365473/SRR8365473_2.fastq.gz
  out=raw/SRR8365473_HIP2_male_R2.fastq.gz
ftp://ftp.sra.ebi.ac.uk/vol1/fastq/SRR836/009/SRR8365449/SRR8365449_1.fastq.gz
  out=raw/SRR8365449_HIP3_male_R1.fastq.gz
ftp://ftp.sra.ebi.ac.uk/vol1/fastq/SRR836/009/SRR8365449/SRR8365449_2.fastq.gz
  out=raw/SRR8365449_HIP3_male_R2.fastq.gz
ftp://ftp.sra.ebi.ac.uk/vol1/fastq/SRR836/002/SRR8365482/SRR8365482_1.fastq.gz
  out=raw/SRR8365482_HIP1_female_R1.fastq.gz
ftp://ftp.sra.ebi.ac.uk/vol1/fastq/SRR836/002/SRR8365482/SRR8365482_2.fastq.gz
  out=raw/SRR8365482_HIP1_female_R2.fastq.gz
ftp://ftp.sra.ebi.ac.uk/vol1/fastq/SRR836/006/SRR8365446/SRR8365446_1.fastq.gz
  out=raw/SRR8365446_HIP3_male_R1.fastq.gz
ftp://ftp.sra.ebi.ac.uk/vol1/fastq/SRR836/006/SRR8365446/SRR8365446_2.fastq.gz
  out=raw/SRR8365446_HIP3_male_R2.fastq.gz
ftp://ftp.sra.ebi.ac.uk/vol1/fastq/SRR836/003/SRR8365483/SRR8365483_1.fastq.gz
  out=raw/SRR8365483_HIP1_female_R1.fastq.gz
ftp://ftp.sra.ebi.ac.uk/vol1/fastq/SRR836/003/SRR8365483/SRR8365483_2.fastq.gz
  out=raw/SRR8365483_HIP1_female_R2.fastq.gz
ftp://ftp.sra.ebi.ac.uk/vol1/fastq/SRR836/007/SRR8365447/SRR8365447_1.fastq.gz
  out=raw/SRR8365447_HIP3_male_R1.fastq.gz
ftp://ftp.sra.ebi.ac.uk/vol1/fastq/SRR836/007/SRR8365447/SRR8365447_2.fastq.gz
  out=raw/SRR8365447_HIP3_male_R2.fastq.gz
ftp://ftp.sra.ebi.ac.uk/vol1/fastq/SRR836/004/SRR8365484/SRR8365484_1.fastq.gz
  out=raw/SRR8365484_HIP1_female_R1.fastq.gz
ftp://ftp.sra.ebi.ac.uk/vol1/fastq/SRR836/004/SRR8365484/SRR8365484_2.fastq.gz
  out=raw/SRR8365484_HIP1_female_R2.fastq.gz
ftp://ftp.sra.ebi.ac.uk/vol1/fastq/SRR836/008/SRR8365448/SRR8365448_1.fastq.gz
  out=raw/SRR8365448_HIP3_male_R1.fastq.gz
ftp://ftp.sra.ebi.ac.uk/vol1/fastq/SRR836/008/SRR8365448/SRR8365448_2.fastq.gz
  out=raw/SRR8365448_HIP3_male_R2.fastq.gz
ftp://ftp.sra.ebi.ac.uk/vol1/fastq/SRR836/004/SRR8365424/SRR8365424_1.fastq.gz
  out=raw/SRR8365424_HIP3_male_R1.fastq.gz
ftp://ftp.sra.ebi.ac.uk/vol1/fastq/SRR836/004/SRR8365424/SRR8365424_2.fastq.gz
  out=raw/SRR8365424_HIP3_male_R2.fastq.gz
ftp://ftp.sra.ebi.ac.uk/vol1/fastq/SRR836/005/SRR8365485/SRR8365485_1.fastq.gz
  out=raw/SRR8365485_HIP1_female_R1.fastq.gz
ftp://ftp.sra.ebi.ac.uk/vol1/fastq/SRR836/005/SRR8365485/SRR8365485_2.fastq.gz
  out=raw/SRR8365485_HIP1_female_R2.fastq.gz
ftp://ftp.sra.ebi.ac.uk/vol1/fastq/SRR836/008/SRR8365488/SRR8365488_1.fastq.gz
  out=raw/SRR8365488_HIP1_female_R1.fastq.gz
ftp://ftp.sra.ebi.ac.uk/vol1/fastq/SRR836/008/SRR8365488/SRR8365488_2.fastq.gz
  out=raw/SRR8365488_HIP1_female_R2.fastq.gz
ftp://ftp.sra.ebi.ac.uk/vol1/fastq/SRR836/001/SRR8365421/SRR8365421_1.fastq.gz
  out=raw/SRR8365421_HIP3_male_R1.fastq.gz
ftp://ftp.sra.ebi.ac.uk/vol1/fastq/SRR836/001/SRR8365421/SRR8365421_2.fastq.gz
  out=raw/SRR8365421_HIP3_male_R2.fastq.gz
ftp://ftp.sra.ebi.ac.uk/vol1/fastq/SRR836/009/SRR8365489/SRR8365489_1.fastq.gz
  out=raw/SRR8365489_HIP1_female_R1.fastq.gz
ftp://ftp.sra.ebi.ac.uk/vol1/fastq/SRR836/009/SRR8365489/SRR8365489_2.fastq.gz
  out=raw/SRR8365489_HIP1_female_R2.fastq.gz
ftp://ftp.sra.ebi.ac.uk/vol1/fastq/SRR836/000/SRR8365490/SRR8365490_1.fastq.gz
  out=raw/SRR8365490_HIP1_female_R1.fastq.gz
ftp://ftp.sra.ebi.ac.uk/vol1/fastq/SRR836/000/SRR8365490/SRR8365490_2.fastq.gz
  out=raw/SRR8365490_HIP1_female_R2.fastq.gz
ftp://ftp.sra.ebi.ac.uk/vol1/fastq/SRR836/006/SRR8365246/SRR8365246_1.fastq.gz
  out=raw/SRR8365246_HIP1_female_R1.fastq.gz
ftp://ftp.sra.ebi.ac.uk/vol1/fastq/SRR836/006/SRR8365246/SRR8365246_2.fastq.gz
  out=raw/SRR8365246_HIP1_female_R2.fastq.gz
ftp://ftp.sra.ebi.ac.uk/vol1/fastq/SRR836/004/SRR8365474/SRR8365474_1.fastq.gz
  out=raw/SRR8365474_HIP1_female_R1.fastq.gz
ftp://ftp.sra.ebi.ac.uk/vol1/fastq/SRR836/004/SRR8365474/SRR8365474_2.fastq.gz
  out=raw/SRR8365474_HIP1_female_R2.fastq.gz
ftp://ftp.sra.ebi.ac.uk/vol1/fastq/SRR836/002/SRR8365422/SRR8365422_1.fastq.gz
  out=raw/SRR8365422_HIP3_male_R1.fastq.gz
ftp://ftp.sra.ebi.ac.uk/vol1/fastq/SRR836/002/SRR8365422/SRR8365422_2.fastq.gz
  out=raw/SRR8365422_HIP3_male_R2.fastq.gz
ftp://ftp.sra.ebi.ac.uk/vol1/fastq/SRR836/003/SRR8365423/SRR8365423_1.fastq.gz
  out=raw/SRR8365423_HIP3_male_R1.fastq.gz
ftp://ftp.sra.ebi.ac.uk/vol1/fastq/SRR836/003/SRR8365423/SRR8365423_2.fastq.gz
  out=raw/SRR8365423_HIP3_male_R2.fastq.gz
ftp://ftp.sra.ebi.ac.uk/vol1/fastq/SRR836/000/SRR8365420/SRR8365420_1.fastq.gz
  out=raw/SRR8365420_HIP3_male_R1.fastq.gz
ftp://ftp.sra.ebi.ac.uk/vol1/fastq/SRR836/000/SRR8365420/SRR8365420_2.fastq.gz
  out=raw/SRR8365420_HIP3_male_R2.fastq.gz
ftp://ftp.sra.ebi.ac.uk/vol1/fastq/SRR836/009/SRR8365419/SRR8365419_1.fastq.gz
  out=raw/SRR8365419_HIP3_male_R1.fastq.gz
ftp://ftp.sra.ebi.ac.uk/vol1/fastq/SRR836/009/SRR8365419/SRR8365419_2.fastq.gz
  out=raw/SRR8365419_HIP3_male_R2.fastq.gz
ftp://ftp.sra.ebi.ac.uk/vol1/fastq/SRR836/008/SRR8365248/SRR8365248_1.fastq.gz
  out=raw/SRR8365248_HIP1_female_R1.fastq.gz
ftp://ftp.sra.ebi.ac.uk/vol1/fastq/SRR836/008/SRR8365248/SRR8365248_2.fastq.gz
  out=raw/SRR8365248_HIP1_female_R2.fastq.gz
ftp://ftp.sra.ebi.ac.uk/vol1/fastq/SRR836/009/SRR8365249/SRR8365249_1.fastq.gz
  out=raw/SRR8365249_HIP1_female_R1.fastq.gz
ftp://ftp.sra.ebi.ac.uk/vol1/fastq/SRR836/009/SRR8365249/SRR8365249_2.fastq.gz
  out=raw/SRR8365249_HIP1_female_R2.fastq.gz
ftp://ftp.sra.ebi.ac.uk/vol1/fastq/SRR836/007/SRR8365247/SRR8365247_1.fastq.gz
  out=raw/SRR8365247_HIP1_female_R1.fastq.gz
ftp://ftp.sra.ebi.ac.uk/vol1/fastq/SRR836/007/SRR8365247/SRR8365247_2.fastq.gz
  out=raw/SRR8365247_HIP1_female_R2.fastq.gz
ftp://ftp.sra.ebi.ac.uk/vol1/fastq/SRR836/000/SRR8365250/SRR8365250_1.fastq.gz
  out=raw/SRR8365250_HIP1_female_R1.fastq.gz
ftp://ftp.sra.ebi.ac.uk/vol1/fastq/SRR836/000/SRR8365250/SRR8365250_2.fastq.gz
  out=raw/SRR8365250_HIP1_female_R2.fastq.gz
ftp://ftp.sra.ebi.ac.uk/vol1/fastq/SRR836/008/SRR8365348/SRR8365348_1.fastq.gz
  out=raw/SRR8365348_HIP1_female_R1.fastq.gz
ftp://ftp.sra.ebi.ac.uk/vol1/fastq/SRR836/008/SRR8365348/SRR8365348_2.fastq.gz
  out=raw/SRR8365348_HIP1_female_R2.fastq.gz
ftp://ftp.sra.ebi.ac.uk/vol1/fastq/SRR836/001/SRR8365251/SRR8365251_1.fastq.gz
  out=raw/SRR8365251_HIP1_female_R1.fastq.gz
ftp://ftp.sra.ebi.ac.uk/vol1/fastq/SRR836/001/SRR8365251/SRR8365251_2.fastq.gz
  out=raw/SRR8365251_HIP1_female_R2.fastq.gz
ftp://ftp.sra.ebi.ac.uk/vol1/fastq/SRR836/008/SRR8365418/SRR8365418_1.fastq.gz
  out=raw/SRR8365418_HIP3_male_R1.fastq.gz
ftp://ftp.sra.ebi.ac.uk/vol1/fastq/SRR836/008/SRR8365418/SRR8365418_2.fastq.gz
  out=raw/SRR8365418_HIP3_male_R2.fastq.gz
ftp://ftp.sra.ebi.ac.uk/vol1/fastq/SRR836/000/SRR8365310/SRR8365310_1.fastq.gz
  out=raw/SRR8365310_HIP1_female_R1.fastq.gz
ftp://ftp.sra.ebi.ac.uk/vol1/fastq/SRR836/000/SRR8365310/SRR8365310_2.fastq.gz
  out=raw/SRR8365310_HIP1_female_R2.fastq.gz
ftp://ftp.sra.ebi.ac.uk/vol1/fastq/SRR836/002/SRR8365252/SRR8365252_1.fastq.gz
  out=raw/SRR8365252_HIP1_female_R1.fastq.gz
ftp://ftp.sra.ebi.ac.uk/vol1/fastq/SRR836/002/SRR8365252/SRR8365252_2.fastq.gz
  out=raw/SRR8365252_HIP1_female_R2.fastq.gz
ftp://ftp.sra.ebi.ac.uk/vol1/fastq/SRR836/008/SRR8365308/SRR8365308_1.fastq.gz
  out=raw/SRR8365308_HIP1_female_R1.fastq.gz
ftp://ftp.sra.ebi.ac.uk/vol1/fastq/SRR836/008/SRR8365308/SRR8365308_2.fastq.gz
  out=raw/SRR8365308_HIP1_female_R2.fastq.gz
ftp://ftp.sra.ebi.ac.uk/vol1/fastq/SRR836/009/SRR8365309/SRR8365309_1.fastq.gz
  out=raw/SRR8365309_HIP1_female_R1.fastq.gz
ftp://ftp.sra.ebi.ac.uk/vol1/fastq/SRR836/009/SRR8365309/SRR8365309_2.fastq.gz
  out=raw/SRR8365309_HIP1_female_R2.fastq.gz
ftp://ftp.sra.ebi.ac.uk/vol1/fastq/SRR836/003/SRR8365253/SRR8365253_1.fastq.gz
  out=raw/SRR8365253_HIP1_female_R1.fastq.gz
ftp://ftp.sra.ebi.ac.uk/vol1/fastq/SRR836/003/SRR8365253/SRR8365253_2.fastq.gz
  out=raw/SRR8365253_HIP1_female_R2.fastq.gz
ftp://ftp.sra.ebi.ac.uk/vol1/fastq/SRR836/007/SRR8365307/SRR8365307_1.fastq.gz
  out=raw/SRR8365307_HIP1_female_R1.fastq.gz
ftp://ftp.sra.ebi.ac.uk/vol1/fastq/SRR836/007/SRR8365307/SRR8365307_2.fastq.gz
  out=raw/SRR8365307_HIP1_female_R2.fastq.gz
ftp://ftp.sra.ebi.ac.uk/vol1/fastq/SRR836/009/SRR8365259/SRR8365259_1.fastq.gz
  out=raw/SRR8365259_HIP2_male_R1.fastq.gz
ftp://ftp.sra.ebi.ac.uk/vol1/fastq/SRR836/009/SRR8365259/SRR8365259_2.fastq.gz
  out=raw/SRR8365259_HIP2_male_R2.fastq.gz
ftp://ftp.sra.ebi.ac.uk/vol1/fastq/SRR836/008/SRR8365258/SRR8365258_1.fastq.gz
  out=raw/SRR8365258_HIP2_male_R1.fastq.gz
ftp://ftp.sra.ebi.ac.uk/vol1/fastq/SRR836/008/SRR8365258/SRR8365258_2.fastq.gz
  out=raw/SRR8365258_HIP2_male_R2.fastq.gz
ftp://ftp.sra.ebi.ac.uk/vol1/fastq/SRR836/006/SRR8365306/SRR8365306_1.fastq.gz
  out=raw/SRR8365306_HIP1_female_R1.fastq.gz
ftp://ftp.sra.ebi.ac.uk/vol1/fastq/SRR836/006/SRR8365306/SRR8365306_2.fastq.gz
  out=raw/SRR8365306_HIP1_female_R2.fastq.gz
ftp://ftp.sra.ebi.ac.uk/vol1/fastq/SRR836/005/SRR8365305/SRR8365305_1.fastq.gz
  out=raw/SRR8365305_HIP1_female_R1.fastq.gz
ftp://ftp.sra.ebi.ac.uk/vol1/fastq/SRR836/005/SRR8365305/SRR8365305_2.fastq.gz
  out=raw/SRR8365305_HIP1_female_R2.fastq.gz
ftp://ftp.sra.ebi.ac.uk/vol1/fastq/SRR836/004/SRR8365304/SRR8365304_1.fastq.gz
  out=raw/SRR8365304_HIP1_female_R1.fastq.gz
ftp://ftp.sra.ebi.ac.uk/vol1/fastq/SRR836/004/SRR8365304/SRR8365304_2.fastq.gz
  out=raw/SRR8365304_HIP1_female_R2.fastq.gz
ftp://ftp.sra.ebi.ac.uk/vol1/fastq/SRR836/000/SRR8365260/SRR8365260_1.fastq.gz
  out=raw/SRR8365260_HIP2_male_R1.fastq.gz
ftp://ftp.sra.ebi.ac.uk/vol1/fastq/SRR836/000/SRR8365260/SRR8365260_2.fastq.gz
  out=raw/SRR8365260_HIP2_male_R2.fastq.gz
ftp://ftp.sra.ebi.ac.uk/vol1/fastq/SRR836/003/SRR8365303/SRR8365303_1.fastq.gz
  out=raw/SRR8365303_HIP1_female_R1.fastq.gz
ftp://ftp.sra.ebi.ac.uk/vol1/fastq/SRR836/003/SRR8365303/SRR8365303_2.fastq.gz
  out=raw/SRR8365303_HIP1_female_R2.fastq.gz
ftp://ftp.sra.ebi.ac.uk/vol1/fastq/SRR836/001/SRR8365261/SRR8365261_1.fastq.gz
  out=raw/SRR8365261_HIP2_male_R1.fastq.gz
ftp://ftp.sra.ebi.ac.uk/vol1/fastq/SRR836/001/SRR8365261/SRR8365261_2.fastq.gz
  out=raw/SRR8365261_HIP2_male_R2.fastq.gz
ftp://ftp.sra.ebi.ac.uk/vol1/fastq/SRR836/002/SRR8365262/SRR8365262_1.fastq.gz
  out=raw/SRR8365262_HIP2_male_R1.fastq.gz
ftp://ftp.sra.ebi.ac.uk/vol1/fastq/SRR836/002/SRR8365262/SRR8365262_2.fastq.gz
  out=raw/SRR8365262_HIP2_male_R2.fastq.gz
ftp://ftp.sra.ebi.ac.uk/vol1/fastq/SRR836/002/SRR8365302/SRR8365302_1.fastq.gz
  out=raw/SRR8365302_HIP1_female_R1.fastq.gz
ftp://ftp.sra.ebi.ac.uk/vol1/fastq/SRR836/002/SRR8365302/SRR8365302_2.fastq.gz
  out=raw/SRR8365302_HIP1_female_R2.fastq.gz
ftp://ftp.sra.ebi.ac.uk/vol1/fastq/SRR836/001/SRR8365301/SRR8365301_1.fastq.gz
  out=raw/SRR8365301_HIP1_female_R1.fastq.gz
ftp://ftp.sra.ebi.ac.uk/vol1/fastq/SRR836/001/SRR8365301/SRR8365301_2.fastq.gz
  out=raw/SRR8365301_HIP1_female_R2.fastq.gz
ftp://ftp.sra.ebi.ac.uk/vol1/fastq/SRR836/003/SRR8365263/SRR8365263_1.fastq.gz
  out=raw/SRR8365263_HIP2_male_R1.fastq.gz
ftp://ftp.sra.ebi.ac.uk/vol1/fastq/SRR836/003/SRR8365263/SRR8365263_2.fastq.gz
  out=raw/SRR8365263_HIP2_male_R2.fastq.gz
ftp://ftp.sra.ebi.ac.uk/vol1/fastq/SRR836/004/SRR8365264/SRR8365264_1.fastq.gz
  out=raw/SRR8365264_HIP1_female_R1.fastq.gz
ftp://ftp.sra.ebi.ac.uk/vol1/fastq/SRR836/004/SRR8365264/SRR8365264_2.fastq.gz
  out=raw/SRR8365264_HIP1_female_R2.fastq.gz
ftp://ftp.sra.ebi.ac.uk/vol1/fastq/SRR836/007/SRR8365267/SRR8365267_1.fastq.gz
  out=raw/SRR8365267_HIP1_female_R1.fastq.gz
ftp://ftp.sra.ebi.ac.uk/vol1/fastq/SRR836/007/SRR8365267/SRR8365267_2.fastq.gz
  out=raw/SRR8365267_HIP1_female_R2.fastq.gz
ftp://ftp.sra.ebi.ac.uk/vol1/fastq/SRR836/004/SRR8365274/SRR8365274_1.fastq.gz
  out=raw/SRR8365274_HIP1_female_R1.fastq.gz
ftp://ftp.sra.ebi.ac.uk/vol1/fastq/SRR836/004/SRR8365274/SRR8365274_2.fastq.gz
  out=raw/SRR8365274_HIP1_female_R2.fastq.gz
ftp://ftp.sra.ebi.ac.uk/vol1/fastq/SRR836/009/SRR8365269/SRR8365269_1.fastq.gz
  out=raw/SRR8365269_HIP1_female_R1.fastq.gz
ftp://ftp.sra.ebi.ac.uk/vol1/fastq/SRR836/009/SRR8365269/SRR8365269_2.fastq.gz
  out=raw/SRR8365269_HIP1_female_R2.fastq.gz
ftp://ftp.sra.ebi.ac.uk/vol1/fastq/SRR836/001/SRR8365271/SRR8365271_1.fastq.gz
  out=raw/SRR8365271_HIP1_female_R1.fastq.gz
ftp://ftp.sra.ebi.ac.uk/vol1/fastq/SRR836/001/SRR8365271/SRR8365271_2.fastq.gz
  out=raw/SRR8365271_HIP1_female_R2.fastq.gz
ftp://ftp.sra.ebi.ac.uk/vol1/fastq/SRR836/003/SRR8365273/SRR8365273_1.fastq.gz
  out=raw/SRR8365273_HIP1_female_R1.fastq.gz
ftp://ftp.sra.ebi.ac.uk/vol1/fastq/SRR836/003/SRR8365273/SRR8365273_2.fastq.gz
  out=raw/SRR8365273_HIP1_female_R2.fastq.gz
ftp://ftp.sra.ebi.ac.uk/vol1/fastq/SRR836/005/SRR8365265/SRR8365265_1.fastq.gz
  out=raw/SRR8365265_HIP1_female_R1.fastq.gz
ftp://ftp.sra.ebi.ac.uk/vol1/fastq/SRR836/005/SRR8365265/SRR8365265_2.fastq.gz
  out=raw/SRR8365265_HIP1_female_R2.fastq.gz
ftp://ftp.sra.ebi.ac.uk/vol1/fastq/SRR836/000/SRR8365270/SRR8365270_1.fastq.gz
  out=raw/SRR8365270_HIP1_female_R1.fastq.gz
ftp://ftp.sra.ebi.ac.uk/vol1/fastq/SRR836/000/SRR8365270/SRR8365270_2.fastq.gz
  out=raw/SRR8365270_HIP1_female_R2.fastq.gz
ftp://ftp.sra.ebi.ac.uk/vol1/fastq/SRR836/002/SRR8365272/SRR8365272_1.fastq.gz
  out=raw/SRR8365272_HIP1_female_R1.fastq.gz
ftp://ftp.sra.ebi.ac.uk/vol1/fastq/SRR836/002/SRR8365272/SRR8365272_2.fastq.gz
  out=raw/SRR8365272_HIP1_female_R2.fastq.gz
ftp://ftp.sra.ebi.ac.uk/vol1/fastq/SRR836/006/SRR8365266/SRR8365266_1.fastq.gz
  out=raw/SRR8365266_HIP1_female_R1.fastq.gz
ftp://ftp.sra.ebi.ac.uk/vol1/fastq/SRR836/006/SRR8365266/SRR8365266_2.fastq.gz
  out=raw/SRR8365266_HIP1_female_R2.fastq.gz
ftp://ftp.sra.ebi.ac.uk/vol1/fastq/SRR836/008/SRR8365268/SRR8365268_1.fastq.gz
  out=raw/SRR8365268_HIP1_female_R1.fastq.gz
ftp://ftp.sra.ebi.ac.uk/vol1/fastq/SRR836/008/SRR8365268/SRR8365268_2.fastq.gz
  out=raw/SRR8365268_HIP1_female_R2.fastq.gz

The above script downloads all the data to raw/ folder. Each file name encodes the data about the donor and its sex. For example for the first file from the above list (SRR8365468_HIP2_male_R1.fastq.gz):

  • SRR8365468 - sample id
  • HIP2 - patient id
  • female - patient sex

The project contains 147 FASTQ file pairs. The structure of sequences is shown on the picture bellow.

The structure of the library is shown on the picture bellow.

This data is obtained using multiplex mix of V and J primers that can be found here. In the original paper, where Biomed2 primer set was published the authors declare that some primers allow upto 2 mismatches. This means that these mismatches will end up in the final sequences and may introduce bias into alignment process.

Briefly: 1. There are 16 primers (complimentary to VH, VK and VL segments). These primers' sequences are located in FR1region and the longest primer consists of 32 nucleotides. R1 is 250bp long and, depending on a certain V gene, it covers the whole V segment upto to the beginning ofCDR3 region. 2. There are 5 primers (complimentary to JH, JK and JL segments). These primers are located in FR4 and the longest consists of 26 nucleotides. R2 starts from one of these primers and ends in FR2.

Bellow you can see J primers aligned with IGHJ / IGKL / IGLJ genes sequences.

# IGHJ

         <---------FR4----------------->
NNNNCTTACCTGAGGAGACGGTGACC                                                <- Jh-primer
         CTGAGGAGACGGTGACCAGGGTGCCCTGGCCCCAGTGCTGGAAGTATTCAGC             <- IGHJ1
         CTGAGGAGACAGTGACCAGGGTGCCACGGCCCCAGAGATCGAAGTACCAGTAG            <- IGHJ2
         CTGAAGAGACGGTGACCATTGTCCCTTGGCCCCAGACATCAAAAGCATCA               <- IGHJ3
         CTGAGGAGACGGTGACCAGGGTTCCTTGGCCCCAGTAGTCAAAGTAGT                 <- IGHJ4
         CTGAGGAGACGGTGACCAGGGTTCCTTGGCCCCAGGAGTCGAACCAGTTGT              <- IGHJ5
            AGGAGACGGTGACCGTGGTCCCTTGCCCCCAGACGTCCATACCGTAGTAGTAGTAGTAAT  <- IGHJ6

# IGK
     <------------ FR4---------->
  NNNNTTTGATaTCCAccTTGGTCCC                   <- Jk1-primer 
     GTTTGATTTCCACCTTGGTCCCTTGGCCGAACGTCCAC   <- IGKJ1
     GTTTGATCTCCAGCTTGGTCCCCTGGCCAAAAGTGTACA  <- IGKJ2
     GTTTGATATCCACTTTGGTCCCAGGGCCGAAAGTGAAT   <- IGKJ3
     GTTTGATCTCCACCTTGGTCCCTCCGCCGAAAGTGAGC   <- IGKJ4

  NNNNTTTAATCTCCAGTCGTGTCCC                   <- Jk2-primer
     GTTTAATCTCCAGTCGTGTCCCTTGGCCGAAGGTGATC   <- IGKJ5

# IGL
  <------------ FR4---------->
NNNNAGGACGGTGACCTTGGTCCC                  <- Jl1-primer
NNNNAGGACGGTCAGCTgGGTCCC                  <- Jl2-primer
  CTAGGACGGTGACCTTGGTCCCAGTTCCGAAGACATAA  <- IGLJ1
  CTAGGACGGTCAGCTTGGTCCCTCCGCCGAATACCACA  <- IGLJ2
  CTAGGACGGTCAGCTTGGTCCCTCCGCCGAATACCACA  <- IGLJ3*01
  CTAGGACGGTCAGCTTGGTCCCTCCGCCGAACACCCAA  <- IGLJ3*02
  CTAGGACGGTCAGCTCGGTCCCCTCACCAAACACCCAG  <- IGLJ5
  CGAGGACGGTCACCTTGGTGCCACTGCCGAACACATTA  <- IGLJ6
  CTAAAATGATCAGCTGGGTTCCTCCACCAAATACAAAA  <- IGLJ4
  CGAGGACGGTCAGCTGGGTGCCTCCTCCGAACACAGCA  <- IGLJ7
What's important, that in all cases there is enough nucleotides, not covered by the primer, for correct J gene identification. Thus, we can use MiXCR to trim primer sequence.

Upstream analysis

MiXCR has a dedicated preset for this protocol, thus analysing the data ia as easy as:

mixcr analyze biomed2-human-rna-igh \
    raw/SRR8365277_HIP1_female_IgG1_R1.fastq.gz \
    raw/SRR8365277_HIP1_female_IgG1_R2.fastq.gz \
    results/SRR8365277_HIP1_female_IgG1

One might also use GNU Parallel to process all samples at once:

#!/usr/bin/env bash

mkdir -p results

ls /raw/*R1* |
    parallel -j 2 --line-buffer \
    "mixcr analyze biomed2-human-rna-igh \
    {} \
    {=s:R1:R2:=} \
    {=s:.*/:results/:;s:_R.*::=}"

Under the hood pipeline:

Under the hood mixcr analyze biomed2-human-bcr-cdr3 executes the following pipeline:

align

Alignment of raw sequencing reads against reference database of V-, D-, J- and C- gene segments.

mixcr align \
    --species hsa \
    -p generic-amplicon \
    --tag-pattern "^N{32}(R1:*)\^N{26}(R2:*)" \
    -OvParameters.geneFeatureToAlign="VTranscriptWithout5UTRWithP" \
    -OvParameters.parameters.floatingLeftBound=false \
    -OjParameters.parameters.floatingRightBound=false \
    --report results/SRR8365468_HIP2_male.report \
    --json-report results/SRR8365468_HIP2_male.json \
     raw/SRR8365468_HIP2_male_R1.fastq.gz \
     raw/SRR8365468_HIP2_male_R2.fastq.gz \
     results/SRR8365468_HIP2_male.vdjca

Option --report is specified here explicitly.

--species hsa
determines the organism species.
-p
generic-amplicon a preset of MiXCR parameters for amplicon data.
--tag-pattern "^N{32}(R1:*)\^N{26}(R2:*)"
With this pattern we trim 32 nucleotides (the length of the longest forward primer) from the beginning of R1 file and 26 nucleotide (the length of the longest reverse primer from the beginning of R2 file.
-OvParameters.geneFeatureToAlign="VTranscriptWithout5UTRWithP"
Sets a V gene feature to align. Check gene features for more info.
-OvParameters.parameters.floatingLeftBound=false
Results in a global alignment algorithm for V gene left bound. We can use global alignment here, because primer sequences were trimmed with --tag-pattern.
-OjParameters.parameters.floatingRightBound=false
Results in a global alignment algorithm for J gene right bound. We can use global alignment here, because primer sequences were trimmed with --tag-pattern.

assemble

Assembles alignments into clonotypes and applies several layers of errors correction(ex. quality-awared correction for sequencing errors, clustering to correct for PCR errors). Check mixcr assemble for more information. By default clones will be assembled by CDR3 gene feature.

mixcr assemble \
    -OassemblingFeatures="CDR3" \
    -OseparateByJ=true \
    -OseparateByV=true \
    --report results/SRR8365468_HIP2_male.report \
    --json-report results/SRR8365468_HIP2_male.json \
    results/SRR8365468_HIP2_male.vdjca \
    results/SRR8365468_HIP2_male.clns
-OseparateByV=true
Split clones with the same CDR3 sequence and different V-genes
-OseparateByJ=true
Split clones with the same CDR3 sequence and different J-genes

export

Exports clonotypes from .clns file into human-readable tables.

mixcr exportClones \
    -c IGH \
    results/SRR8365468_HIP2_male.clns \
    results/SRR8365468_HIP2_male.clonotypes.IGH.tsv

mixcr exportClones \
    -c IGL \
    results/SRR8365468_HIP2_male.clns \
    results/SRR8365468_HIP2_male.clonotypes.IGL.tsv

mixcr exportClones \
    -c IGK \
    results/SRR8365468_HIP2_male.clns \
    results/SRR8365468_HIP2_male.clonotypes.IGK.tsv

Here -p full defines the full preset of common export columns. Check mixcr export for more information.

-с <chain>
defines a specific chain to be exported.

After execution is complete the following list of files is generated for every sample:

# human-readable reports 
SRR8365468_HIP2_male.report
# raw alignments (highly compressed binary file)
SRR8365468_HIP2_male.vdjca
# IGH, IGK and IGL CDR3 clonotypes (highly compressed binary file)
SRR8365468_HIP2_male.clns
# IGH, IGK and IGL CDR3 clonotypes exported in tab-delimited txt
SRR8365468_HIP2_male.IGH.tsv
SRR8365468_HIP2_male.IGL.tsv  
SRR8365468_HIP2_male.IGK.tsv  

While .clns file holds all data and is used for downstream analysis using mixcr postanalisis, the output .tsv clonotype table will contain exhaustive information about each clonotype as well:

See first 100 records from clonotype table SRR8365468_HIP2_male:
cloneId cloneCount cloneFraction targetSequences targetQualities allVHitsWithScore allDHitsWithScore allJHitsWithScore allCHitsWithScore allVAlignments allDAlignments allJAlignments allCAlignments nSeqFR1 minQualFR1 nSeqCDR1 minQualCDR1 nSeqFR2 minQualFR2 nSeqCDR2 minQualCDR2 nSeqFR3 minQualFR3 nSeqCDR3 minQualCDR3 nSeqFR4 minQualFR4 aaSeqFR1 aaSeqCDR1 aaSeqFR2 aaSeqCDR2 aaSeqFR3 aaSeqCDR3 aaSeqFR4 refPoints
0 2 0.0952381 AGCAGTGACGTTGATACTTATAACTATGTCTCCTGGTCTGGCAACACGGCCTCCCTGACCATCTCTGGGCTCCAGGCTGAGGACGAGGCTCATTATTACTGCTGCTCACATTCGACCGGCAGCACTCGTTATGTCTTCGGAACT IIIIIIIIGIGGIGGIGGIGGIIGIIIIIIIIIIIIGIIIIIIIIIIIIIIIIIIIIIIAGIGGGGGGGIGIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIGGGIIIAG IGLV2-14*00(569) nan IGLJ1*00(257.5) nan 75 112 317 0 37 SG88ASG90ASG91C 283.0 nan 20 36 58 128 144 160.0 nan nan nan AGCAGTGACGTTGATACTTATAACTAT 38 nan nan nan nan nan nan
1 2 0.0952381 AGCAATGACGTTGGGGTTTCTCTTCGCTTCTCTGGCTCCAAGTCTGGCAACACGGCCTCCCTGACCATCTCTGGGCTCCAGGCTGAGGACGAGGCTGATTATTACTGCAACTCATATACACTCGGCAGCACCCCGGTTTGTGTCTTCGGAACT IIIIIIIIGGIIGIIIIIIIIIIIIIIIIIIIIIGIIIIIIGIIIIIIGIIIIIIIIIIGIGGGIIIIIIIIGGGIGIIIIIGIIIIIIIIIIIIIIIIIIIIIIIIGIIIIIIIIIGGIIIIIIIIIGIIIIIIIGIIGIIIIIIIIIIIAA IGLV2-14*00(737.5) nan IGLJ1*00(206) nan 75 89 317 0 14 SG79A 111.0 nan 20 36 58 137 153 SA22G 131.0 nan nan nan nan nan nan nan nan nan nan nan
2 1 0.047619 GGTGGCTCCATCAGTAGTTACTACTGGAGCTGGATCCGGCAGCCCCCAGGGAAGGGACTGGAGTGGATTGGGTATATCTATTACAGTGGGAGCACCAACTACAACCCCTCCCTCAAGAGTCGAGTCACCATATCAGTAGACACGTCCAAGAACCAGTTCTCCCTGAAGCTGAGCTCTGTGACCGCTGCGGACACGGCCGTGTATTACTGTGCGAGAGGAGTAATACCGCACCCGCTCGACTACTGGGGCCAGGGA IIIIIIIIIIIGIIIIIGIIIIIGIGGIIIIGIGIIIIGIIEIGIIIGEGGGGIIGIGIGGG<GGGGGG<GIGIIIIGIAGGGIIIIGGGGGGGGGIGGGGGGGGIGIGGGG<GGGIGGIIGGIGIGGGAGGGGGGGGGGGGGGIGIIGIGGGGAAGAAAGGIIIIGIGGIGIGGGGGGGGAGAGIIIGGGGGGGIGIIIIIIIIIGIIGIIIIIGGIIIIIIGGIIIGGGGGGGGIGGGGGGIIGGIGIIIIII IGHV4-59*00(2230) IGHD3-1000(40),IGHD3-2200(40),IGHD1-7*00(36) IGHJ4*00(210) nan 75 292 313 0 217 2170.0 23 31 93 218 226 40.0;23 31 93 218 226 40.0;8 18 51 218 228 ST11A 36.0
3 1 0.047619 GGTGGCTCCATCAGCAGTGGTGGTTACTACTGGAGCTGGATCCGCCAGCACCCAGGGAAGGGCCTGGAGTGGATTGGGTACATCTATTACAGTGGGAGCACCTACTACAACCCGTCCCTCAAGAGTCGAGTTACCATATCAGTAGACACGTCTAAGAACCAGTTCTCCCTGAAGCTGAGCTCTGTGACTGCCGCGGACACGGCCGTGTATTACTGTGCGAGAGACACCCGTATTACGATTTTTGGAGTGGACTCCAACGGTATGGACGTCTGGGGCCAAGGG GGGGGGGIGGIIIGIIAGGIIIIGGIIGIIIIGIIIIIIIIIIIIIIGGIIIIIIIIIIGAGGEIIIGGGEIIIIGIIIIIIIIGGIIGIIIGGGIIGIIIIGGGIGIIIGGG:GGIIGGGIGIIIGGGIIGIGGGGGGGIIIIIII<GAGGIIIGGIIGGGGGGGGIGIIGIGGIGGGGGGGGIGGIGIGGGGGGGGIIGGAGGIAAAG<<<<<<GGAAAGGAGGIGIIGGGGIIGIIGGGIIGGIIIGIIGIIIGIGGGGIGGGAGGGGGIGGGGIIIGI IGHV4-31*00(2300) IGHD3-3*00(110) IGHJ6*00(312) nan 75 299 319 0 224 2240.0 30 52 93 228 250 110.0 29 61 83 250 282 SA32CST34A 262.0 nan nan nan GGTGGCTCCATCAGCAGTGGTGGTTACTAC 32
4 1 0.047619 GGTGGCTCCATCAGCAGTAGTAACTGGTGGAGTTGGGTCCGCCAGCCCCCAGGGAAGGGGCTGGAGTGGATTGGGGAAATCTATCATAGTGGGAGCACCAACTACAACCCGTCCCTCAAGAGTCGAGTCACCATATCAGTAGACAAGTCCAAGAACCAGTTCTCCCTGAAGCTGAGCTCTGTGACCGCCGCGGACACGGCCGTGTATTACTGTGCGAGAGTTAGTGGGAGCTACTACTACGGTATGGACGTCTGGGGCCAAGGG IIIIIIIIIIIIIIIGIIIIIIIIIIGGIIIIIIIIIIIIIIIIIIIIIIIIIIIIIGEGAAGA<<<<:<<GGIIGIGGGGIGIGIIG<<<GEGGGIIIIIGGGGGGGIIGGGGGGIIIGGGGGIGA<GGGIIIIGGGGIIIIIIIIGGIGGAGGGGGGIIIGIIGGGGGGGIIIIGGGIIGGGGGGGGAAAGG<AGA<<<<<<G<GGIIIIIIIIIIIIIIIIIIIIIIGIIIIIIIIIIIIIIIIIIIIGIIIGGGGGIIII IGHV4-4*00(2260) IGHD1-26*00(45) IGHJ6*00(351) nan 75 295 316 0 220 2200.0 24 33 60 221 230 45.0 27 61 83 230 264 340.0 nan nan nan GGTGGCTCCATCAGCAGTAGTAACTGG 38
5 1 0.047619 GGTGGGTCCTTCAGTGGTTACTACTGGAGCTGGATCCGCCAGCCCCCAGGGAAGGGGCTGGAGTGGATTGGGGAAATCAATCATAGTGGAAGCACCAACTACAACCCGTCCCTCAAGAGTCGAGTCACCATATCAGTAGACACGTCCAAGAACCAGTTCTCCCTGAAGCTGAGCTCTGTGACCGCCGCGGACACGGCTGTGTATTACTGTGCGAGAGGCCCCTATTACGATTTTTGGAGTGGTTATCGAAACCTCTACTACTACTACGGTATGGACGTCTGGGGCCAAGGG GGIGIIIGGGGGGGGGIIIGIIIIIIIIIIIIIIIIIIIIIGIIIIIIGIIIIIIIIIGGI<<GGGGIIIIIIIGGIIIIIIIIIGGGGGGIIGGGGGGIIIGGGIGIIIIIGGGIGGGGGIGGGGIGGGIGIIIIIIGGIIIIIGIIGGIIIGG:GIIIIGIGIAAAAGGIGIGGGIGGGGGIIIIIGGIIGIGGGIIIIIIIIIIIIIIIIIIIGG<<<<<GGGG<<<GGGGIIIIIIIIIIIIIIIIIGIIIIIIIIGGGGGGIGIIIIIIIIGGGIGIIGGGGIIGI IGHV4-34*00(2261) IGHD3-3*00(115) IGHJ6*00(410) nan 75 298 313 0 223 ST295C 2201.0 33 56 93 223 246 115.0 24 61 83 254 291 370.0 nan nan nan GGTGGGTCCTTCAGTGGTTACTAC 38
6 1 0.047619 GGTGGGTCCTTCAGTGGTTACATCTGGACCTGGATCCGCCAGACCCCAGGAAAGGGGCTGCAGTAGATTGGACAAATCAATCATAGTGGAAGCGCCAACTACAACCCGTCCCTCAAGAGTCGAGTCACCATATCAGTACACACGTCCAATAGTCAGTTCTCCCTGGAGCTGAGCTCTGTGACCGCCGCGGACACGGCTGTGTATTACTGTGCGAGTCCCAAGATAGCATTCTACAACTGGTTCGACCCCTGGGGCCAGGGA GIIIIIIIIIIIIIIIIIIIIIIIIIIGIIIIIIIIIIIIGGEIIIIIIIIIIGIIIIIIIGGGG<GGGGIGGIIIIGGIIIIIIIIIIIIGAGIIIIIGGGIIIGIIIIIIGIGIGIGGGGIGGGGGGGGGGGIGIIIIIGG:GGIIIIGGIIIGIGIIIIIIGGGIIIIIIIIIIIGGGGIIIIIIIIIG<GGGG<<GAG<GAIIIIGIIGIIIIIIIIIIIIIIIIIGGGIGGIIIIIGGGGGGGIGIIIIGGIIIGI IGHV4-34*00(1775) IGHD7-2700(32),IGHD6-1300(30),IGHD6-25*00(30) IGHJ5*00(351) nan 75 290 313 0 215 ST96ASA97TSG103CSC117ASG125ASG135CSG139ASG146ASG147CSA168GSG213CSG224TSA226GSC227TSA240G 1715.0 0 12 33 215 227 SC4AST7A 32.0;25 31 63 222 228 30.0;22 28 54 222 228 30.0
7 1 0.047619 GGTGACTTCCTCGACGATAGTAGGTGGTGGAGTTGGGTCCGCCAGCCCCCAGGGAAGGGGCTGGAGTGGATTGGAGAGATCCATCATACGGGGATCACCAACTACATCCCGTCCCTCAAGAGTCGGGTCACCATGTCAGTGGACAAGAACAAGAACCAGGTCTCCCTAGAGGTGTATTCTGTGACCGCCGCGGACACGGCCGTGTATTACTGTGCGAGTGGGGCCCACTATATTTGGAATGGGTGGGGCCAGGGA GGIIIIIIIGIIGIIGGIGGIGGGIIIIIIIIIIIGIGIIIIIIIIIIIIIIIIIIIIIGIIIIGGGGI<<<IIGIIIIIIIIIIIIIGIIIIIIIIIIIIIIIIIIIIIIIGAGGGGIGIIGGGIGGIGIIIIIGGGIGIIGGIIGGIIIIIGGIIGGGGGIIIIIIIIGGIIIIIGIIIIIIGGIIGIIIIGIGIGGGIIGIIGGGGGGIIIIIGGIGIIIIIIIIIIIIIGIIIGIIIIIIIIIIGGIIIGG IGHV4-4*00(1399) IGHD5-1800(50),IGHD1-2600(45),IGHD5-12*00(45) IGHJ400(170),IGHJ500(170) nan 75 293 316 0 218 SG79ASC82TSA84CSA87GSG88ASA90GSG91ASA97GSC98GSG149ASA152GST156CSG163CST164GSG169TSA181TSA200GSA209GSA215GST222ASC223AST234GSG242ASA243GSC246GSA249TSG250ASC251T 1368.0 10 20 69 224 234 50.0;9 18 60 223 232 45.0;10 19 69 224 233 45.0
8 1 0.047619 GGATTCACCTTCAGTAGCTACTGGATGCACTGGGTCCGCCAAGCTCCAGGGAAGGGGCTGGTGTGGGTCTCACGTATTAATAGTGATGGGAGTATCACAAGCTACGCGGACTCCGTGAAGGGCCGATTCACCATCTCCAGAGACAACGCCAAGAGCACGCTGTATCTGCAAATGAACAGTCTGAGAGCCGAGGACACGGCTGTGTATTACTGTGCCCCATACGGTAGTAACCACATTTTCGACTACTGGGGCCAGGGA IIGIIIIIIIIIGIIIIIIIIIIIIIGIIIIIIIIIIIIIGIIIIIIIIIGIIIIII<:GIGGGGIGIIIIIIIIIIGGIIIIGGIIIIIIGGGGGGGIIGGIGGGIGIIIGGGGGGGGGIIIIIIIIIGIIIIGIIIIIIGIIIIGIIIGIIIGGGIGIGGGGGGIGGIGGGIIIIIGGGGIIGIGIIGGGGGGGGGAGGIIIIIIIIIIIIIGIIIGGIIIIIGIIIIIIIGGGIGIGGIIGIGIIIIGGAIIIII IGHV3-74*00(2152) IGHD4-2300(46),IGHD1-1400(41),IGHD3-22*00(41) IGHJ4*00(221) nan 75 290 316 0 215 SG169TSA229G 2092.0 23 35 57 219 231 SG29A 46.0;12 26 51 218 232 DC15I21G 41.0;3 14 93 223 234 SA6G 41.0
9 1 0.047619 AGTTCCAACATCGGAAGCAATACTGTAAACTGGTCTGGCACCTCAGCCTCCCTGGCCATCAGTGGTCTCCAGTCTGAGGATGAAGCTGATTATTACTGTGCAGTCTGGGATGACAAGCTGCGTGGTCGGATAATCGGCGGA IIIIIIGIGG<GAG<G<<G<GGGIIGIIIIIIIIIIIGGGIIIIGGGIGIIIGIIIIIGG<<<<<GGIIIGGGGGGGGIIIIIIIIIIGIGIIIIIIIIIIIIGIIIIIGGIGGIGGGGGIIIIIIIIIIIGIIIGIIGGG IGLV1-44*00(627) nan IGLJ200(223),IGLJ300(222) nan 75 109 316 0 34 SC77TST92C 282.0 nan 25 36 58 130 141 ST27A 81.0;28 36 58 133 141 80.0 nan nan nan AGTTCCAACATCGGAAGCAATACT 27
10 1 0.047619 AGCAGTGATATTGGGGGTTATAACCATGTCTCCTGGTACCAACAACACCCAGGCAAAGCCCCCAAACTCATCATTTATGAGCACCCGTTATGTCGTCGGAACA GIIIIGGIG<<GAGGGIIIGGIIGGGGGIIIIIIIIIIGIIIIIIGIIIIIIIIIIIIGGIIIIIIGIIIGIIGIIIIIGGGGGGGGGGGIGIIIIIIIIIGG IGLV2-2300(1357),IGLV2-1400(1318) nan IGLJ1*00(264) nan 75 156 317 0 81 SG84ASA90GST100ASG119ASG146C 665.0;75 155 317 0 80 SC83TSG84AST89GST99CSG119ASG146C 626.0 nan 20 36 58 87 103 ST27GST35A 102.0 nan nan nan AGCAGTGATATTGGGGGTTATAACCAT 27
11 1 0.047619 AGCAGTGATATTGGGAGGTCTGGCAACACGTCCTCCCTGACAATGTTTGGGCTCCAGGGTGAGGACGAGGCTGATAATTACTGCTACTCATATGCGTTTAATTGGACGTCTTCTAAACT IGGIIIGGIGIGIGGGIGIIGIIIIIGIIIIIIIIGGIGGGGGIIIIIIIIIIGIIIIIGGIGIIIIIIIIIIIGGGGIIIIIIGIIIIIIGIIGIIIIIIIIIIIIII7AAGGAG<<7 IGLV2-23*00(167) nan IGLJ1*00(224) nan 75 92 317 0 17 SG84A 141.0 nan 24 36 58 107 119 SG30TSG31A 62.0 nan nan nan nan nan nan nan nan nan nan nan
12 1 0.047619 AGCAGTGACGTTGGTGGTTATAACTATGTCTCCTGGTCTGGCAACACGGCCTCCCTGACCATCTCTGGGCTCCAGGCTGAGGACGAGGCTGATTATTACTGCACCTCATATACAAGCAGTGCGAATTATGTCTTCGGAACT GIIIIIIGGGGGIIIIIGIGIGIIIIIIIIIIIIIIIIIGIIIIIIIIIIIIIIIIIIIG<<GGGIIIIIGIIIGIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIGGGGGIIIIIIIIIIIIIIGIIIIIGIAG IGLV2-14*00(808) nan IGLJ1*00(370) nan 75 112 317 0 37 370.0 nan 18 36 58 123 141 180.0 nan nan nan AGCAGTGACGTTGGTGGTTATAACTAT 38 nan nan nan nan nan nan
13 1 0.047619 AGCAGTGACGTTGGTGGTTATAACTATGTCTCCTGGTACCAACAGCACCCAGGCAAAGCCCCCAAACTCATGATTTATGATGTCAGTAATCGGCCCTCAGGGGTTTCTAATCGCTTCTCTGGCTCCAAGTCTGGCAACACGGCCTCCCTGACCATCTCTGGGCTCCAGGCTGAGGACGAGGCTGATTATTACTGCAGCTCATATACAAGCAGCAGCACTCTCTATGTCTTCGGAACT <<G<<<GGGAGGGGIGIIAGAGIEIIIIIIIIIIGGIIGGIIIIIIIGGIGGIIIGIGIGGGGGGGAAGGGGGIIGIGGGGGAAG:GIIGGIIIGIIIGIIGGGGGGGIIIIGGGIGIIGGGGGIIIIIGGGIIGGIIIIIIIIIIIIIIIIIIIIIIIIIIIGGGG<<<<GIIGIIIIGIIIGIIIIIIIIIGIIIGIIIIIIIIIIGIIIIIIIGGIGIIIIIIIIIIIIIIIIG IGLV2-14*00(2850) nan IGLJ1*00(190) nan 75 297 317 0 222 2220.0 nan 21 36 58 222 237 150.0 nan nan nan AGCAGTGACGTTGGTGGTTATAACTAT 27 GTCTCCTGGTACCAACAGCACCCAGGCAAAGCCCCCAAACTCATGATTTAT 32 GATGTCAGT 25 AATCGGCCCTCAGGGGTTTCTAATCGCTTCTCTGGCTCCAAGTCTGGCAACACGGCCTCCCTGACCATCTCTGGGCTCCAGGCTGAGGACGAGGCTGATTATTAC 27
14 1 0.047619 AGCAGTGACGTTGGGGTTTCTAATCGCTTCTCTGGCTCCAAGTCTGGCGACACGGCCTCCCTGACCATCTCTGGGCTCCAGGCTGAGGACGAGGCTGATTATTACTGCAACTCATATACAAGCAGCAGCACTTATGTCTTCGGAACT GGIIIIIIIIGGGGGIIGIIIIIIIIIIIIIIIIIIIIIGGGGGIIIIIIIIGG:IIIIIGGIGGGGGGGGIGGGGAIGGGGGIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIGGIIGGGGGIGIIIIIIGAA IGLV2-14*00(845) nan IGLJ1*00(281) nan 75 294 317 0 132 DT89DG90DG91DT92DT93DA94DT95DA96DA97DC98DT99DA100DT101DG102DT103DC104DT105DC106DC107DT108DG109DG110DT111DA112DC113DC114DA115DA116DC117DA118DG119DC120DA121DC122DC123DC124DA125DG126DG127DC128DA129DA130DA131DG132DC133DC134DC135DC136DC137DA138DA139DA140DC141DT142DC143DA144DT145DG146DA147DT148DT149DT150DA151DT152DG153DA154DT155DG156DT157DC158DA159DG160DT161DA162DA163DT164DC165DG166DG167DC168DC169DC170DT171DC172DA173DG174DG175SA210GSG271A 254.0 nan 20 36 58 131 147 160.0 nan nan nan AGCAGTGACGTTGG 38 nan 0 nan 0 GGTTTCTAATCGCTTCTCTGGCTCCAAGTCTGGCGACACGGCCTCCCTGACCATCTCTGGGCTCCAGGCTGAGGACGAGGCTGATTATTAC 25
15 1 0.047619 AGCAGTGACGTTGGGATCTCGGATCGCTTCTCTGGTTCCAAGTCTGGCAACACGGCCTCCCTGACCATCTCTGGGCTCCAGGCTGAGGACGAGGCTGATTATTACTGCAGCTCGTTTACAACCAGCGACACTTATGTCTTCGGAAGT IGIIGGGIGIGIGGGGIGGGIIIIIIGGIIIIIIIIGGGGIGGGIIIIIIIIIIIIIIIIG<GGGGGGIIGGGGGIGIIGIIGGGIIGIIIGIIIIIGIGIIIIIIIIIIIIIIIGIIGIIIIIIIIIGIGIIIGG:IIIIIIIGIG IGLV2-14*00(721) nan IGLJ1*00(271) nan 75 89 317 0 14 140.0 nan 20 36 58 131 147 SC34G 131.0 nan nan nan nan nan nan nan nan nan nan nan
16 1 0.047619 AGCAGTGACATTGGTCTGGCAACACGGCCTCCCTGACCATCTCTGGACTCCAGGCTGAGGACGAGGCTGATTATTACTGCAGCTCATATACAAGCAGCAGCACTGATGTCTTCGGAACT IIIIIIIIGGGGGGGGIIIIGIIIIIIIIIIIIIIGIIIIIIIGGGIIIIGGGGIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIGGIIIIIIIGG<G IGLV2-14*00(416) nan IGLJ1*00(302) nan 75 90 317 0 15 SG84A 121.0 nan 22 36 58 105 119 140.0 nan nan nan nan nan nan nan nan nan nan nan
17 1 0.047619 AGCAGTGACATTGGGGTTTCTAATCGCTTCTCTGGCTCCAAGTCTGGCAACACGGCCTCCCTGACCATCTCTGGGCTCCAGGCTGAGGACGAGGCTGATTATTACTGCACCTCATGGACAACCAGCACCACTATGATATTCGGCGGA II<IIGIIGG<<<GGGGIIIIIIIGIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIGIIIGIGGGGGIGIGIGGGGGIIIIIIGIIIIIIIIIIIIIIIIIIIIGIIIIIIIIIIIIIGGGGIIGIIIIIIIIIIGGGG<IIGGGIA IGLV2-14*00(859) nan IGLJ2*00(164) nan 75 294 317 0 132 SG84ADT89DG90DG91DT92DT93DA94DT95DA96DA97DC98DT99DA100DT101DG102DT103DC104DT105DC106DC107DT108DG109DG110DT111DA112DC113DC114DA115DA116DC117DA118DG119DC120DA121DC122DC123DC124DA125DG126DG127DC128DA129DA130DA131DG132DC133DC134DC135DC136DC137DA138DA139DA140DC141DT142DC143DA144DT145DG146DA147DT148DT149DT150DA151DT152DG153DA154DT155DG156DT157DC158DA159DG160DT161DA162DA163DT164DC165DG166DG167DC168DC169DC170DT171DC172DA173DG174DG175SG271CSA277GST278GSG283CSG289C 138.0 nan 22 36 58 133 147 SG24A 111.0 nan nan nan AGCAGTGACATTGG 27 nan 0 nan 0 GGTTTCTAATCGCTTCTCTGGCTCCAAGTCTGGCAACACGGCCTCCCTGACCATCTCTGGGCTCCAGGCTGAGGACGAGGCTGATTATTAC 38
18 1 0.047619 AGCAGAGATATTGGGACTTCTAACCTTGTCTCCTGGTACCAACAGTATCCAGGTATTTCTACTTCCCTTTTTGTCTTCGGAACA IIIIGIIIG<GGGGG<GGGIIIIGIIIIIIIIIIIIIGIIIGIIIIIIIIIIIIIIIIIIIGIIIIGIGGIIIIIIIIIIIGG< IGLV2-23*00(1048) nan IGLJ1*00(264) nan 75 128 317 0 53 ST80ASG84ASG91CSA94CSC120TSC122T 356.0 nan 20 36 58 68 84 SA22TST35A 102.0 nan nan nan AGCAGAGATATTGGGACTTCTAACCTT 27 nan nan nan nan nan nan

Quality control

Now when we have all files processed lets perform Quality Control. The first thing to check is the alignment rate. That can be easily done using mixcr exportQc align function.

mixcr exportQc align results/*.clns figs/alignQc.pdf

alignQc.svg

Here we can see a percentage of successfully aligned reads for every sample as well as the percentage of reads that failed to align for various reasons. Some samples have a good alignment rate (more than 90%). But a lot of samples had some issues that lead to an alignment as low as 10-20%. Two major reasons for that outcome according to the plot are:

  • the lack of TCR/IG sequence
  • absence of J hits

Let's take one of the bad quality samples (ex. SRR8365459_HIP1_female) and examine it. To look at the reads' alignments for that sample we first will run mixcr align command for that sample once again, but this time we will specify additional options (-OallowPartialAlignments=true -OallowNoCDR3PartAlignments=true) that will preserve partially aligned reads (ex. reads that may lack J gene) and reads that lack CDR3 sequence.

mkdir -p debug
mixcr align -f \
    --species hsa \
    -p kAligner2_4.0 \
    --tag-pattern "^N{32}(R1:*)\^N{26}(R2:*)" \
    -OvParameters.geneFeatureToAlign="VTranscriptWithout5UTRWithP" \
    -OvParameters.parameters.floatingLeftBound=false \
    -OjParameters.parameters.floatingRightBound=false \
    -OallowNoCDR3PartAlignments=true \
    -OallowPartialAlignments=true \
    --not-aligned-R1 debug/SRR8365459_HIP1_female_notAligned_R1.fastq \
    --not-aligned-R2 debug/SRR8365459_HIP1_female_notAligned_R2.fastq \
    --report debug/SRR8365459_HIP1_female_debug.report \
     raw/SRR8365459_HIP1_female_R1.fastq.gz raw/SRR8365459_HIP1_female_R2.fastq.gz \
     debug/SRR8365459_HIP1_female_debug.vdjca

Now, when we have a new .vdjca file let's visualize how reads cover FRs and CDRs regions for that sample.

mixcr exportQc coverage \
      debug/SRR8365459_HIP1_female_debug.vdjca \
      figs/SRR8365459_HIP1_female_debug.vdjca.coverage.pdf

This will generate three .pdf formatted plots: R1 alignment, R2 alignment and alignment of overlapped reads. These plots can tell us the percentage of reads that cover each region at a certain position. Briefly, for this sample, only those reads that overlap show a good coverage pattern.

SRR8365459_HIP_female_debug_coverageQc_R1.svg

SRR8365459_HIP_female_debug_coverageQc_R2.svg

SRR8365459_HIP_female_debug_coverageQc_Overlap.svg

Finally, we can look at raw alignments using mixcr exportAlignmentsPretty.

The function bellow will generate a .txt human-readable file with alignments. We use parameter --skip 1000 to skip first 1000 reads, as first reads usually have bad quality, and --limit 100 will export only 100 alignments as we usually don't need to examine every alignment to see the issue.

mixcr exportAlignmentsPretty
    --skip 1000 \
    --limit 100 \
    debug/SRR8365459_HIP1_female_debug.vdjca \
    debug/SRR8365459_HIP1_female_debug.alignments.txt

Bellow you can see a few alignments from the generated file. The first one is an example of well aligned read.

>>> Read ids: 1840


                                     FR1><CDR1              CDR1><FR2                               
                _ L  R  L  S  C  A  A  S  G  F  T  L  S  D  Y  Y  M  S  W  I  R  Q  A  P  G  K      
    Quality     77767826778888888888888887788888888888878888888888888887888778888888888888887878    
    Target0   0 CCTGAGACTCTCCTGTGCAGCCTCTGGATTCACCTTGAGTGACTACTACATGAGCTGGATCCGCCAGGCTCCAGGGAAGG 79   Score
IGHV3-11*00 107 cctgagactctcctgtgcagcctctggattcaccttCagtgactactacatgagctggatccgccaggctccagggaagg 186  2102

                                FR2><CDR2              CDR2><FR3                                    
                G  L  E  W  V  S  H  I  S  G  S  G  N  T  I  D  Y  A  D  S  V  K  G  R  F  T  I     
    Quality     88788887778888888888887788888888888888888888888888888888887888888888888888888888    
    Target0  80 GGCTGGAGTGGGTTTCACACATTAGTGGCAGTGGTAATACCATAGACTACGCAGACTCTGTGAAGGGCCGATTCACCATC 159  Score
IGHV3-11*00 187 ggctggagtgggtttcaTacattagtAgTagtggtaGtaccataTactacgcagactctgtgaagggccgattcaccatc 266  2102

                                                                                       FR3><CDR3    
                 S  R  D  N  A  K  N  S  L  Y  L  Q  M  N  S  L  R  D  D  D  T  A  V  F  Y  C  A    
    Quality     88888888888888888888888888888888888888888888888888888888888888888888888887888788    
    Target0 160 TCCAGGGACAACGCCAAGAACTCGCTCTATCTGCAAATGAACAGCCTGAGAGACGACGACACGGCCGTGTTTTATTGTGC 239  Score
IGHV3-11*00 267 tccagggacaacgccaagaactcActGtatctgcaaatgaacagcctgagagCcgaGgacacggccgtgtAttaCtgtgc 346  2102

                   V>  <D     D>   <J  CDR3><FR4                       FR4>             
                  R  G  R  Y  A  L  D  Y  W  G  Q  G  T  R  V  T  V  S  S _             
    Quality     88878788888888788888888888888888888888888888888888888888888888877776    
    Target0 240 GAGAGGCCGTTATGCCCTAGATTATTGGGGCCAGGGAACCCGGGTCACCGTCTCCTCAGGTAAGCCCC 307  Score
IGHV3-11*00 347 gagag                                                                351  2102
IGHD3-16*00  65        cgttatAcc                                                     73   31
 IGHD2-2*00  56           tatgcc                                                     61   30
IGHD3-10*00  61        cgttat                                                        66   30
   IGHJ4*00  28                    gaCtaCtggggccagggaacccTggtcaccgtctcctcag          67   313
   IGHJ5*00  37                          tggggccagggaacccTggtcaccgtctcctcag          70   311

Now, the following alignment is a troubled one. And for this particular sample the majority of alignments look similar. We can see that one read does not align to any reference sequences and has a lot of low-quality nucleotides.

>>> Read ids: 1853



Quality   25762677888767572572527252276555757625555572625275777255225225525252725222252552   
Target0 0 CGGCATTCCTGCTGAAACGAGATCGGAAGAGCGTCGTGTAGGGAAAGAGTGTAGATCTCGGTGGTCGCCGTCACATTAAA 79  Score


Quality    57222255256566257772557225555565256777672262255775556255567762277262262677727762    
Target0 80 AAAAAAAACCAGCAGTGATGTTGGCAGTTATGACTATGTCTCCTGGTACCAACAGCACCCAGGCACAGTCTCCAAACCCA 159  Score


Quality     42222547622525426677662772660000000000000000000000000000000000000000000000000000    
Target0 160 TGACGTACAATGACAATACTCAGCCCTCAGGGGTCCCTGATCGATTCTCTGGCTTCAAGTCTGGCAATACGCCCTCCATG 239  Score


Quality     00000000000    
Target0 240 ACCATCCTTAG 250  Score

                                                                                        CDR2><F    
                            CDR1><FR2                                           FR2><CDR2          
                F  G  S  Y  V  Y  V  S  W  Y  Q  Q  H  S  S  T  V  P  K  P  M  I  D  N  V  N  T    
   Quality     00000000000000000000000000000000000000000006247675667452527762544226662675276772    
   Target1   0 TTTGGGAGTTATGTCTATGTCTCCTGGTACCAACAGCACTCAAGCACAGTCCCCAAACCCATGATCGACAATGTCAATAC 79   Score
IGLV2-5*00 141  ttgggagttatgActatgtctcctggtaccaacagcacCcaGgcacagtccccaaacccatgatcTacaatgtcaatac 219  1445

               R3                                                                                  

                 Q  P  S  G  V  P  D  R  F  S  G  S  K  S  G  N  T  A  S  M  T  I  S  G  L  *      
   Quality     22262275272725267265252267676726267252555266222725267626677675277777757552652222    
   Target1  80 TCAGCCCTCAGGGGTCCCTGATCGTTTCTCTGGCTCCAAGTCTGGCAATACGGCCTCCATGACCATCTCTGGACTCTAGG 159  Score
IGLV2-5*00 220 tcagccctcaggggtccctgatcgtttctctggctccaagtctggcaatacggcctccatgaccatctctggactcCagg 299  1445



            V  Y  K  Q  K  T  A  Y  E  C  Q  S  R  S  R  H  S  C  *  T  A  L  P  I  S  A  F     
Quality     55222252675222525222752252222225622525266672252275226775565552222577665575255625    
Target1 160 TTTACAAGCAGAAGACGGCATACGAGTGCCAGTCCCGGTCTCGGCATTCCTGCTGAACCGCTCTTCCGATCTCGGCATTC 239  Score



             L  L  T _     
Quality     72552252777    
Target1 240 CTGCTGACCCG 250  Score

Another quality report we should investigate is chain abundance plot.

mixcr exportQc chainUsage \
    results/*.clns \
    figs/chainUsage.pdf

chainUsage.svg

From that plot we can see another issue. According to the publication, the data was generated using a V and J primers multiplex protocol in such a way that every sample should have sequences for both heavy and light IG chains. But we see, that most samples have only one of the chains present in the sample, and those samples that have both still have a unexpected distribution, which has to be about 50\50, as every cell has both chains.

Full-length clonotype assembly

Biomed2 BCR protocol allows to recover a broader BCR receptor sequence then just CDR3 region. According to the protocol, forward primers are located in FR1 region, thus we can safely use an assembling feature that starts from CDR1 and be sure that no primers will affect the original sequence. The reverse primers are located in FR4 region very close to CDR3, thus there is not much left from to include in clone assembly.

Taking into account what is mentioned above, the longest possible assembling feature for this protocol is "{CDR1Begin:CDR3End}".

MiXCR has a specific preset to obtain full-length BCR clones with Biomed2 protocol:

mixcr analyze biomed2-human-rna-igh \
    raw/SRR8365468_HIP2_male_R1.fastq.gz \
    raw/SRR8365468_HIP2_male_R2.fastq.gz \
    results/SRR8365468_HIP2_male

The mixcr assemble step in this preset differs from the one above in the following manner:

mixcr assemble \
    -OassemblingFeatures="{CDR1Begin:CDR3End}" \
    `-OseparateByJ=true` \
    --report results/SRR8365468_HIP2_male.report \
    --json-report results/SRR8365468_HIP2_male.json \
    results/SRR8365468_HIP2_male.vdjca \
    results/SRR8365468_HIP2_male.clns
-OassemblingFeatures="{CDR1Begin:CDR3End}"
sets the assembling feature to the region which starts from CDR1Begin and ends at the end of CDR3.

Notice that we do not use -OseparateByV=true in this case because assembling feature already covers most of the V regions, thus in case if clones have identical CDR3 they will still be separated.

Reports

Finally, MiXCR provides a very convenient way to look at the reports generated at ech step. Every .vdjca, .clns and .clna file holds all the reports for every MiXCR function that has been applied to this sample. E.g. in our case .clns file contains reports for mixcr align and mixcr assemble. To output this report use mixcr exportReports as shown bellow. Note --json parameter will output a JSON-formatted report.

mixcr exportReports \
    results/SRR8365468_HIP2_male.clns \
    figs/SRR8365468_HIP2_male.report.txt
mixcr exportReports \
    --json \
    results/SRR8365468_HIP2_male.clns \
    figs/SRR8365468_HIP2_male.report.txt
Show report file
============== Align Report ==============
Input file(s): /raw/PRJNA511481-HIP/BIOMED2/SRR8365468_HIP2_male_R1.fastq.gz,/raw/PRJNA511481-HIP/BIOMED2/SRR8365468_HIP2_male_R2.fastq.gz
Output file(s): results/SRR8365468_HIP2_male.vdjca
Version: ; built=Mon Sep 26 10:55:18 CEST 2022; rev=8c998df1ab; lib=repseqio.v2.0
Command line arguments: --report results/SRR8365468_HIP2_male.align.report.txt --json-report results/SRR8365468_HIP2_male.align.report.json --preset local:biomed2-human-bcr-full-length +limitInput 100000 /raw/PRJNA511481-HIP/BIOMED2/SRR8365468_HIP2_male_R1.fastq.gz /raw/PRJNA511481-HIP/BIOMED2/SRR8365468_HIP2_male_R2.fastq.gz results/SRR8365468_HIP2_male.vdjca
Analysis time: 0ns
Total sequencing reads: 100000
Successfully aligned reads: 760 (0.76%)
Paired-end alignment conflicts eliminated: 98 (0.1%)
Alignment failed, no hits (not TCR/IG?): 82357 (82.36%)
Alignment failed because of absence of V hits: 51 (0.05%)
Alignment failed because of absence of J hits: 16810 (16.81%)
No target with both V and J alignments: 22 (0.02%)
Overlapped: 25788 (25.79%)
Overlapped and aligned: 547 (0.55%)
Alignment-aided overlaps: 20 (3.66%)
Overlapped and not aligned: 25241 (25.24%)
No CDR3 parts alignments, percent of successfully aligned: 2 (0.26%)
Partial aligned reads, percent of successfully aligned: 9 (1.18%)
V gene chimeras: 9 (0.01%)
IGH chains: 44 (5.79%)
IGH non-functional: 1 (2.27%)
IGK chains: 496 (65.26%)
IGK non-functional: 15 (3.02%)
IGL chains: 220 (28.95%)
IGL non-functional: 11 (5%)
Realigned with forced non-floating bound: 148464 (148.46%)
Realigned with forced non-floating right bound in left read: 997 (1%)
Realigned with forced non-floating left bound in right read: 997 (1%)
============== Assemble Report ==============
Input file(s): results/SRR8365468_HIP2_male.vdjca
Output file(s): results/SRR8365468_HIP2_male.clns
Version: ; built=Mon Sep 26 10:55:18 CEST 2022; rev=8c998df1ab; lib=repseqio.v2.0
Command line arguments: --report results/SRR8365468_HIP2_male.assemble.report.txt --json-report results/SRR8365468_HIP2_male.assemble.report.json results/SRR8365468_HIP2_male.vdjca results/SRR8365468_HIP2_male.clns
Analysis time: 0ns
Final clonotype count: 19
Average number of reads per clonotype: 1.11
Reads used in clonotypes, percent of total: 21 (0.02%)
Reads used in clonotypes before clustering, percent of total: 21 (0.02%)
Number of reads used as a core, percent of used: 21 (100%)
Mapped low quality reads, percent of used: 0 (0%)
Reads clustered in PCR error correction, percent of used: 0 (0%)
Reads pre-clustered due to the similar VJC-lists, percent of used: 0 (0%)
Reads dropped due to the lack of a clone sequence, percent of total: 610 (0.61%)
Reads dropped due to a too short clonal sequence, percent of total: 0 (0%)
Reads dropped due to low quality, percent of total: 0 (0%)
Reads dropped due to failed mapping, percent of total: 129 (0.13%)
Reads dropped with low quality clones, percent of total: 0 (0%)
Clonotypes eliminated by PCR error correction: 0
Clonotypes dropped as low quality: 0
Clonotypes pre-clustered due to the similar VJC-lists: 0
IGH chains: 7 (36.84%)
IGH non-functional: 0 (0%)
IGL chains: 12 (63.16%)
IGL non-functional: 0 (0%)
{
  "type": "alignerReport",
  "commandLine": "--report results/SRR8365468_HIP2_male.align.report.txt --json-report results/SRR8365468_HIP2_male.align.report.json --preset local:biomed2-human-bcr-full-length +limitInput 100000 /raw/PRJNA511481-HIP/BIOMED2/SRR8365468_HIP2_male_R1.fastq.gz /raw/PRJNA511481-HIP/BIOMED2/SRR8365468_HIP2_male_R2.fastq.gz results/SRR8365468_HIP2_male.vdjca",
  "inputFiles": [
    "/raw/PRJNA511481-HIP/BIOMED2/SRR8365468_HIP2_male_R1.fastq.gz",
    "/raw/PRJNA511481-HIP/BIOMED2/SRR8365468_HIP2_male_R2.fastq.gz"
  ],
  "outputFiles": [
    "results/SRR8365468_HIP2_male.vdjca"
  ],
  "version": "; built=Mon Sep 26 10:55:18 CEST 2022; rev=8c998df1ab; lib=repseqio.v2.0",
  "trimmingReport": null,
  "totalReadsProcessed": 100000,
  "aligned": 760,
  "notAligned": 99240,
  "notAlignedReasons": {
    "VAndJOnDifferentTargets": 22,
    "NoVHits": 51,
    "NoJHits": 16810,
    "LowTotalScore": 0,
    "NoBarcode": 0,
    "NoHits": 82357,
    "NoCDR3Parts": 0
  },
  "chimeras": 0,
  "overlapped": 25788,
  "alignmentAidedOverlaps": 20,
  "overlappedAligned": 547,
  "overlappedNotAligned": 25241,
  "pairedEndAlignmentConflicts": 98,
  "vChimeras": 9,
  "jChimeras": 0,
  "chainUsage": {
    "type": "chainUsage",
    "chimeras": 0,
    "total": 760,
    "chains": {
      "IGH": {
        "total": 44,
        "nonFunctional": 1,
        "isOOF": 0,
        "hasStops": 1
      },
      "IGK": {
        "total": 496,
        "nonFunctional": 15,
        "isOOF": 10,
        "hasStops": 5
      },
      "IGL": {
        "total": 220,
        "nonFunctional": 11,
        "isOOF": 7,
        "hasStops": 4
      }
    }
  },
  "realignedWithForcedNonFloatingBound": 148464,
  "realignedWithForcedNonFloatingRightBoundInLeftRead": 997,
  "realignedWithForcedNonFloatingLeftBoundInRightRead": 997,
  "noCDR3PartsAlignments": 2,
  "partialAlignments": 9,
  "tagReport": {
    "type": "tagReport"
  }
}
{
  "type": "assemblerReport",
  "commandLine": "--report results/SRR8365468_HIP2_male.assemble.report.txt --json-report results/SRR8365468_HIP2_male.assemble.report.json results/SRR8365468_HIP2_male.vdjca results/SRR8365468_HIP2_male.clns",
  "inputFiles": [
    "results/SRR8365468_HIP2_male.vdjca"
  ],
  "outputFiles": [
    "results/SRR8365468_HIP2_male.clns"
  ],
  "version": "; built=Mon Sep 26 10:55:18 CEST 2022; rev=8c998df1ab; lib=repseqio.v2.0",
  "preCloneAssemblerReport": null,
  "totalReadsProcessed": 100000,
  "initialClonesCreated": 19,
  "readsDroppedNoTargetSequence": 610,
  "readsDroppedTooShortClonalSequence": 0,
  "readsDroppedLowQuality": 0,
  "coreReads": 21,
  "readsDroppedFailedMapping": 129,
  "lowQualityRescued": 0,
  "clonesClustered": 0,
  "readsClustered": 0,
  "clones": 19,
  "clonesDroppedAsLowQuality": 0,
  "clonesPreClustered": 0,
  "readsPreClustered": 0,
  "readsInClones": 21,
  "readsInClonesBeforeClustering": 21,
  "readsDroppedWithLowQualityClones": 0,
  "clonalChainUsage": {
    "type": "chainUsage",
    "chimeras": 0,
    "total": 19,
    "chains": {
      "IGH": {
        "total": 7,
        "nonFunctional": 0,
        "isOOF": 0,
        "hasStops": 0
      },
      "IGL": {
        "total": 12,
        "nonFunctional": 0,
        "isOOF": 0,
        "hasStops": 0
      }
    }
  }
}