Merge pull request #63 from jonathanBieler/sam_to_bam

Adds conversion capability for SAM to BAM.
BioJulia · Mar 23, 2024 · 2b15ada · 2b15ada
2 parents 629b1f3 + b14b7c2
commit 2b15ada
Show file tree

Hide file tree

Showing 4 changed files with 338 additions and 0 deletions.
diff --git a/src/XAM.jl b/src/XAM.jl
@@ -15,6 +15,7 @@ include("flags.jl")
 
 include("sam/sam.jl")
 include("bam/bam.jl")
+include("convert.jl")
 
 using .SAM
 using .BAM

diff --git a/src/convert.jl b/src/convert.jl
@@ -0,0 +1,230 @@
+module Conversion
+
+using ..SAM
+using ..BAM
+
+using BioSequences: DNAAlphabet, DNA, EncodeError
+
+const OP_TO_CODE = Dict(
+    'M' => 0x00,
+    'I' => 0x01,
+    'D' => 0x02,
+    'N' => 0x03,
+    'S' => 0x04,
+    'H' => 0x05,
+    'P' => 0x06,
+    '=' => 0x07,
+    'X' => 0x08,
+    'B' => 0x09,
+    '0' => 0x0a
+)
+
+function parse_cigar(record::SAM.Record)
+
+    cigar_ops = ('M','I','D','N','S','H','P','=','X')
+    out = NamedTuple{(:L, :op), Tuple{Int64, Char}}[]
+
+    cigar = SAM.cigar(record)
+    start = 1
+
+    for (i,c) in enumerate(cigar)
+        if c ∈ cigar_ops
+            L = parse(Int, view(cigar, start:i-1))
+            op = c
+            push!(out, (L = L, op = op))
+            start = i+1
+        end
+    end
+    out
+end
+
+function BAM.Record(record::SAM.Record; header = nothing)
+
+    bam_record = BAM.Record()
+    cigar, n_cigar_op = encode_cigar(record)
+
+    set_fixed_length_fields!(bam_record, record, header, n_cigar_op)
+    set_variable_length_fields!(bam_record, record, cigar)
+
+    bam_record
+end
+
+function refname_to_refid(refname::String, header::SAM.Header)
+    refname == "*" && return -1
+
+    refs = findall(header, "SQ")
+    for (i,ref) in enumerate(refs)
+        ref["SN"] == refname && return i-1
+    end
+    error("Couldn't find reference $(refname) in header.")
+end
+
+refname_to_refid(refname::String, header::Nothing) = -1
+
+
+"""
+    This field is set as '*' when the information is unavailable, and set as '=' if RNEXT is identical RNAME
+"""
+function get_nextrefname(record::SAM.Record)
+    !SAM.hasnextrefname(record) && return "*"
+    SAM.nextrefname(record) == "=" ? SAM.refname(record) : SAM.nextrefname(record)
+end
+
+function get_refname(record::SAM.Record)
+    !SAM.hasrefname(record) && return "*"
+    SAM.refname(record)
+end
+
+function set_fixed_length_fields!(bam_record::BAM.Record, record::SAM.Record, header, n_cigar_op)
+
+
+    bin = reg2bin(SAM.position(record)-1, SAM.alignlength(record))
+
+    bam_record.refid = refname_to_refid(get_refname(record), header)
+    bam_record.pos = SAM.position(record)-1
+    bam_record.l_read_name = SAM.tempname(record) |> x -> length(x)+1 # +1 for NULL
+    bam_record.mapq = SAM.mappingquality(record)
+    bam_record.bin = bin
+    bam_record.n_cigar_op = n_cigar_op
+    bam_record.flags = SAM.flags(record)
+    bam_record.l_seq = SAM.hassequence(record) ? SAM.seqlength(record) : 0
+    bam_record.next_refid = refname_to_refid(get_nextrefname(record), header)
+    bam_record.next_pos = SAM.nextposition(record)-1
+    bam_record.tlen = SAM.templength(record)
+    bam_record
+end
+
+"""
+While all single (i.e., non-array) integer types are stored in SAM as 'i', in BAM any of 'cCsSiI' 
+may be used together with the correspondingly-sized binary integer value, chosen according to the field value's magnitude
+"""
+const TYPE_TO_AUX_INT_FIELDS = Dict(
+    Int8 => UInt8('c'),
+    UInt8 => UInt8('C'),
+    Int16 => UInt8('s'),
+    UInt16 => UInt8('S'),
+    Int32 => UInt8('i'),
+    UInt32 => UInt8('I'),
+)
+
+function find_smallest_int_type(x)
+    candidates = x < 0 ? (Int8, Int16, Int32) : (UInt8, UInt16, UInt32)
+    idx = findfirst(T -> typemin(T) <= x <= typemax(T) , candidates)
+    isnothing(idx) && error("Coulnd't find Int type for value $(x)")
+    T = candidates[idx]
+
+    T, TYPE_TO_AUX_INT_FIELDS[T]
+end
+
+function encode_aux_field(record::SAM.Record, field::UnitRange{Int64})
+    typ = record.data[first(field)+3]
+    key = record.data[first(field):first(field)+1]
+
+    if typ == UInt8('i')
+        val = record.data[first(field)+5:last(field)] |> String
+        val = parse(Int32, val)
+        T, typ = find_smallest_int_type(val)
+        val = reinterpret(UInt8, [T(val)])
+
+        return vcat(key, typ, val)
+    elseif typ == UInt8('Z')
+        val = record.data[first(field)+5:last(field)]
+        return vcat(key, typ, val, 0x00)
+    elseif typ == UInt8('A')
+        val = record.data[first(field)+5:last(field)]
+        return vcat(key, typ, val)
+    else
+        error("Unsupported tag type $(Char(typ))")
+    end
+end
+
+function encode_cigar(record::SAM.Record)
+
+    ops = parse_cigar(record)
+
+    n_cigar_op = length(ops)
+    if n_cigar_op > typemax(UInt16) 
+        error("Cigar contains more than $(typemax(UInt16)) operations. This is currently unsupported. See section 4.2.2 of SAM/BAM specifications.")
+    end
+
+    cigar = Vector{UInt8}(undef, length(ops)*4)
+    k = 1
+    for c in ops
+        cigar_32 = [UInt32(0) + OP_TO_CODE[c.op] + (UInt32(c.L) << 4)]
+        for u in reinterpret(UInt8, cigar_32)
+            cigar[k] = u
+            k+=1
+        end
+    end
+    cigar, n_cigar_op
+end
+
+@inline function encode_nucleotide(nt::DNA)
+    if !isvalid(nt)
+        throw(EncodeError(DNAAlphabet{4}(), nt))
+    end
+    return reinterpret(UInt8, nt)
+end
+
+function encode_sequence(record::SAM.Record)
+    !SAM.hassequence(record) && return UInt8[]
+
+    Nsam = SAM.seqlength(record)
+    Nbam = cld(Nsam, 2)
+
+    # Specs : "When l seq is odd the bottom 4 bits of the last byte are undefined, but we recommend writing these as zero."
+    bamseq = fill(0x00, Nbam)
+    samseq = SAM.sequence(record)
+
+    # SAM
+    # 1 2 3 4 5
+    # A T G G T
+
+    # BAM
+    # 1  2  3
+    # AT GG T0
+
+    for (i, sym) in enumerate(samseq)
+        val = encode_nucleotide(sym)
+        bamseq[div(i + 1, 2)] |= val << (4 * isodd(i))
+    end
+
+    bamseq
+end
+
+function encode_tempname(record::SAM.Record)
+    tempname = SAM.tempname(record)
+    vcat(unsafe_wrap(Vector{UInt8}, tempname), 0x00) #add NULL terminator
+end
+
+""" See section 5.3 of specifications."""
+function reg2bin(start::Int, stop::Int)
+    stop -= 1
+    (start >> 14) == (stop >> 14) && return ((1 << 15) - 1) ÷ 7 + (start >> 14)
+    (start >> 17) == (stop >> 17) && return ((1 << 12) - 1) ÷ 7 + (start >> 17)
+    (start >> 20) == (stop >> 20) && return ((1 << 9)  - 1) ÷ 7 + (start >> 20)
+    (start >> 23) == (stop >> 23) && return ((1 << 6)  - 1) ÷ 7 + (start >> 23)
+    (start >> 26) == (stop >> 26) && return ((1 << 3)  - 1) ÷ 7 + (start >> 26)
+    return 0
+end
+
+function set_variable_length_fields!(bam_record::BAM.Record, record::SAM.Record, cigar)
+
+    tempname = encode_tempname(record)
+    seq = encode_sequence(record)
+    aux_data = reduce(vcat, encode_aux_field(record, field) for field in record.fields; init = UInt8[])
+    # if quality is missing, fill with good quality
+    quality = SAM.hasquality(record) ? SAM.quality(record) : fill(0xff, SAM.hassequence(record) ? SAM.seqlength(record) : 0) 
+
+    bam_record.data = vcat(
+        tempname,
+        cigar,
+        seq,
+        quality,
+        aux_data
+    )
+    bam_record.block_size = length(bam_record.data) + BAM.FIXED_FIELDS_BYTES - sizeof(bam_record.block_size)
+    bam_record
+end
+
+end# module Conversion
diff --git a/test/runtests.jl b/test/runtests.jl
@@ -29,6 +29,7 @@ end
     include("test_bam.jl")
     include("test_issues.jl")
     include("test_crosscheck.jl")
+    include("test_convert.jl")
 
     # Include doctests.
     DocMeta.setdocmeta!(XAM, :DocTestSetup, :(using XAM); recursive=true)

diff --git a/test/test_convert.jl b/test/test_convert.jl
@@ -0,0 +1,106 @@
+using Test, XAM
+
+@testset "convert SAM to BAM" begin
+
+    samdir = path_of_format("SAM")
+
+    function test_auxdata(record, bam_record)
+        aux_sam = SAM.auxdata(record)
+        aux_bam = BAM.auxdata(bam_record)
+
+        @test Set(keys(aux_sam)) == Set(keys(aux_bam))
+        @test all(aux_sam[k] == aux_bam[k] for k in keys(aux_sam))
+    end
+
+    function test_record(record, bam_record)
+        @test SAM.position(record) == BAM.position(bam_record)
+        @test SAM.flags(record) == BAM.flags(bam_record)
+        @test SAM.cigar(record) == BAM.cigar(bam_record)
+        @test SAM.sequence(record) == BAM.sequence(bam_record)
+        if SAM.hasquality(record)
+            @test SAM.quality(record) == BAM.quality(bam_record)
+        end
+        test_auxdata(record, bam_record)
+    end
+
+    # internal methods
+    @test XAM.Conversion.reg2bin(-1,0) == 4680
+
+    # good alignment with missing qualities
+    sam = "seq1\t81\tPhiX\t1051\t60\t70M\t=\t1821\t702\tTCTTGGCTTCCTTGCTGGTCAGATTGGTCGTCTTATTACCATTTCAACTACTCCGGTTATCGCTGGCGAC\t*\tNM:i:0\tMD:Z:70\tMC:Z:70M\tAS:i:70\tXS:i:0"
+    record = XAM.SAM.Record(sam)
+
+    bam_record = BAM.Record(record)
+    test_record(record, bam_record)
+    @test BAM.quality(bam_record) == fill(0xff, BAM.seqlength(bam_record))
+
+    # good alignment with qualities
+    sam = "seq1\t81\tPhiX\t1051\t60\t70M\t=\t1821\t702\tTCTTGGCTTCCTTGCTGGTCAGATTGGTCGTCTTATTACCATTTCAACTACTCCGGTTATCGCTGGCGAC\tADAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA\tNM:i:0\tMD:Z:70\tMC:Z:70M\tAS:i:70\tXS:i:0"
+    record = XAM.SAM.Record(sam)
+    bam_record = BAM.Record(record)
+    test_record(record, bam_record)    
+    @test BAM.quality(bam_record) == BAM.quality(bam_record)
+
+    # with an insertion
+    sam = "seq1\t16\tPhiX\t1051\t60\t38M6I32M\t*\t0\t0\tTCTTGGCTTCCTTGCTGGTCAGATTGGTCGTCTTATTACCCCCCCCATTTCAACTACTCCGGTTATCGCTGGCGAC\t*\tNM:i:6\tMD:Z:70\tAS:i:58\tXS:i:0"
+
+    record = XAM.SAM.Record(sam)
+    bam_record = BAM.Record(record)
+    test_record(record, bam_record)
+
+    #################
+    ## test all SAM files in FormatSpecimens
+    function test_sam_file(samdir, file)
+
+        records = collect(open(SAM.Reader, joinpath(samdir, file)))
+
+        for record in records
+            # record with more than 65535 operations
+            if SAM.tempname(record) == "03d98240-9a1d-4a9a-ae95-2fa8e2f1c945_Basecall_1D_template"
+                @test_throws ErrorException BAM.Record(record)
+                continue
+            end
+
+            bam_record = BAM.Record(record)
+            test_record(record, bam_record)
+        end
+    end
+
+    for file in ["ce#1.sam","ce#2.sam","ce#5.sam","ce#5b.sam","ce#unmap.sam","ce#unmap1.sam","ce#unmap2.sam","cigar-64k.sam","ex1.sam","ex1_header.sam","sam1.sam","sam2.sam","xx#blank.sam","xx#minimal.sam"]
+        test_sam_file(samdir, file)
+    end
+
+    #records = collect(open(SAM.Reader, joinpath(samdir, "ex1.sam")))
+
+    #################
+    # test with a BAM
+
+    function test_bam(bam_file, sam_file)
+        bamdir = path_of_format("BAM")
+        reader = open(BAM.Reader, joinpath(bamdir, bam_file))
+        h = BAM.header(reader)
+        bam_records = [record for record in reader] 
+
+        reader = open(SAM.Reader, joinpath(samdir, sam_file))
+        sam_records = [record for record in reader] 
+
+        for (sam_record, bam_record) in zip(sam_records, bam_records)
+            if SAM.tempname(sam_record) == "03d98240-9a1d-4a9a-ae95-2fa8e2f1c945_Basecall_1D_template"
+                continue
+            end
+            #@info SAM.tempname(sam_record)
+            bam_record2 = BAM.Record(sam_record; header=h)
+            @test bam_record == bam_record2
+        end
+
+    end
+
+    test_bam("ce#1.bam", "ce#1.sam")
+    test_bam("ce#2.bam", "ce#2.sam")
+    test_bam("ce#5.bam", "ce#5.sam")
+    test_bam("ce#unmap.bam", "ce#unmap.sam")
+    test_bam("ce#unmap1.bam", "ce#unmap1.sam")
+    test_bam("ce#unmap2.bam", "ce#unmap2.sam")
+    #test_bam("cigar-64k.bam", "cigar-64k.sam") # this one as some issues
+
+end