Skip to content

Commit

Permalink
Merge pull request #63 from jonathanBieler/sam_to_bam
Browse files Browse the repository at this point in the history
Adds conversion capability for SAM to BAM.
  • Loading branch information
CiaranOMara authored Mar 23, 2024
2 parents 629b1f3 + b14b7c2 commit 2b15ada
Show file tree
Hide file tree
Showing 4 changed files with 338 additions and 0 deletions.
1 change: 1 addition & 0 deletions src/XAM.jl
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@ include("flags.jl")

include("sam/sam.jl")
include("bam/bam.jl")
include("convert.jl")

using .SAM
using .BAM
Expand Down
230 changes: 230 additions & 0 deletions src/convert.jl
Original file line number Diff line number Diff line change
@@ -0,0 +1,230 @@
module Conversion

using ..SAM
using ..BAM

using BioSequences: DNAAlphabet, DNA, EncodeError

const OP_TO_CODE = Dict(
'M' => 0x00,
'I' => 0x01,
'D' => 0x02,
'N' => 0x03,
'S' => 0x04,
'H' => 0x05,
'P' => 0x06,
'=' => 0x07,
'X' => 0x08,
'B' => 0x09,
'0' => 0x0a
)

function parse_cigar(record::SAM.Record)

cigar_ops = ('M','I','D','N','S','H','P','=','X')
out = NamedTuple{(:L, :op), Tuple{Int64, Char}}[]

cigar = SAM.cigar(record)
start = 1

for (i,c) in enumerate(cigar)
if c cigar_ops
L = parse(Int, view(cigar, start:i-1))
op = c
push!(out, (L = L, op = op))
start = i+1
end
end
out
end

function BAM.Record(record::SAM.Record; header = nothing)

bam_record = BAM.Record()
cigar, n_cigar_op = encode_cigar(record)

set_fixed_length_fields!(bam_record, record, header, n_cigar_op)
set_variable_length_fields!(bam_record, record, cigar)

bam_record
end

function refname_to_refid(refname::String, header::SAM.Header)
refname == "*" && return -1

refs = findall(header, "SQ")
for (i,ref) in enumerate(refs)
ref["SN"] == refname && return i-1
end
error("Couldn't find reference $(refname) in header.")
end

refname_to_refid(refname::String, header::Nothing) = -1


"""
This field is set as '*' when the information is unavailable, and set as '=' if RNEXT is identical RNAME
"""
function get_nextrefname(record::SAM.Record)
!SAM.hasnextrefname(record) && return "*"
SAM.nextrefname(record) == "=" ? SAM.refname(record) : SAM.nextrefname(record)
end

function get_refname(record::SAM.Record)
!SAM.hasrefname(record) && return "*"
SAM.refname(record)
end

function set_fixed_length_fields!(bam_record::BAM.Record, record::SAM.Record, header, n_cigar_op)


bin = reg2bin(SAM.position(record)-1, SAM.alignlength(record))

bam_record.refid = refname_to_refid(get_refname(record), header)
bam_record.pos = SAM.position(record)-1
bam_record.l_read_name = SAM.tempname(record) |> x -> length(x)+1 # +1 for NULL
bam_record.mapq = SAM.mappingquality(record)
bam_record.bin = bin
bam_record.n_cigar_op = n_cigar_op
bam_record.flags = SAM.flags(record)
bam_record.l_seq = SAM.hassequence(record) ? SAM.seqlength(record) : 0
bam_record.next_refid = refname_to_refid(get_nextrefname(record), header)
bam_record.next_pos = SAM.nextposition(record)-1
bam_record.tlen = SAM.templength(record)
bam_record
end

"""
While all single (i.e., non-array) integer types are stored in SAM as 'i', in BAM any of 'cCsSiI'
may be used together with the correspondingly-sized binary integer value, chosen according to the field value's magnitude
"""
const TYPE_TO_AUX_INT_FIELDS = Dict(
Int8 => UInt8('c'),
UInt8 => UInt8('C'),
Int16 => UInt8('s'),
UInt16 => UInt8('S'),
Int32 => UInt8('i'),
UInt32 => UInt8('I'),
)

function find_smallest_int_type(x)
candidates = x < 0 ? (Int8, Int16, Int32) : (UInt8, UInt16, UInt32)
idx = findfirst(T -> typemin(T) <= x <= typemax(T) , candidates)
isnothing(idx) && error("Coulnd't find Int type for value $(x)")
T = candidates[idx]

T, TYPE_TO_AUX_INT_FIELDS[T]
end

function encode_aux_field(record::SAM.Record, field::UnitRange{Int64})
typ = record.data[first(field)+3]
key = record.data[first(field):first(field)+1]

if typ == UInt8('i')
val = record.data[first(field)+5:last(field)] |> String
val = parse(Int32, val)
T, typ = find_smallest_int_type(val)
val = reinterpret(UInt8, [T(val)])

return vcat(key, typ, val)
elseif typ == UInt8('Z')
val = record.data[first(field)+5:last(field)]
return vcat(key, typ, val, 0x00)
elseif typ == UInt8('A')
val = record.data[first(field)+5:last(field)]
return vcat(key, typ, val)
else
error("Unsupported tag type $(Char(typ))")
end
end

function encode_cigar(record::SAM.Record)

ops = parse_cigar(record)

n_cigar_op = length(ops)
if n_cigar_op > typemax(UInt16)
error("Cigar contains more than $(typemax(UInt16)) operations. This is currently unsupported. See section 4.2.2 of SAM/BAM specifications.")
end

cigar = Vector{UInt8}(undef, length(ops)*4)
k = 1
for c in ops
cigar_32 = [UInt32(0) + OP_TO_CODE[c.op] + (UInt32(c.L) << 4)]
for u in reinterpret(UInt8, cigar_32)
cigar[k] = u
k+=1
end
end
cigar, n_cigar_op
end

@inline function encode_nucleotide(nt::DNA)
if !isvalid(nt)
throw(EncodeError(DNAAlphabet{4}(), nt))
end
return reinterpret(UInt8, nt)
end

function encode_sequence(record::SAM.Record)
!SAM.hassequence(record) && return UInt8[]

Nsam = SAM.seqlength(record)
Nbam = cld(Nsam, 2)

# Specs : "When l seq is odd the bottom 4 bits of the last byte are undefined, but we recommend writing these as zero."
bamseq = fill(0x00, Nbam)
samseq = SAM.sequence(record)

# SAM
# 1 2 3 4 5
# A T G G T

# BAM
# 1 2 3
# AT GG T0

for (i, sym) in enumerate(samseq)
val = encode_nucleotide(sym)
bamseq[div(i + 1, 2)] |= val << (4 * isodd(i))
end

bamseq
end

function encode_tempname(record::SAM.Record)
tempname = SAM.tempname(record)
vcat(unsafe_wrap(Vector{UInt8}, tempname), 0x00) #add NULL terminator
end

""" See section 5.3 of specifications."""
function reg2bin(start::Int, stop::Int)
stop -= 1
(start >> 14) == (stop >> 14) && return ((1 << 15) - 1) ÷ 7 + (start >> 14)
(start >> 17) == (stop >> 17) && return ((1 << 12) - 1) ÷ 7 + (start >> 17)
(start >> 20) == (stop >> 20) && return ((1 << 9) - 1) ÷ 7 + (start >> 20)
(start >> 23) == (stop >> 23) && return ((1 << 6) - 1) ÷ 7 + (start >> 23)
(start >> 26) == (stop >> 26) && return ((1 << 3) - 1) ÷ 7 + (start >> 26)
return 0
end

function set_variable_length_fields!(bam_record::BAM.Record, record::SAM.Record, cigar)

tempname = encode_tempname(record)
seq = encode_sequence(record)
aux_data = reduce(vcat, encode_aux_field(record, field) for field in record.fields; init = UInt8[])
# if quality is missing, fill with good quality
quality = SAM.hasquality(record) ? SAM.quality(record) : fill(0xff, SAM.hassequence(record) ? SAM.seqlength(record) : 0)

bam_record.data = vcat(
tempname,
cigar,
seq,
quality,
aux_data
)
bam_record.block_size = length(bam_record.data) + BAM.FIXED_FIELDS_BYTES - sizeof(bam_record.block_size)
bam_record
end

end# module Conversion
1 change: 1 addition & 0 deletions test/runtests.jl
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,7 @@ end
include("test_bam.jl")
include("test_issues.jl")
include("test_crosscheck.jl")
include("test_convert.jl")

# Include doctests.
DocMeta.setdocmeta!(XAM, :DocTestSetup, :(using XAM); recursive=true)
Expand Down
106 changes: 106 additions & 0 deletions test/test_convert.jl
Original file line number Diff line number Diff line change
@@ -0,0 +1,106 @@
using Test, XAM

@testset "convert SAM to BAM" begin

samdir = path_of_format("SAM")

function test_auxdata(record, bam_record)
aux_sam = SAM.auxdata(record)
aux_bam = BAM.auxdata(bam_record)

@test Set(keys(aux_sam)) == Set(keys(aux_bam))
@test all(aux_sam[k] == aux_bam[k] for k in keys(aux_sam))
end

function test_record(record, bam_record)
@test SAM.position(record) == BAM.position(bam_record)
@test SAM.flags(record) == BAM.flags(bam_record)
@test SAM.cigar(record) == BAM.cigar(bam_record)
@test SAM.sequence(record) == BAM.sequence(bam_record)
if SAM.hasquality(record)
@test SAM.quality(record) == BAM.quality(bam_record)
end
test_auxdata(record, bam_record)
end

# internal methods
@test XAM.Conversion.reg2bin(-1,0) == 4680

# good alignment with missing qualities
sam = "seq1\t81\tPhiX\t1051\t60\t70M\t=\t1821\t702\tTCTTGGCTTCCTTGCTGGTCAGATTGGTCGTCTTATTACCATTTCAACTACTCCGGTTATCGCTGGCGAC\t*\tNM:i:0\tMD:Z:70\tMC:Z:70M\tAS:i:70\tXS:i:0"
record = XAM.SAM.Record(sam)

bam_record = BAM.Record(record)
test_record(record, bam_record)
@test BAM.quality(bam_record) == fill(0xff, BAM.seqlength(bam_record))

# good alignment with qualities
sam = "seq1\t81\tPhiX\t1051\t60\t70M\t=\t1821\t702\tTCTTGGCTTCCTTGCTGGTCAGATTGGTCGTCTTATTACCATTTCAACTACTCCGGTTATCGCTGGCGAC\tADAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA\tNM:i:0\tMD:Z:70\tMC:Z:70M\tAS:i:70\tXS:i:0"
record = XAM.SAM.Record(sam)
bam_record = BAM.Record(record)
test_record(record, bam_record)
@test BAM.quality(bam_record) == BAM.quality(bam_record)

# with an insertion
sam = "seq1\t16\tPhiX\t1051\t60\t38M6I32M\t*\t0\t0\tTCTTGGCTTCCTTGCTGGTCAGATTGGTCGTCTTATTACCCCCCCCATTTCAACTACTCCGGTTATCGCTGGCGAC\t*\tNM:i:6\tMD:Z:70\tAS:i:58\tXS:i:0"

record = XAM.SAM.Record(sam)
bam_record = BAM.Record(record)
test_record(record, bam_record)

#################
## test all SAM files in FormatSpecimens
function test_sam_file(samdir, file)

records = collect(open(SAM.Reader, joinpath(samdir, file)))

for record in records
# record with more than 65535 operations
if SAM.tempname(record) == "03d98240-9a1d-4a9a-ae95-2fa8e2f1c945_Basecall_1D_template"
@test_throws ErrorException BAM.Record(record)
continue
end

bam_record = BAM.Record(record)
test_record(record, bam_record)
end
end

for file in ["ce#1.sam","ce#2.sam","ce#5.sam","ce#5b.sam","ce#unmap.sam","ce#unmap1.sam","ce#unmap2.sam","cigar-64k.sam","ex1.sam","ex1_header.sam","sam1.sam","sam2.sam","xx#blank.sam","xx#minimal.sam"]
test_sam_file(samdir, file)
end

#records = collect(open(SAM.Reader, joinpath(samdir, "ex1.sam")))

#################
# test with a BAM

function test_bam(bam_file, sam_file)
bamdir = path_of_format("BAM")
reader = open(BAM.Reader, joinpath(bamdir, bam_file))
h = BAM.header(reader)
bam_records = [record for record in reader]

reader = open(SAM.Reader, joinpath(samdir, sam_file))
sam_records = [record for record in reader]

for (sam_record, bam_record) in zip(sam_records, bam_records)
if SAM.tempname(sam_record) == "03d98240-9a1d-4a9a-ae95-2fa8e2f1c945_Basecall_1D_template"
continue
end
#@info SAM.tempname(sam_record)
bam_record2 = BAM.Record(sam_record; header=h)
@test bam_record == bam_record2
end

end

test_bam("ce#1.bam", "ce#1.sam")
test_bam("ce#2.bam", "ce#2.sam")
test_bam("ce#5.bam", "ce#5.sam")
test_bam("ce#unmap.bam", "ce#unmap.sam")
test_bam("ce#unmap1.bam", "ce#unmap1.sam")
test_bam("ce#unmap2.bam", "ce#unmap2.sam")
#test_bam("cigar-64k.bam", "cigar-64k.sam") # this one as some issues

end

0 comments on commit 2b15ada

Please sign in to comment.