Skip to content

Commit

Permalink
fixed issues from unordered gtf accomodations
Browse files Browse the repository at this point in the history
  • Loading branch information
fairliereese committed Jun 25, 2020
1 parent 21a6042 commit 9537734
Show file tree
Hide file tree
Showing 4 changed files with 130 additions and 25 deletions.
18 changes: 10 additions & 8 deletions swan_vis/swangraph.py
Original file line number Diff line number Diff line change
Expand Up @@ -541,16 +541,16 @@ def create_dfs_gtf(self, gtf_file):
locs[key] = vertex_id
vertex_id += 1

# create inverse loc dict to sort paths by
locs_inv = {v: k for k, v in locs.items()}

# add locs-indexed path to transcripts, and populate edges
edges = {}
for _,t in transcripts.items():
t['path'] = []
strand = t['strand']
t_exons = t['exons']

# reorder exons that are in weird orders from the GTF
t_exons = reorder_exons(t_exons)

for i, exon_id in enumerate(t_exons):

# pull some information from exon dict
Expand Down Expand Up @@ -583,10 +583,6 @@ def create_dfs_gtf(self, gtf_file):
if key not in edges:
edges[key] = {'edge_id': edge_id, 'edge_type': 'intron'}

# sort the path based on chromosomal coordinates and strand
# in case there's some weird ordering in the gtf
t['path'] = reorder_locs(t['path'], strand, locs_inv)

# turn transcripts, edges, and locs into dataframes
locs = [{'chrom': key[0],
'coord': key[1],
Expand Down Expand Up @@ -1018,7 +1014,13 @@ def find_ir_genes(self):
sub_nodes = [i for i in range(eid[0]+1,eid[1])]
sub_G = self.G.subgraph(sub_nodes)
sub_edges = list(sub_G.edges())
sub_edges = self.edge_df.loc[sub_edges]
try:
sub_edges = self.edge_df.loc[sub_edges]
except:
for blop in sub_edges:
if blop not in self.edge_df.edge_id.tolist():
print(blop)
continue
sub_edges = sub_edges.loc[sub_edges.edge_type == 'intron']

if len(sub_edges.index) > 0:
Expand Down
31 changes: 20 additions & 11 deletions swan_vis/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -204,18 +204,27 @@ def find_edge_start_stop(v1, v2, strand):
stop = max([v1, v2])
return start, stop

# reorder the locations in a transcript's path based on
# chromosomal coordinate
# TODO
def reorder_locs(path, strand, locs):
coords = [locs[i] for i in path]
path_coords = sorted(zip(path, coords), key=lambda x: x[1])
path = [i[0] for i in path_coords]
coords = [i[1][1] for i in path_coords]
# reorder exon ids from create_dfs_gtf
def reorder_exons(exon_ids):
strand = exon_ids[0].split('_')[-2]
coords = [int(i.split('_')[-4]) for i in exon_ids]
exons = sorted(zip(exon_ids, coords), key=lambda x: x[1])
exons = [i[0] for i in exons]
if strand == '-':
path.reverse()
return path

exons.reverse()
return exons

# # reorder the locations in a transcript's path based on
# # chromosomal coordinate
# def reorder_locs(path, strand, locs):
# coords = [locs[i] for i in path]
# path_coords = sorted(zip(path, coords), key=lambda x: x[1])
# path = [i[0] for i in path_coords]
# coords = [i[1][1] for i in path_coords]
# if strand == '-':
# path.reverse()
# return path

# get novelty types associated with each transcript
def get_transcript_novelties(fields):
if fields['transcript_status'] == 'KNOWN':
Expand Down
Loading

0 comments on commit 9537734

Please sign in to comment.