fixed issues from unordered gtf accomodations

mortazavilab · Jun 25, 2020 · 9537734 · 9537734
1 parent 21a6042
commit 9537734
Show file tree

Hide file tree

Showing 4 changed files with 130 additions and 25 deletions.
diff --git a/swan_vis/swangraph.py b/swan_vis/swangraph.py
@@ -541,16 +541,16 @@ def create_dfs_gtf(self, gtf_file):
 				locs[key] = vertex_id
 				vertex_id += 1
 
-		# create inverse loc dict to sort paths by
-		locs_inv = {v: k for k, v in locs.items()}
-
 		# add locs-indexed path to transcripts, and populate edges
 		edges = {}
 		for _,t in transcripts.items():
 			t['path'] = []
 			strand = t['strand']
 			t_exons = t['exons']
 
+			# reorder exons that are in weird orders from the GTF
+			t_exons = reorder_exons(t_exons)
+
 			for i, exon_id in enumerate(t_exons):
 
 				# pull some information from exon dict
@@ -583,10 +583,6 @@ def create_dfs_gtf(self, gtf_file):
 					if key not in edges:
 						edges[key] = {'edge_id': edge_id, 'edge_type': 'intron'}
 
-			# sort the path based on chromosomal coordinates and strand
-			# in case there's some weird ordering in the gtf 
-			t['path'] = reorder_locs(t['path'], strand, locs_inv)
-
 		# turn transcripts, edges, and locs into dataframes
 		locs = [{'chrom': key[0],
 				 'coord': key[1],
@@ -1018,7 +1014,13 @@ def find_ir_genes(self):
 			sub_nodes = [i for i in range(eid[0]+1,eid[1])]
 			sub_G = self.G.subgraph(sub_nodes)
 			sub_edges = list(sub_G.edges())
-			sub_edges = self.edge_df.loc[sub_edges]
+			try:
+				sub_edges = self.edge_df.loc[sub_edges]
+			except:
+				for blop in sub_edges:
+					if blop not in self.edge_df.edge_id.tolist():
+						print(blop)
+						continue
 			sub_edges = sub_edges.loc[sub_edges.edge_type == 'intron']
 
 			if len(sub_edges.index) > 0:

diff --git a/swan_vis/utils.py b/swan_vis/utils.py
@@ -204,18 +204,27 @@ def find_edge_start_stop(v1, v2, strand):
 		stop = max([v1, v2])
 	return start, stop
 
-# reorder the locations in a transcript's path based on
-# chromosomal coordinate
-# TODO
-def reorder_locs(path, strand, locs):
-	coords = [locs[i] for i in path]
-	path_coords = sorted(zip(path, coords), key=lambda x: x[1])
-	path = [i[0] for i in path_coords]
-	coords = [i[1][1] for i in path_coords]
+# reorder exon ids from create_dfs_gtf
+def reorder_exons(exon_ids):
+	strand = exon_ids[0].split('_')[-2]
+	coords = [int(i.split('_')[-4]) for i in exon_ids]
+	exons = sorted(zip(exon_ids, coords), key=lambda x: x[1])
+	exons = [i[0] for i in exons]
 	if strand == '-':
-		path.reverse()
-	return path 
-
+		exons.reverse()
+	return exons
+
+# # reorder the locations in a transcript's path based on
+# # chromosomal coordinate
+# def reorder_locs(path, strand, locs):
+# 	coords = [locs[i] for i in path]
+# 	path_coords = sorted(zip(path, coords), key=lambda x: x[1])
+# 	path = [i[0] for i in path_coords]
+# 	coords = [i[1][1] for i in path_coords]
+# 	if strand == '-':
+# 		path.reverse()
+# 	return path 
+
 # get novelty types associated with each transcript
 def get_transcript_novelties(fields):
 	if fields['transcript_status'] == 'KNOWN':