From 953773476ec50f32f9a0c761234c30eae4cdb9da Mon Sep 17 00:00:00 2001 From: fairliereese Date: Thu, 25 Jun 2020 15:03:27 -0700 Subject: [PATCH] fixed issues from unordered gtf accomodations --- swan_vis/swangraph.py | 18 ++++--- swan_vis/utils.py | 31 +++++++---- testing/input_files/weird_gtf_entries.gtf | 40 ++++++++++++++ testing/test_adding_datasets.py | 66 ++++++++++++++++++++--- 4 files changed, 130 insertions(+), 25 deletions(-) diff --git a/swan_vis/swangraph.py b/swan_vis/swangraph.py index 62118f9..001fafb 100644 --- a/swan_vis/swangraph.py +++ b/swan_vis/swangraph.py @@ -541,9 +541,6 @@ def create_dfs_gtf(self, gtf_file): locs[key] = vertex_id vertex_id += 1 - # create inverse loc dict to sort paths by - locs_inv = {v: k for k, v in locs.items()} - # add locs-indexed path to transcripts, and populate edges edges = {} for _,t in transcripts.items(): @@ -551,6 +548,9 @@ def create_dfs_gtf(self, gtf_file): strand = t['strand'] t_exons = t['exons'] + # reorder exons that are in weird orders from the GTF + t_exons = reorder_exons(t_exons) + for i, exon_id in enumerate(t_exons): # pull some information from exon dict @@ -583,10 +583,6 @@ def create_dfs_gtf(self, gtf_file): if key not in edges: edges[key] = {'edge_id': edge_id, 'edge_type': 'intron'} - # sort the path based on chromosomal coordinates and strand - # in case there's some weird ordering in the gtf - t['path'] = reorder_locs(t['path'], strand, locs_inv) - # turn transcripts, edges, and locs into dataframes locs = [{'chrom': key[0], 'coord': key[1], @@ -1018,7 +1014,13 @@ def find_ir_genes(self): sub_nodes = [i for i in range(eid[0]+1,eid[1])] sub_G = self.G.subgraph(sub_nodes) sub_edges = list(sub_G.edges()) - sub_edges = self.edge_df.loc[sub_edges] + try: + sub_edges = self.edge_df.loc[sub_edges] + except: + for blop in sub_edges: + if blop not in self.edge_df.edge_id.tolist(): + print(blop) + continue sub_edges = sub_edges.loc[sub_edges.edge_type == 'intron'] if len(sub_edges.index) > 0: diff --git a/swan_vis/utils.py b/swan_vis/utils.py index f5a90af..9be02ad 100644 --- a/swan_vis/utils.py +++ b/swan_vis/utils.py @@ -204,18 +204,27 @@ def find_edge_start_stop(v1, v2, strand): stop = max([v1, v2]) return start, stop -# reorder the locations in a transcript's path based on -# chromosomal coordinate -# TODO -def reorder_locs(path, strand, locs): - coords = [locs[i] for i in path] - path_coords = sorted(zip(path, coords), key=lambda x: x[1]) - path = [i[0] for i in path_coords] - coords = [i[1][1] for i in path_coords] +# reorder exon ids from create_dfs_gtf +def reorder_exons(exon_ids): + strand = exon_ids[0].split('_')[-2] + coords = [int(i.split('_')[-4]) for i in exon_ids] + exons = sorted(zip(exon_ids, coords), key=lambda x: x[1]) + exons = [i[0] for i in exons] if strand == '-': - path.reverse() - return path - + exons.reverse() + return exons + +# # reorder the locations in a transcript's path based on +# # chromosomal coordinate +# def reorder_locs(path, strand, locs): +# coords = [locs[i] for i in path] +# path_coords = sorted(zip(path, coords), key=lambda x: x[1]) +# path = [i[0] for i in path_coords] +# coords = [i[1][1] for i in path_coords] +# if strand == '-': +# path.reverse() +# return path + # get novelty types associated with each transcript def get_transcript_novelties(fields): if fields['transcript_status'] == 'KNOWN': diff --git a/testing/input_files/weird_gtf_entries.gtf b/testing/input_files/weird_gtf_entries.gtf index e97ddfd..21ad118 100644 --- a/testing/input_files/weird_gtf_entries.gtf +++ b/testing/input_files/weird_gtf_entries.gtf @@ -9,3 +9,43 @@ chr6 HAVANA exon 143832548 143832857 . - . gene_id "ENSG00000001036.14_4"; trans chr1 HAVANA transcript 326096 328112 . + . gene_id "ENSG00000250575.1"; transcript_id "ENST00000514436.1"; gene_type "pseudogene"; gene_status "KNOWN"; gene_name "RP4-669L17.8"; transcript_type "unprocessed_pseudogene"; transcript_status "KNOWN"; transcript_name "RP4-669L17.8-001"; level 1; ont "PGO:0000005"; tag "pseudo_consens"; havana_gene "OTTHUMG00000002861.2"; havana_transcript "OTTHUMT00000008000.2"; remap_substituted_missing_target "V19"; chr1 HAVANA exon 326096 326569 . + . gene_id "ENSG00000250575.1"; transcript_id "ENST00000514436.1"; gene_type "pseudogene"; gene_status "KNOWN"; gene_name "RP4-669L17.8"; transcript_type "unprocessed_pseudogene"; transcript_status "KNOWN"; transcript_name "RP4-669L17.8-001"; exon_number 1; exon_id "ENSE00002058739.1"; level 1; ont "PGO:0000005"; tag "pseudo_consens"; havana_gene "OTTHUMG00000002861.2"; havana_transcript "OTTHUMT00000008000.2"; remap_substituted_missing_target "V19"; chr1 HAVANA exon 327348 328112 . + . gene_id "ENSG00000250575.1"; transcript_id "ENST00000514436.1"; gene_type "pseudogene"; gene_status "KNOWN"; gene_name "RP4-669L17.8"; transcript_type "unprocessed_pseudogene"; transcript_status "KNOWN"; transcript_name "RP4-669L17.8-001"; exon_number 2; exon_id "ENSE00002064640.1"; level 1; ont "PGO:0000005"; tag "pseudo_consens"; havana_gene "OTTHUMG00000002861.2"; havana_transcript "OTTHUMT00000008000.2"; remap_substituted_missing_target "V19"; +chr16 ENSEMBL transcript 84274 138860 . - . gene_id "ENSG00000103148.15"; transcript_id "ENST00000620134.4"; gene_type "protein_coding"; gene_name "NPRL3"; transcript_type "protein_coding"; transcript_name "NPRL3-212"; level 3; protein_id "ENSP00000483814.1"; transcript_support_level "1"; tag "basic"; tag "appris_principal_1"; tag "CCDS"; ccdsid "CCDS73795.1"; havana_gene "OTTHUMG00000047792.19"; +chr16 ENSEMBL exon 138642 138860 . - . gene_id "ENSG00000103148.15"; transcript_id "ENST00000620134.4"; gene_type "protein_coding"; gene_name "NPRL3"; transcript_type "protein_coding"; transcript_name "NPRL3-212"; exon_number 1; exon_id "ENSE00003730697.1"; level 3; protein_id "ENSP00000483814.1"; transcript_support_level "1"; tag "basic"; tag "appris_principal_1"; tag "CCDS"; ccdsid "CCDS73795.1"; havana_gene "OTTHUMG00000047792.19"; +chr16 ENSEMBL exon 138150 138334 . - . gene_id "ENSG00000103148.15"; transcript_id "ENST00000620134.4"; gene_type "protein_coding"; gene_name "NPRL3"; transcript_type "protein_coding"; transcript_name "NPRL3-212"; exon_number 2; exon_id "ENSE00003663199.1"; level 3; protein_id "ENSP00000483814.1"; transcript_support_level "1"; tag "basic"; tag "appris_principal_1"; tag "CCDS"; ccdsid "CCDS73795.1"; havana_gene "OTTHUMG00000047792.19"; +chr16 ENSEMBL CDS 138150 138267 . - 0 gene_id "ENSG00000103148.15"; transcript_id "ENST00000620134.4"; gene_type "protein_coding"; gene_name "NPRL3"; transcript_type "protein_coding"; transcript_name "NPRL3-212"; exon_number 2; exon_id "ENSE00003663199.1"; level 3; protein_id "ENSP00000483814.1"; transcript_support_level "1"; tag "basic"; tag "appris_principal_1"; tag "CCDS"; ccdsid "CCDS73795.1"; havana_gene "OTTHUMG00000047792.19"; +chr16 ENSEMBL start_codon 138265 138267 . - 0 gene_id "ENSG00000103148.15"; transcript_id "ENST00000620134.4"; gene_type "protein_coding"; gene_name "NPRL3"; transcript_type "protein_coding"; transcript_name "NPRL3-212"; exon_number 2; exon_id "ENSE00003663199.1"; level 3; protein_id "ENSP00000483814.1"; transcript_support_level "1"; tag "basic"; tag "appris_principal_1"; tag "CCDS"; ccdsid "CCDS73795.1"; havana_gene "OTTHUMG00000047792.19"; +chr16 ENSEMBL exon 130522 130591 . - . gene_id "ENSG00000103148.15"; transcript_id "ENST00000620134.4"; gene_type "protein_coding"; gene_name "NPRL3"; transcript_type "protein_coding"; transcript_name "NPRL3-212"; exon_number 3; exon_id "ENSE00003589280.1"; level 3; protein_id "ENSP00000483814.1"; transcript_support_level "1"; tag "basic"; tag "appris_principal_1"; tag "CCDS"; ccdsid "CCDS73795.1"; havana_gene "OTTHUMG00000047792.19"; +chr16 ENSEMBL CDS 130522 130591 . - 2 gene_id "ENSG00000103148.15"; transcript_id "ENST00000620134.4"; gene_type "protein_coding"; gene_name "NPRL3"; transcript_type "protein_coding"; transcript_name "NPRL3-212"; exon_number 3; exon_id "ENSE00003589280.1"; level 3; protein_id "ENSP00000483814.1"; transcript_support_level "1"; tag "basic"; tag "appris_principal_1"; tag "CCDS"; ccdsid "CCDS73795.1"; havana_gene "OTTHUMG00000047792.19"; +chr16 ENSEMBL exon 119126 119255 . - . gene_id "ENSG00000103148.15"; transcript_id "ENST00000620134.4"; gene_type "protein_coding"; gene_name "NPRL3"; transcript_type "protein_coding"; transcript_name "NPRL3-212"; exon_number 4; exon_id "ENSE00003538360.1"; level 3; protein_id "ENSP00000483814.1"; transcript_support_level "1"; tag "basic"; tag "appris_principal_1"; tag "CCDS"; ccdsid "CCDS73795.1"; havana_gene "OTTHUMG00000047792.19"; +chr16 ENSEMBL CDS 119126 119255 . - 1 gene_id "ENSG00000103148.15"; transcript_id "ENST00000620134.4"; gene_type "protein_coding"; gene_name "NPRL3"; transcript_type "protein_coding"; transcript_name "NPRL3-212"; exon_number 4; exon_id "ENSE00003538360.1"; level 3; protein_id "ENSP00000483814.1"; transcript_support_level "1"; tag "basic"; tag "appris_principal_1"; tag "CCDS"; ccdsid "CCDS73795.1"; havana_gene "OTTHUMG00000047792.19"; +chr16 ENSEMBL exon 117301 117375 . - . gene_id "ENSG00000103148.15"; transcript_id "ENST00000620134.4"; gene_type "protein_coding"; gene_name "NPRL3"; transcript_type "protein_coding"; transcript_name "NPRL3-212"; exon_number 5; exon_id "ENSE00003622190.1"; level 3; protein_id "ENSP00000483814.1"; transcript_support_level "1"; tag "basic"; tag "appris_principal_1"; tag "CCDS"; ccdsid "CCDS73795.1"; havana_gene "OTTHUMG00000047792.19"; +chr16 ENSEMBL CDS 117301 117375 . - 0 gene_id "ENSG00000103148.15"; transcript_id "ENST00000620134.4"; gene_type "protein_coding"; gene_name "NPRL3"; transcript_type "protein_coding"; transcript_name "NPRL3-212"; exon_number 5; exon_id "ENSE00003622190.1"; level 3; protein_id "ENSP00000483814.1"; transcript_support_level "1"; tag "basic"; tag "appris_principal_1"; tag "CCDS"; ccdsid "CCDS73795.1"; havana_gene "OTTHUMG00000047792.19"; +chr16 ENSEMBL exon 112622 112775 . - . gene_id "ENSG00000103148.15"; transcript_id "ENST00000620134.4"; gene_type "protein_coding"; gene_name "NPRL3"; transcript_type "protein_coding"; transcript_name "NPRL3-212"; exon_number 6; exon_id "ENSE00003466209.1"; level 3; protein_id "ENSP00000483814.1"; transcript_support_level "1"; tag "basic"; tag "appris_principal_1"; tag "CCDS"; ccdsid "CCDS73795.1"; havana_gene "OTTHUMG00000047792.19"; +chr16 ENSEMBL CDS 112622 112775 . - 0 gene_id "ENSG00000103148.15"; transcript_id "ENST00000620134.4"; gene_type "protein_coding"; gene_name "NPRL3"; transcript_type "protein_coding"; transcript_name "NPRL3-212"; exon_number 6; exon_id "ENSE00003466209.1"; level 3; protein_id "ENSP00000483814.1"; transcript_support_level "1"; tag "basic"; tag "appris_principal_1"; tag "CCDS"; ccdsid "CCDS73795.1"; havana_gene "OTTHUMG00000047792.19"; +chr16 ENSEMBL exon 110525 110606 . - . gene_id "ENSG00000103148.15"; transcript_id "ENST00000620134.4"; gene_type "protein_coding"; gene_name "NPRL3"; transcript_type "protein_coding"; transcript_name "NPRL3-212"; exon_number 7; exon_id "ENSE00003672562.1"; level 3; protein_id "ENSP00000483814.1"; transcript_support_level "1"; tag "basic"; tag "appris_principal_1"; tag "CCDS"; ccdsid "CCDS73795.1"; havana_gene "OTTHUMG00000047792.19"; +chr16 ENSEMBL CDS 110525 110606 . - 2 gene_id "ENSG00000103148.15"; transcript_id "ENST00000620134.4"; gene_type "protein_coding"; gene_name "NPRL3"; transcript_type "protein_coding"; transcript_name "NPRL3-212"; exon_number 7; exon_id "ENSE00003672562.1"; level 3; protein_id "ENSP00000483814.1"; transcript_support_level "1"; tag "basic"; tag "appris_principal_1"; tag "CCDS"; ccdsid "CCDS73795.1"; havana_gene "OTTHUMG00000047792.19"; +chr16 ENSEMBL exon 100372 100509 . - . gene_id "ENSG00000103148.15"; transcript_id "ENST00000620134.4"; gene_type "protein_coding"; gene_name "NPRL3"; transcript_type "protein_coding"; transcript_name "NPRL3-212"; exon_number 8; exon_id "ENSE00003472148.1"; level 3; protein_id "ENSP00000483814.1"; transcript_support_level "1"; tag "basic"; tag "appris_principal_1"; tag "CCDS"; ccdsid "CCDS73795.1"; havana_gene "OTTHUMG00000047792.19"; +chr16 ENSEMBL CDS 100372 100509 . - 1 gene_id "ENSG00000103148.15"; transcript_id "ENST00000620134.4"; gene_type "protein_coding"; gene_name "NPRL3"; transcript_type "protein_coding"; transcript_name "NPRL3-212"; exon_number 8; exon_id "ENSE00003472148.1"; level 3; protein_id "ENSP00000483814.1"; transcript_support_level "1"; tag "basic"; tag "appris_principal_1"; tag "CCDS"; ccdsid "CCDS73795.1"; havana_gene "OTTHUMG00000047792.19"; +chr16 ENSEMBL exon 98145 98301 . - . gene_id "ENSG00000103148.15"; transcript_id "ENST00000620134.4"; gene_type "protein_coding"; gene_name "NPRL3"; transcript_type "protein_coding"; transcript_name "NPRL3-212"; exon_number 9; exon_id "ENSE00003624942.1"; level 3; protein_id "ENSP00000483814.1"; transcript_support_level "1"; tag "basic"; tag "appris_principal_1"; tag "CCDS"; ccdsid "CCDS73795.1"; havana_gene "OTTHUMG00000047792.19"; +chr16 ENSEMBL CDS 98145 98301 . - 1 gene_id "ENSG00000103148.15"; transcript_id "ENST00000620134.4"; gene_type "protein_coding"; gene_name "NPRL3"; transcript_type "protein_coding"; transcript_name "NPRL3-212"; exon_number 9; exon_id "ENSE00003624942.1"; level 3; protein_id "ENSP00000483814.1"; transcript_support_level "1"; tag "basic"; tag "appris_principal_1"; tag "CCDS"; ccdsid "CCDS73795.1"; havana_gene "OTTHUMG00000047792.19"; +chr16 ENSEMBL exon 93219 93325 . - . gene_id "ENSG00000103148.15"; transcript_id "ENST00000620134.4"; gene_type "protein_coding"; gene_name "NPRL3"; transcript_type "protein_coding"; transcript_name "NPRL3-212"; exon_number 10; exon_id "ENSE00003459258.1"; level 3; protein_id "ENSP00000483814.1"; transcript_support_level "1"; tag "basic"; tag "appris_principal_1"; tag "CCDS"; ccdsid "CCDS73795.1"; havana_gene "OTTHUMG00000047792.19"; +chr16 ENSEMBL CDS 93219 93325 . - 0 gene_id "ENSG00000103148.15"; transcript_id "ENST00000620134.4"; gene_type "protein_coding"; gene_name "NPRL3"; transcript_type "protein_coding"; transcript_name "NPRL3-212"; exon_number 10; exon_id "ENSE00003459258.1"; level 3; protein_id "ENSP00000483814.1"; transcript_support_level "1"; tag "basic"; tag "appris_principal_1"; tag "CCDS"; ccdsid "CCDS73795.1"; havana_gene "OTTHUMG00000047792.19"; +chr16 ENSEMBL exon 92596 92725 . - . gene_id "ENSG00000103148.15"; transcript_id "ENST00000620134.4"; gene_type "protein_coding"; gene_name "NPRL3"; transcript_type "protein_coding"; transcript_name "NPRL3-212"; exon_number 11; exon_id "ENSE00003674129.1"; level 3; protein_id "ENSP00000483814.1"; transcript_support_level "1"; tag "basic"; tag "appris_principal_1"; tag "CCDS"; ccdsid "CCDS73795.1"; havana_gene "OTTHUMG00000047792.19"; +chr16 ENSEMBL CDS 92596 92725 . - 1 gene_id "ENSG00000103148.15"; transcript_id "ENST00000620134.4"; gene_type "protein_coding"; gene_name "NPRL3"; transcript_type "protein_coding"; transcript_name "NPRL3-212"; exon_number 11; exon_id "ENSE00003674129.1"; level 3; protein_id "ENSP00000483814.1"; transcript_support_level "1"; tag "basic"; tag "appris_principal_1"; tag "CCDS"; ccdsid "CCDS73795.1"; havana_gene "OTTHUMG00000047792.19"; +chr16 ENSEMBL exon 89713 89902 . - . gene_id "ENSG00000103148.15"; transcript_id "ENST00000620134.4"; gene_type "protein_coding"; gene_name "NPRL3"; transcript_type "protein_coding"; transcript_name "NPRL3-212"; exon_number 12; exon_id "ENSE00003683492.1"; level 3; protein_id "ENSP00000483814.1"; transcript_support_level "1"; tag "basic"; tag "appris_principal_1"; tag "CCDS"; ccdsid "CCDS73795.1"; havana_gene "OTTHUMG00000047792.19"; +chr16 ENSEMBL CDS 89713 89902 . - 0 gene_id "ENSG00000103148.15"; transcript_id "ENST00000620134.4"; gene_type "protein_coding"; gene_name "NPRL3"; transcript_type "protein_coding"; transcript_name "NPRL3-212"; exon_number 12; exon_id "ENSE00003683492.1"; level 3; protein_id "ENSP00000483814.1"; transcript_support_level "1"; tag "basic"; tag "appris_principal_1"; tag "CCDS"; ccdsid "CCDS73795.1"; havana_gene "OTTHUMG00000047792.19"; +chr16 ENSEMBL exon 88698 88890 . - . gene_id "ENSG00000103148.15"; transcript_id "ENST00000620134.4"; gene_type "protein_coding"; gene_name "NPRL3"; transcript_type "protein_coding"; transcript_name "NPRL3-212"; exon_number 13; exon_id "ENSE00001314268.3"; level 3; protein_id "ENSP00000483814.1"; transcript_support_level "1"; tag "basic"; tag "appris_principal_1"; tag "CCDS"; ccdsid "CCDS73795.1"; havana_gene "OTTHUMG00000047792.19"; +chr16 ENSEMBL CDS 88698 88890 . - 2 gene_id "ENSG00000103148.15"; transcript_id "ENST00000620134.4"; gene_type "protein_coding"; gene_name "NPRL3"; transcript_type "protein_coding"; transcript_name "NPRL3-212"; exon_number 13; exon_id "ENSE00001314268.3"; level 3; protein_id "ENSP00000483814.1"; transcript_support_level "1"; tag "basic"; tag "appris_principal_1"; tag "CCDS"; ccdsid "CCDS73795.1"; havana_gene "OTTHUMG00000047792.19"; +chr16 ENSEMBL exon 85820 86870 . - . gene_id "ENSG00000103148.15"; transcript_id "ENST00000620134.4"; gene_type "protein_coding"; gene_name "NPRL3"; transcript_type "protein_coding"; transcript_name "NPRL3-212"; exon_number 14; exon_id "ENSE00003726115.1"; level 3; protein_id "ENSP00000483814.1"; transcript_support_level "1"; tag "basic"; tag "appris_principal_1"; tag "CCDS"; ccdsid "CCDS73795.1"; havana_gene "OTTHUMG00000047792.19"; +chr16 ENSEMBL CDS 86708 86870 . - 1 gene_id "ENSG00000103148.15"; transcript_id "ENST00000620134.4"; gene_type "protein_coding"; gene_name "NPRL3"; transcript_type "protein_coding"; transcript_name "NPRL3-212"; exon_number 14; exon_id "ENSE00003726115.1"; level 3; protein_id "ENSP00000483814.1"; transcript_support_level "1"; tag "basic"; tag "appris_principal_1"; tag "CCDS"; ccdsid "CCDS73795.1"; havana_gene "OTTHUMG00000047792.19"; +chr16 ENSEMBL stop_codon 86705 86707 . - 0 gene_id "ENSG00000103148.15"; transcript_id "ENST00000620134.4"; gene_type "protein_coding"; gene_name "NPRL3"; transcript_type "protein_coding"; transcript_name "NPRL3-212"; exon_number 14; exon_id "ENSE00003726115.1"; level 3; protein_id "ENSP00000483814.1"; transcript_support_level "1"; tag "basic"; tag "appris_principal_1"; tag "CCDS"; ccdsid "CCDS73795.1"; havana_gene "OTTHUMG00000047792.19"; +chr16 ENSEMBL exon 84274 84595 . - . gene_id "ENSG00000103148.15"; transcript_id "ENST00000620134.4"; gene_type "protein_coding"; gene_name "NPRL3"; transcript_type "protein_coding"; transcript_name "NPRL3-212"; exon_number 15; exon_id "ENSE00003744974.1"; level 3; protein_id "ENSP00000483814.1"; transcript_support_level "1"; tag "basic"; tag "appris_principal_1"; tag "CCDS"; ccdsid "CCDS73795.1"; havana_gene "OTTHUMG00000047792.19"; +chr16 ENSEMBL UTR 138642 138860 . - . gene_id "ENSG00000103148.15"; transcript_id "ENST00000620134.4"; gene_type "protein_coding"; gene_name "NPRL3"; transcript_type "protein_coding"; transcript_name "NPRL3-212"; exon_number 1; exon_id "ENSE00003730697.1"; level 3; protein_id "ENSP00000483814.1"; transcript_support_level "1"; tag "basic"; tag "appris_principal_1"; tag "CCDS"; ccdsid "CCDS73795.1"; havana_gene "OTTHUMG00000047792.19"; +chr16 ENSEMBL UTR 138268 138334 . - . gene_id "ENSG00000103148.15"; transcript_id "ENST00000620134.4"; gene_type "protein_coding"; gene_name "NPRL3"; transcript_type "protein_coding"; transcript_name "NPRL3-212"; exon_number 2; exon_id "ENSE00003663199.1"; level 3; protein_id "ENSP00000483814.1"; transcript_support_level "1"; tag "basic"; tag "appris_principal_1"; tag "CCDS"; ccdsid "CCDS73795.1"; havana_gene "OTTHUMG00000047792.19"; +chr16 ENSEMBL UTR 85820 86707 . - . gene_id "ENSG00000103148.15"; transcript_id "ENST00000620134.4"; gene_type "protein_coding"; gene_name "NPRL3"; transcript_type "protein_coding"; transcript_name "NPRL3-212"; exon_number 14; exon_id "ENSE00003726115.1"; level 3; protein_id "ENSP00000483814.1"; transcript_support_level "1"; tag "basic"; tag "appris_principal_1"; tag "CCDS"; ccdsid "CCDS73795.1"; havana_gene "OTTHUMG00000047792.19"; +chr16 ENSEMBL UTR 84274 84595 . - . gene_id "ENSG00000103148.15"; transcript_id "ENST00000620134.4"; gene_type "protein_coding"; gene_name "NPRL3"; transcript_type "protein_coding"; transcript_name "NPRL3-212"; exon_number 15; exon_id "ENSE00003744974.1"; level 3; protein_id "ENSP00000483814.1"; transcript_support_level "1"; tag "basic"; tag "appris_principal_1"; tag "CCDS"; ccdsid "CCDS73795.1"; havana_gene "OTTHUMG00000047792.19"; +chr14_GL000194v1_random TALON gene 61468 115043 . - . gene_id "TALONG000058722"; gene_name "TALONG000058722"; gene_status "NOVEL"; source "TALON"; talon_gene "58722"; intergenic_novel "TRUE"; +chr14_GL000194v1_random TALON transcript 61468 115043 . - . gene_id "TALONG000058722"; transcript_id "TALONT000206784"; gene_name "TALONG000058722"; gene_status "NOVEL"; transcript_status "NOVEL"; transcript_name "TALONT000206784"; talon_gene "58722"; talon_transcript "206784"; NNC_transcript "TRUE"; +chr14_GL000194v1_random TALON exon 114986 115043 . - . gene_id "TALONG000058722"; transcript_id "TALONT000206784"; gene_status "NOVEL"; gene_name "TALONG000058722"; transcript_status "NOVEL"; transcript_name "TALONT000206784"; exon_number "1"; exon_id "952613"; talon_gene "58722"; talon_transcript "206784"; talon_exon "952613"; exon_status "NOVEL"; +chr14_GL000194v1_random TALON exon 112792 112850 . - . gene_id "TALONG000058722"; transcript_id "TALONT000206784"; gene_status "NOVEL"; gene_name "TALONG000058722"; transcript_status "NOVEL"; transcript_name "TALONT000206784"; exon_number "2"; exon_id "952611"; talon_gene "58722"; talon_transcript "206784"; talon_exon "952611"; exon_status "NOVEL"; +chr14_GL000194v1_random TALON exon 61468 62949 . - . gene_id "TALONG000058722"; transcript_id "TALONT000206784"; gene_status "NOVEL"; gene_name "TALONG000058722"; transcript_status "NOVEL"; transcript_name "TALONT000206784"; exon_number "3"; exon_id "952779"; talon_gene "58722"; talon_transcript "206784"; talon_exon "952779"; exon_status "NOVEL"; diff --git a/testing/test_adding_datasets.py b/testing/test_adding_datasets.py index c82ccfc..2f34927 100644 --- a/testing/test_adding_datasets.py +++ b/testing/test_adding_datasets.py @@ -39,12 +39,11 @@ def test_add_annotation(self): test = sg.t_df.apply(lambda x: (x.tid, x.novelty), axis=1) check_pairs(control, test) - def test_weird_gtf(self): - sg = swan.SwanGraph() - sg.add_dataset('test', 'input_files/weird_gtf_entries.gtf') + def test_minus_strand_unordered_gtf(self): + sg = process_gtf() print(sg.t_df) - # check each transcript + # check transcript tid = 'ENST00000002165.11_3' path = sg.t_df.loc[tid, 'path'] print(path) @@ -54,13 +53,68 @@ def test_weird_gtf(self): 143823702, 143823492, 143823259, 143823069, 143818634, 143818526, 143816984, 143815949] - check_pairs(ctrl_coords, coords) + assert ctrl_coords == coords + edge_ids = [(path[i],path[i+1]) for i in range(len(path)-1)] + for eid in edge_ids: + print(eid) + assert eid in sg.edge_df.index.tolist() + + def test_plus_strand_unordered_gtf(self): + sg = process_gtf() + print(sg.t_df) + + # check transcript tid = 'ENST00000514436.1' path = sg.t_df.loc[tid, 'path'] print(path) coords = sg.loc_df.loc[path, 'coord'].tolist() ctrl_coords = [326096, 326569, 327348, 328112] - check_pairs(ctrl_coords, coords) + assert ctrl_coords == coords + edge_ids = [(path[i],path[i+1]) for i in range(len(path)-1)] + for eid in edge_ids: + print(eid) + assert eid in sg.edge_df.index.tolist() + + def test_unordered_gtf_numeric_coord_sort(self): + sg = process_gtf() + print(sg.t_df) + + # check transcript + tid = 'ENST00000620134.4' + path = sg.t_df.loc[tid, 'path'] + print(path) + coords = sg.loc_df.loc[path, 'coord'].tolist() + ctrl_coords = [138860, 138642, 138334, 138150, + 130591, 130522, 119255, 119126, 117375, 117301, + 112775, 112622, 110606, 110525, 100509, 100372, + 98301, 98145, 93325, 93219, 92725, 92596, 89902, + 89713, 88890, 88698, 86870, 85820, 84595, 84274,] + assert ctrl_coords == coords + edge_ids = [(path[i],path[i+1]) for i in range(len(path)-1)] + for eid in edge_ids: + print(eid) + assert eid in sg.edge_df.index.tolist() + + def test_weird_gtf_weird_chrom(self): + sg = process_gtf() + print(sg.t_df) + + # check transcript + tid = 'TALONT000206784' + path = sg.t_df.loc[tid, 'path'] + print(path) + coords = sg.loc_df.loc[path, 'coord'].tolist() + ctrl_coords = [115043, 114986, 112850, 112792, 62949, 61468] + assert ctrl_coords == coords + edge_ids = [(path[i],path[i+1]) for i in range(len(path)-1)] + for eid in edge_ids: + print(eid) + assert eid in sg.edge_df.index.tolist() + +def process_gtf(): + sg = swan.SwanGraph() + sg.add_dataset('test', 'input_files/weird_gtf_entries.gtf') + return sg def check_pairs(control, test): print('control')