kipoi / kipoiseq

Currently, UTR region inference works only for non-spliced UTR regions:

Lines 348 to 368 in e67fab6

    
           # get start and end of cds for each transcript 
        
           cds = CDSFetcher.get_cds_from_gtf(df=df, on_error_warn=on_error_warn) \ 
        
               .groupby('transcript_id') \ 
        
               .agg({'Start': min, 'End': max}) 
        
           # join cds start and end to utr df 
        
           utr_df = df.query("Feature == 'transcript'") \ 
        
               .set_index('transcript_id') \ 
        
               .join(cds, rsuffix="_cds") \ 
        
               .dropna(subset=['Start_cds', 'End_cds'], axis=0) 
        
           if feature_type.upper() == "5UTR": 
        
               utr_df['Start'] = np.where(utr_df['Strand'] == '+', int(utr_df['Start']), int(utr_df['End_cds'])) 
        
               utr_df['End'] = np.where(utr_df['Strand'] == '+', int(utr_df['Start_cds']), int(utr_df['End'])) 
        
               utr_df['Feature'] = pd.Categorical("5UTR", categories = utr_df['Feature']) 
        
           if feature_type.upper() == "3UTR": 
        
               utr_df['Start'] = np.where(utr_df['Strand'] == '+', int(utr_df['End_cds']), int(utr_df['Start'])) 
        
               utr_df['End'] = np.where(utr_df['Strand'] == '+', int(utr_df['End']), int(utr_df['Start_cds'])) 
        
               utr_df['Feature'] = pd.Categorical("3UTR", categories = utr_df['Feature']) 
        
           utr_df.drop(['Start_cds', 'End_cds'], axis=1, inplace=True)

TODO:

Generate test data with transcript that contains a spliced UTR, e.g.:
tabix /s/genomes/GenBank/hg38/annotation/hg38.ensGene.gtf.gz chr22 | grep -i ENST00000263207 > kipoiseq/tests/data/chr22_ENST00000263207.gtf
Simulate some variants:

chr22_ENST00000263207_3UTR.vcf.gz
chr22_ENST00000263207_5UTR.vcf.gz

Generate with infer_from_cds=False:

chr22_ENST00000263207_3UTR.alt_seqs.txt
chr22_ENST00000263207_3UTR.ref_seq.txt
chr22_ENST00000263207_5UTR.alt_seqs.txt
chr22_ENST00000263207_5UTR.ref_seq.txt

Update tests:

kipoiseq/tests/extractors/test_protein.py

Lines 323 to 358 in 1d72daf

    
           # chr22_fasta_file = 'tests/data/chr22.fa.gz' 
        
           chr22_gtf_file = 'tests/data/chr22_ENST00000319363.gtf' 
        
           # chr22_5UTR_vcf_file = 'tests/data/chr22_ENST00000319363_5UTR.vcf.gz' 
        
           def test_5UTRFetcher__read_utr(): 
        
               utr5 = UTRFetcher._read_utr(chr22_gtf_file, feature_type="5UTR") 
        
               assert utr5.shape == (1, 12) 
        
               assert utr5.iloc[0].Chromosome == 'chr22' 
        
               assert utr5.iloc[0].Start == 17565848 
        
               assert utr5.iloc[0].End == 17565981 
        
               assert utr5.iloc[0].Strand == "+" 
        
               utr5_from_cds = UTRFetcher._read_utr(chr22_gtf_file, feature_type="5UTR", infer_from_cds=True) 
        
               pd.testing.assert_frame_equal(left = utr5.drop(['exon_number', 'exon_id'], axis=1), right = utr5_from_cds.drop(['exon_number', 'exon_id'], axis=1), check_dtype=False) 
        
           def test_3UTRFetcher__read_utr(): 
        
               utr3 = UTRFetcher._read_utr(chr22_gtf_file, feature_type="3UTR") 
        
               assert utr3.shape == (1, 12) 
        
               assert utr3.iloc[0].Chromosome == 'chr22' 
        
               assert utr3.iloc[0].Start == 17590710 
        
               assert utr3.iloc[0].End == 17596583 
        
               assert utr3.iloc[0].Strand == "+" 
        
               utr3_from_cds = UTRFetcher._read_utr(chr22_gtf_file, feature_type="3UTR", infer_from_cds=True) 
        
               pd.testing.assert_frame_equal(left=utr3.drop(['exon_number', 'exon_id'], axis=1), 
        
                                             right=utr3_from_cds.drop(['exon_number', 'exon_id'], axis=1), check_dtype=False)

The correct solution would be to pyranges.intersect() the exons with transcript_start - CDS_start for 5'UTR and CDS_end - transcript_end for 3'UTR.
Also, make sure that the last / first exon of the UTRs is shrinked to CDS_start / CDS_end.

	# get start and end of cds for each transcript
	cds = CDSFetcher.get_cds_from_gtf(df=df, on_error_warn=on_error_warn) \
	.groupby('transcript_id') \
	.agg({'Start': min, 'End': max})

	# join cds start and end to utr df
	utr_df = df.query("Feature == 'transcript'") \
	.set_index('transcript_id') \
	.join(cds, rsuffix="_cds") \
	.dropna(subset=['Start_cds', 'End_cds'], axis=0)

	if feature_type.upper() == "5UTR":
	utr_df['Start'] = np.where(utr_df['Strand'] == '+', int(utr_df['Start']), int(utr_df['End_cds']))
	utr_df['End'] = np.where(utr_df['Strand'] == '+', int(utr_df['Start_cds']), int(utr_df['End']))
	utr_df['Feature'] = pd.Categorical("5UTR", categories = utr_df['Feature'])
	if feature_type.upper() == "3UTR":
	utr_df['Start'] = np.where(utr_df['Strand'] == '+', int(utr_df['End_cds']), int(utr_df['Start']))
	utr_df['End'] = np.where(utr_df['Strand'] == '+', int(utr_df['End']), int(utr_df['Start_cds']))
	utr_df['Feature'] = pd.Categorical("3UTR", categories = utr_df['Feature'])

	utr_df.drop(['Start_cds', 'End_cds'], axis=1, inplace=True)

	# chr22_fasta_file = 'tests/data/chr22.fa.gz'
	chr22_gtf_file = 'tests/data/chr22_ENST00000319363.gtf'


	# chr22_5UTR_vcf_file = 'tests/data/chr22_ENST00000319363_5UTR.vcf.gz'


	def test_5UTRFetcher__read_utr():
	utr5 = UTRFetcher._read_utr(chr22_gtf_file, feature_type="5UTR")

	assert utr5.shape == (1, 12)

	assert utr5.iloc[0].Chromosome == 'chr22'
	assert utr5.iloc[0].Start == 17565848
	assert utr5.iloc[0].End == 17565981
	assert utr5.iloc[0].Strand == "+"

	utr5_from_cds = UTRFetcher._read_utr(chr22_gtf_file, feature_type="5UTR", infer_from_cds=True)

	pd.testing.assert_frame_equal(left = utr5.drop(['exon_number', 'exon_id'], axis=1), right = utr5_from_cds.drop(['exon_number', 'exon_id'], axis=1), check_dtype=False)


	def test_3UTRFetcher__read_utr():
	utr3 = UTRFetcher._read_utr(chr22_gtf_file, feature_type="3UTR")

	assert utr3.shape == (1, 12)

	assert utr3.iloc[0].Chromosome == 'chr22'
	assert utr3.iloc[0].Start == 17590710
	assert utr3.iloc[0].End == 17596583
	assert utr3.iloc[0].Strand == "+"

	utr3_from_cds = UTRFetcher._read_utr(chr22_gtf_file, feature_type="3UTR", infer_from_cds=True)

	pd.testing.assert_frame_equal(left=utr3.drop(['exon_number', 'exon_id'], axis=1),
	right=utr3_from_cds.drop(['exon_number', 'exon_id'], axis=1), check_dtype=False)

infer_from_cds in UTRFetcher does not work correctly