|
#include "bcftools.pysam.h" |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
#include <stdio.h> |
|
#include <stdlib.h> |
|
#include <assert.h> |
|
#include <getopt.h> |
|
#include <math.h> |
|
#include <inttypes.h> |
|
#include <htslib/hts.h> |
|
#include <htslib/vcf.h> |
|
#include <htslib/synced_bcf_reader.h> |
|
#include <htslib/khash.h> |
|
#include <htslib/khash_str2int.h> |
|
#include <htslib/kseq.h> |
|
#include <htslib/faidx.h> |
|
#include <htslib/bgzf.h> |
|
#include <errno.h> |
|
#include <unistd.h> |
|
#include <ctype.h> |
|
#include "bcftools.h" |
|
#include "filter.h" |
|
#include "regidx.h" |
|
#include "kheap.h" |
|
#include "smpl_ilist.h" |
|
#include "rbuf.h" |
|
#include "gff.h" |
|
|
|
#ifndef __FUNCTION__ |
|
# define __FUNCTION__ __func__ |
|
#endif |
|
|
|
|
|
#define FLT_INCLUDE 1 |
|
#define FLT_EXCLUDE 2 |
|
|
|
#define N_REF_PAD 10 |
|
|
|
|
|
#define PHASE_REQUIRE 0 |
|
#define PHASE_MERGE 1 |
|
#define PHASE_AS_IS 2 |
|
#define PHASE_SKIP 3 |
|
#define PHASE_NON_REF 4 |
|
#define PHASE_DROP_GT 5 |
|
|
|
|
|
#define HAP_CDS 0 |
|
#define HAP_ROOT 1 |
|
#define HAP_SSS 2 |
|
|
|
#define CSQ_PRINTED_UPSTREAM (1<<0) |
|
#define CSQ_SYNONYMOUS_VARIANT (1<<1) |
|
#define CSQ_MISSENSE_VARIANT (1<<2) |
|
#define CSQ_STOP_LOST (1<<3) |
|
#define CSQ_STOP_GAINED (1<<4) |
|
#define CSQ_INFRAME_DELETION (1<<5) |
|
#define CSQ_INFRAME_INSERTION (1<<6) |
|
#define CSQ_FRAMESHIFT_VARIANT (1<<7) |
|
#define CSQ_SPLICE_ACCEPTOR (1<<8) |
|
#define CSQ_SPLICE_DONOR (1<<9) |
|
#define CSQ_START_LOST (1<<10) |
|
#define CSQ_SPLICE_REGION (1<<11) |
|
#define CSQ_STOP_RETAINED (1<<12) |
|
#define CSQ_UTR5 (1<<13) |
|
#define CSQ_UTR3 (1<<14) |
|
#define CSQ_NON_CODING (1<<15) |
|
#define CSQ_INTRON (1<<16) |
|
|
|
#define CSQ_INFRAME_ALTERING (1<<18) |
|
#define CSQ_UPSTREAM_STOP (1<<19) |
|
#define CSQ_INCOMPLETE_CDS (1<<20) |
|
#define CSQ_CODING_SEQUENCE (1<<21) |
|
#define CSQ_ELONGATION (1<<22) |
|
#define CSQ_START_RETAINED (1<<23) |
|
|
|
|
|
#define CSQ_COMPOUND (CSQ_SYNONYMOUS_VARIANT|CSQ_MISSENSE_VARIANT|CSQ_STOP_LOST|CSQ_STOP_GAINED| \ |
|
CSQ_INFRAME_DELETION|CSQ_INFRAME_INSERTION|CSQ_FRAMESHIFT_VARIANT| \ |
|
CSQ_START_LOST|CSQ_STOP_RETAINED|CSQ_INFRAME_ALTERING|CSQ_INCOMPLETE_CDS| \ |
|
CSQ_UPSTREAM_STOP|CSQ_START_RETAINED) |
|
#define CSQ_START_STOP (CSQ_STOP_LOST|CSQ_STOP_GAINED|CSQ_STOP_RETAINED|CSQ_START_LOST|CSQ_START_RETAINED) |
|
|
|
#define CSQ_PRN_STRAND(csq) ((csq)&CSQ_COMPOUND && !((csq)&(CSQ_SPLICE_ACCEPTOR|CSQ_SPLICE_DONOR|CSQ_SPLICE_REGION))) |
|
#define CSQ_PRN_TSCRIPT (~(CSQ_INTRON|CSQ_NON_CODING)) |
|
#define CSQ_PRN_NMD (~(CSQ_INTRON|CSQ_NON_CODING)) |
|
#define CSQ_PRN_BIOTYPE CSQ_NON_CODING |
|
|
|
|
|
const char *csq_strings[] = |
|
{ |
|
NULL, |
|
"synonymous", |
|
"missense", |
|
"stop_lost", |
|
"stop_gained", |
|
"inframe_deletion", |
|
"inframe_insertion", |
|
"frameshift", |
|
"splice_acceptor", |
|
"splice_donor", |
|
"start_lost", |
|
"splice_region", |
|
"stop_retained", |
|
"5_prime_utr", |
|
"3_prime_utr", |
|
"non_coding", |
|
"intron", |
|
"intergenic", |
|
"inframe_altering", |
|
NULL, |
|
NULL, |
|
"coding_sequence", |
|
"feature_elongation", |
|
"start_retained" |
|
}; |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
typedef struct _vbuf_t vbuf_t; |
|
typedef struct _vcsq_t vcsq_t; |
|
struct _vcsq_t |
|
{ |
|
uint32_t strand:1, |
|
type:31; |
|
uint32_t trid; |
|
uint32_t vcf_ial; |
|
uint32_t biotype; |
|
char *gene; |
|
bcf1_t *ref; |
|
kstring_t vstr; |
|
}; |
|
typedef struct |
|
{ |
|
bcf1_t *line; |
|
uint32_t *fmt_bm; |
|
uint32_t nfmt:4, |
|
nvcsq:28, mvcsq; |
|
vcsq_t *vcsq; |
|
} |
|
vrec_t; |
|
typedef struct |
|
{ |
|
uint32_t pos; |
|
vrec_t *vrec; |
|
int idx; |
|
vcsq_t type; |
|
} |
|
csq_t; |
|
struct _vbuf_t |
|
{ |
|
vrec_t **vrec; |
|
int n, m; |
|
uint32_t keep_until; |
|
}; |
|
KHASH_MAP_INIT_INT(pos2vbuf, vbuf_t*) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
typedef struct _hap_node_t hap_node_t; |
|
struct _hap_node_t |
|
{ |
|
char *seq; |
|
char *var; |
|
uint32_t type:2, |
|
csq:30; |
|
int dlen; |
|
uint32_t rbeg; |
|
int32_t rlen; |
|
uint32_t sbeg; |
|
uint32_t icds; |
|
hap_node_t **child, *prev; |
|
int nchild, mchild; |
|
bcf1_t *cur_rec, *rec; |
|
int vcf_ial; |
|
uint32_t nend; |
|
int *cur_child, mcur_child; |
|
csq_t *csq_list; |
|
int ncsq_list, mcsq_list; |
|
}; |
|
#define TSCRIPT_AUX(x) ((tscript_t*)(x)->aux) |
|
typedef struct |
|
{ |
|
char *ref; |
|
char *sref; |
|
hap_node_t *root; |
|
hap_node_t **hap; |
|
int nhap, nsref; |
|
} |
|
tscript_t; |
|
static inline int cmp_tscript(gf_tscript_t **a, gf_tscript_t **b) |
|
{ |
|
return ( (*a)->end < (*b)->end ) ? 1 : 0; |
|
} |
|
KHEAP_INIT(trhp, gf_tscript_t*, cmp_tscript) |
|
typedef khp_trhp_t tr_heap_t; |
|
typedef struct |
|
{ |
|
hap_node_t *node; |
|
int ichild; |
|
int dlen; |
|
size_t slen; |
|
} |
|
hstack_t; |
|
typedef struct |
|
{ |
|
int mstack; |
|
hstack_t *stack; |
|
gf_tscript_t *tr; |
|
kstring_t sseq; |
|
kstring_t tseq; |
|
kstring_t tref; |
|
uint32_t sbeg; |
|
int upstream_stop; |
|
} |
|
hap_t; |
|
|
|
typedef struct _args_t |
|
{ |
|
|
|
|
|
gff_t *gff; |
|
regidx_t *idx_cds, *idx_utr, *idx_exon, *idx_tscript; |
|
regitr_t *itr; |
|
|
|
|
|
FILE *out; |
|
htsFile *out_fh; |
|
char *index_fn; |
|
int write_index; |
|
char *dump_gff; |
|
|
|
|
|
bcf_srs_t *sr; |
|
bcf_hdr_t *hdr; |
|
int hdr_nsmpl; |
|
|
|
|
|
filter_t *filter; |
|
char *filter_str; |
|
int filter_logic; |
|
|
|
|
|
int sample_is_file; |
|
char *sample_list; |
|
smpl_ilist_t *smpl; |
|
|
|
char *outdir, **argv, *fa_fname, *gff_fname, *output_fname; |
|
char *bcsq_tag; |
|
int argc, output_type, clevel; |
|
int phase, verbosity, local_csq, record_cmd_line; |
|
int ncsq2_max, nfmt_bcsq; |
|
int ncsq2_small_warned; |
|
int brief_predictions; |
|
int unify_chr_names; |
|
char *chr_name; |
|
int mchr_name; |
|
struct { |
|
int unknown_chr,unknown_tscript_biotype,unknown_strand,unknown_phase,duplicate_id; |
|
int unknown_cds_phase,incomplete_cds,wrong_phase,overlapping_cds; |
|
} warned; |
|
|
|
int rid; |
|
tr_heap_t *active_tr; |
|
hap_t *hap; |
|
vbuf_t **vcf_buf; |
|
rbuf_t vcf_rbuf; |
|
kh_pos2vbuf_t *pos2vbuf; |
|
gf_tscript_t **rm_tr; |
|
int nrm_tr, mrm_tr; |
|
csq_t *csq_buf; |
|
int ncsq_buf, mcsq_buf; |
|
int force; |
|
int n_threads; |
|
|
|
faidx_t *fai; |
|
kstring_t str, str2; |
|
int32_t *gt_arr, mgt_arr; |
|
} |
|
args_t; |
|
|
|
|
|
const char *gencode = "KNKNTTTTRSRSIIMIQHQHPPPPRRRRLLLLEDEDAAAAGGGGVVVV*Y*YSSSS*CWCLFLF"; |
|
const uint8_t nt4[] = |
|
{ |
|
4,4,4,4, 4,4,4,4, 4,4,4,4, 4,4,4,4, |
|
4,4,4,4, 4,4,4,4, 4,4,4,4, 4,4,4,4, |
|
4,4,4,4, 4,4,4,4, 4,4,4,4, 4,4,4,4, |
|
4,4,4,4, 4,4,4,4, 4,4,4,4, 4,4,4,4, |
|
4,0,4,1, 4,4,4,2, 4,4,4,4, 4,4,4,4, |
|
4,4,4,4, 3,4,4,4, 4,4,4,4, 4,4,4,4, |
|
4,0,4,1, 4,4,4,2, 4,4,4,4, 4,4,4,4, |
|
4,4,4,4, 3 |
|
}; |
|
const uint8_t cnt4[] = |
|
{ |
|
4,4,4,4, 4,4,4,4, 4,4,4,4, 4,4,4,4, |
|
4,4,4,4, 4,4,4,4, 4,4,4,4, 4,4,4,4, |
|
4,4,4,4, 4,4,4,4, 4,4,4,4, 4,4,4,4, |
|
4,4,4,4, 4,4,4,4, 4,4,4,4, 4,4,4,4, |
|
4,3,4,2, 4,4,4,1, 4,4,4,4, 4,4,4,4, |
|
4,4,4,4, 0,4,4,4, 4,4,4,4, 4,4,4,4, |
|
4,3,4,2, 4,4,4,1, 4,4,4,4, 4,4,4,4, |
|
4,4,4,4, 0 |
|
}; |
|
#define dna2aa(x) gencode[ nt4[(uint8_t)(x)[0]]<<4 | nt4[(uint8_t)(x)[1]]<<2 | nt4[(uint8_t)(x)[2]] ] |
|
#define cdna2aa(x) gencode[ cnt4[(uint8_t)(x)[2]]<<4 | cnt4[(uint8_t)(x)[1]]<<2 | cnt4[(uint8_t)(x)[0]] ] |
|
|
|
static inline int ncsq2_to_nfmt(int ncsq2) |
|
{ |
|
return 1 + (ncsq2 - 1) / 30; |
|
} |
|
static inline void icsq2_to_bit(int icsq2, int *ival, int *ibit) |
|
{ |
|
*ival = icsq2 / 30; |
|
*ibit = icsq2 % 30; |
|
} |
|
|
|
void init_data(args_t *args) |
|
{ |
|
args->nfmt_bcsq = ncsq2_to_nfmt(args->ncsq2_max); |
|
|
|
args->fai = fai_load(args->fa_fname); |
|
if ( !args->fai ) error("Failed to load the fai index: %s\n", args->fa_fname); |
|
|
|
args->gff = gff_init(args->gff_fname); |
|
gff_set(args->gff,verbosity,args->verbosity); |
|
gff_set(args->gff,strip_chr_names,args->unify_chr_names); |
|
gff_set(args->gff,force_out_of_phase,args->force); |
|
gff_set(args->gff,dump_fname,args->dump_gff); |
|
gff_parse(args->gff); |
|
args->idx_cds = gff_get(args->gff,idx_cds); |
|
args->idx_utr = gff_get(args->gff,idx_utr); |
|
args->idx_exon = gff_get(args->gff,idx_exon); |
|
args->idx_tscript = gff_get(args->gff,idx_tscript); |
|
args->itr = regitr_init(NULL); |
|
|
|
args->rid = -1; |
|
|
|
if ( args->filter_str ) |
|
args->filter = filter_init(args->hdr, args->filter_str); |
|
|
|
args->pos2vbuf = kh_init(pos2vbuf); |
|
args->active_tr = khp_init(trhp); |
|
args->hap = (hap_t*) calloc(1,sizeof(hap_t)); |
|
|
|
|
|
if ( !bcf_hdr_nsamples(args->hdr) ) args->phase = PHASE_DROP_GT; |
|
if ( args->sample_list && !strcmp("-",args->sample_list) ) |
|
{ |
|
|
|
if ( args->output_type==FT_TAB_TEXT ) |
|
{ |
|
|
|
if (bcf_hdr_set_samples(args->hdr,NULL,0) < 0) |
|
error_errno("[%s] Couldn't build sample filter", __func__); |
|
} |
|
args->phase = PHASE_DROP_GT; |
|
} |
|
else |
|
args->smpl = smpl_ilist_init(args->hdr, args->sample_list, args->sample_is_file, SMPL_STRICT); |
|
args->hdr_nsmpl = args->phase==PHASE_DROP_GT ? 0 : bcf_hdr_nsamples(args->hdr); |
|
|
|
if ( args->output_type==FT_TAB_TEXT ) |
|
{ |
|
args->out = args->output_fname ? fopen(args->output_fname,"w") : bcftools_stdout; |
|
if ( !args->out ) error("Failed to write to %s: %s\n", !strcmp("-",args->output_fname)?"standard output":args->output_fname,strerror(errno)); |
|
|
|
fprintf(args->out,"# This file was produced by: bcftools +csq(%s+htslib-%s)\n", bcftools_version(),hts_version()); |
|
fprintf(args->out,"# The command line was:\tbcftools +%s", args->argv[0]); |
|
int i; |
|
for (i=1; i<args->argc; i++) |
|
fprintf(args->out," %s",args->argv[i]); |
|
fprintf(args->out,"\n"); |
|
fprintf(args->out,"# LOG\t[2]Message\n"); |
|
fprintf(args->out,"# CSQ"); i = 1; |
|
fprintf(args->out,"\t[%d]Sample", ++i); |
|
fprintf(args->out,"\t[%d]Haplotype", ++i); |
|
fprintf(args->out,"\t[%d]Chromosome", ++i); |
|
fprintf(args->out,"\t[%d]Position", ++i); |
|
fprintf(args->out,"\t[%d]Consequence", ++i); |
|
fprintf(args->out,"\n"); |
|
} |
|
else |
|
{ |
|
char wmode[8]; |
|
set_wmode(wmode,args->output_type,args->output_fname,args->clevel); |
|
args->out_fh = hts_open(args->output_fname ? args->output_fname : "-", wmode); |
|
if ( args->out_fh == NULL ) error("[%s] Error: cannot write to %s: %s\n", __func__,args->output_fname? args->output_fname : "standard output", strerror(errno)); |
|
if ( args->n_threads > 0) |
|
hts_set_opt(args->out_fh, HTS_OPT_THREAD_POOL, args->sr->p); |
|
if ( args->record_cmd_line ) bcf_hdr_append_version(args->hdr,args->argc,args->argv,"bcftools/csq"); |
|
bcf_hdr_printf(args->hdr,"##INFO=<ID=%s,Number=.,Type=String,Description=\"%s consequence annotation from BCFtools/csq, see http://samtools.github.io/bcftools/howtos/csq-calling.html for details. Format: Consequence|gene|transcript|biotype|strand|amino_acid_change|dna_change\">",args->bcsq_tag, args->local_csq ? "Local" : "Haplotype-aware"); |
|
if ( args->hdr_nsmpl ) |
|
bcf_hdr_printf(args->hdr,"##FORMAT=<ID=%s,Number=.,Type=Integer,Description=\"Bitmask of indexes to INFO/BCSQ, with interleaved first/second haplotype. Use \\\"bcftools query -f'[%%CHROM\\t%%POS\\t%%SAMPLE\\t%%TBCSQ\\n]'\\\" to translate.\">",args->bcsq_tag); |
|
if ( bcf_hdr_write(args->out_fh, args->hdr)!=0 ) error("[%s] Error: cannot write the header to %s\n", __func__,args->output_fname?args->output_fname:"standard output"); |
|
if ( args->write_index && init_index(args->out_fh,args->hdr,args->output_fname,&args->index_fn)<0 ) error("Error: failed to initialise index for %s\n",args->output_fname); |
|
} |
|
if ( args->verbosity > 0 ) fprintf(bcftools_stderr,"Calling...\n"); |
|
} |
|
|
|
void destroy_data(args_t *args) |
|
{ |
|
if ( args->ncsq2_small_warned ) |
|
fprintf(bcftools_stderr, |
|
"Note: Some samples had too many consequences to be represented in %d bytes. If you need to record them all,\n" |
|
" the limit can be increased by running with `--ncsq %d`.\n",ncsq2_to_nfmt(args->ncsq2_max)/8,1+args->ncsq2_small_warned/2); |
|
|
|
regitr_destroy(args->itr); |
|
gff_destroy(args->gff); |
|
|
|
if ( args->filter ) |
|
filter_destroy(args->filter); |
|
|
|
khp_destroy(trhp,args->active_tr); |
|
kh_destroy(pos2vbuf,args->pos2vbuf); |
|
if ( args->smpl ) smpl_ilist_destroy(args->smpl); |
|
int i,j,ret; |
|
if ( args->out_fh ) |
|
{ |
|
if ( args->write_index ) |
|
{ |
|
if ( bcf_idx_save(args->out_fh)<0 ) |
|
{ |
|
if ( hts_close(args->out_fh)!=0 ) error("Error: close failed .. %s\n", args->output_fname?args->output_fname:"bcftools_stdout"); |
|
error("Error: cannot write to index %s\n", args->index_fn); |
|
} |
|
free(args->index_fn); |
|
} |
|
ret = hts_close(args->out_fh); |
|
} |
|
else |
|
ret = fclose(args->out); |
|
if ( ret ) error("Error: close failed .. %s\n", args->output_fname?args->output_fname:"bcftools_stdout"); |
|
for (i=0; i<args->vcf_rbuf.m; i++) |
|
{ |
|
vbuf_t *vbuf = args->vcf_buf[i]; |
|
if ( !vbuf ) continue; |
|
for (j=0; j<vbuf->m; j++) |
|
{ |
|
if ( !vbuf->vrec[j] ) continue; |
|
if ( vbuf->vrec[j]->line ) bcf_destroy(vbuf->vrec[j]->line); |
|
free(vbuf->vrec[j]->fmt_bm); |
|
free(vbuf->vrec[j]->vcsq); |
|
free(vbuf->vrec[j]); |
|
} |
|
free(vbuf->vrec); |
|
free(vbuf); |
|
} |
|
free(args->vcf_buf); |
|
free(args->rm_tr); |
|
free(args->csq_buf); |
|
free(args->hap->stack); |
|
free(args->hap->sseq.s); |
|
free(args->hap->tseq.s); |
|
free(args->hap->tref.s); |
|
free(args->hap); |
|
fai_destroy(args->fai); |
|
free(args->gt_arr); |
|
free(args->str.s); |
|
free(args->str2.s); |
|
free(args->chr_name); |
|
} |
|
|
|
|
|
|
|
|
|
#define SPLICE_VAR_REF 0 |
|
#define SPLICE_OUTSIDE 1 |
|
#define SPLICE_INSIDE 2 |
|
#define SPLICE_OVERLAP 3 |
|
typedef struct |
|
{ |
|
gf_tscript_t *tr; |
|
struct { |
|
int32_t pos, rlen, alen, ial; |
|
char *ref, *alt; |
|
bcf1_t *rec; |
|
} vcf; |
|
uint16_t check_acceptor:1, |
|
check_start:1, |
|
check_stop:1, |
|
check_donor:1, |
|
check_region_beg:1, |
|
check_region_end:1, |
|
check_utr:1, |
|
set_refalt:1; |
|
uint32_t csq; |
|
int tbeg, tend; |
|
uint32_t ref_beg, |
|
ref_end; |
|
kstring_t kref, kalt; |
|
} |
|
splice_t; |
|
void splice_init(splice_t *splice, bcf1_t *rec) |
|
{ |
|
memset(splice,0,sizeof(*splice)); |
|
splice->vcf.rec = rec; |
|
splice->vcf.pos = rec->pos; |
|
splice->vcf.rlen = rec->rlen; |
|
splice->vcf.ref = rec->d.allele[0]; |
|
splice->csq = 0; |
|
} |
|
static inline void splice_build_hap(splice_t *splice, uint32_t beg, int len) |
|
{ |
|
|
|
|
|
|
|
int rlen, alen, rbeg, abeg; |
|
if ( len<0 ) |
|
{ |
|
rlen = alen = -len; |
|
rbeg = beg - rlen + 1; |
|
int dlen = splice->vcf.alen - splice->vcf.rlen; |
|
if ( dlen<0 && beg < splice->ref_end ) |
|
dlen += splice->ref_end - beg; |
|
abeg = rbeg + dlen; |
|
} |
|
else |
|
{ |
|
rbeg = abeg = beg; |
|
rlen = alen = len; |
|
|
|
} |
|
|
|
#define XDBG 0 |
|
#if XDBG |
|
fprintf(bcftools_stderr,"build_hap: rbeg=%d + %d abeg=%d \n",rbeg,rlen,abeg); |
|
#endif |
|
splice->kref.l = 0; |
|
splice->kalt.l = 0; |
|
|
|
|
|
int roff; |
|
if ( rbeg < splice->vcf.pos ) |
|
{ |
|
assert( splice->tr->beg <= rbeg ); |
|
kputsn(TSCRIPT_AUX(splice->tr)->ref + N_REF_PAD + rbeg - splice->tr->beg, splice->vcf.pos - rbeg, &splice->kref); |
|
roff = 0; |
|
} |
|
else |
|
roff = rbeg - splice->vcf.pos; |
|
#if XDBG |
|
fprintf(bcftools_stderr,"r1: %s roff=%d\n",splice->kref.s,roff); |
|
#endif |
|
|
|
if ( roff < splice->vcf.rlen && splice->kref.l < rlen ) |
|
{ |
|
int len = splice->vcf.rlen - roff; |
|
if ( len > rlen - splice->kref.l ) len = rlen - splice->kref.l; |
|
kputsn(splice->vcf.ref + roff, len, &splice->kref); |
|
} |
|
#if XDBG |
|
fprintf(bcftools_stderr,"r2: %s\n",splice->kref.s); |
|
#endif |
|
|
|
uint32_t end = splice->vcf.pos + splice->vcf.rlen; |
|
if ( splice->kref.l < rlen ) |
|
{ |
|
if ( end + rlen - splice->kref.l - 1 > splice->tr->end ) |
|
rlen -= end + rlen - splice->kref.l - 1 - splice->tr->end; |
|
if ( splice->kref.l < rlen ) |
|
kputsn(TSCRIPT_AUX(splice->tr)->ref + N_REF_PAD + end - splice->tr->beg, rlen - splice->kref.l, &splice->kref); |
|
} |
|
#if XDBG |
|
fprintf(bcftools_stderr,"r3: %s\n",splice->kref.s); |
|
#endif |
|
|
|
|
|
int aoff; |
|
if ( abeg < splice->vcf.pos ) |
|
{ |
|
assert( splice->tr->beg <= abeg ); |
|
kputsn(TSCRIPT_AUX(splice->tr)->ref + N_REF_PAD + abeg - splice->tr->beg, splice->vcf.pos - abeg, &splice->kalt); |
|
aoff = 0; |
|
} |
|
else |
|
aoff = abeg - splice->vcf.pos; |
|
#if XDBG |
|
fprintf(bcftools_stderr,"a1: %s aoff=%d\n",splice->kalt.s,aoff); |
|
#endif |
|
|
|
if ( aoff < splice->vcf.alen && splice->kalt.l < alen ) |
|
{ |
|
int len = splice->vcf.alen - aoff; |
|
if ( len > alen - splice->kalt.l ) len = alen - splice->kalt.l; |
|
kputsn(splice->vcf.alt + aoff, len, &splice->kalt); |
|
aoff -= len; |
|
} |
|
if ( aoff < 0 ) aoff = 0; |
|
else aoff--; |
|
#if XDBG |
|
fprintf(bcftools_stderr,"a2: %s aoff=%d\n",splice->kalt.s,aoff); |
|
#endif |
|
|
|
end = splice->vcf.pos + splice->vcf.rlen; |
|
if ( splice->kalt.l < alen ) |
|
{ |
|
if ( end + alen + aoff - splice->kalt.l - 1 > splice->tr->end ) |
|
alen -= end + alen + aoff - splice->kalt.l - 1 - splice->tr->end; |
|
if ( alen > 0 && alen > splice->kalt.l ) |
|
kputsn(TSCRIPT_AUX(splice->tr)->ref + aoff + N_REF_PAD + end - splice->tr->beg, alen - splice->kalt.l, &splice->kalt); |
|
} |
|
#if XDBG |
|
fprintf(bcftools_stderr,"a3: %s\n",splice->kalt.s); |
|
fprintf(bcftools_stderr," [%s]\n [%s]\n\n",splice->kref.s,splice->kalt.s); |
|
#endif |
|
} |
|
void csq_stage(args_t *args, csq_t *csq, bcf1_t *rec); |
|
static inline int csq_stage_utr(args_t *args, regitr_t *itr, bcf1_t *rec, uint32_t trid, uint32_t type, int ial) |
|
{ |
|
while ( regitr_overlap(itr) ) |
|
{ |
|
gf_utr_t *utr = regitr_payload(itr, gf_utr_t*); |
|
gf_tscript_t *tr = utr->tr; |
|
if ( tr->id != trid ) continue; |
|
csq_t csq; |
|
memset(&csq, 0, sizeof(csq_t)); |
|
csq.pos = rec->pos; |
|
csq.type.type = (utr->which==prime5 ? CSQ_UTR5 : CSQ_UTR3) | type; |
|
csq.type.biotype = tr->type; |
|
csq.type.strand = tr->strand; |
|
csq.type.trid = tr->id; |
|
csq.type.vcf_ial = ial; |
|
csq.type.gene = tr->gene->name; |
|
csq_stage(args, &csq, rec); |
|
return csq.type.type; |
|
} |
|
return 0; |
|
} |
|
static inline void csq_stage_splice(args_t *args, bcf1_t *rec, gf_tscript_t *tr, uint32_t type, int ial) |
|
{ |
|
#if XDBG |
|
fprintf(bcftools_stderr,"csq_stage_splice %d: type=%d\n",rec->pos+1,type); |
|
#endif |
|
if ( !type ) return; |
|
csq_t csq; |
|
memset(&csq, 0, sizeof(csq_t)); |
|
csq.pos = rec->pos; |
|
csq.type.type = type; |
|
csq.type.biotype = tr->type; |
|
csq.type.strand = tr->strand; |
|
csq.type.trid = tr->id; |
|
csq.type.vcf_ial = ial; |
|
csq.type.gene = tr->gene->name; |
|
csq_stage(args, &csq, rec); |
|
} |
|
static inline const char *drop_chr_prefix(args_t *args, const char *chr) |
|
{ |
|
if ( !args->unify_chr_names ) return chr; |
|
if ( !strncasecmp("chr",chr,3) ) return chr+3; |
|
return chr; |
|
} |
|
static inline const char *add_chr_prefix(args_t *args, const char *chr) |
|
{ |
|
if ( !args->unify_chr_names ) return chr; |
|
int len = strlen(chr); |
|
hts_expand(char,len+4,args->mchr_name,args->chr_name); |
|
memcpy(args->chr_name,"chr",3); |
|
memcpy(args->chr_name+3,chr,len+1); |
|
return args->chr_name; |
|
} |
|
static inline int splice_csq_ins(args_t *args, splice_t *splice, uint32_t ex_beg, uint32_t ex_end) |
|
{ |
|
|
|
|
|
if ( splice->tbeg || splice->vcf.ref[0]!=splice->vcf.alt[0] ) |
|
{ |
|
splice->ref_beg = splice->vcf.pos + splice->tbeg - 1; |
|
splice->ref_end = splice->vcf.pos + splice->vcf.rlen - splice->tend; |
|
} |
|
else |
|
{ |
|
if ( splice->tend ) splice->tend--; |
|
splice->ref_beg = splice->vcf.pos; |
|
splice->ref_end = splice->vcf.pos + splice->vcf.rlen - splice->tend; |
|
} |
|
#if XDBG |
|
fprintf(bcftools_stderr,"ins: %s>%s .. ex=%d,%d beg,end=%d,%d tbeg,tend=%d,%d check_utr=%d start,stop,beg,end=%d,%d,%d,%d\n", splice->vcf.ref,splice->vcf.alt,ex_beg,ex_end,splice->ref_beg,splice->ref_end,splice->tbeg,splice->tend,splice->check_utr,splice->check_start,splice->check_stop,splice->check_region_beg,splice->check_region_end); |
|
#endif |
|
|
|
int ret; |
|
if ( splice->ref_beg >= ex_end ) |
|
{ |
|
if ( splice->check_utr ) |
|
{ |
|
regitr_t *itr = regitr_init(NULL); |
|
const char *chr = drop_chr_prefix(args, bcf_seqname(args->hdr,splice->vcf.rec)); |
|
if ( regidx_overlap(args->idx_utr,chr,splice->ref_beg+1,splice->ref_beg+1, itr) ) |
|
{ |
|
ret = csq_stage_utr(args, itr, splice->vcf.rec, splice->tr->id, splice->csq, splice->vcf.ial); |
|
if ( ret!=0 ) |
|
{ |
|
regitr_destroy(itr); |
|
return SPLICE_OUTSIDE; |
|
} |
|
} |
|
regitr_destroy(itr); |
|
} |
|
if ( !splice->check_region_end ) return SPLICE_OUTSIDE; |
|
char *ref = NULL, *alt = NULL; |
|
if ( splice->set_refalt ) |
|
{ |
|
splice_build_hap(splice, ex_end+1, N_SPLICE_REGION_INTRON); |
|
ref = splice->kref.s, alt = splice->kalt.s; |
|
} |
|
if ( splice->ref_beg < ex_end + N_SPLICE_REGION_INTRON && splice->ref_end > ex_end + N_SPLICE_DONOR ) |
|
{ |
|
splice->csq |= CSQ_SPLICE_REGION; |
|
if ( ref && !strncmp(ref,alt,N_SPLICE_REGION_INTRON) ) splice->csq |= CSQ_SYNONYMOUS_VARIANT; |
|
} |
|
if ( splice->ref_beg < ex_end + N_SPLICE_DONOR ) |
|
{ |
|
if ( splice->check_donor && splice->tr->strand==STRAND_FWD ) splice->csq |= CSQ_SPLICE_DONOR; |
|
if ( splice->check_acceptor && splice->tr->strand==STRAND_REV ) splice->csq |= CSQ_SPLICE_ACCEPTOR; |
|
if ( ref && !strncmp(ref,alt,N_SPLICE_DONOR) ) splice->csq |= CSQ_SYNONYMOUS_VARIANT; |
|
} |
|
csq_stage_splice(args, splice->vcf.rec, splice->tr, splice->csq, splice->vcf.ial); |
|
return SPLICE_OUTSIDE; |
|
} |
|
if ( splice->ref_end < ex_beg || (splice->ref_end == ex_beg && !splice->check_region_beg) ) |
|
{ |
|
if ( splice->check_utr ) |
|
{ |
|
regitr_t *itr = regitr_init(NULL); |
|
const char *chr = drop_chr_prefix(args, bcf_seqname(args->hdr,splice->vcf.rec)); |
|
if ( regidx_overlap(args->idx_utr,chr,splice->ref_end-1,splice->ref_end-1, itr) ) |
|
{ |
|
ret = csq_stage_utr(args, itr, splice->vcf.rec, splice->tr->id, splice->csq, splice->vcf.ial); |
|
if ( ret!=0 ) |
|
{ |
|
regitr_destroy(itr); |
|
return SPLICE_OUTSIDE; |
|
} |
|
} |
|
regitr_destroy(itr); |
|
} |
|
if ( !splice->check_region_beg ) return SPLICE_OUTSIDE; |
|
char *ref = NULL, *alt = NULL; |
|
if ( splice->set_refalt ) |
|
{ |
|
splice_build_hap(splice, ex_beg - N_SPLICE_REGION_INTRON, N_SPLICE_REGION_INTRON); |
|
ref = splice->kref.s, alt = splice->kalt.s; |
|
} |
|
if ( splice->ref_end > ex_beg - N_SPLICE_REGION_INTRON && splice->ref_beg < ex_beg - N_SPLICE_DONOR ) |
|
{ |
|
splice->csq |= CSQ_SPLICE_REGION; |
|
if ( ref && !strncmp(ref,alt,N_SPLICE_REGION_INTRON) ) splice->csq |= CSQ_SYNONYMOUS_VARIANT; |
|
} |
|
if ( splice->ref_end > ex_beg - N_SPLICE_DONOR ) |
|
{ |
|
if ( splice->check_donor && splice->tr->strand==STRAND_REV ) splice->csq |= CSQ_SPLICE_DONOR; |
|
if ( splice->check_acceptor && splice->tr->strand==STRAND_FWD ) splice->csq |= CSQ_SPLICE_ACCEPTOR; |
|
if ( ref && !strncmp(ref+N_SPLICE_REGION_INTRON-N_SPLICE_DONOR,alt+N_SPLICE_REGION_INTRON-N_SPLICE_DONOR,N_SPLICE_DONOR) ) splice->csq |= CSQ_SYNONYMOUS_VARIANT; |
|
} |
|
csq_stage_splice(args, splice->vcf.rec, splice->tr, splice->csq, splice->vcf.ial); |
|
return SPLICE_OUTSIDE; |
|
} |
|
|
|
|
|
if ( splice->ref_beg <= ex_beg + 2 ) |
|
{ |
|
if ( splice->check_region_beg ) splice->csq |= CSQ_SPLICE_REGION; |
|
if ( splice->tr->strand==STRAND_FWD ) { if ( splice->check_start ) splice->csq |= CSQ_START_LOST; } |
|
else { if ( splice->check_stop ) splice->csq |= CSQ_STOP_LOST; } |
|
} |
|
if ( splice->ref_end > ex_end - 2 ) |
|
{ |
|
if ( splice->check_region_end ) splice->csq |= CSQ_SPLICE_REGION; |
|
if ( splice->tr->strand==STRAND_REV ) { if ( splice->check_start ) splice->csq |= CSQ_START_LOST; } |
|
else { if ( splice->check_stop ) splice->csq |= CSQ_STOP_LOST; } |
|
} |
|
if ( splice->set_refalt ) |
|
{ |
|
|
|
|
|
|
|
|
|
if ( splice->ref_beg < splice->vcf.pos ) |
|
{ |
|
int dlen = splice->vcf.pos - splice->ref_beg; |
|
assert( dlen==1 ); |
|
splice->tbeg += dlen; |
|
if ( splice->tbeg + splice->tend == splice->vcf.rlen ) splice->tend -= dlen; |
|
splice->ref_beg = splice->vcf.pos; |
|
} |
|
if ( splice->ref_end==ex_beg ) splice->tend--; |
|
splice_build_hap(splice, splice->ref_beg, splice->vcf.alen - splice->tend - splice->tbeg + 1); |
|
splice->vcf.rlen -= splice->tbeg + splice->tend - 1; |
|
if ( splice->kref.l > splice->vcf.rlen ) { splice->kref.l = splice->vcf.rlen; splice->kref.s[splice->kref.l] = 0; } |
|
} |
|
csq_stage_splice(args, splice->vcf.rec, splice->tr, splice->csq, splice->vcf.ial); |
|
return SPLICE_INSIDE; |
|
} |
|
|
|
int shifted_del_synonymous(args_t *args, splice_t *splice, uint32_t ex_beg, uint32_t ex_end) |
|
{ |
|
static int small_ref_padding_warned = 0; |
|
gf_tscript_t *tr = splice->tr; |
|
|
|
|
|
if ( tr->strand==STRAND_REV && splice->vcf.pos + splice->vcf.rlen + 2 <= ex_end ) return 0; |
|
if ( tr->strand==STRAND_FWD && splice->vcf.pos >= ex_beg + 3 ) return 0; |
|
|
|
#if XDBG |
|
fprintf(bcftools_stderr,"shifted_del_synonymous: %d-%d %s\n",ex_beg,ex_end, tr->strand==STRAND_FWD?"fwd":"rev"); |
|
fprintf(bcftools_stderr," %d .. %s > %s\n",splice->vcf.pos+1,splice->vcf.ref,splice->vcf.alt); |
|
#endif |
|
|
|
|
|
int ref_len = strlen(splice->vcf.ref); |
|
int alt_len = strlen(splice->vcf.alt); |
|
assert( ref_len > alt_len ); |
|
int ndel = ref_len - alt_len; |
|
|
|
if ( tr->strand==STRAND_REV ) |
|
{ |
|
int32_t vcf_ref_end = splice->vcf.pos + ref_len - 1; |
|
int32_t tr_ref_end = splice->tr->end + N_REF_PAD; |
|
if ( vcf_ref_end + ndel > tr_ref_end ) |
|
{ |
|
if ( !small_ref_padding_warned ) |
|
{ |
|
fprintf(bcftools_stderr,"Warning: Could not verify synonymous start/stop at %s:%d due to small N_REF_PAD. (Improve me?)\n",bcf_seqname(args->hdr,splice->vcf.rec),splice->vcf.pos+1); |
|
small_ref_padding_warned = 1; |
|
} |
|
return 0; |
|
} |
|
|
|
char *ptr_vcf = splice->vcf.ref + alt_len; |
|
char *ptr_ref = TSCRIPT_AUX(splice->tr)->ref + N_REF_PAD + (vcf_ref_end + 1 - splice->tr->beg); |
|
#if XDBG |
|
fprintf(bcftools_stderr,"vcf: %s\nref: %s\n",ptr_vcf,ptr_ref); |
|
#endif |
|
int i = 0; |
|
while ( ptr_vcf[i] && ptr_vcf[i]==ptr_ref[i] ) i++; |
|
if ( ptr_vcf[i] ) return 0; |
|
} |
|
else |
|
{ |
|
|
|
int32_t vcf_block_beg = splice->vcf.pos + ref_len - 2*ndel; |
|
if ( vcf_block_beg < 0 ) return 0; |
|
|
|
#if XDBG |
|
fprintf(bcftools_stderr,"vcf_block_beg: %d\n",vcf_block_beg+1); |
|
#endif |
|
|
|
if ( N_REF_PAD + vcf_block_beg < ex_beg ) |
|
{ |
|
if ( !small_ref_padding_warned ) |
|
{ |
|
fprintf(bcftools_stderr,"Warning: Could not verify synonymous start/stop at %s:%d due to small N_REF_PAD. (Improve me?)\n",bcf_seqname(args->hdr,splice->vcf.rec),splice->vcf.pos+1); |
|
small_ref_padding_warned = 1; |
|
} |
|
return 0; |
|
} |
|
|
|
char *ptr_vcf = splice->vcf.ref + alt_len; |
|
char *ptr_ref = TSCRIPT_AUX(splice->tr)->ref + N_REF_PAD + vcf_block_beg - splice->tr->beg; |
|
#if XDBG |
|
fprintf(bcftools_stderr,"vcf: %s\nref: %s\n",ptr_vcf,ptr_ref); |
|
#endif |
|
|
|
int i = 0; |
|
while ( ptr_vcf[i] && ptr_vcf[i]==ptr_ref[i] ) i++; |
|
if ( ptr_vcf[i] ) return 0; |
|
} |
|
|
|
return 1; |
|
} |
|
|
|
static inline int splice_csq_del(args_t *args, splice_t *splice, uint32_t ex_beg, uint32_t ex_end) |
|
{ |
|
if ( splice->check_start ) |
|
{ |
|
|
|
|
|
|
|
|
|
int is_synonymous = shifted_del_synonymous(args, splice, ex_beg, ex_end); |
|
if ( is_synonymous ) |
|
{ |
|
splice->csq |= CSQ_START_RETAINED; |
|
return SPLICE_OVERLAP; |
|
} |
|
} |
|
|
|
|
|
splice->ref_beg = splice->vcf.pos + splice->tbeg - 1; |
|
splice->ref_end = splice->vcf.pos + splice->vcf.rlen - splice->tend - 1; |
|
|
|
#if XDBG |
|
fprintf(bcftools_stderr,"splice_csq_del: %s>%s .. ex=%d,%d beg,end=%d,%d tbeg,tend=%d,%d check_utr=%d start,stop,beg,end=%d,%d,%d,%d\n", splice->vcf.ref,splice->vcf.alt,ex_beg,ex_end,splice->ref_beg,splice->ref_end,splice->tbeg,splice->tend,splice->check_utr,splice->check_start,splice->check_stop,splice->check_region_beg,splice->check_region_end); |
|
#endif |
|
|
|
if ( splice->ref_beg + 1 < ex_beg ) |
|
{ |
|
if ( splice->check_region_beg ) |
|
{ |
|
int csq = 0; |
|
if ( splice->check_utr ) |
|
{ |
|
regitr_t *itr = regitr_init(NULL); |
|
const char *chr = drop_chr_prefix(args, bcf_seqname(args->hdr,splice->vcf.rec)); |
|
if ( regidx_overlap(args->idx_utr,chr,splice->ref_beg,ex_beg-1, itr) ) |
|
csq = csq_stage_utr(args, itr, splice->vcf.rec, splice->tr->id, splice->csq, splice->vcf.ial); |
|
regitr_destroy(itr); |
|
} |
|
if ( !csq ) |
|
{ |
|
char *ref = NULL, *alt = NULL; |
|
if ( splice->set_refalt ) |
|
{ |
|
|
|
|
|
|
|
|
|
|
|
|
|
splice_build_hap(splice, ex_beg - N_SPLICE_REGION_INTRON, N_SPLICE_REGION_INTRON); |
|
ref = splice->kref.s, alt = splice->kalt.s; |
|
} |
|
if ( splice->ref_end >= ex_beg - N_SPLICE_REGION_INTRON && splice->ref_beg < ex_beg - N_SPLICE_DONOR ) |
|
{ |
|
splice->csq |= CSQ_SPLICE_REGION; |
|
if ( ref && alt && !strncmp(ref,alt,N_SPLICE_REGION_INTRON) ) splice->csq |= CSQ_SYNONYMOUS_VARIANT; |
|
} |
|
if ( splice->ref_end >= ex_beg - N_SPLICE_DONOR ) |
|
{ |
|
if ( splice->check_donor && splice->tr->strand==STRAND_REV ) splice->csq |= CSQ_SPLICE_DONOR; |
|
if ( splice->check_acceptor && splice->tr->strand==STRAND_FWD ) splice->csq |= CSQ_SPLICE_ACCEPTOR; |
|
if ( ref && alt && !strncmp(ref+N_SPLICE_REGION_INTRON-N_SPLICE_DONOR,alt+N_SPLICE_REGION_INTRON-N_SPLICE_DONOR,N_SPLICE_DONOR) ) splice->csq |= CSQ_SYNONYMOUS_VARIANT; |
|
} |
|
} |
|
} |
|
if ( splice->ref_end >= ex_beg ) |
|
{ |
|
splice->tbeg = splice->ref_beg - splice->vcf.pos + 1; |
|
splice->ref_beg = ex_beg - 1; |
|
if ( splice->tbeg + splice->tend == splice->vcf.alen ) |
|
{ |
|
|
|
if ( !splice->tend ) |
|
{ |
|
splice->csq |= CSQ_CODING_SEQUENCE; |
|
return SPLICE_OVERLAP; |
|
} |
|
splice->tend--; |
|
} |
|
} |
|
} |
|
if ( ex_end < splice->ref_end ) |
|
{ |
|
if ( splice->check_region_end ) |
|
{ |
|
int csq = 0; |
|
if ( splice->check_utr ) |
|
{ |
|
regitr_t *itr = regitr_init(NULL); |
|
const char *chr = drop_chr_prefix(args, bcf_seqname(args->hdr,splice->vcf.rec)); |
|
if ( regidx_overlap(args->idx_utr,chr,ex_end+1,splice->ref_end, itr) ) |
|
csq = csq_stage_utr(args, itr, splice->vcf.rec, splice->tr->id, splice->csq, splice->vcf.ial); |
|
regitr_destroy(itr); |
|
} |
|
if ( !csq ) |
|
{ |
|
char *ref = NULL, *alt = NULL; |
|
if ( splice->set_refalt ) |
|
{ |
|
splice_build_hap(splice, ex_end+1, N_SPLICE_REGION_INTRON); |
|
ref = splice->kref.s, alt = splice->kalt.s; |
|
} |
|
if ( splice->ref_beg < ex_end + N_SPLICE_REGION_INTRON && splice->ref_end > ex_end + N_SPLICE_DONOR ) |
|
{ |
|
splice->csq |= CSQ_SPLICE_REGION; |
|
if ( ref && alt && !strncmp(ref,alt,N_SPLICE_REGION_INTRON) ) splice->csq |= CSQ_SYNONYMOUS_VARIANT; |
|
} |
|
if ( splice->ref_beg < ex_end + N_SPLICE_DONOR ) |
|
{ |
|
if ( splice->check_donor && splice->tr->strand==STRAND_FWD ) splice->csq |= CSQ_SPLICE_DONOR; |
|
if ( splice->check_acceptor && splice->tr->strand==STRAND_REV ) splice->csq |= CSQ_SPLICE_ACCEPTOR; |
|
if ( ref && alt && !strncmp(ref+N_SPLICE_REGION_INTRON-N_SPLICE_DONOR,alt+N_SPLICE_REGION_INTRON-N_SPLICE_DONOR,N_SPLICE_DONOR) ) splice->csq |= CSQ_SYNONYMOUS_VARIANT; |
|
} |
|
} |
|
} |
|
if ( splice->ref_beg < ex_end ) |
|
{ |
|
splice->tend = splice->vcf.rlen - (splice->ref_end - splice->vcf.pos + 1); |
|
splice->ref_end = ex_end; |
|
} |
|
} |
|
if ( splice->ref_end < ex_beg || splice->ref_beg >= ex_end ) |
|
{ |
|
csq_stage_splice(args, splice->vcf.rec, splice->tr, splice->csq, splice->vcf.ial); |
|
return SPLICE_OUTSIDE; |
|
} |
|
if ( splice->ref_beg < ex_beg + 2 ) |
|
{ |
|
if ( splice->check_region_beg ) splice->csq |= CSQ_SPLICE_REGION; |
|
if ( splice->tr->strand==STRAND_FWD ) { if ( splice->check_start ) splice->csq |= CSQ_START_LOST; } |
|
else { if ( splice->check_stop ) splice->csq |= CSQ_STOP_LOST; } |
|
} |
|
if ( splice->ref_end > ex_end - 3 ) |
|
{ |
|
if ( splice->check_region_end ) splice->csq |= CSQ_SPLICE_REGION; |
|
if ( splice->tr->strand==STRAND_REV ) { if ( splice->check_start ) splice->csq |= CSQ_START_LOST; } |
|
else { if ( splice->check_stop ) splice->csq |= CSQ_STOP_LOST; } |
|
} |
|
if ( splice->set_refalt ) |
|
{ |
|
if ( splice->tbeg>0 ) splice->tbeg--; |
|
if ( splice->vcf.rlen > splice->tbeg + splice->tend && splice->vcf.alen > splice->tbeg + splice->tend ) |
|
{ |
|
splice->vcf.rlen -= splice->tbeg + splice->tend; |
|
splice->vcf.alen -= splice->tbeg + splice->tend; |
|
} |
|
splice->kref.l = 0; kputsn(splice->vcf.ref + splice->tbeg, splice->vcf.rlen, &splice->kref); |
|
splice->kalt.l = 0; kputsn(splice->vcf.alt + splice->tbeg, splice->vcf.alen, &splice->kalt); |
|
if ( (splice->ref_beg+1 < ex_beg && splice->ref_end >= ex_beg) || (splice->ref_beg+1 < ex_end && splice->ref_end >= ex_end) ) |
|
{ |
|
splice->csq |= (splice->ref_end - splice->ref_beg)%3 ? CSQ_FRAMESHIFT_VARIANT : CSQ_INFRAME_DELETION; |
|
return SPLICE_OVERLAP; |
|
} |
|
} |
|
csq_stage_splice(args, splice->vcf.rec, splice->tr, splice->csq, splice->vcf.ial); |
|
return SPLICE_INSIDE; |
|
} |
|
|
|
static inline int splice_csq_mnp(args_t *args, splice_t *splice, uint32_t ex_beg, uint32_t ex_end) |
|
{ |
|
|
|
if ( splice->tbeg + splice->tend == splice->vcf.rlen ) return SPLICE_VAR_REF; |
|
|
|
splice->ref_beg = splice->vcf.pos + splice->tbeg; |
|
splice->ref_end = splice->vcf.pos + splice->vcf.rlen - splice->tend - 1; |
|
|
|
#if XDBG |
|
fprintf(bcftools_stderr,"mnp: %s>%s .. ex=%d,%d beg,end=%d,%d tbeg,tend=%d,%d check_utr=%d start,stop,beg,end=%d,%d,%d,%d\n", splice->vcf.ref,splice->vcf.alt,ex_beg,ex_end,splice->ref_beg,splice->ref_end,splice->tbeg,splice->tend,splice->check_utr,splice->check_start,splice->check_stop,splice->check_region_beg,splice->check_region_end); |
|
#endif |
|
|
|
if ( splice->ref_beg < ex_beg ) |
|
{ |
|
if ( splice->check_region_beg ) |
|
{ |
|
int csq = 0; |
|
if ( splice->check_utr ) |
|
{ |
|
regitr_t *itr = regitr_init(NULL); |
|
const char *chr = drop_chr_prefix(args, bcf_seqname(args->hdr,splice->vcf.rec)); |
|
if ( regidx_overlap(args->idx_utr,chr,splice->ref_beg,ex_beg-1, itr) ) |
|
csq = csq_stage_utr(args, itr, splice->vcf.rec, splice->tr->id, splice->csq, splice->vcf.ial); |
|
regitr_destroy(itr); |
|
} |
|
if ( !csq ) |
|
{ |
|
if ( splice->ref_end >= ex_beg - N_SPLICE_REGION_INTRON && splice->ref_beg < ex_beg - N_SPLICE_DONOR ) |
|
splice->csq |= CSQ_SPLICE_REGION; |
|
if ( splice->ref_end >= ex_beg - N_SPLICE_DONOR ) |
|
{ |
|
if ( splice->check_donor && splice->tr->strand==STRAND_REV ) splice->csq |= CSQ_SPLICE_DONOR; |
|
if ( splice->check_acceptor && splice->tr->strand==STRAND_FWD ) splice->csq |= CSQ_SPLICE_ACCEPTOR; |
|
} |
|
} |
|
} |
|
if ( splice->ref_end >= ex_beg ) |
|
{ |
|
splice->tbeg = splice->ref_beg - splice->vcf.pos; |
|
splice->ref_beg = ex_beg; |
|
} |
|
} |
|
if ( ex_end < splice->ref_end ) |
|
{ |
|
if ( splice->check_region_end ) |
|
{ |
|
int csq = 0; |
|
if ( splice->check_utr ) |
|
{ |
|
regitr_t *itr = regitr_init(NULL); |
|
const char *chr = drop_chr_prefix(args, bcf_seqname(args->hdr,splice->vcf.rec)); |
|
if ( regidx_overlap(args->idx_utr,chr,ex_end+1,splice->ref_end, itr) ) |
|
csq = csq_stage_utr(args, itr, splice->vcf.rec, splice->tr->id, splice->csq, splice->vcf.ial); |
|
regitr_destroy(itr); |
|
} |
|
if ( !csq ) |
|
{ |
|
if ( splice->ref_beg <= ex_end + N_SPLICE_REGION_INTRON && splice->ref_end > ex_end + N_SPLICE_DONOR ) |
|
splice->csq |= CSQ_SPLICE_REGION; |
|
if ( splice->ref_beg <= ex_end + N_SPLICE_DONOR ) |
|
{ |
|
if ( splice->check_donor && splice->tr->strand==STRAND_FWD ) splice->csq |= CSQ_SPLICE_DONOR; |
|
if ( splice->check_acceptor && splice->tr->strand==STRAND_REV ) splice->csq |= CSQ_SPLICE_ACCEPTOR; |
|
} |
|
} |
|
} |
|
if ( splice->ref_beg <= ex_end ) |
|
{ |
|
splice->tend = splice->vcf.rlen - (splice->ref_end - splice->vcf.pos + 1); |
|
splice->ref_end = ex_end; |
|
} |
|
} |
|
if ( splice->ref_end < ex_beg || splice->ref_beg > ex_end ) |
|
{ |
|
csq_stage_splice(args, splice->vcf.rec, splice->tr, splice->csq, splice->vcf.ial); |
|
return SPLICE_OUTSIDE; |
|
} |
|
|
|
if ( splice->ref_beg < ex_beg + 3 ) |
|
{ |
|
if ( splice->check_region_beg ) splice->csq |= CSQ_SPLICE_REGION; |
|
if ( splice->tr->strand==STRAND_FWD ) { if ( splice->check_start ) splice->csq |= CSQ_START_LOST; } |
|
else { if ( splice->check_stop ) splice->csq |= CSQ_STOP_LOST; } |
|
} |
|
if ( splice->ref_end > ex_end - 3 ) |
|
{ |
|
if ( splice->check_region_end ) splice->csq |= CSQ_SPLICE_REGION; |
|
if ( splice->tr->strand==STRAND_REV ) { if ( splice->check_start ) splice->csq |= CSQ_START_LOST; } |
|
else { if ( splice->check_stop ) splice->csq |= CSQ_STOP_LOST; } |
|
} |
|
if ( splice->set_refalt ) |
|
{ |
|
splice->vcf.rlen -= splice->tbeg + splice->tend; |
|
splice->kref.l = 0; kputsn(splice->vcf.ref + splice->tbeg, splice->vcf.rlen, &splice->kref); |
|
splice->kalt.l = 0; kputsn(splice->vcf.alt + splice->tbeg, splice->vcf.rlen, &splice->kalt); |
|
} |
|
csq_stage_splice(args, splice->vcf.rec, splice->tr, splice->csq, splice->vcf.ial); |
|
return SPLICE_INSIDE; |
|
} |
|
static inline int splice_csq(args_t *args, splice_t *splice, uint32_t ex_beg, uint32_t ex_end) |
|
{ |
|
splice->vcf.alen = strlen(splice->vcf.alt); |
|
|
|
int rlen1 = splice->vcf.rlen - 1, alen1 = splice->vcf.alen - 1, i = 0; |
|
splice->tbeg = 0, splice->tend = 0; |
|
|
|
|
|
while ( i<=rlen1 && i<=alen1 ) |
|
{ |
|
if ( splice->vcf.ref[rlen1-i] != splice->vcf.alt[alen1-i] ) break; |
|
i++; |
|
} |
|
splice->tend = i; |
|
rlen1 -= i, alen1 -= i, i = 0; |
|
while ( i<=rlen1 && i<=alen1 ) |
|
{ |
|
if ( splice->vcf.ref[i] != splice->vcf.alt[i] ) break; |
|
i++; |
|
} |
|
splice->tbeg = i; |
|
|
|
|
|
|
|
if ( splice->vcf.rlen==splice->vcf.alen ) return splice_csq_mnp(args, splice, ex_beg, ex_end); |
|
if ( splice->vcf.rlen < splice->vcf.alen ) return splice_csq_ins(args, splice, ex_beg, ex_end); |
|
if ( splice->vcf.rlen > splice->vcf.alen ) return splice_csq_del(args, splice, ex_beg, ex_end); |
|
|
|
return 0; |
|
} |
|
|
|
|
|
|
|
int hap_init(args_t *args, hap_node_t *parent, hap_node_t *child, gf_cds_t *cds, bcf1_t *rec, int ial) |
|
{ |
|
int i; |
|
kstring_t str = {0,0,0}; |
|
gf_tscript_t *tr = cds->tr; |
|
child->icds = cds->icds; |
|
child->vcf_ial = ial; |
|
|
|
splice_t splice; |
|
splice_init(&splice, rec); |
|
splice.tr = tr; |
|
splice.vcf.ial = ial; |
|
splice.vcf.alt = rec->d.allele[ial]; |
|
splice.check_acceptor = splice.check_donor = splice.set_refalt = splice.check_utr = 1; |
|
if ( !(tr->trim & TRIM_5PRIME) ) |
|
{ |
|
if ( tr->strand==STRAND_FWD ) { if ( child->icds==0 ) splice.check_start = 1; } |
|
else { if ( child->icds==tr->ncds-1 ) splice.check_start = 1; } |
|
} |
|
if ( !(tr->trim & TRIM_3PRIME) ) |
|
{ |
|
if ( tr->strand==STRAND_FWD ) { if ( child->icds==tr->ncds-1 ) splice.check_stop = 1; } |
|
else { if ( child->icds==0 ) splice.check_stop = 1; } |
|
} |
|
if ( splice.check_start ) |
|
{ |
|
if ( tr->strand==STRAND_FWD ) { if ( dna2aa(TSCRIPT_AUX(tr)->ref+N_REF_PAD+cds->beg-tr->beg) != 'M' ) splice.check_start = 0; } |
|
else { if ( cdna2aa(TSCRIPT_AUX(tr)->ref+N_REF_PAD+cds->beg-tr->beg+cds->len-3) != 'M' ) splice.check_start = 0; } |
|
} |
|
if ( child->icds!=0 ) splice.check_region_beg = 1; |
|
if ( child->icds!=tr->ncds-1 ) splice.check_region_end = 1; |
|
|
|
#if XDBG |
|
fprintf(bcftools_stderr,"\nhap_init: %d [%s][%s] check start:%d,stop:%d\n",splice.vcf.pos+1,splice.vcf.ref,splice.vcf.alt,splice.check_start,splice.check_stop); |
|
#endif |
|
int ret = splice_csq(args, &splice, cds->beg, cds->beg + cds->len - 1); |
|
#if XDBG |
|
fprintf(bcftools_stderr,"cds splice_csq: %d [%s][%s] .. beg,end=%d %d, ret=%d, csq=%d\n\n",splice.vcf.pos+1,splice.kref.s,splice.kalt.s,splice.ref_beg+1,splice.ref_end+1,ret,splice.csq); |
|
#endif |
|
|
|
if ( ret==SPLICE_VAR_REF ) return 2; |
|
if ( ret==SPLICE_OUTSIDE || ret==SPLICE_OVERLAP || splice.csq==CSQ_START_LOST ) |
|
{ |
|
free(splice.kref.s); |
|
free(splice.kalt.s); |
|
|
|
if ( !splice.csq ) return 2; |
|
|
|
|
|
child->seq = NULL; |
|
child->sbeg = 0; |
|
child->rbeg = rec->pos; |
|
child->rlen = 0; |
|
child->dlen = 0; |
|
kputs(rec->d.allele[0],&str); |
|
kputc('>',&str); |
|
kputs(rec->d.allele[ial],&str); |
|
child->var = str.s; |
|
child->type = HAP_SSS; |
|
child->csq = splice.csq; |
|
child->rec = rec; |
|
return 0; |
|
} |
|
if ( splice.csq & CSQ_SYNONYMOUS_VARIANT ) splice.csq &= ~CSQ_SYNONYMOUS_VARIANT; |
|
|
|
int dbeg = 0; |
|
if ( splice.ref_beg < cds->beg ) |
|
{ |
|
|
|
|
|
|
|
dbeg = cds->beg - splice.ref_beg; |
|
splice.kref.l -= dbeg; |
|
splice.ref_beg = cds->beg; |
|
assert( dbeg <= splice.kalt.l ); |
|
} |
|
|
|
assert( parent->type!=HAP_SSS ); |
|
if ( parent->type==HAP_CDS ) |
|
{ |
|
i = parent->icds; |
|
if ( i!=cds->icds ) |
|
{ |
|
|
|
int len = tr->cds[i]->len - parent->rbeg - parent->rlen + tr->cds[i]->beg; |
|
if ( len > 0 ) |
|
kputsn_(TSCRIPT_AUX(tr)->ref + N_REF_PAD + parent->rbeg + parent->rlen - tr->beg, len, &str); |
|
} |
|
|
|
|
|
while ( ++i < cds->icds ) |
|
kputsn_(TSCRIPT_AUX(tr)->ref + N_REF_PAD + tr->cds[i]->beg - tr->beg, tr->cds[i]->len, &str); |
|
|
|
if ( parent->icds==child->icds ) |
|
{ |
|
int len = splice.ref_beg - parent->rbeg - parent->rlen; |
|
if ( len < 0 ) |
|
{ |
|
free(str.s); |
|
free(splice.kref.s); |
|
free(splice.kalt.s); |
|
return 1; |
|
} |
|
kputsn_(TSCRIPT_AUX(tr)->ref + N_REF_PAD + parent->rbeg + parent->rlen - tr->beg, len, &str); |
|
} |
|
else |
|
kputsn_(TSCRIPT_AUX(tr)->ref + N_REF_PAD + cds->beg - tr->beg, splice.ref_beg - cds->beg, &str); |
|
} |
|
kputs(splice.kalt.s + dbeg, &str); |
|
|
|
child->seq = str.s; |
|
child->sbeg = cds->pos + (splice.ref_beg - cds->beg); |
|
child->rbeg = splice.ref_beg; |
|
child->rlen = splice.kref.l; |
|
child->type = HAP_CDS; |
|
child->prev = parent; |
|
child->rec = rec; |
|
child->csq = splice.csq; |
|
|
|
|
|
{ |
|
int rlen = strlen(rec->d.allele[0]); |
|
int alen = strlen(rec->d.allele[ial]); |
|
child->dlen = alen - rlen; |
|
child->var = (char*) malloc(rlen+alen+2); |
|
memcpy(child->var,rec->d.allele[0],rlen); |
|
child->var[rlen] = '>'; |
|
memcpy(child->var+rlen+1,rec->d.allele[ial],alen); |
|
child->var[rlen+alen+1] = 0; |
|
} |
|
|
|
|
|
if ( child->rbeg + child->rlen > cds->beg + cds->len ) |
|
{ |
|
child->type = HAP_SSS; |
|
if ( !child->csq ) child->csq |= CSQ_CODING_SEQUENCE; |
|
} |
|
|
|
|
|
free(splice.kref.s); |
|
free(splice.kalt.s); |
|
return 0; |
|
} |
|
void hap_destroy(hap_node_t *hap) |
|
{ |
|
int i; |
|
for (i=0; i<hap->nchild; i++) |
|
if ( hap->child[i] ) hap_destroy(hap->child[i]); |
|
for (i=0; i<hap->mcsq_list; i++) free(hap->csq_list[i].type.vstr.s); |
|
free(hap->csq_list); |
|
free(hap->child); |
|
free(hap->cur_child); |
|
free(hap->seq); |
|
free(hap->var); |
|
free(hap); |
|
} |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
void cds_translate(kstring_t *_ref, kstring_t *_seq, uint32_t sbeg, uint32_t rbeg, uint32_t rend, int strand, kstring_t *tseq, int fill) |
|
{ |
|
#if XDBG |
|
fprintf(bcftools_stderr,"\ntranslate: %d %d %d fill=%d seq.l=%d\n",sbeg,rbeg,rend,fill,(int)_seq->l); |
|
#endif |
|
char tmp[3], *codon, *end; |
|
int i, len, npad; |
|
|
|
kstring_t ref = *_ref; |
|
kstring_t seq = *_seq; |
|
|
|
tseq->l = 0; |
|
if ( !seq.l ) |
|
{ |
|
kputc('?', tseq); |
|
return; |
|
} |
|
|
|
#define DBG 0 |
|
#if DBG |
|
fprintf(bcftools_stderr,"translate: sbeg,rbeg,rend=%d %d %d fill=%d seq.l=%d\n",sbeg,rbeg,rend,fill,(int)_seq->l); |
|
fprintf(bcftools_stderr," ref: l=%d %s\n", (int)ref.l,ref.s); |
|
fprintf(bcftools_stderr," seq: l=%d m=%d ", (int)seq.l,(int)seq.m); |
|
for (i=0; i<seq.l; i++) fprintf(bcftools_stderr,"%c",seq.s[i]); fprintf(bcftools_stderr,"\n"); |
|
fprintf(bcftools_stderr," sbeg,rbeg,rend: %d,%d,%d\n", sbeg,rbeg,rend); |
|
fprintf(bcftools_stderr," strand,fill: %d,%d\n", strand,fill); |
|
#endif |
|
|
|
if ( strand==STRAND_FWD ) |
|
{ |
|
|
|
npad = sbeg % 3; |
|
#if DBG>1 |
|
fprintf(bcftools_stderr," npad: %d\n",npad); |
|
#endif |
|
assert( npad<=rbeg ); |
|
|
|
for (i=0; i<npad; i++) |
|
tmp[i] = ref.s[rbeg+i-npad+N_REF_PAD]; |
|
for (; i<3 && i-npad<seq.l; i++) |
|
tmp[i] = seq.s[i-npad]; |
|
len = seq.l - i + npad; |
|
#if DBG>1 |
|
fprintf(bcftools_stderr,"\t i=%d\n", i); |
|
#endif |
|
if ( i==3 ) |
|
{ |
|
kputc_(dna2aa(tmp), tseq); |
|
#if DBG>1 |
|
fprintf(bcftools_stderr,"[1]%c%c%c\n",tmp[0],tmp[1],tmp[2]); |
|
#endif |
|
codon = seq.s + 3 - npad; |
|
end = codon + len - 1 - (len % 3); |
|
while ( codon < end ) |
|
{ |
|
kputc_(dna2aa(codon), tseq); |
|
#if DBG>1 |
|
fprintf(bcftools_stderr,"[2]%c%c%c\n",codon[0],codon[1],codon[2]); |
|
#endif |
|
codon += 3; |
|
} |
|
end = seq.s + seq.l - 1; |
|
for (i=0; codon+i<=end; i++) tmp[i] = codon[i]; |
|
} |
|
|
|
|
|
codon = ref.s + rend + N_REF_PAD; |
|
if ( i>0 ) |
|
{ |
|
#if DBG>1 |
|
if(i==1)fprintf(bcftools_stderr,"[3]%c\n",tmp[0]); |
|
if(i==2)fprintf(bcftools_stderr,"[3]%c%c\n",tmp[0],tmp[1]); |
|
#endif |
|
for (; i<3; i++) |
|
{ |
|
tmp[i] = *codon; |
|
codon++; |
|
} |
|
kputc_(dna2aa(tmp), tseq); |
|
#if DBG>1 |
|
fprintf(bcftools_stderr,"[4]%c%c%c\n",tmp[0],tmp[1],tmp[2]); |
|
#endif |
|
} |
|
if ( fill!=0 ) |
|
{ |
|
end = ref.s + ref.l - N_REF_PAD; |
|
while ( codon+3 <= end ) |
|
{ |
|
kputc_(dna2aa(codon), tseq); |
|
#if DBG>1 |
|
fprintf(bcftools_stderr,"[5]%c%c%c\t%c\n",codon[0],codon[1],codon[2],dna2aa(codon)); |
|
#endif |
|
codon += 3; |
|
} |
|
} |
|
} |
|
else |
|
{ |
|
|
|
npad = (seq.m - (sbeg + seq.l)) % 3; |
|
#if DBG>1 |
|
fprintf(bcftools_stderr," npad: %d\n",npad); |
|
#endif |
|
if ( !(npad>=0 && sbeg+seq.l+npad<=seq.m) ) fprintf(bcftools_stderr,"sbeg=%d seq.l=%d seq.m=%d npad=%d\n",sbeg,(int)seq.l,(int)seq.m,npad); |
|
assert( npad>=0 && sbeg+seq.l+npad<=seq.m ); |
|
|
|
if ( npad==2 ) |
|
{ |
|
tmp[1] = ref.s[rend+N_REF_PAD]; |
|
tmp[2] = ref.s[rend+N_REF_PAD+1]; |
|
i = 0; |
|
} |
|
else if ( npad==1 ) |
|
{ |
|
tmp[2] = ref.s[rend+N_REF_PAD]; |
|
i = 1; |
|
} |
|
else |
|
i = 2; |
|
|
|
end = seq.s + seq.l; |
|
for (; i>=0 && end>seq.s; i--) tmp[i] = *(--end); |
|
#if DBG>1 |
|
fprintf(bcftools_stderr,"\t i=%d\n", i); |
|
if(i==1)fprintf(bcftools_stderr,"[0] %c\n",tmp[2]); |
|
if(i==0)fprintf(bcftools_stderr,"[0] %c%c\n",tmp[1],tmp[2]); |
|
#endif |
|
if ( i==-1 ) |
|
{ |
|
#if DBG>1 |
|
fprintf(bcftools_stderr,"[1]%c%c%c\t%c\n",tmp[0],tmp[1],tmp[2], cdna2aa(tmp)); |
|
#endif |
|
kputc_(cdna2aa(tmp), tseq); |
|
codon = end - 3; |
|
while ( codon >= seq.s ) |
|
{ |
|
kputc_(cdna2aa(codon), tseq); |
|
#if DBG>1 |
|
fprintf(bcftools_stderr,"[2]%c%c%c\t%c\n",codon[0],codon[1],codon[2], cdna2aa(codon)); |
|
#endif |
|
codon -= 3; |
|
} |
|
if ( seq.s-codon==2 ) |
|
{ |
|
tmp[2] = seq.s[0]; |
|
i = 1; |
|
} |
|
else if ( seq.s-codon==1 ) |
|
{ |
|
tmp[1] = seq.s[0]; |
|
tmp[2] = seq.s[1]; |
|
i = 0; |
|
} |
|
else |
|
i = -1; |
|
#if DBG>1 |
|
if(i==1)fprintf(bcftools_stderr,"[3] %c\n",tmp[2]); |
|
if(i==0)fprintf(bcftools_stderr,"[3] %c%c\n",tmp[1],tmp[2]); |
|
#endif |
|
} |
|
|
|
end = ref.s + N_REF_PAD + rbeg; |
|
if ( i>=0 ) |
|
{ |
|
for (; i>=0 && end>=ref.s; i--) tmp[i] = *(--end); |
|
kputc_(cdna2aa(tmp), tseq); |
|
#if DBG>1 |
|
fprintf(bcftools_stderr,"[4]%c%c%c\t%c\n",tmp[0],tmp[1],tmp[2],cdna2aa(tmp)); |
|
#endif |
|
} |
|
if ( fill!=0 ) |
|
{ |
|
codon = end - 3; |
|
while ( codon >= ref.s + N_REF_PAD ) |
|
{ |
|
kputc_(cdna2aa(codon), tseq); |
|
#if DBG>1 |
|
fprintf(bcftools_stderr,"[5]%c%c%c\t%c\n",codon[0],codon[1],codon[2],cdna2aa(codon)); |
|
#endif |
|
codon -= 3; |
|
} |
|
} |
|
} |
|
kputc_(0,tseq); tseq->l--; |
|
#if DBG |
|
fprintf(bcftools_stderr," tseq: %s\n", tseq->s); |
|
#endif |
|
} |
|
|
|
void tscript_splice_ref(gf_tscript_t *tr) |
|
{ |
|
int i, len = 0; |
|
for (i=0; i<tr->ncds; i++) |
|
len += tr->cds[i]->len; |
|
|
|
TSCRIPT_AUX(tr)->nsref = len + 2*N_REF_PAD; |
|
TSCRIPT_AUX(tr)->sref = (char*) malloc(len + 1 + 2*N_REF_PAD); |
|
len = 0; |
|
|
|
memcpy(TSCRIPT_AUX(tr)->sref, TSCRIPT_AUX(tr)->ref + tr->cds[0]->beg - tr->beg, N_REF_PAD); |
|
len += N_REF_PAD; |
|
|
|
for (i=0; i<tr->ncds; i++) |
|
{ |
|
memcpy(TSCRIPT_AUX(tr)->sref + len, TSCRIPT_AUX(tr)->ref + N_REF_PAD + tr->cds[i]->beg - tr->beg, tr->cds[i]->len); |
|
len += tr->cds[i]->len; |
|
} |
|
memcpy(TSCRIPT_AUX(tr)->sref + len, TSCRIPT_AUX(tr)->ref + N_REF_PAD + tr->cds[tr->ncds-1]->beg - tr->beg, N_REF_PAD); |
|
len += N_REF_PAD; |
|
|
|
TSCRIPT_AUX(tr)->sref[len] = 0; |
|
} |
|
|
|
|
|
int csq_push(args_t *args, csq_t *csq, bcf1_t *rec) |
|
{ |
|
#if XDBG |
|
fprintf(bcftools_stderr,"csq_push: %d .. %d\n",rec->pos+1,csq->type.type); |
|
#endif |
|
khint_t k = kh_get(pos2vbuf, args->pos2vbuf, (int)csq->pos); |
|
vbuf_t *vbuf = (k == kh_end(args->pos2vbuf)) ? NULL : kh_val(args->pos2vbuf, k); |
|
if ( !vbuf ) error("This should not happen. %s:%d %s\n",bcf_seqname(args->hdr,rec),csq->pos+1,csq->type.vstr.s); |
|
|
|
int i; |
|
for (i=0; i<vbuf->n; i++) |
|
if ( vbuf->vrec[i]->line==rec ) break; |
|
if ( i==vbuf->n ) error("This should not happen.. %s:%d %s\n", bcf_seqname(args->hdr,rec),csq->pos+1,csq->type.vstr.s); |
|
vrec_t *vrec = vbuf->vrec[i]; |
|
|
|
|
|
if ( csq->type.type & CSQ_SPLICE_REGION && csq->type.type & (CSQ_SPLICE_DONOR|CSQ_SPLICE_ACCEPTOR) ) |
|
csq->type.type &= ~CSQ_SPLICE_REGION; |
|
|
|
if ( csq->type.type & CSQ_PRINTED_UPSTREAM ) |
|
{ |
|
for (i=0; i<vrec->nvcsq; i++) |
|
{ |
|
|
|
|
|
|
|
if ( csq->type.type&CSQ_START_STOP && vrec->vcsq[i].type&CSQ_START_STOP ) |
|
{ |
|
vrec->vcsq[i] = csq->type; |
|
goto exit_duplicate; |
|
} |
|
if ( !(vrec->vcsq[i].type & CSQ_PRINTED_UPSTREAM) ) continue; |
|
if ( csq->type.ref != vrec->vcsq[i].ref ) continue; |
|
goto exit_duplicate; |
|
} |
|
} |
|
else if ( csq->type.type & CSQ_COMPOUND ) |
|
{ |
|
for (i=0; i<vrec->nvcsq; i++) |
|
{ |
|
if ( csq->type.trid != vrec->vcsq[i].trid && (csq->type.type|vrec->vcsq[i].type)&CSQ_PRN_TSCRIPT ) continue; |
|
if ( csq->type.biotype != vrec->vcsq[i].biotype ) continue; |
|
if ( csq->type.gene != vrec->vcsq[i].gene ) continue; |
|
if ( csq->type.vcf_ial != vrec->vcsq[i].vcf_ial ) continue; |
|
if ( (csq->type.type&CSQ_UPSTREAM_STOP)^(vrec->vcsq[i].type&CSQ_UPSTREAM_STOP) ) continue; |
|
if ( csq->type.vstr.s || vrec->vcsq[i].vstr.s ) |
|
{ |
|
|
|
|
|
|
|
|
|
|
|
|
|
if ( !csq->type.vstr.s || !vrec->vcsq[i].vstr.s ) |
|
{ |
|
if ( csq->type.type&CSQ_START_STOP && vrec->vcsq[i].type&CSQ_START_STOP ) |
|
{ |
|
vrec->vcsq[i].type |= csq->type.type; |
|
|
|
|
|
if ( vrec->vcsq[i].type&CSQ_STOP_RETAINED ) |
|
vrec->vcsq[i].type &= ~(CSQ_STOP_LOST|CSQ_SYNONYMOUS_VARIANT); |
|
|
|
if ( !vrec->vcsq[i].vstr.s ) vrec->vcsq[i].vstr = csq->type.vstr; |
|
goto exit_duplicate; |
|
} |
|
continue; |
|
} |
|
if ( strcmp(csq->type.vstr.s,vrec->vcsq[i].vstr.s) ) continue; |
|
} |
|
vrec->vcsq[i].type |= csq->type.type; |
|
goto exit_duplicate; |
|
} |
|
} |
|
else |
|
{ |
|
for (i=0; i<vrec->nvcsq; i++) |
|
{ |
|
if ( csq->type.trid != vrec->vcsq[i].trid && (csq->type.type|vrec->vcsq[i].type)&CSQ_PRN_TSCRIPT) continue; |
|
if ( csq->type.biotype != vrec->vcsq[i].biotype ) continue; |
|
if ( !(vrec->vcsq[i].type & CSQ_COMPOUND) ) |
|
{ |
|
vrec->vcsq[i].type |= csq->type.type; |
|
goto exit_duplicate; |
|
} |
|
if ( vrec->vcsq[i].type==(vrec->vcsq[i].type|csq->type.type) ) goto exit_duplicate; |
|
} |
|
} |
|
|
|
csq->vrec = vrec; |
|
csq->idx = vrec->nvcsq++; |
|
hts_expand0(vcsq_t, vrec->nvcsq, vrec->mvcsq, vrec->vcsq); |
|
vrec->vcsq[i] = csq->type; |
|
return 0; |
|
|
|
exit_duplicate: |
|
csq->vrec = vrec; |
|
csq->idx = i; |
|
return 1; |
|
} |
|
|
|
|
|
|
|
|
|
|
|
#define node2soff(i) (hap->stack[i].slen - (hap->stack[i].node->rlen + hap->stack[i].node->dlen)) |
|
#define node2sbeg(i) (hap->sbeg + node2soff(i)) |
|
#define node2send(i) (hap->sbeg + hap->stack[i].slen) |
|
#define node2rbeg(i) (hap->stack[i].node->sbeg) |
|
#define node2rend(i) (hap->stack[i].node->sbeg + hap->stack[i].node->rlen) |
|
#define node2rpos(i) (hap->stack[i].node->rec->pos) |
|
|
|
void kput_vcsq(args_t *args, vcsq_t *csq, kstring_t *str) |
|
{ |
|
|
|
|
|
if ( csq->type & CSQ_INCOMPLETE_CDS && (csq->type & ~(CSQ_START_STOP|CSQ_INCOMPLETE_CDS|CSQ_UPSTREAM_STOP)) ) csq->type &= ~(CSQ_START_STOP|CSQ_INCOMPLETE_CDS); |
|
|
|
|
|
if ( csq->type & CSQ_START_STOP && csq->type & CSQ_MISSENSE_VARIANT ) csq->type &= ~CSQ_MISSENSE_VARIANT; |
|
|
|
if ( csq->type & CSQ_PRINTED_UPSTREAM && csq->ref ) |
|
{ |
|
kputc_('@',str); |
|
kputw(csq->ref->pos+1, str); |
|
return; |
|
} |
|
if ( csq->type & CSQ_UPSTREAM_STOP ) |
|
kputc_('*',str); |
|
|
|
int has_csq = 0, i, n = sizeof(csq_strings)/sizeof(char*); |
|
for (i=1; i<n; i++) |
|
if ( csq_strings[i] && csq->type&(1<<i) ) { has_csq = 1; kputs(csq_strings[i],str); break; } |
|
i++; |
|
for (; i<n; i++) |
|
if ( csq_strings[i] && csq->type&(1<<i) ) { has_csq = 1; kputc_('&',str); kputs(csq_strings[i],str); } |
|
|
|
if ( (csq->biotype==GF_NMD) && (csq->type & CSQ_PRN_NMD) ) |
|
{ |
|
if ( has_csq ) kputc_('&',str); |
|
kputs("NMD_transcript",str); |
|
} |
|
|
|
kputc_('|', str); |
|
if ( csq->gene ) kputs(csq->gene , str); |
|
|
|
kputc_('|', str); |
|
|
|
if ( csq->type & CSQ_PRN_TSCRIPT ) kputs(gff_id2string(args->gff,transcript,csq->trid), str); |
|
|
|
kputc_('|', str); |
|
kputs(gf_type2gff_string(csq->biotype), str); |
|
|
|
if ( CSQ_PRN_STRAND(csq->type) || csq->vstr.l ) |
|
kputs(csq->strand==STRAND_FWD ? "|+" : "|-", str); |
|
|
|
if ( csq->vstr.l ) |
|
kputs(csq->vstr.s, str); |
|
} |
|
|
|
void kprint_aa_prediction(args_t *args, int beg, kstring_t *aa, kstring_t *str) |
|
{ |
|
if ( !args->brief_predictions || (int)aa->l - args->brief_predictions < 3 ) |
|
kputs(aa->s, str); |
|
else |
|
{ |
|
int i, len = aa->l; |
|
if ( aa->s[len-1]=='*' ) len--; |
|
for (i=0; i<len && i<args->brief_predictions; i++) kputc(aa->s[i], str); |
|
kputs("..", str); |
|
kputw(beg+len, str); |
|
} |
|
} |
|
|
|
void hap_add_csq(args_t *args, hap_t *hap, hap_node_t *node, int tlen, int ibeg, int iend, int dlen, int indel) |
|
{ |
|
int i; |
|
gf_tscript_t *tr = hap->tr; |
|
int ref_node = tr->strand==STRAND_FWD ? ibeg : iend; |
|
int icsq = node->ncsq_list++; |
|
hts_expand0(csq_t,node->ncsq_list,node->mcsq_list,node->csq_list); |
|
csq_t *csq = &node->csq_list[icsq]; |
|
csq->pos = hap->stack[ref_node].node->rec->pos; |
|
csq->type.trid = tr->id; |
|
csq->type.vcf_ial = node->vcf_ial; |
|
csq->type.gene = tr->gene->name; |
|
csq->type.strand = tr->strand; |
|
csq->type.biotype = tr->type; |
|
|
|
|
|
int rm_csq = 0; |
|
csq->type.type = 0; |
|
for (i=ibeg; i<=iend; i++) |
|
csq->type.type |= hap->stack[i].node->csq & CSQ_COMPOUND; |
|
if ( dlen==0 && indel ) csq->type.type |= CSQ_INFRAME_ALTERING; |
|
|
|
int has_upstream_stop = hap->upstream_stop; |
|
if ( hap->stack[ibeg].node->type != HAP_SSS ) |
|
{ |
|
|
|
for (i=0; i<hap->tref.l; i++) |
|
if ( hap->tref.s[i]=='*' ) break; |
|
if ( i!=hap->tref.l ) |
|
{ |
|
hap->tref.l = i+1; |
|
hap->tref.s[i+1] = 0; |
|
} |
|
for (i=0; i<hap->tseq.l; i++) |
|
if ( hap->tseq.s[i]=='*' ) break; |
|
if ( i!=hap->tseq.l ) |
|
{ |
|
hap->tseq.l = i+1; |
|
hap->tseq.s[i+1] = 0; |
|
hap->upstream_stop = 1; |
|
} |
|
if ( csq->type.type & CSQ_STOP_LOST ) |
|
{ |
|
if ( hap->tref.s[hap->tref.l-1]=='*' && hap->tref.s[hap->tref.l-1] == hap->tseq.s[hap->tseq.l-1] ) |
|
{ |
|
rm_csq |= CSQ_STOP_LOST; |
|
csq->type.type |= CSQ_STOP_RETAINED; |
|
} |
|
else if ( hap->tref.s[hap->tref.l-1]!='*' ) |
|
{ |
|
|
|
|
|
if ( hap->tseq.s[hap->tseq.l-1] == '*' ) |
|
{ |
|
rm_csq |= CSQ_STOP_GAINED; |
|
csq->type.type |= CSQ_STOP_RETAINED; |
|
} |
|
else |
|
csq->type.type |= CSQ_INCOMPLETE_CDS; |
|
} |
|
} |
|
if ( csq->type.type & CSQ_START_LOST && hap->tref.s[0]!='M' ) |
|
{ |
|
rm_csq |= CSQ_START_LOST; |
|
csq->type.type &= ~CSQ_START_LOST; |
|
} |
|
if ( dlen!=0 ) |
|
{ |
|
if ( dlen%3 ) |
|
csq->type.type |= CSQ_FRAMESHIFT_VARIANT; |
|
else if ( dlen<0 ) |
|
csq->type.type |= CSQ_INFRAME_DELETION; |
|
else |
|
csq->type.type |= CSQ_INFRAME_INSERTION; |
|
if ( hap->tref.s[hap->tref.l-1]!='*' && hap->tseq.s[hap->tseq.l-1]=='*' ) |
|
csq->type.type |= CSQ_STOP_GAINED; |
|
} |
|
else |
|
{ |
|
int aa_change = 0; |
|
for (i=0; i<hap->tref.l; i++) |
|
{ |
|
if ( hap->tref.s[i] == hap->tseq.s[i] ) continue; |
|
aa_change = 1; |
|
if ( hap->tref.s[i] == '*' ) |
|
csq->type.type |= CSQ_STOP_LOST; |
|
else if ( hap->tseq.s[i] == '*' ) |
|
csq->type.type |= CSQ_STOP_GAINED; |
|
else |
|
csq->type.type |= CSQ_MISSENSE_VARIANT; |
|
} |
|
if ( !aa_change ) |
|
csq->type.type |= CSQ_SYNONYMOUS_VARIANT; |
|
} |
|
} |
|
|
|
if ( ibeg!=iend && (csq->type.type & (CSQ_INFRAME_DELETION|CSQ_INFRAME_INSERTION|CSQ_INFRAME_ALTERING)) && hap->tseq.s[hap->tseq.l-1]=='*' ) |
|
{ |
|
rm_csq |= CSQ_INFRAME_DELETION | CSQ_INFRAME_INSERTION | CSQ_INFRAME_ALTERING; |
|
csq->type.type |= CSQ_FRAMESHIFT_VARIANT | CSQ_STOP_GAINED; |
|
} |
|
if ( has_upstream_stop ) csq->type.type |= CSQ_UPSTREAM_STOP; |
|
csq->type.type &= ~rm_csq; |
|
|
|
if ( hap->stack[ibeg].node->type == HAP_SSS ) |
|
{ |
|
node->csq_list[icsq].type.type |= hap->stack[ibeg].node->csq & ~rm_csq; |
|
node->csq_list[icsq].type.ref = hap->stack[ibeg].node->rec; |
|
node->csq_list[icsq].type.biotype = tr->type; |
|
csq_push(args, node->csq_list+icsq, hap->stack[ibeg].node->rec); |
|
return; |
|
} |
|
|
|
kstring_t str = node->csq_list[icsq].type.vstr; |
|
str.l = 0; |
|
|
|
|
|
int aa_rbeg = tr->strand==STRAND_FWD ? node2rbeg(ibeg)/3+1 : (TSCRIPT_AUX(hap->tr)->nsref - 2*N_REF_PAD - node2rend(iend))/3+1; |
|
int aa_sbeg = tr->strand==STRAND_FWD ? node2sbeg(ibeg)/3+1 : (tlen - node2send(iend))/3+1; |
|
kputc_('|', &str); |
|
kputw(aa_rbeg, &str); |
|
kprint_aa_prediction(args,aa_rbeg,&hap->tref,&str); |
|
if ( !(csq->type.type & CSQ_SYNONYMOUS_VARIANT) ) |
|
{ |
|
kputc_('>', &str); |
|
kputw(aa_sbeg, &str); |
|
kprint_aa_prediction(args,aa_sbeg,&hap->tseq,&str); |
|
} |
|
kputc_('|', &str); |
|
|
|
|
|
|
|
for (i=ibeg; i<=iend; i++) |
|
{ |
|
if ( i>ibeg ) kputc_('+', &str); |
|
kputw(node2rpos(i)+1, &str); |
|
kputs(hap->stack[i].node->var, &str); |
|
} |
|
node->csq_list[icsq].type.vstr = str; |
|
csq_push(args, node->csq_list+icsq, hap->stack[ref_node].node->rec); |
|
|
|
for (i=ibeg; i<=iend; i++) |
|
{ |
|
|
|
|
|
if ( hap->stack[i].node->csq & ~CSQ_COMPOUND ) |
|
{ |
|
node->ncsq_list++; |
|
hts_expand0(csq_t,node->ncsq_list,node->mcsq_list,node->csq_list); |
|
csq_t *tmp_csq = &node->csq_list[node->ncsq_list - 1]; |
|
tmp_csq->pos = hap->stack[i].node->rec->pos; |
|
tmp_csq->type.trid = tr->id; |
|
|
|
tmp_csq->type.gene = tr->gene->name; |
|
tmp_csq->type.strand = tr->strand; |
|
tmp_csq->type.type = hap->stack[i].node->csq & ~CSQ_COMPOUND & ~rm_csq; |
|
tmp_csq->type.biotype = tr->type; |
|
tmp_csq->type.vstr.l = 0; |
|
kputs(str.s,&tmp_csq->type.vstr); |
|
csq_push(args, tmp_csq, hap->stack[i].node->rec); |
|
} |
|
if ( i!=ref_node && (node->csq_list[icsq].type.type & CSQ_COMPOUND || !(hap->stack[i].node->csq & ~CSQ_COMPOUND)) ) |
|
{ |
|
node->ncsq_list++; |
|
hts_expand0(csq_t,node->ncsq_list,node->mcsq_list,node->csq_list); |
|
csq_t *tmp_csq = &node->csq_list[node->ncsq_list - 1]; |
|
tmp_csq->pos = hap->stack[i].node->rec->pos; |
|
tmp_csq->type.trid = tr->id; |
|
|
|
tmp_csq->type.gene = tr->gene->name; |
|
tmp_csq->type.strand = tr->strand; |
|
tmp_csq->type.type = CSQ_PRINTED_UPSTREAM | hap->stack[i].node->csq; |
|
tmp_csq->type.biotype = tr->type; |
|
tmp_csq->type.ref = hap->stack[ref_node].node->rec; |
|
tmp_csq->type.vstr.l = 0; |
|
csq_push(args, tmp_csq, hap->stack[i].node->rec); |
|
} |
|
} |
|
} |
|
|
|
|
|
void hap_finalize(args_t *args, hap_t *hap) |
|
{ |
|
gf_tscript_t *tr = hap->tr; |
|
if ( !TSCRIPT_AUX(tr)->sref ) |
|
tscript_splice_ref(tr); |
|
|
|
kstring_t sref; |
|
sref.s = TSCRIPT_AUX(tr)->sref; |
|
sref.l = TSCRIPT_AUX(tr)->nsref; |
|
sref.m = sref.l; |
|
|
|
int istack = 0; |
|
hts_expand(hstack_t,1,hap->mstack,hap->stack); |
|
|
|
hap->sseq.l = 0; |
|
hap->tseq.l = 0; |
|
hap->stack[0].node = TSCRIPT_AUX(tr)->root; |
|
hap->stack[0].ichild = -1; |
|
hap->stack[0].slen = 0; |
|
hap->stack[0].dlen = 0; |
|
|
|
while ( istack>=0 ) |
|
{ |
|
hstack_t *stack = &hap->stack[istack]; |
|
hap_node_t *node = hap->stack[istack].node; |
|
while ( ++hap->stack[istack].ichild < node->nchild ) |
|
{ |
|
if ( node->child[stack->ichild] ) break; |
|
} |
|
if ( stack->ichild == node->nchild ) { istack--; continue; } |
|
|
|
node = node->child[stack->ichild]; |
|
|
|
istack++; |
|
hts_expand(hstack_t,istack+1,hap->mstack,hap->stack); |
|
stack = &hap->stack[istack-1]; |
|
|
|
hap->stack[istack].node = node; |
|
hap->stack[istack].ichild = -1; |
|
|
|
hap->sseq.l = stack->slen; |
|
if ( node->type==HAP_CDS ) kputs(node->seq, &hap->sseq); |
|
hap->stack[istack].slen = hap->sseq.l; |
|
hap->stack[istack].dlen = hap->stack[istack-1].dlen + node->dlen; |
|
|
|
if ( !node->nend ) continue; |
|
|
|
|
|
|
|
|
|
kstring_t sseq; |
|
sseq.m = sref.m - 2*N_REF_PAD + hap->stack[istack].dlen; |
|
hap->upstream_stop = 0; |
|
|
|
int i = 1, dlen = 0, ibeg, indel = 0; |
|
hap->sbeg = hap->stack[i].node->sbeg; |
|
assert( hap->stack[istack].node->type != HAP_SSS ); |
|
if ( tr->strand==STRAND_FWD ) |
|
{ |
|
i = 0, ibeg = -1; |
|
while ( ++i <= istack ) |
|
{ |
|
assert( hap->stack[i].node->type != HAP_SSS ); |
|
|
|
dlen += hap->stack[i].node->dlen; |
|
if ( hap->stack[i].node->dlen ) indel = 1; |
|
|
|
|
|
if ( i<istack ) |
|
{ |
|
if ( dlen%3 ) |
|
{ |
|
if ( ibeg==-1 ) ibeg = i; |
|
continue; |
|
} |
|
|
|
|
|
int icur = node2sbeg(i); |
|
int inext = node2sbeg(i+1); |
|
if ( hap->stack[i].node->dlen > 0 ) icur += hap->stack[i].node->dlen; |
|
else if ( hap->stack[i].node->dlen < 0 ) icur++; |
|
if ( icur/3 == inext/3 ) |
|
{ |
|
if ( ibeg==-1 ) ibeg = i; |
|
continue; |
|
} |
|
} |
|
if ( ibeg<0 ) ibeg = i; |
|
|
|
int ioff = node2soff(ibeg); |
|
int icur = node2sbeg(ibeg); |
|
int rbeg = node2rbeg(ibeg); |
|
int rend = node2rend(i); |
|
int fill = dlen%3; |
|
|
|
|
|
if ( hap->sseq.l ) |
|
{ |
|
sseq.l = hap->stack[i].slen - ioff; |
|
sseq.s = hap->sseq.s + ioff; |
|
} |
|
else |
|
sseq.l = fill = 0; |
|
cds_translate(&sref, &sseq, icur,rbeg,rend, tr->strand, &hap->tseq, fill); |
|
|
|
|
|
sseq.l = node2rend(i) - rbeg; |
|
sseq.s = sref.s + N_REF_PAD + rbeg; |
|
sseq.m = sref.m - 2*N_REF_PAD; |
|
cds_translate(&sref, &sseq, rbeg,rbeg,rend, tr->strand, &hap->tref, fill); |
|
sseq.m = sref.m - 2*N_REF_PAD + hap->stack[istack].dlen; |
|
|
|
hap_add_csq(args,hap,node,0, ibeg,i,dlen,indel); |
|
ibeg = -1; |
|
dlen = 0; |
|
indel = 0; |
|
} |
|
} |
|
else |
|
{ |
|
i = istack + 1, ibeg = -1; |
|
while ( --i > 0 ) |
|
{ |
|
assert ( hap->stack[i].node->type != HAP_SSS ); |
|
dlen += hap->stack[i].node->dlen; |
|
if ( hap->stack[i].node->dlen ) indel = 1; |
|
if ( i>1 ) |
|
{ |
|
if ( dlen%3 ) |
|
{ |
|
if ( ibeg==-1 ) ibeg = i; |
|
continue; |
|
} |
|
|
|
|
|
int icur = sseq.m - 1 - node2sbeg(i); |
|
int inext = sseq.m - 1 - node2sbeg(i-1); |
|
if ( hap->stack[i].node->dlen > 0 ) icur += hap->stack[i].node->dlen - 1; |
|
else if ( hap->stack[i].node->dlen < 0 ) icur -= hap->stack[i].node->dlen; |
|
if ( hap->stack[i-1].node->dlen > 0 ) inext -= hap->stack[i-1].node->dlen; |
|
if ( icur/3 == inext/3 ) |
|
{ |
|
if ( ibeg==-1 ) ibeg = i; |
|
continue; |
|
} |
|
} |
|
if ( ibeg<0 ) ibeg = i; |
|
int ioff = node2soff(i); |
|
int icur = node2sbeg(i); |
|
int rbeg = node2rbeg(i); |
|
int rend = node2rend(ibeg); |
|
int fill = dlen%3; |
|
|
|
|
|
if ( hap->sseq.l ) |
|
{ |
|
sseq.l = hap->stack[ibeg].slen - ioff; |
|
sseq.s = hap->sseq.s + ioff; |
|
} |
|
else |
|
sseq.l = fill = 0; |
|
cds_translate(&sref, &sseq, icur,rbeg,rend, tr->strand, &hap->tseq, fill); |
|
|
|
|
|
sseq.l = node2rend(ibeg) - rbeg; |
|
sseq.s = sref.s + N_REF_PAD + rbeg; |
|
sseq.m = sref.m - 2*N_REF_PAD; |
|
cds_translate(&sref, &sseq, rbeg,rbeg,rend, tr->strand, &hap->tref, fill); |
|
sseq.m = sref.m - 2*N_REF_PAD + hap->stack[istack].dlen; |
|
|
|
hap_add_csq(args,hap,node,sseq.m, i,ibeg,dlen,indel); |
|
ibeg = -1; |
|
dlen = 0; |
|
indel = 0; |
|
} |
|
} |
|
} |
|
} |
|
|
|
static inline void csq_print_text(args_t *args, csq_t *csq, int ismpl, int ihap) |
|
{ |
|
if ( csq->type.type & CSQ_PRINTED_UPSTREAM ) return; |
|
|
|
char *smpl = ismpl >= 0 ? args->hdr->samples[ismpl] : "-"; |
|
const char *chr = bcf_hdr_id2name(args->hdr,args->rid); |
|
|
|
fprintf(args->out,"CSQ\t%s\t", smpl); |
|
if ( ihap>0 ) |
|
fprintf(args->out,"%d", ihap); |
|
else |
|
fprintf(args->out,"-"); |
|
|
|
args->str.l = 0; |
|
kput_vcsq(args, &csq->type, &args->str); |
|
fprintf(args->out,"\t%s\t%d\t%s\n",chr,csq->pos+1,args->str.s); |
|
} |
|
static inline void hap_print_text(args_t *args, gf_tscript_t *tr, int ismpl, int ihap, hap_node_t *node) |
|
{ |
|
if ( !node || !node->ncsq_list ) return; |
|
|
|
char *smpl = ismpl >= 0 ? args->hdr->samples[ismpl] : "-"; |
|
const char *chr = bcf_hdr_id2name(args->hdr,args->rid); |
|
|
|
int i; |
|
for (i=0; i<node->ncsq_list; i++) |
|
{ |
|
csq_t *csq = node->csq_list + i; |
|
if ( csq->type.type & CSQ_PRINTED_UPSTREAM ) continue; |
|
assert( csq->type.vstr.l ); |
|
|
|
fprintf(args->out,"CSQ\t%s\t", smpl); |
|
if ( ihap>0 ) |
|
fprintf(args->out,"%d", ihap); |
|
else |
|
fprintf(args->out,"-"); |
|
|
|
args->str.l = 0; |
|
kput_vcsq(args, &csq->type, &args->str); |
|
fprintf(args->out,"\t%s\t%d\t%s\n",chr,csq->pos+1,args->str.s); |
|
} |
|
} |
|
|
|
static inline void hap_stage_vcf(args_t *args, gf_tscript_t *tr, int ismpl, int ihap, hap_node_t *node) |
|
{ |
|
if ( !node || !node->ncsq_list || ismpl<0 ) return; |
|
|
|
int i; |
|
for (i=0; i<node->ncsq_list; i++) |
|
{ |
|
csq_t *csq = node->csq_list + i; |
|
vrec_t *vrec = csq->vrec; |
|
int icsq2 = 2*csq->idx + ihap; |
|
if ( icsq2 >= args->ncsq2_max ) |
|
{ |
|
if ( args->verbosity && (!args->ncsq2_small_warned || args->verbosity > 1) ) |
|
{ |
|
fprintf(bcftools_stderr, |
|
"Warning: Too many consequences for sample %s at %s:%"PRId64", keeping the first %d and skipping the rest.\n", |
|
args->hdr->samples[ismpl],bcf_hdr_id2name(args->hdr,args->rid),(int64_t) vrec->line->pos+1,csq->idx); |
|
if ( !args->ncsq2_small_warned ) |
|
fprintf(bcftools_stderr," The limit can be increased by setting the --ncsq parameter. This warning is printed only once.\n"); |
|
} |
|
if ( args->ncsq2_small_warned < icsq2 ) args->ncsq2_small_warned = icsq2; |
|
break; |
|
} |
|
int ival, ibit; |
|
icsq2_to_bit(icsq2, &ival,&ibit); |
|
if ( vrec->nfmt < 1 + ival ) vrec->nfmt = 1 + ival; |
|
vrec->fmt_bm[ismpl*args->nfmt_bcsq + ival] |= 1 << ibit; |
|
} |
|
} |
|
|
|
void hap_flush(args_t *args, uint32_t pos) |
|
{ |
|
int i,j; |
|
tr_heap_t *heap = args->active_tr; |
|
while ( heap->ndat && heap->dat[0]->end<=pos ) |
|
{ |
|
gf_tscript_t *tr = heap->dat[0]; |
|
khp_delete(trhp, heap); |
|
args->hap->tr = tr; |
|
if ( TSCRIPT_AUX(tr)->root && TSCRIPT_AUX(tr)->root->nchild ) |
|
{ |
|
hap_finalize(args, args->hap); |
|
|
|
if ( args->output_type==FT_TAB_TEXT ) |
|
{ |
|
if ( args->phase==PHASE_DROP_GT ) |
|
hap_print_text(args, tr, -1,0, TSCRIPT_AUX(tr)->hap[0]); |
|
else |
|
{ |
|
for (i=0; i<args->smpl->n; i++) |
|
{ |
|
for (j=0; j<2; j++) |
|
hap_print_text(args, tr, args->smpl->idx[i],j+1, TSCRIPT_AUX(tr)->hap[i*2+j]); |
|
} |
|
} |
|
} |
|
else if ( args->phase!=PHASE_DROP_GT ) |
|
{ |
|
for (i=0; i<args->smpl->n; i++) |
|
{ |
|
for (j=0; j<2; j++) |
|
hap_stage_vcf(args, tr, args->smpl->idx[i],j, TSCRIPT_AUX(tr)->hap[i*2+j]); |
|
} |
|
} |
|
} |
|
|
|
|
|
|
|
args->nrm_tr++; |
|
hts_expand(gf_tscript_t*,args->nrm_tr,args->mrm_tr,args->rm_tr); |
|
args->rm_tr[args->nrm_tr-1] = tr; |
|
} |
|
} |
|
|
|
#define SWAP(type_t, a, b) { type_t t = a; a = b; b = t; } |
|
|
|
vbuf_t *vbuf_push(args_t *args, bcf1_t **rec_ptr) |
|
{ |
|
int i; |
|
|
|
assert(rec_ptr); |
|
bcf1_t *rec = *rec_ptr; |
|
|
|
|
|
i = args->vcf_rbuf.n ? rbuf_last(&args->vcf_rbuf) : -1; |
|
if ( i<0 || args->vcf_buf[i]->vrec[0]->line->pos!=rec->pos ) |
|
{ |
|
|
|
rbuf_expand0(&args->vcf_rbuf, vbuf_t*, args->vcf_rbuf.n+1, args->vcf_buf); |
|
i = rbuf_append(&args->vcf_rbuf); |
|
if ( !args->vcf_buf[i] ) args->vcf_buf[i] = (vbuf_t*) calloc(1,sizeof(vbuf_t)); |
|
args->vcf_buf[i]->n = 0; |
|
args->vcf_buf[i]->keep_until = 0; |
|
} |
|
vbuf_t *vbuf = args->vcf_buf[i]; |
|
vbuf->n++; |
|
hts_expand0(vrec_t*, vbuf->n, vbuf->m, vbuf->vrec); |
|
if ( !vbuf->vrec[vbuf->n - 1] ) |
|
vbuf->vrec[vbuf->n - 1] = (vrec_t*) calloc(1,sizeof(vrec_t)); |
|
|
|
vrec_t *vrec = vbuf->vrec[vbuf->n - 1]; |
|
if ( args->phase!=PHASE_DROP_GT && args->smpl->n ) |
|
{ |
|
if ( !vrec->fmt_bm ) vrec->fmt_bm = (uint32_t*) calloc(args->hdr_nsmpl,sizeof(*vrec->fmt_bm) * args->nfmt_bcsq); |
|
else memset(vrec->fmt_bm,0,args->hdr_nsmpl*sizeof(*vrec->fmt_bm) * args->nfmt_bcsq); |
|
} |
|
if ( !vrec->line ) vrec->line = bcf_init1(); |
|
SWAP(bcf1_t*, (*rec_ptr), vrec->line); |
|
|
|
int ret; |
|
khint_t k = kh_put(pos2vbuf, args->pos2vbuf, (int)rec->pos, &ret); |
|
kh_val(args->pos2vbuf,k) = vbuf; |
|
|
|
return vbuf; |
|
} |
|
|
|
void vbuf_flush(args_t *args, uint32_t pos) |
|
{ |
|
int i,j; |
|
while ( args->vcf_rbuf.n ) |
|
{ |
|
vbuf_t *vbuf; |
|
if ( !args->local_csq && args->active_tr->ndat ) |
|
{ |
|
|
|
|
|
vbuf = args->vcf_buf[ args->vcf_rbuf.f ]; |
|
if ( vbuf->keep_until > pos ) break; |
|
assert( vbuf->n ); |
|
} |
|
|
|
i = rbuf_shift(&args->vcf_rbuf); |
|
assert( i>=0 ); |
|
vbuf = args->vcf_buf[i]; |
|
int pos = vbuf->n ? vbuf->vrec[0]->line->pos : -1; |
|
for (i=0; i<vbuf->n; i++) |
|
{ |
|
vrec_t *vrec = vbuf->vrec[i]; |
|
if ( !args->out_fh ) |
|
{ |
|
vrec->nvcsq = 0; |
|
continue; |
|
} |
|
if ( !vrec->nvcsq ) |
|
{ |
|
if ( bcf_write(args->out_fh, args->hdr, vrec->line)!=0 ) error("[%s] Error: cannot write to %s\n", __func__,args->output_fname?args->output_fname:"standard output"); |
|
int save_pos = vrec->line->pos; |
|
bcf_empty(vrec->line); |
|
vrec->line->pos = save_pos; |
|
continue; |
|
} |
|
|
|
args->str.l = 0; |
|
kput_vcsq(args, &vrec->vcsq[0], &args->str); |
|
for (j=1; j<vrec->nvcsq; j++) |
|
{ |
|
kputc_(',', &args->str); |
|
kput_vcsq(args, &vrec->vcsq[j], &args->str); |
|
} |
|
bcf_update_info_string(args->hdr, vrec->line, args->bcsq_tag, args->str.s); |
|
if ( args->hdr_nsmpl ) |
|
{ |
|
if ( vrec->nfmt < args->nfmt_bcsq ) |
|
for (j=1; j<args->hdr_nsmpl; j++) |
|
memmove(&vrec->fmt_bm[j*vrec->nfmt], &vrec->fmt_bm[j*args->nfmt_bcsq], vrec->nfmt*sizeof(*vrec->fmt_bm)); |
|
bcf_update_format_int32(args->hdr, vrec->line, args->bcsq_tag, vrec->fmt_bm, args->hdr_nsmpl*vrec->nfmt); |
|
} |
|
vrec->nvcsq = 0; |
|
if ( bcf_write(args->out_fh, args->hdr, vrec->line)!=0 ) error("[%s] Error: cannot write to %s\n", __func__,args->output_fname?args->output_fname:"standard output"); |
|
int save_pos = vrec->line->pos; |
|
bcf_empty(vrec->line); |
|
vrec->line->pos = save_pos; |
|
} |
|
if ( pos!=-1 ) |
|
{ |
|
khint_t k = kh_get(pos2vbuf, args->pos2vbuf, pos); |
|
if ( k != kh_end(args->pos2vbuf) ) kh_del(pos2vbuf, args->pos2vbuf, k); |
|
} |
|
vbuf->n = 0; |
|
} |
|
if ( args->active_tr->ndat ) return; |
|
|
|
for (i=0; i<args->nrm_tr; i++) |
|
{ |
|
gf_tscript_t *tr = args->rm_tr[i]; |
|
tscript_t *aux = TSCRIPT_AUX(tr); |
|
if ( aux->root ) hap_destroy(aux->root); |
|
aux->root = NULL; |
|
free(aux->hap); |
|
free(aux->ref); |
|
free(aux->sref); |
|
free(aux); |
|
tr->aux = NULL; |
|
} |
|
args->nrm_tr = 0; |
|
args->ncsq_buf = 0; |
|
} |
|
|
|
void tscript_init_ref(args_t *args, gf_tscript_t *tr, const char *chr) |
|
{ |
|
int i, len; |
|
int pad_beg = tr->beg >= N_REF_PAD ? N_REF_PAD : tr->beg; |
|
|
|
const char *tmp_chr = chr; |
|
if ( !faidx_has_seq(args->fai,tmp_chr) ) |
|
{ |
|
tmp_chr = drop_chr_prefix(args,chr); |
|
if ( !faidx_has_seq(args->fai,tmp_chr) ) tmp_chr = add_chr_prefix(args,chr); |
|
} |
|
TSCRIPT_AUX(tr)->ref = faidx_fetch_seq(args->fai, tmp_chr, tr->beg - pad_beg, tr->end + N_REF_PAD, &len); |
|
if ( !TSCRIPT_AUX(tr)->ref ) |
|
error("faidx_fetch_seq failed %s:%d-%d\n", chr,tr->beg+1,tr->end+1); |
|
|
|
int pad_end = len - (tr->end - tr->beg + 1 + pad_beg); |
|
if ( pad_beg + pad_end != 2*N_REF_PAD ) |
|
{ |
|
char *ref = (char*) malloc(tr->end - tr->beg + 1 + 2*N_REF_PAD + 1); |
|
for (i=0; i < N_REF_PAD - pad_beg; i++) ref[i] = 'N'; |
|
memcpy(ref+i, TSCRIPT_AUX(tr)->ref, len); |
|
len += i; |
|
for (i=0; i < N_REF_PAD - pad_end; i++) ref[i+len] = 'N'; |
|
ref[i+len] = 0; |
|
free(TSCRIPT_AUX(tr)->ref); |
|
TSCRIPT_AUX(tr)->ref = ref; |
|
} |
|
} |
|
|
|
static void sanity_check_ref(args_t *args, gf_tscript_t *tr, bcf1_t *rec) |
|
{ |
|
int vbeg = 0; |
|
int rbeg = rec->pos - tr->beg + N_REF_PAD; |
|
if ( rbeg < 0 ) { vbeg += abs(rbeg); rbeg = 0; } |
|
char *ref = TSCRIPT_AUX(tr)->ref + rbeg; |
|
char *vcf = rec->d.allele[0] + vbeg; |
|
assert( vcf - rec->d.allele[0] < strlen(rec->d.allele[0]) && ref - TSCRIPT_AUX(tr)->ref < tr->end - tr->beg + 2*N_REF_PAD ); |
|
int i = 0; |
|
while ( ref[i] && vcf[i] ) |
|
{ |
|
if ( ref[i]!=vcf[i] && toupper(ref[i])!=toupper(vcf[i]) ) |
|
error("Error: the fasta reference does not match the VCF REF allele at %s:%"PRId64" .. fasta=%c vcf=%c\n", |
|
bcf_seqname(args->hdr,rec),(int64_t) rec->pos+vbeg+1,ref[i],vcf[i]); |
|
i++; |
|
} |
|
} |
|
|
|
int test_cds_local(args_t *args, bcf1_t *rec) |
|
{ |
|
int i,j, ret = 0; |
|
const char *chr = drop_chr_prefix(args, bcf_seqname(args->hdr,rec)); |
|
|
|
if ( !regidx_overlap(args->idx_cds,chr,rec->pos,rec->pos+rec->rlen, args->itr) ) return 0; |
|
|
|
|
|
hap_node_t root, node; |
|
root.type = HAP_ROOT; |
|
kstring_t *tref = &args->hap->tref, *tseq = &args->hap->tseq; |
|
|
|
while ( regitr_overlap(args->itr) ) |
|
{ |
|
gf_cds_t *cds = regitr_payload(args->itr,gf_cds_t*); |
|
gf_tscript_t *tr = cds->tr; |
|
if ( !GF_is_coding(tr->type) ) continue; |
|
ret = 1; |
|
|
|
if ( !TSCRIPT_AUX(tr) ) |
|
{ |
|
tr->aux = calloc(sizeof(tscript_t),1); |
|
tscript_init_ref(args, tr, chr); |
|
tscript_splice_ref(tr); |
|
khp_insert(trhp, args->active_tr, &tr); |
|
} |
|
|
|
sanity_check_ref(args, tr, rec); |
|
|
|
kstring_t sref; |
|
sref.s = TSCRIPT_AUX(tr)->sref; |
|
sref.l = TSCRIPT_AUX(tr)->nsref; |
|
sref.m = sref.l; |
|
|
|
for (i=1; i<rec->n_allele; i++) |
|
{ |
|
if ( rec->d.allele[i][0]=='<' || rec->d.allele[i][0]=='*' ) { continue; } |
|
if ( hap_init(args, &root, &node, cds, rec, i)!=0 ) continue; |
|
|
|
csq_t csq; |
|
memset(&csq, 0, sizeof(csq_t)); |
|
csq.pos = rec->pos; |
|
csq.type.biotype = tr->type; |
|
csq.type.strand = tr->strand; |
|
csq.type.trid = tr->id; |
|
csq.type.vcf_ial = i; |
|
csq.type.gene = tr->gene->name; |
|
|
|
int csq_type = node.csq; |
|
|
|
|
|
if ( node.type == HAP_SSS ) |
|
{ |
|
csq.type.type = csq_type; |
|
csq_stage(args, &csq, rec); |
|
} |
|
else |
|
{ |
|
kstring_t sseq; |
|
sseq.m = sref.m - 2*N_REF_PAD + node.dlen; |
|
sseq.s = node.seq; |
|
int alen = sseq.l = strlen(sseq.s); |
|
int fill = node.dlen%3 && alen ? 1 : 0; |
|
cds_translate(&sref, &sseq, node.sbeg,node.sbeg,node.sbeg+node.rlen, tr->strand, tseq, fill); |
|
|
|
sseq.m = sref.m - 2*N_REF_PAD; |
|
sseq.s = sref.s + N_REF_PAD + node.sbeg; |
|
sseq.l = node.rlen; |
|
cds_translate(&sref, &sseq, node.sbeg,node.sbeg,node.sbeg+node.rlen, tr->strand, tref, fill); |
|
|
|
|
|
for (j=0; j<tref->l; j++) |
|
if ( tref->s[j]=='*' ) break; |
|
if ( j!=tref->l ) |
|
{ |
|
tref->l = j+1; |
|
tref->s[j+1] = 0; |
|
} |
|
for (j=0; j<tseq->l; j++) |
|
if ( tseq->s[j]=='*' ) break; |
|
if ( j!=tseq->l ) |
|
{ |
|
tseq->l = j+1; |
|
tseq->s[j+1] = 0; |
|
} |
|
if ( csq_type & CSQ_STOP_LOST ) |
|
{ |
|
if ( tref->s[tref->l-1]=='*' && tref->s[tref->l-1] == tseq->s[tseq->l-1] ) |
|
{ |
|
csq_type &= ~CSQ_STOP_LOST; |
|
csq_type |= CSQ_STOP_RETAINED; |
|
} |
|
else if (tref->s[tref->l-1]!='*' ) |
|
{ |
|
|
|
|
|
if ( tseq->s[tseq->l-1] == '*' ) |
|
{ |
|
csq_type &= ~CSQ_STOP_GAINED; |
|
csq_type |= CSQ_STOP_RETAINED; |
|
} |
|
else |
|
csq_type |= CSQ_INCOMPLETE_CDS; |
|
} |
|
} |
|
if ( csq_type & CSQ_START_LOST && tref->s[0]!='M' ) |
|
csq_type &= ~CSQ_START_LOST; |
|
if ( node.dlen!=0 ) |
|
{ |
|
if ( node.dlen%3 ) |
|
csq_type |= CSQ_FRAMESHIFT_VARIANT; |
|
else if ( node.dlen<0 ) |
|
csq_type |= CSQ_INFRAME_DELETION; |
|
else |
|
csq_type |= CSQ_INFRAME_INSERTION; |
|
if ( tref->s[tref->l-1]!='*' && tseq->s[tseq->l-1]=='*' ) |
|
csq_type |= CSQ_STOP_GAINED; |
|
} |
|
else |
|
{ |
|
int aa_change = 0; |
|
for (j=0; j<tref->l; j++) |
|
{ |
|
if ( tref->s[j] == tseq->s[j] ) continue; |
|
aa_change = 1; |
|
if ( tref->s[j] == '*' ) |
|
csq_type |= CSQ_STOP_LOST; |
|
else if ( tseq->s[j] == '*' ) |
|
csq_type |= CSQ_STOP_GAINED; |
|
else |
|
csq_type |= CSQ_MISSENSE_VARIANT; |
|
} |
|
if ( !aa_change ) |
|
csq_type |= CSQ_SYNONYMOUS_VARIANT; |
|
} |
|
if ( csq_type & CSQ_COMPOUND ) |
|
{ |
|
|
|
kstring_t str = {0,0,0}; |
|
int aa_rbeg = tr->strand==STRAND_FWD ? node.sbeg/3+1 : (TSCRIPT_AUX(tr)->nsref - 2*N_REF_PAD - node.sbeg - node.rlen)/3+1; |
|
int aa_sbeg = tr->strand==STRAND_FWD ? node.sbeg/3+1 : (TSCRIPT_AUX(tr)->nsref - 2*N_REF_PAD + node.dlen - node.sbeg - alen)/3+1; |
|
kputc_('|', &str); |
|
kputw(aa_rbeg, &str); |
|
kprint_aa_prediction(args,aa_rbeg,tref,&str); |
|
if ( !(csq_type & CSQ_SYNONYMOUS_VARIANT) ) |
|
{ |
|
kputc_('>', &str); |
|
kputw(aa_sbeg, &str); |
|
kprint_aa_prediction(args,aa_sbeg,tseq,&str); |
|
} |
|
kputc_('|', &str); |
|
kputw(rec->pos+1, &str); |
|
kputs(node.var, &str); |
|
csq.type.vstr = str; |
|
csq.type.type = csq_type & CSQ_COMPOUND; |
|
csq_stage(args, &csq, rec); |
|
|
|
|
|
if ( !TSCRIPT_AUX(tr)->root ) |
|
TSCRIPT_AUX(tr)->root = (hap_node_t*) calloc(1,sizeof(hap_node_t)); |
|
TSCRIPT_AUX(tr)->root->ncsq_list++; |
|
hts_expand0(csq_t,TSCRIPT_AUX(tr)->root->ncsq_list,TSCRIPT_AUX(tr)->root->mcsq_list,TSCRIPT_AUX(tr)->root->csq_list); |
|
csq_t *rm_csq = TSCRIPT_AUX(tr)->root->csq_list + TSCRIPT_AUX(tr)->root->ncsq_list - 1; |
|
rm_csq->type.vstr = str; |
|
} |
|
if ( csq_type & ~CSQ_COMPOUND ) |
|
{ |
|
csq.type.type = csq_type & ~CSQ_COMPOUND; |
|
csq.type.vstr.l = 0; |
|
csq_stage(args, &csq, rec); |
|
} |
|
} |
|
free(node.seq); |
|
free(node.var); |
|
} |
|
} |
|
return ret; |
|
} |
|
|
|
int test_cds(args_t *args, bcf1_t *rec, vbuf_t *vbuf) |
|
{ |
|
static int overlaps_warned = 0, multiploid_warned = 0; |
|
|
|
int i, ret = 0, hap_ret; |
|
const char *chr = drop_chr_prefix(args, bcf_seqname(args->hdr,rec)); |
|
|
|
if ( !regidx_overlap(args->idx_cds,chr,rec->pos,rec->pos+rec->rlen, args->itr) ) return 0; |
|
while ( regitr_overlap(args->itr) ) |
|
{ |
|
gf_cds_t *cds = regitr_payload(args->itr,gf_cds_t*); |
|
gf_tscript_t *tr = cds->tr; |
|
if ( !GF_is_coding(tr->type) ) continue; |
|
if ( vbuf->keep_until < tr->end ) vbuf->keep_until = tr->end; |
|
ret = 1; |
|
if ( !TSCRIPT_AUX(tr) ) |
|
{ |
|
|
|
tr->aux = calloc(sizeof(tscript_t),1); |
|
tscript_init_ref(args, tr, chr); |
|
|
|
TSCRIPT_AUX(tr)->root = (hap_node_t*) calloc(1,sizeof(hap_node_t)); |
|
TSCRIPT_AUX(tr)->nhap = args->phase==PHASE_DROP_GT ? 1 : 2*args->smpl->n; |
|
TSCRIPT_AUX(tr)->hap = (hap_node_t**) malloc(TSCRIPT_AUX(tr)->nhap*sizeof(hap_node_t*)); |
|
for (i=0; i<TSCRIPT_AUX(tr)->nhap; i++) TSCRIPT_AUX(tr)->hap[i] = NULL; |
|
TSCRIPT_AUX(tr)->root->nend = TSCRIPT_AUX(tr)->nhap; |
|
TSCRIPT_AUX(tr)->root->type = HAP_ROOT; |
|
|
|
khp_insert(trhp, args->active_tr, &tr); |
|
} |
|
|
|
sanity_check_ref(args, tr, rec); |
|
|
|
if ( args->phase==PHASE_DROP_GT ) |
|
{ |
|
if ( rec->d.allele[1][0]=='<' || rec->d.allele[1][0]=='*' ) { continue; } |
|
hap_node_t *parent = TSCRIPT_AUX(tr)->hap[0] ? TSCRIPT_AUX(tr)->hap[0] : TSCRIPT_AUX(tr)->root; |
|
hap_node_t *child = (hap_node_t*)calloc(1,sizeof(hap_node_t)); |
|
hap_ret = hap_init(args, parent, child, cds, rec, 1); |
|
if ( hap_ret!=0 ) |
|
{ |
|
|
|
if ( hap_ret==1 ) |
|
{ |
|
if ( args->verbosity && (!overlaps_warned || args->verbosity > 1) ) |
|
{ |
|
fprintf(bcftools_stderr, |
|
"Warning: Skipping overlapping variants at %s:%"PRId64"\t%s>%s.\n", |
|
chr,(int64_t) rec->pos+1,rec->d.allele[0],rec->d.allele[1]); |
|
if ( !overlaps_warned ) |
|
fprintf(bcftools_stderr," This message is printed only once, the verbosity can be increased with `--verbose 2`\n"); |
|
overlaps_warned = 1; |
|
} |
|
if ( args->out ) |
|
fprintf(args->out,"LOG\tWarning: Skipping overlapping variants at %s:%"PRId64"\t%s>%s\n", chr,(int64_t) rec->pos+1,rec->d.allele[0],rec->d.allele[1]); |
|
} |
|
else ret = 1; |
|
hap_destroy(child); |
|
continue; |
|
} |
|
if ( child->type==HAP_SSS ) |
|
{ |
|
csq_t csq; |
|
memset(&csq, 0, sizeof(csq_t)); |
|
csq.pos = rec->pos; |
|
csq.type.biotype = tr->type; |
|
csq.type.strand = tr->strand; |
|
csq.type.trid = tr->id; |
|
csq.type.vcf_ial = 1; |
|
csq.type.gene = tr->gene->name; |
|
csq.type.type = child->csq; |
|
csq_stage(args, &csq, rec); |
|
hap_destroy(child); |
|
ret = 1; |
|
continue; |
|
} |
|
parent->nend--; |
|
parent->nchild = 1; |
|
parent->mchild = 1; |
|
parent->child = (hap_node_t**) malloc(sizeof(hap_node_t*)); |
|
parent->child[0] = child; |
|
TSCRIPT_AUX(tr)->hap[0] = child; |
|
TSCRIPT_AUX(tr)->hap[0]->nend = 1; |
|
continue; |
|
} |
|
|
|
|
|
int j, ismpl, ihap, ngts = bcf_get_genotypes(args->hdr, rec, &args->gt_arr, &args->mgt_arr); |
|
ngts /= bcf_hdr_nsamples(args->hdr); |
|
if ( ngts!=1 && ngts!=2 ) |
|
{ |
|
if ( args->verbosity && (!multiploid_warned || args->verbosity > 1) ) |
|
{ |
|
fprintf(bcftools_stderr, |
|
"Warning: Skipping site with non-diploid/non-haploid genotypes at %s:%"PRId64"\t%s>%s.\n", |
|
chr,(int64_t) rec->pos+1,rec->d.allele[0],rec->d.allele[1]); |
|
if ( !multiploid_warned ) |
|
fprintf(bcftools_stderr," This message is printed only once, the verbosity can be increased with `--verbose 2`\n"); |
|
multiploid_warned = 1; |
|
} |
|
if ( args->out ) |
|
fprintf(args->out,"LOG\tWarning: Skipping site with non-diploid/non-haploid genotypes at %s:%"PRId64"\t%s>%s\n", chr,(int64_t) rec->pos+1,rec->d.allele[0],rec->d.allele[1]); |
|
continue; |
|
} |
|
for (ismpl=0; ismpl<args->smpl->n; ismpl++) |
|
{ |
|
int32_t *gt = args->gt_arr + args->smpl->idx[ismpl]*ngts; |
|
if ( gt[0]==bcf_gt_missing ) continue; |
|
|
|
if ( ngts>1 && gt[1]!=bcf_gt_missing && gt[1]!=bcf_int32_vector_end && bcf_gt_allele(gt[0])!=bcf_gt_allele(gt[1]) ) |
|
{ |
|
if ( args->phase==PHASE_MERGE ) |
|
{ |
|
if ( !bcf_gt_allele(gt[0]) ) gt[0] = gt[1]; |
|
} |
|
if ( !bcf_gt_is_phased(gt[0]) && !bcf_gt_is_phased(gt[1]) ) |
|
{ |
|
if ( args->phase==PHASE_REQUIRE ) |
|
error("Unphased heterozygous genotype at %s:%"PRId64", sample %s. See the --phase option.\n", chr,(int64_t) rec->pos+1,args->hdr->samples[args->smpl->idx[ismpl]]); |
|
if ( args->phase==PHASE_SKIP ) |
|
continue; |
|
if ( args->phase==PHASE_NON_REF ) |
|
{ |
|
if ( !bcf_gt_allele(gt[0]) ) gt[0] = gt[1]; |
|
else if ( !bcf_gt_allele(gt[1]) ) gt[1] = gt[0]; |
|
} |
|
} |
|
} |
|
|
|
for (ihap=0; ihap<ngts; ihap++) |
|
{ |
|
if ( gt[ihap]==bcf_gt_missing || gt[ihap]==bcf_int32_vector_end ) continue; |
|
|
|
i = 2*ismpl + ihap; |
|
|
|
int ial = bcf_gt_allele(gt[ihap]); |
|
if ( !ial ) continue; |
|
assert( ial < rec->n_allele ); |
|
if ( rec->d.allele[ial][0]=='<' || rec->d.allele[ial][0]=='*' ) { continue; } |
|
|
|
hap_node_t *parent = TSCRIPT_AUX(tr)->hap[i] ? TSCRIPT_AUX(tr)->hap[i] : TSCRIPT_AUX(tr)->root; |
|
if ( parent->cur_rec==rec && parent->cur_child[ial]>=0 ) |
|
{ |
|
|
|
TSCRIPT_AUX(tr)->hap[i] = parent->child[ parent->cur_child[ial] ]; |
|
TSCRIPT_AUX(tr)->hap[i]->nend++; |
|
parent->nend--; |
|
continue; |
|
} |
|
|
|
hap_node_t *child = (hap_node_t*)calloc(1,sizeof(hap_node_t)); |
|
hap_ret = hap_init(args, parent, child, cds, rec, ial); |
|
if ( hap_ret!=0 ) |
|
{ |
|
|
|
if ( hap_ret==1 ) |
|
{ |
|
if ( args->verbosity && (!overlaps_warned || args->verbosity > 1) ) |
|
{ |
|
fprintf(bcftools_stderr, |
|
"Warning: Skipping overlapping variants at %s:%"PRId64", sample %s\t%s>%s.\n", |
|
chr,(int64_t) rec->pos+1,args->hdr->samples[args->smpl->idx[ismpl]],rec->d.allele[0],rec->d.allele[ial]); |
|
if ( !overlaps_warned ) |
|
fprintf(bcftools_stderr," This message is printed only once, the verbosity can be increased with `--verbose 2`\n"); |
|
overlaps_warned = 1; |
|
} |
|
if ( args->out ) |
|
fprintf(args->out,"LOG\tWarning: Skipping overlapping variants at %s:%"PRId64", sample %s\t%s>%s\n", |
|
chr,(int64_t) rec->pos+1,args->hdr->samples[args->smpl->idx[ismpl]],rec->d.allele[0],rec->d.allele[ial]); |
|
} |
|
hap_destroy(child); |
|
continue; |
|
} |
|
if ( child->type==HAP_SSS ) |
|
{ |
|
csq_t csq; |
|
memset(&csq, 0, sizeof(csq_t)); |
|
csq.pos = rec->pos; |
|
csq.type.biotype = tr->type; |
|
csq.type.strand = tr->strand; |
|
csq.type.trid = tr->id; |
|
csq.type.vcf_ial = ial; |
|
csq.type.gene = tr->gene->name; |
|
csq.type.type = child->csq; |
|
csq_stage(args, &csq, rec); |
|
hap_destroy(child); |
|
continue; |
|
} |
|
if ( parent->cur_rec!=rec ) |
|
{ |
|
hts_expand(int,rec->n_allele,parent->mcur_child,parent->cur_child); |
|
for (j=0; j<rec->n_allele; j++) parent->cur_child[j] = -1; |
|
parent->cur_rec = rec; |
|
} |
|
|
|
j = parent->nchild++; |
|
hts_expand0(hap_node_t*,parent->nchild,parent->mchild,parent->child); |
|
parent->cur_child[ial] = j; |
|
parent->child[j] = child; |
|
TSCRIPT_AUX(tr)->hap[i] = child; |
|
TSCRIPT_AUX(tr)->hap[i]->nend++; |
|
parent->nend--; |
|
} |
|
} |
|
} |
|
return ret; |
|
} |
|
|
|
void csq_stage(args_t *args, csq_t *csq, bcf1_t *rec) |
|
{ |
|
|
|
|
|
|
|
if ( csq_push(args, csq, rec)!=0 && args->phase==PHASE_DROP_GT ) return; |
|
|
|
int i,j,ngt = 0; |
|
if ( args->phase!=PHASE_DROP_GT ) |
|
{ |
|
ngt = bcf_get_genotypes(args->hdr, rec, &args->gt_arr, &args->mgt_arr); |
|
if ( ngt>0 ) ngt /= bcf_hdr_nsamples(args->hdr); |
|
} |
|
if ( ngt<=0 ) |
|
{ |
|
if ( args->output_type==FT_TAB_TEXT ) |
|
csq_print_text(args, csq, -1,0); |
|
return; |
|
} |
|
assert( ngt<=2 ); |
|
|
|
if ( args->output_type==FT_TAB_TEXT ) |
|
{ |
|
for (i=0; i<args->smpl->n; i++) |
|
{ |
|
int32_t *gt = args->gt_arr + args->smpl->idx[i]*ngt; |
|
for (j=0; j<ngt; j++) |
|
{ |
|
if ( gt[j]==bcf_gt_missing || gt[j]==bcf_int32_vector_end ) continue; |
|
int ial = bcf_gt_allele(gt[j]); |
|
if ( !ial || ial!=csq->type.vcf_ial ) continue; |
|
csq_print_text(args, csq, args->smpl->idx[i],j+1); |
|
} |
|
} |
|
return; |
|
} |
|
|
|
vrec_t *vrec = csq->vrec; |
|
for (i=0; i<args->smpl->n; i++) |
|
{ |
|
int32_t *gt = args->gt_arr + args->smpl->idx[i]*ngt; |
|
for (j=0; j<ngt; j++) |
|
{ |
|
if ( gt[j]==bcf_gt_missing || gt[j]==bcf_int32_vector_end ) continue; |
|
int ial = bcf_gt_allele(gt[j]); |
|
if ( !ial || ial!=csq->type.vcf_ial ) continue; |
|
|
|
int icsq2 = 2*csq->idx + j; |
|
if ( icsq2 >= args->ncsq2_max ) |
|
{ |
|
int ismpl = args->smpl->idx[i]; |
|
if ( args->verbosity && (!args->ncsq2_small_warned || args->verbosity > 1) ) |
|
{ |
|
fprintf(bcftools_stderr, |
|
"Warning: Too many consequences for sample %s at %s:%"PRId64", keeping the first %d and skipping the rest.\n", |
|
args->hdr->samples[ismpl],bcf_hdr_id2name(args->hdr,args->rid),(int64_t) vrec->line->pos+1,icsq2+1); |
|
if ( !args->ncsq2_small_warned ) |
|
fprintf(bcftools_stderr," The limit can be increased by setting the --ncsq parameter. This warning is printed only once.\n"); |
|
args->ncsq2_small_warned = 1; |
|
} |
|
if ( args->ncsq2_small_warned < icsq2 ) args->ncsq2_small_warned = icsq2; |
|
break; |
|
} |
|
int ival, ibit; |
|
icsq2_to_bit(icsq2, &ival,&ibit); |
|
if ( vrec->nfmt < 1 + ival ) vrec->nfmt = 1 + ival; |
|
vrec->fmt_bm[i*args->nfmt_bcsq + ival] |= 1 << ibit; |
|
} |
|
} |
|
} |
|
int test_utr(args_t *args, bcf1_t *rec) |
|
{ |
|
const char *chr = drop_chr_prefix(args, bcf_seqname(args->hdr,rec)); |
|
|
|
if ( !regidx_overlap(args->idx_utr,chr,rec->pos,rec->pos+rec->rlen, args->itr) ) return 0; |
|
|
|
splice_t splice; |
|
splice_init(&splice, rec); |
|
|
|
int i, ret = 0; |
|
while ( regitr_overlap(args->itr) ) |
|
{ |
|
gf_utr_t *utr = regitr_payload(args->itr, gf_utr_t*); |
|
gf_tscript_t *tr = splice.tr = utr->tr; |
|
for (i=1; i<rec->n_allele; i++) |
|
{ |
|
if ( rec->d.allele[i][0]=='<' || rec->d.allele[i][0]=='*' ) { continue; } |
|
splice.vcf.alt = rec->d.allele[i]; |
|
splice.csq = 0; |
|
int splice_ret = splice_csq(args, &splice, utr->beg, utr->end); |
|
if ( splice_ret!=SPLICE_INSIDE && splice_ret!=SPLICE_OVERLAP ) continue; |
|
csq_t csq; |
|
memset(&csq, 0, sizeof(csq_t)); |
|
csq.pos = rec->pos; |
|
csq.type.type = utr->which==prime5 ? CSQ_UTR5 : CSQ_UTR3; |
|
csq.type.biotype = tr->type; |
|
csq.type.strand = tr->strand; |
|
csq.type.trid = tr->id; |
|
csq.type.vcf_ial = i; |
|
csq.type.gene = tr->gene->name; |
|
csq_stage(args, &csq, rec); |
|
ret = 1; |
|
} |
|
} |
|
assert(!splice.kref.s); |
|
assert(!splice.kalt.s); |
|
return ret; |
|
} |
|
int test_splice(args_t *args, bcf1_t *rec) |
|
{ |
|
const char *chr = drop_chr_prefix(args, bcf_seqname(args->hdr,rec)); |
|
if ( !regidx_overlap(args->idx_exon,chr,rec->pos,rec->pos + rec->rlen, args->itr) ) return 0; |
|
|
|
splice_t splice; |
|
splice_init(&splice, rec); |
|
splice.check_acceptor = splice.check_donor = 1; |
|
|
|
int i, ret = 0; |
|
while ( regitr_overlap(args->itr) ) |
|
{ |
|
gf_exon_t *exon = regitr_payload(args->itr, gf_exon_t*); |
|
splice.tr = exon->tr; |
|
if ( !splice.tr->ncds ) continue; |
|
|
|
splice.check_region_beg = splice.tr->beg==exon->beg ? 0 : 1; |
|
splice.check_region_end = splice.tr->end==exon->end ? 0 : 1; |
|
|
|
for (i=1; i<rec->n_allele; i++) |
|
{ |
|
if ( rec->d.allele[1][0]=='<' || rec->d.allele[1][0]=='*' ) { continue; } |
|
splice.vcf.alt = rec->d.allele[i]; |
|
splice.csq = 0; |
|
splice_csq(args, &splice, exon->beg, exon->end); |
|
if ( splice.csq ) ret = 1; |
|
} |
|
} |
|
free(splice.kref.s); |
|
free(splice.kalt.s); |
|
return ret; |
|
} |
|
int test_tscript(args_t *args, bcf1_t *rec) |
|
{ |
|
const char *chr = drop_chr_prefix(args, bcf_seqname(args->hdr,rec)); |
|
if ( !regidx_overlap(args->idx_tscript,chr,rec->pos,rec->pos+rec->rlen, args->itr) ) return 0; |
|
|
|
splice_t splice; |
|
splice_init(&splice, rec); |
|
|
|
int i, ret = 0; |
|
while ( regitr_overlap(args->itr) ) |
|
{ |
|
gf_tscript_t *tr = splice.tr = regitr_payload(args->itr, gf_tscript_t*); |
|
for (i=1; i<rec->n_allele; i++) |
|
{ |
|
if ( rec->d.allele[i][0]=='<' || rec->d.allele[i][0]=='*' ) { continue; } |
|
splice.vcf.alt = rec->d.allele[i]; |
|
splice.csq = 0; |
|
int splice_ret = splice_csq(args, &splice, tr->beg, tr->end); |
|
if ( splice_ret!=SPLICE_INSIDE && splice_ret!=SPLICE_OVERLAP ) continue; |
|
csq_t csq; |
|
memset(&csq, 0, sizeof(csq_t)); |
|
csq.pos = rec->pos; |
|
csq.type.type = GF_is_coding(tr->type) ? CSQ_INTRON : CSQ_NON_CODING; |
|
csq.type.biotype = tr->type; |
|
csq.type.strand = tr->strand; |
|
csq.type.trid = tr->id; |
|
csq.type.gene = tr->gene->name; |
|
csq_stage(args, &csq, rec); |
|
ret = 1; |
|
} |
|
} |
|
assert(!splice.kref.s); |
|
assert(!splice.kalt.s); |
|
return ret; |
|
} |
|
|
|
void test_symbolic_alt(args_t *args, bcf1_t *rec) |
|
{ |
|
static int warned = 0; |
|
if ( args->verbosity && (!warned && args->verbosity > 0) ) |
|
{ |
|
fprintf(bcftools_stderr,"Warning: The support for symbolic ALT insertions is experimental.\n"); |
|
warned = 1; |
|
} |
|
|
|
const char *chr = drop_chr_prefix(args, bcf_seqname(args->hdr,rec)); |
|
|
|
|
|
int beg = rec->pos + 1; |
|
int end = beg; |
|
int csq_class = CSQ_ELONGATION; |
|
|
|
int hit = 0; |
|
if ( regidx_overlap(args->idx_cds,chr,beg,end, args->itr) ) |
|
{ |
|
while ( regitr_overlap(args->itr) ) |
|
{ |
|
csq_t csq; |
|
memset(&csq, 0, sizeof(csq_t)); |
|
gf_cds_t *cds = regitr_payload(args->itr,gf_cds_t*); |
|
gf_tscript_t *tr = cds->tr; |
|
csq.type.type = (GF_is_coding(tr->type) ? CSQ_CODING_SEQUENCE : CSQ_NON_CODING) | csq_class; |
|
csq.pos = rec->pos; |
|
csq.type.biotype = tr->type; |
|
csq.type.strand = tr->strand; |
|
csq.type.trid = tr->id; |
|
csq.type.gene = tr->gene->name; |
|
csq_stage(args, &csq, rec); |
|
hit = 1; |
|
} |
|
} |
|
if ( regidx_overlap(args->idx_utr,chr,beg,end, args->itr) ) |
|
{ |
|
while ( regitr_overlap(args->itr) ) |
|
{ |
|
csq_t csq; |
|
memset(&csq, 0, sizeof(csq_t)); |
|
gf_utr_t *utr = regitr_payload(args->itr, gf_utr_t*); |
|
gf_tscript_t *tr = utr->tr; |
|
csq.type.type = (utr->which==prime5 ? CSQ_UTR5 : CSQ_UTR3) | csq_class; |
|
csq.pos = rec->pos; |
|
csq.type.biotype = tr->type; |
|
csq.type.strand = tr->strand; |
|
csq.type.trid = tr->id; |
|
csq.type.gene = tr->gene->name; |
|
csq_stage(args, &csq, rec); |
|
hit = 1; |
|
} |
|
} |
|
if ( regidx_overlap(args->idx_exon,chr,beg,end, args->itr) ) |
|
{ |
|
splice_t splice; |
|
splice_init(&splice, rec); |
|
splice.check_acceptor = splice.check_donor = 1; |
|
|
|
while ( regitr_overlap(args->itr) ) |
|
{ |
|
gf_exon_t *exon = regitr_payload(args->itr, gf_exon_t*); |
|
splice.tr = exon->tr; |
|
if ( !splice.tr->ncds ) continue; |
|
splice.check_region_beg = splice.tr->beg==exon->beg ? 0 : 1; |
|
splice.check_region_end = splice.tr->end==exon->end ? 0 : 1; |
|
splice.vcf.alt = rec->d.allele[1]; |
|
splice.csq = csq_class; |
|
splice_csq(args, &splice, exon->beg, exon->end); |
|
if ( splice.csq ) hit = 1; |
|
} |
|
} |
|
if ( !hit && regidx_overlap(args->idx_tscript,chr,beg,end, args->itr) ) |
|
{ |
|
splice_t splice; |
|
splice_init(&splice, rec); |
|
|
|
while ( regitr_overlap(args->itr) ) |
|
{ |
|
csq_t csq; |
|
memset(&csq, 0, sizeof(csq_t)); |
|
gf_tscript_t *tr = splice.tr = regitr_payload(args->itr, gf_tscript_t*); |
|
splice.vcf.alt = rec->d.allele[1]; |
|
splice.csq = csq_class; |
|
int splice_ret = splice_csq(args, &splice, tr->beg, tr->end); |
|
if ( splice_ret!=SPLICE_INSIDE && splice_ret!=SPLICE_OVERLAP ) continue; |
|
csq.type.type = (GF_is_coding(tr->type) ? CSQ_INTRON : CSQ_NON_CODING) | csq_class; |
|
csq.pos = rec->pos; |
|
csq.type.biotype = tr->type; |
|
csq.type.strand = tr->strand; |
|
csq.type.trid = tr->id; |
|
csq.type.gene = tr->gene->name; |
|
csq_stage(args, &csq, rec); |
|
} |
|
} |
|
} |
|
|
|
void debug_print_buffers(args_t *args, int pos) |
|
{ |
|
int i,j; |
|
fprintf(bcftools_stderr,"debug_print_buffers at %d\n", pos); |
|
fprintf(bcftools_stderr,"vbufs:\n"); |
|
for (i=0; i<args->vcf_rbuf.n; i++) |
|
{ |
|
int k = rbuf_kth(&args->vcf_rbuf, i); |
|
vbuf_t *vbuf = args->vcf_buf[k]; |
|
|
|
fprintf(bcftools_stderr,"\tvbuf %d:\n", i); |
|
for (j=0; j<vbuf->n; j++) |
|
{ |
|
vrec_t *vrec = vbuf->vrec[j]; |
|
fprintf(bcftools_stderr,"\t\t%"PRId64" .. nvcsq=%d\n", (int64_t) vrec->line->pos+1, vrec->nvcsq); |
|
} |
|
} |
|
fprintf(bcftools_stderr,"pos2vbuf:"); |
|
khint_t k; |
|
for (k = 0; k < kh_end(args->pos2vbuf); ++k) |
|
if (kh_exist(args->pos2vbuf, k)) fprintf(bcftools_stderr," %d",1+(int)kh_key(args->pos2vbuf, k)); |
|
fprintf(bcftools_stderr,"\n"); |
|
fprintf(bcftools_stderr,"active_tr: %d\n", args->active_tr->ndat); |
|
} |
|
|
|
static void process(args_t *args, bcf1_t **rec_ptr) |
|
{ |
|
if ( !rec_ptr ) |
|
{ |
|
hap_flush(args, REGIDX_MAX); |
|
vbuf_flush(args, REGIDX_MAX); |
|
return; |
|
} |
|
|
|
bcf1_t *rec = *rec_ptr; |
|
static int32_t prev_rid = -1, prev_pos = -1; |
|
if ( prev_rid!=rec->rid ) |
|
{ |
|
prev_rid = rec->rid; |
|
prev_pos = rec->pos; |
|
|
|
|
|
|
|
|
|
if ( !faidx_has_seq(args->fai,bcf_seqname(args->hdr,rec)) ) |
|
{ |
|
if ( !faidx_has_seq(args->fai,drop_chr_prefix(args,bcf_seqname(args->hdr,rec))) && !faidx_has_seq(args->fai,add_chr_prefix(args,bcf_seqname(args->hdr,rec))) ) |
|
error("Error: the chromosome \"%s\" is not present in %s\n",bcf_seqname(args->hdr,rec),args->fa_fname); |
|
} |
|
} |
|
if ( prev_pos > rec->pos ) |
|
error("Error: The file is not sorted, %s:%d comes before %s:%"PRId64"\n",bcf_seqname(args->hdr,rec),prev_pos+1,bcf_seqname(args->hdr,rec),(int64_t) rec->pos+1); |
|
|
|
int call_csq = 1; |
|
if ( rec->n_allele < 2 ) call_csq = 0; |
|
else if ( rec->n_allele==2 && (rec->d.allele[1][0]=='*' || rec->d.allele[1][1]=='*') ) call_csq = 0; |
|
else if ( rec->d.allele[1][0]=='<' ) |
|
{ |
|
if ( strncmp("<INS",rec->d.allele[1], 4) ) call_csq = 0; |
|
} |
|
if ( call_csq && args->filter ) |
|
{ |
|
call_csq = filter_test(args->filter, rec, NULL); |
|
if ( args->filter_logic==FLT_EXCLUDE ) call_csq = call_csq ? 0 : 1; |
|
} |
|
if ( !call_csq ) |
|
{ |
|
if ( !args->out_fh ) return; |
|
vbuf_push(args, rec_ptr); |
|
hap_flush(args, rec->pos-1); |
|
vbuf_flush(args, rec->pos-1); |
|
return; |
|
} |
|
|
|
if ( args->rid != rec->rid ) |
|
{ |
|
hap_flush(args, REGIDX_MAX); |
|
vbuf_flush(args, REGIDX_MAX); |
|
} |
|
args->rid = rec->rid; |
|
vbuf_t *vbuf = vbuf_push(args, rec_ptr); |
|
|
|
if ( rec->d.allele[1][0]!='<' ) |
|
{ |
|
int hit = args->local_csq ? test_cds_local(args, rec) : test_cds(args, rec, vbuf); |
|
hit += test_utr(args, rec); |
|
hit += test_splice(args, rec); |
|
if ( !hit ) test_tscript(args, rec); |
|
} |
|
else |
|
test_symbolic_alt(args, rec); |
|
|
|
if ( rec->pos > 0 ) |
|
{ |
|
hap_flush(args, rec->pos-1); |
|
vbuf_flush(args, rec->pos-1); |
|
} |
|
|
|
return; |
|
} |
|
|
|
static const char *usage(void) |
|
{ |
|
return |
|
"\n" |
|
"About: Haplotype-aware consequence caller.\n" |
|
"Usage: bcftools csq [OPTIONS] in.vcf\n" |
|
"\n" |
|
"Required options:\n" |
|
" -f, --fasta-ref FILE Reference file in fasta format\n" |
|
" -g, --gff-annot FILE GFF3 annotation file\n" |
|
"\n" |
|
"CSQ options:\n" |
|
" -B, --trim-protein-seq INT Abbreviate protein-changing predictions to max INT aminoacids\n" |
|
" -c, --custom-tag STRING Use this tag instead of the default BCSQ\n" |
|
" -l, --local-csq Localized predictions, consider only one VCF record at a time\n" |
|
" -n, --ncsq INT Maximum number of per-haplotype consequences to consider for each site [15]\n" |
|
" -p, --phase a|m|r|R|s How to handle unphased heterozygous genotypes: [r]\n" |
|
" a: take GTs as is, create haplotypes regardless of phase (0/1 -> 0|1)\n" |
|
" m: merge *all* GTs into a single haplotype (0/1 -> 1, 1/2 -> 1)\n" |
|
" r: require phased GTs, throw an error on unphased het GTs\n" |
|
" R: create non-reference haplotypes if possible (0/1 -> 1|1, 1/2 -> 1|2)\n" |
|
" s: skip unphased hets\n" |
|
"GFF options:\n" |
|
" --dump-gff FILE.gz Dump the parsed GFF file (for debugging purposes)\n" |
|
" --force Run even if some sanity checks fail\n" |
|
" --unify-chr-names 1|0 Automatically unify chromosome naming (e.g. chrX vs X) in GFF, fasta, and VCF [1]\n" |
|
"General options:\n" |
|
" -e, --exclude EXPR Exclude sites for which the expression is true\n" |
|
" -i, --include EXPR Select sites for which the expression is true\n" |
|
" --no-version Do not append version and command line to the header\n" |
|
" -o, --output FILE Write output to a file [standard output]\n" |
|
" -O, --output-type b|u|z|v|t[0-9] b: compressed BCF, u: uncompressed BCF, z: compressed VCF\n" |
|
" v: uncompressed VCF, t: plain tab-delimited text output, 0-9: compression level [v]\n" |
|
" -r, --regions REGION Restrict to comma-separated list of regions\n" |
|
" -R, --regions-file FILE Restrict to regions listed in a file\n" |
|
" --regions-overlap 0|1|2 Include if POS in the region (0), record overlaps (1), variant overlaps (2) [1]\n" |
|
" -s, --samples -|LIST Samples to include or \"-\" to apply all variants and ignore samples\n" |
|
" -S, --samples-file FILE Samples to include\n" |
|
" -t, --targets REGION Similar to -r but streams rather than index-jumps\n" |
|
" -T, --targets-file FILE Similar to -R but streams rather than index-jumps\n" |
|
" --targets-overlap 0|1|2 Include if POS in the region (0), record overlaps (1), variant overlaps (2) [0]\n" |
|
" --threads INT Use multithreading with <int> worker threads [0]\n" |
|
" -v, --verbose INT Verbosity level 0-2 [1]\n" |
|
" --write-index Automatically index the output files [off]\n" |
|
"\n" |
|
"Example:\n" |
|
" bcftools csq -f hs37d5.fa -g Homo_sapiens.GRCh37.82.gff3.gz in.vcf\n" |
|
"\n" |
|
" # GFF3 annotation files can be downloaded from Ensembl. e.g. for human:\n" |
|
" ftp://ftp.ensembl.org/pub/current_gff3/homo_sapiens/\n" |
|
" ftp://ftp.ensembl.org/pub/grch37/release-84/gff3/homo_sapiens/\n" |
|
"\n"; |
|
} |
|
|
|
int main_csq(int argc, char *argv[]) |
|
{ |
|
args_t *args = (args_t*) calloc(1,sizeof(args_t)); |
|
args->argc = argc; args->argv = argv; |
|
args->output_type = FT_VCF; |
|
args->bcsq_tag = "BCSQ"; |
|
args->ncsq2_max = 2*(16-1); |
|
args->verbosity = 1; |
|
args->record_cmd_line = 1; |
|
args->clevel = -1; |
|
args->unify_chr_names = 1; |
|
|
|
static struct option loptions[] = |
|
{ |
|
{"force",0,0,1}, |
|
{"threads",required_argument,NULL,2}, |
|
{"help",0,0,'h'}, |
|
{"ncsq",1,0,'n'}, |
|
{"brief-predictions",no_argument,0,'b'}, |
|
{"trim-protein-seq",required_argument,0,'B'}, |
|
{"custom-tag",1,0,'c'}, |
|
{"local-csq",0,0,'l'}, |
|
{"gff-annot",1,0,'g'}, |
|
{"fasta-ref",1,0,'f'}, |
|
{"include",1,0,'i'}, |
|
{"exclude",1,0,'e'}, |
|
{"output",1,0,'o'}, |
|
{"output-type",1,NULL,'O'}, |
|
{"phase",1,0,'p'}, |
|
{"quiet",0,0,'q'}, |
|
{"verbose",1,0,'v'}, |
|
{"regions",1,0,'r'}, |
|
{"regions-file",1,0,'R'}, |
|
{"regions-overlap",required_argument,NULL,4}, |
|
{"samples",1,0,'s'}, |
|
{"samples-file",1,0,'S'}, |
|
{"targets",1,0,'t'}, |
|
{"targets-file",1,0,'T'}, |
|
{"targets-overlap",required_argument,NULL,5}, |
|
{"no-version",no_argument,NULL,3}, |
|
{"write-index",no_argument,NULL,6}, |
|
{"dump-gff",required_argument,NULL,7}, |
|
{"unify-chr-names",required_argument,NULL,8}, |
|
{0,0,0,0} |
|
}; |
|
int c, targets_is_file = 0, regions_is_file = 0; |
|
int regions_overlap = 1; |
|
int targets_overlap = 0; |
|
char *targets_list = NULL, *regions_list = NULL, *tmp; |
|
while ((c = getopt_long(argc, argv, "?hr:R:t:T:i:e:f:o:O:g:s:S:p:qc:ln:bB:v:",loptions,NULL)) >= 0) |
|
{ |
|
switch (c) |
|
{ |
|
case 1 : args->force = 1; break; |
|
case 2 : |
|
args->n_threads = strtol(optarg,&tmp,10); |
|
if ( *tmp ) error("Could not parse argument: --threads %s\n", optarg); |
|
break; |
|
case 3 : args->record_cmd_line = 0; break; |
|
case 'b': |
|
args->brief_predictions = 1; |
|
fprintf(bcftools_stderr,"Warning: The -b option will be removed in future versions. Please use -B 1 instead.\n"); |
|
break; |
|
case 'B': |
|
args->brief_predictions = strtol(optarg,&tmp,10); |
|
if ( *tmp || args->brief_predictions<1 ) error("Could not parse argument: --trim-protein-seq %s\n", optarg); |
|
break; |
|
case 'l': args->local_csq = 1; break; |
|
case 'c': args->bcsq_tag = optarg; break; |
|
case 'q': error("Error: the -q option has been deprecated, use -v, --verbose instead.\n"); break; |
|
case 'v': |
|
args->verbosity = atoi(optarg); |
|
if ( args->verbosity<0 || args->verbosity>2 ) error("Error: expected integer 0-2 with -v, --verbose\n"); |
|
break; |
|
case 'p': |
|
switch (optarg[0]) |
|
{ |
|
case 'a': args->phase = PHASE_AS_IS; break; |
|
case 'm': args->phase = PHASE_MERGE; break; |
|
case 'r': args->phase = PHASE_REQUIRE; break; |
|
case 'R': args->phase = PHASE_NON_REF; break; |
|
case 's': args->phase = PHASE_SKIP; break; |
|
default: error("The -p code \"%s\" not recognised\n", optarg); |
|
} |
|
break; |
|
case 'f': args->fa_fname = optarg; break; |
|
case 'g': args->gff_fname = optarg; break; |
|
case 'n': |
|
args->ncsq2_max = 2 * atoi(optarg); |
|
if ( args->ncsq2_max <= 0 ) error("Expected positive integer with -n, got %s\n", optarg); |
|
break; |
|
case 'o': args->output_fname = optarg; break; |
|
case 'O': |
|
switch (optarg[0]) { |
|
case 't': args->output_type = FT_TAB_TEXT; break; |
|
case 'b': args->output_type = FT_BCF_GZ; break; |
|
case 'u': args->output_type = FT_BCF; break; |
|
case 'z': args->output_type = FT_VCF_GZ; break; |
|
case 'v': args->output_type = FT_VCF; break; |
|
default: |
|
{ |
|
args->clevel = strtol(optarg,&tmp,10); |
|
if ( *tmp || args->clevel<0 || args->clevel>9 ) error("The output type \"%s\" not recognised\n", optarg); |
|
} |
|
} |
|
if ( optarg[1] ) |
|
{ |
|
args->clevel = strtol(optarg+1,&tmp,10); |
|
if ( *tmp || args->clevel<0 || args->clevel>9 ) error("Could not parse argument: --output-type %s\n", optarg+1); |
|
} |
|
break; |
|
case 'e': |
|
if ( args->filter_str ) error("Error: only one -i or -e expression can be given, and they cannot be combined\n"); |
|
args->filter_str = optarg; args->filter_logic |= FLT_EXCLUDE; break; |
|
case 'i': |
|
if ( args->filter_str ) error("Error: only one -i or -e expression can be given, and they cannot be combined\n"); |
|
args->filter_str = optarg; args->filter_logic |= FLT_INCLUDE; break; |
|
case 'r': regions_list = optarg; break; |
|
case 'R': regions_list = optarg; regions_is_file = 1; break; |
|
case 's': args->sample_list = optarg; break; |
|
case 'S': args->sample_list = optarg; args->sample_is_file = 1; break; |
|
case 't': targets_list = optarg; break; |
|
case 'T': targets_list = optarg; targets_is_file = 1; break; |
|
case 4 : |
|
regions_overlap = parse_overlap_option(optarg); |
|
if ( regions_overlap < 0 ) error("Could not parse: --regions-overlap %s\n",optarg); |
|
break; |
|
case 5 : |
|
targets_overlap = parse_overlap_option(optarg); |
|
if ( targets_overlap < 0 ) error("Could not parse: --targets-overlap %s\n",optarg); |
|
break; |
|
case 6 : args->write_index = 1; break; |
|
case 7 : args->dump_gff = optarg; break; |
|
case 8 : |
|
if ( !strcmp(optarg,"0") ) args->unify_chr_names = 0; |
|
else if ( !strcmp(optarg,"1") ) args->unify_chr_names = 1; |
|
else error("Could not parse: --unify-chr-names %s\n",optarg); |
|
break; |
|
case 'h': |
|
case '?': error("%s",usage()); |
|
default: error("The option not recognised: %s\n\n", optarg); break; |
|
} |
|
} |
|
char *fname = NULL; |
|
if ( optind==argc ) |
|
{ |
|
if ( !isatty(fileno((FILE *)stdin)) ) fname = "-"; |
|
else error("%s", usage()); |
|
} |
|
else fname = argv[optind]; |
|
if ( argc - optind>1 ) error("%s", usage()); |
|
if ( !args->fa_fname ) error("Missing the --fa-ref option\n"); |
|
if ( !args->gff_fname ) error("Missing the --gff option\n"); |
|
args->sr = bcf_sr_init(); |
|
if ( targets_list ) |
|
{ |
|
bcf_sr_set_opt(args->sr,BCF_SR_TARGETS_OVERLAP,targets_overlap); |
|
if ( bcf_sr_set_targets(args->sr, targets_list, targets_is_file, 0)<0 ) |
|
error("Failed to read the targets: %s\n", targets_list); |
|
} |
|
if ( regions_list ) |
|
{ |
|
bcf_sr_set_opt(args->sr,BCF_SR_REGIONS_OVERLAP,regions_overlap); |
|
if ( bcf_sr_set_regions(args->sr, regions_list, regions_is_file)<0 ) |
|
error("Failed to read the regions: %s\n", regions_list); |
|
} |
|
if ( bcf_sr_set_threads(args->sr, args->n_threads)<0 ) error("Failed to create %d extra threads\n", args->n_threads); |
|
if ( !bcf_sr_add_reader(args->sr, fname) ) |
|
error("Failed to read from %s: %s\n", !strcmp("-",fname)?"standard input":fname,bcf_sr_strerror(args->sr->errnum)); |
|
args->hdr = bcf_sr_get_header(args->sr,0); |
|
|
|
init_data(args); |
|
while ( bcf_sr_next_line(args->sr) ) |
|
{ |
|
process(args, &args->sr->readers[0].buffer[0]); |
|
} |
|
process(args,NULL); |
|
|
|
destroy_data(args); |
|
bcf_sr_destroy(args->sr); |
|
free(args); |
|
return 0; |
|
} |
|
|
|
|