#include "htslib/vcfutils.h" int bcf_calc_ac(const bcf_hdr_t *header, bcf1_t *line, int *ac, int which) { int i; for (i=0; in_allele; i++) ac[i]=0; // Use INFO/AC,AN field only when asked if ( which&BCF_UN_INFO ) { bcf_unpack(line, BCF_UN_INFO); int an_id = bcf_hdr_id2int(header, BCF_DT_ID, "AN"); int ac_id = bcf_hdr_id2int(header, BCF_DT_ID, "AC"); int i, an=-1, ac_len=0, ac_type=0; uint8_t *ac_ptr=NULL; if ( an_id>=0 && ac_id>=0 ) { for (i=0; in_info; i++) { bcf_info_t *z = &line->d.info[i]; if ( z->key == an_id ) an = z->v1.i; else if ( z->key == ac_id ) { ac_ptr = z->vptr; ac_len = z->len; ac_type = z->type; } } } if ( an>=0 && ac_ptr ) { int nac = 0; #define BRANCH_INT(type_t) { \ type_t *p = (type_t *) ac_ptr; \ for (i=0; iid[BCF_DT_CTG][line->rid].key, line->pos+1); exit(1); break; } #undef BRANCH_INT assert( an>=nac ); // sanity check for missing values ac[0] = an - nac; return 1; } } // Split genotype fields only when asked if ( which&BCF_UN_FMT ) { int i, gt_id = bcf_hdr_id2int(header,BCF_DT_ID,"GT"); if ( gt_id<0 ) return 0; bcf_unpack(line, BCF_UN_FMT); bcf_fmt_t *fmt_gt = NULL; for (i=0; i<(int)line->n_fmt; i++) if ( line->d.fmt[i].id==gt_id ) { fmt_gt = &line->d.fmt[i]; break; } if ( !fmt_gt ) return 0; #define BRANCH_INT(type_t,missing,vector_end) { \ for (i=0; in_sample; i++) \ { \ type_t *p = (type_t*) (fmt_gt->p + i*fmt_gt->size); \ int ial; \ for (ial=0; ialn; ial++) \ { \ if ( p[ial]==vector_end ) break; /* smaller ploidy */ \ if ( !(p[ial]>>1) || p[ial]==missing ) continue; /* missing allele */ \ ac[(p[ial]>>1)-1]++; \ } \ } \ } switch (fmt_gt->type) { case BCF_BT_INT8: BRANCH_INT(int8_t, bcf_int8_missing, bcf_int8_vector_end); break; case BCF_BT_INT16: BRANCH_INT(int16_t, bcf_int16_missing, bcf_int16_vector_end); break; case BCF_BT_INT32: BRANCH_INT(int32_t, bcf_int32_missing, bcf_int32_vector_end); break; default: fprintf(stderr, "[E::%s] todo: %d at %s:%d\n", __func__, fmt_gt->type, header->id[BCF_DT_CTG][line->rid].key, line->pos+1); exit(1); break; } #undef BRANCH_INT return 1; } return 0; } int bcf_gt_type(bcf_fmt_t *fmt_ptr, int isample, int *_ial, int *_jal) { int i, nals = 0, has_ref = 0, has_alt = 0, ial = 0, jal = 0; #define BRANCH_INT(type_t,missing,vector_end) { \ type_t *p = (type_t*) (fmt_ptr->p + isample*fmt_ptr->size); \ for (i=0; in; i++) \ { \ if ( p[i] == vector_end ) break; /* smaller ploidy */ \ if ( !p[i] || p[i] == missing ) continue; /* missing allele */ \ int tmp = p[i]>>1; \ if ( tmp>1 ) \ { \ if ( !ial ) { ial = tmp; has_alt = 1; } \ else if ( tmp!=ial ) \ { \ if ( tmptype) { case BCF_BT_INT8: BRANCH_INT(int8_t, bcf_int8_missing, bcf_int8_vector_end); break; case BCF_BT_INT16: BRANCH_INT(int16_t, bcf_int16_missing, bcf_int16_vector_end); break; case BCF_BT_INT32: BRANCH_INT(int32_t, bcf_int32_missing, bcf_int32_vector_end); break; default: fprintf(stderr, "[E::%s] todo: fmt_type %d\n", __func__, fmt_ptr->type); exit(1); break; } #undef BRANCH_INT if ( _ial ) *_ial = ial>0 ? ial-1 : ial; if ( _jal ) *_jal = jal>0 ? jal-1 : jal; if ( !nals ) return GT_UNKN; if ( nals==1 ) return has_ref ? GT_HAPL_R : GT_HAPL_A; if ( !has_ref ) return has_alt==1 ? GT_HOM_AA : GT_HET_AA; if ( !has_alt ) return GT_HOM_RR; return GT_HET_RA; } int bcf_trim_alleles(const bcf_hdr_t *header, bcf1_t *line) { int i; bcf_fmt_t *gt = bcf_get_fmt(header, line, "GT"); if ( !gt ) return 0; int *ac = (int*) calloc(line->n_allele,sizeof(int)); // check if all alleles are populated #define BRANCH(type_t,missing,vector_end) { \ for (i=0; in_sample; i++) \ { \ type_t *p = (type_t*) (gt->p + i*gt->size); \ int ial; \ for (ial=0; ialsize; ial++) \ { \ if ( p[ial]==vector_end ) break; /* smaller ploidy */ \ if ( !(p[ial]>>1) || p[ial]==missing ) continue; /* missing allele */ \ if ( (p[ial]>>1)-1 >= line->n_allele ) return -1; \ ac[(p[ial]>>1)-1]++; \ } \ } \ } switch (gt->type) { case BCF_BT_INT8: BRANCH(int8_t, bcf_int8_missing, bcf_int8_vector_end); break; case BCF_BT_INT16: BRANCH(int16_t, bcf_int16_missing, bcf_int16_vector_end); break; case BCF_BT_INT32: BRANCH(int32_t, bcf_int32_missing, bcf_int32_vector_end); break; default: fprintf(stderr, "[E::%s] todo: %d at %s:%d\n", __func__, gt->type, header->id[BCF_DT_CTG][line->rid].key, line->pos+1); exit(1); break; } #undef BRANCH int rm_als = 0, nrm = 0; for (i=1; in_allele; i++) { if ( !ac[i] ) { rm_als |= 1<n_allele, sizeof(int)); // create map of indexes from old to new ALT numbering and modify ALT kstring_t str = {0,0,0}; kputs(line->d.allele[0], &str); int nrm = 0, i,j; // i: ori alleles, j: new alleles for (i=1, j=1; in_allele; i++) { if ( rm_mask & 1<d.allele[i] = NULL; nrm++; continue; } kputc(',', &str); kputs(line->d.allele[i], &str); map[i] = j; j++; } if ( !nrm ) { free(map); free(str.s); return; } int nR_ori = line->n_allele; int nR_new = line->n_allele-nrm; assert(nR_new > 0); // should not be able to remove reference allele int nA_ori = nR_ori-1; int nA_new = nR_new-1; int nG_ori = nR_ori*(nR_ori + 1)/2; int nG_new = nR_new*(nR_new + 1)/2; bcf_update_alleles_str(header, line, str.s); // remove from Number=G, Number=R and Number=A INFO fields. uint8_t *dat = NULL; int mdat = 0, ndat = 0, mdat_bytes = 0, nret; for (i=0; in_info; i++) { bcf_info_t *info = &line->d.info[i]; int vlen = bcf_hdr_id2length(header,BCF_HL_INFO,info->key); if ( vlen!=BCF_VL_A && vlen!=BCF_VL_G && vlen!=BCF_VL_R ) continue; // no need to change int type = bcf_hdr_id2type(header,BCF_HL_INFO,info->key); if ( type==BCF_HT_FLAG ) continue; int size = 1; if ( type==BCF_HT_REAL || type==BCF_HT_INT ) size = 4; mdat = mdat_bytes / size; nret = bcf_get_info_values(header, line, bcf_hdr_int2id(header,BCF_DT_ID,info->key), (void**)&dat, &mdat, type); mdat_bytes = mdat * size; if ( nret<0 ) { fprintf(stderr,"[%s:%d %s] Could not access INFO/%s at %s:%d [%d]\n", __FILE__,__LINE__,__FUNCTION__, bcf_hdr_int2id(header,BCF_DT_ID,info->key), bcf_seqname(header,line), line->pos+1, nret); exit(1); } if ( type==BCF_HT_STR ) { str.l = 0; char *ss = (char*) dat, *se = (char*) dat; if ( vlen==BCF_VL_A || vlen==BCF_VL_R ) { int nexp, inc = 0; if ( vlen==BCF_VL_A ) { nexp = nA_ori; inc = 1; } else nexp = nR_ori; for (j=0; jkey), (void*)str.s, str.l, type); if ( nret<0 ) { fprintf(stderr,"[%s:%d %s] Could not update INFO/%s at %s:%d [%d]\n", __FILE__,__LINE__,__FUNCTION__, bcf_hdr_int2id(header,BCF_DT_ID,info->key), bcf_seqname(header,line), line->pos+1, nret); exit(1); } continue; } if ( vlen==BCF_VL_A || vlen==BCF_VL_R ) { int inc = 0, ntop; if ( vlen==BCF_VL_A ) { assert( nret==nA_ori ); ntop = nA_ori; ndat = nA_new; inc = 1; } else { assert( nret==nR_ori ); ntop = nR_ori; ndat = nR_new; } int k = 0; #define BRANCH(type_t,is_vector_end) \ { \ type_t *ptr = (type_t*) dat; \ int size = sizeof(type_t); \ for (j=0; jkey), (void*)dat, ndat, type); if ( nret<0 ) { fprintf(stderr,"[%s:%d %s] Could not update INFO/%s at %s:%d [%d]\n", __FILE__,__LINE__,__FUNCTION__, bcf_hdr_int2id(header,BCF_DT_ID,info->key), bcf_seqname(header,line), line->pos+1, nret); exit(1); } } // Update GT fields, the allele indexes might have changed for (i=1; in_allele; i++) if ( map[i]!=i ) break; if ( in_allele ) { mdat = mdat_bytes / 4; // sizeof(int32_t) nret = bcf_get_genotypes(header,line,(void**)&dat,&mdat); mdat_bytes = mdat * 4; if ( nret>0 ) { nret /= line->n_sample; int32_t *ptr = (int32_t*) dat; for (i=0; in_sample; i++) { for (j=0; j=0 ); ptr[j] = (map[al]+1)<<1 | (ptr[j]&1); } ptr += nret; } bcf_update_genotypes(header, line, (void*)dat, nret*line->n_sample); } } // Remove from Number=G, Number=R and Number=A FORMAT fields. // Assuming haploid or diploid GTs for (i=0; in_fmt; i++) { bcf_fmt_t *fmt = &line->d.fmt[i]; int vlen = bcf_hdr_id2length(header,BCF_HL_FMT,fmt->id); if ( vlen!=BCF_VL_A && vlen!=BCF_VL_G && vlen!=BCF_VL_R ) continue; // no need to change int type = bcf_hdr_id2type(header,BCF_HL_FMT,fmt->id); if ( type==BCF_HT_FLAG ) continue; int size = 1; if ( type==BCF_HT_REAL || type==BCF_HT_INT ) size = 4; mdat = mdat_bytes / size; nret = bcf_get_format_values(header, line, bcf_hdr_int2id(header,BCF_DT_ID,fmt->id), (void**)&dat, &mdat, type); mdat_bytes = mdat * size; if ( nret<0 ) { fprintf(stderr,"[%s:%d %s] Could not access FORMAT/%s at %s:%d [%d]\n", __FILE__,__LINE__,__FUNCTION__, bcf_hdr_int2id(header,BCF_DT_ID,fmt->id), bcf_seqname(header,line), line->pos+1, nret); exit(1); } if ( type==BCF_HT_STR ) { int size = nret/line->n_sample; // number of bytes per sample str.l = 0; if ( vlen==BCF_VL_A || vlen==BCF_VL_R ) { int nexp, inc = 0; if ( vlen==BCF_VL_A ) { nexp = nA_ori; inc = 1; } else nexp = nR_ori; for (j=0; jn_sample; j++) { char *ss = ((char*)dat) + j*size, *se = ss + size, *ptr = ss; int k_src = 0, k_dst = 0, l = str.l; for (k_src=0; k_src=se || !*ptr) break; while ( ptrn_sample; j++) { char *ss = ((char*)dat) + j*size, *se = ss + size, *ptr = ss; int k_src = 0, k_dst = 0, l = str.l; int nexp = 0; // diploid or haploid? while ( ptr=se || !*ptr ) break; while ( ptr=se || !*ptr ) break; } } else // haploid { for (k_src=0; k_src=se || !*ptr ) break; while ( ptrid), (void*)str.s, str.l, type); if ( nret<0 ) { fprintf(stderr,"[%s:%d %s] Could not update FORMAT/%s at %s:%d [%d]\n", __FILE__,__LINE__,__FUNCTION__, bcf_hdr_int2id(header,BCF_DT_ID,fmt->id), bcf_seqname(header,line), line->pos+1, nret); exit(1); } continue; } int nori = nret / line->n_sample; if ( vlen==BCF_VL_A || vlen==BCF_VL_R || (vlen==BCF_VL_G && nori==nR_ori) ) // Number=A, R or haploid Number=G { int ntop, inc = 0; if ( vlen==BCF_VL_A ) { assert( nori==nA_ori ); // todo: will fail if all values are missing ntop = nA_ori; ndat = nA_new*line->n_sample; inc = 1; } else { assert( nori==nR_ori ); // todo: will fail if all values are missing ntop = nR_ori; ndat = nR_new*line->n_sample; } #define BRANCH(type_t,is_vector_end) \ { \ for (j=0; jn_sample; j++) \ { \ type_t *ptr_src = ((type_t*)dat) + j*nori; \ type_t *ptr_dst = ((type_t*)dat) + j*nA_new; \ int size = sizeof(type_t); \ int k_src, k_dst = 0; \ for (k_src=0; k_srcn_sample; #define BRANCH(type_t,is_vector_end) \ { \ for (j=0; jn_sample; j++) \ { \ type_t *ptr_src = ((type_t*)dat) + j*nori; \ type_t *ptr_dst = ((type_t*)dat) + j*nG_new; \ int size = sizeof(type_t); \ int ia, ib, k_dst = 0, k_src; \ int nset = 0; /* haploid or diploid? */ \ for (k_src=0; k_srcid), (void*)dat, ndat, type); if ( nret<0 ) { fprintf(stderr,"[%s:%d %s] Could not update FORMAT/%s at %s:%d [%d]\n", __FILE__,__LINE__,__FUNCTION__, bcf_hdr_int2id(header,BCF_DT_ID,fmt->id), bcf_seqname(header,line), line->pos+1, nret); exit(1); } } free(dat); free(str.s); free(map); }