vcfutils.c 24 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642
  1. #include "htslib/vcfutils.h"
  2. int bcf_calc_ac(const bcf_hdr_t *header, bcf1_t *line, int *ac, int which)
  3. {
  4. int i;
  5. for (i=0; i<line->n_allele; i++) ac[i]=0;
  6. // Use INFO/AC,AN field only when asked
  7. if ( which&BCF_UN_INFO )
  8. {
  9. bcf_unpack(line, BCF_UN_INFO);
  10. int an_id = bcf_hdr_id2int(header, BCF_DT_ID, "AN");
  11. int ac_id = bcf_hdr_id2int(header, BCF_DT_ID, "AC");
  12. int i, an=-1, ac_len=0, ac_type=0;
  13. uint8_t *ac_ptr=NULL;
  14. if ( an_id>=0 && ac_id>=0 )
  15. {
  16. for (i=0; i<line->n_info; i++)
  17. {
  18. bcf_info_t *z = &line->d.info[i];
  19. if ( z->key == an_id ) an = z->v1.i;
  20. else if ( z->key == ac_id ) { ac_ptr = z->vptr; ac_len = z->len; ac_type = z->type; }
  21. }
  22. }
  23. if ( an>=0 && ac_ptr )
  24. {
  25. int nac = 0;
  26. #define BRANCH_INT(type_t) { \
  27. type_t *p = (type_t *) ac_ptr; \
  28. for (i=0; i<ac_len; i++) \
  29. { \
  30. ac[i+1] = p[i]; \
  31. nac += p[i]; \
  32. } \
  33. }
  34. switch (ac_type) {
  35. case BCF_BT_INT8: BRANCH_INT(int8_t); break;
  36. case BCF_BT_INT16: BRANCH_INT(int16_t); break;
  37. case BCF_BT_INT32: BRANCH_INT(int32_t); break;
  38. default: fprintf(stderr, "[E::%s] todo: %d at %s:%d\n", __func__, ac_type, header->id[BCF_DT_CTG][line->rid].key, line->pos+1); exit(1); break;
  39. }
  40. #undef BRANCH_INT
  41. assert( an>=nac ); // sanity check for missing values
  42. ac[0] = an - nac;
  43. return 1;
  44. }
  45. }
  46. // Split genotype fields only when asked
  47. if ( which&BCF_UN_FMT )
  48. {
  49. int i, gt_id = bcf_hdr_id2int(header,BCF_DT_ID,"GT");
  50. if ( gt_id<0 ) return 0;
  51. bcf_unpack(line, BCF_UN_FMT);
  52. bcf_fmt_t *fmt_gt = NULL;
  53. for (i=0; i<(int)line->n_fmt; i++)
  54. if ( line->d.fmt[i].id==gt_id ) { fmt_gt = &line->d.fmt[i]; break; }
  55. if ( !fmt_gt ) return 0;
  56. #define BRANCH_INT(type_t,missing,vector_end) { \
  57. for (i=0; i<line->n_sample; i++) \
  58. { \
  59. type_t *p = (type_t*) (fmt_gt->p + i*fmt_gt->size); \
  60. int ial; \
  61. for (ial=0; ial<fmt_gt->n; ial++) \
  62. { \
  63. if ( p[ial]==vector_end ) break; /* smaller ploidy */ \
  64. if ( !(p[ial]>>1) || p[ial]==missing ) continue; /* missing allele */ \
  65. ac[(p[ial]>>1)-1]++; \
  66. } \
  67. } \
  68. }
  69. switch (fmt_gt->type) {
  70. case BCF_BT_INT8: BRANCH_INT(int8_t, bcf_int8_missing, bcf_int8_vector_end); break;
  71. case BCF_BT_INT16: BRANCH_INT(int16_t, bcf_int16_missing, bcf_int16_vector_end); break;
  72. case BCF_BT_INT32: BRANCH_INT(int32_t, bcf_int32_missing, bcf_int32_vector_end); break;
  73. default: fprintf(stderr, "[E::%s] todo: %d at %s:%d\n", __func__, fmt_gt->type, header->id[BCF_DT_CTG][line->rid].key, line->pos+1); exit(1); break;
  74. }
  75. #undef BRANCH_INT
  76. return 1;
  77. }
  78. return 0;
  79. }
  80. int bcf_gt_type(bcf_fmt_t *fmt_ptr, int isample, int *_ial, int *_jal)
  81. {
  82. int i, nals = 0, has_ref = 0, has_alt = 0, ial = 0, jal = 0;
  83. #define BRANCH_INT(type_t,missing,vector_end) { \
  84. type_t *p = (type_t*) (fmt_ptr->p + isample*fmt_ptr->size); \
  85. for (i=0; i<fmt_ptr->n; i++) \
  86. { \
  87. if ( p[i] == vector_end ) break; /* smaller ploidy */ \
  88. if ( !p[i] || p[i] == missing ) continue; /* missing allele */ \
  89. int tmp = p[i]>>1; \
  90. if ( tmp>1 ) \
  91. { \
  92. if ( !ial ) { ial = tmp; has_alt = 1; } \
  93. else if ( tmp!=ial ) \
  94. { \
  95. if ( tmp<ial ) \
  96. { \
  97. jal = ial; \
  98. ial = tmp; \
  99. } \
  100. else \
  101. { \
  102. jal = tmp; \
  103. } \
  104. has_alt = 2; \
  105. } \
  106. } \
  107. else has_ref = 1; \
  108. nals++; \
  109. } \
  110. }
  111. switch (fmt_ptr->type) {
  112. case BCF_BT_INT8: BRANCH_INT(int8_t, bcf_int8_missing, bcf_int8_vector_end); break;
  113. case BCF_BT_INT16: BRANCH_INT(int16_t, bcf_int16_missing, bcf_int16_vector_end); break;
  114. case BCF_BT_INT32: BRANCH_INT(int32_t, bcf_int32_missing, bcf_int32_vector_end); break;
  115. default: fprintf(stderr, "[E::%s] todo: fmt_type %d\n", __func__, fmt_ptr->type); exit(1); break;
  116. }
  117. #undef BRANCH_INT
  118. if ( _ial ) *_ial = ial>0 ? ial-1 : ial;
  119. if ( _jal ) *_jal = jal>0 ? jal-1 : jal;
  120. if ( !nals ) return GT_UNKN;
  121. if ( nals==1 )
  122. return has_ref ? GT_HAPL_R : GT_HAPL_A;
  123. if ( !has_ref )
  124. return has_alt==1 ? GT_HOM_AA : GT_HET_AA;
  125. if ( !has_alt )
  126. return GT_HOM_RR;
  127. return GT_HET_RA;
  128. }
  129. int bcf_trim_alleles(const bcf_hdr_t *header, bcf1_t *line)
  130. {
  131. int i;
  132. bcf_fmt_t *gt = bcf_get_fmt(header, line, "GT");
  133. if ( !gt ) return 0;
  134. int *ac = (int*) calloc(line->n_allele,sizeof(int));
  135. // check if all alleles are populated
  136. #define BRANCH(type_t,missing,vector_end) { \
  137. for (i=0; i<line->n_sample; i++) \
  138. { \
  139. type_t *p = (type_t*) (gt->p + i*gt->size); \
  140. int ial; \
  141. for (ial=0; ial<gt->size; ial++) \
  142. { \
  143. if ( p[ial]==vector_end ) break; /* smaller ploidy */ \
  144. if ( !(p[ial]>>1) || p[ial]==missing ) continue; /* missing allele */ \
  145. if ( (p[ial]>>1)-1 >= line->n_allele ) return -1; \
  146. ac[(p[ial]>>1)-1]++; \
  147. } \
  148. } \
  149. }
  150. switch (gt->type) {
  151. case BCF_BT_INT8: BRANCH(int8_t, bcf_int8_missing, bcf_int8_vector_end); break;
  152. case BCF_BT_INT16: BRANCH(int16_t, bcf_int16_missing, bcf_int16_vector_end); break;
  153. case BCF_BT_INT32: BRANCH(int32_t, bcf_int32_missing, bcf_int32_vector_end); break;
  154. default: fprintf(stderr, "[E::%s] todo: %d at %s:%d\n", __func__, gt->type, header->id[BCF_DT_CTG][line->rid].key, line->pos+1); exit(1); break;
  155. }
  156. #undef BRANCH
  157. int rm_als = 0, nrm = 0;
  158. for (i=1; i<line->n_allele; i++)
  159. {
  160. if ( !ac[i] ) { rm_als |= 1<<i; nrm++; }
  161. }
  162. free(ac);
  163. if ( nrm ) bcf_remove_alleles(header, line, rm_als);
  164. return nrm;
  165. }
  166. void bcf_remove_alleles(const bcf_hdr_t *header, bcf1_t *line, int rm_mask)
  167. {
  168. int *map = (int*) calloc(line->n_allele, sizeof(int));
  169. // create map of indexes from old to new ALT numbering and modify ALT
  170. kstring_t str = {0,0,0};
  171. kputs(line->d.allele[0], &str);
  172. int nrm = 0, i,j; // i: ori alleles, j: new alleles
  173. for (i=1, j=1; i<line->n_allele; i++)
  174. {
  175. if ( rm_mask & 1<<i )
  176. {
  177. // remove this allele
  178. line->d.allele[i] = NULL;
  179. nrm++;
  180. continue;
  181. }
  182. kputc(',', &str);
  183. kputs(line->d.allele[i], &str);
  184. map[i] = j;
  185. j++;
  186. }
  187. if ( !nrm ) { free(map); free(str.s); return; }
  188. int nR_ori = line->n_allele;
  189. int nR_new = line->n_allele-nrm;
  190. assert(nR_new > 0); // should not be able to remove reference allele
  191. int nA_ori = nR_ori-1;
  192. int nA_new = nR_new-1;
  193. int nG_ori = nR_ori*(nR_ori + 1)/2;
  194. int nG_new = nR_new*(nR_new + 1)/2;
  195. bcf_update_alleles_str(header, line, str.s);
  196. // remove from Number=G, Number=R and Number=A INFO fields.
  197. uint8_t *dat = NULL;
  198. int mdat = 0, ndat = 0, mdat_bytes = 0, nret;
  199. for (i=0; i<line->n_info; i++)
  200. {
  201. bcf_info_t *info = &line->d.info[i];
  202. int vlen = bcf_hdr_id2length(header,BCF_HL_INFO,info->key);
  203. if ( vlen!=BCF_VL_A && vlen!=BCF_VL_G && vlen!=BCF_VL_R ) continue; // no need to change
  204. int type = bcf_hdr_id2type(header,BCF_HL_INFO,info->key);
  205. if ( type==BCF_HT_FLAG ) continue;
  206. int size = 1;
  207. if ( type==BCF_HT_REAL || type==BCF_HT_INT ) size = 4;
  208. mdat = mdat_bytes / size;
  209. nret = bcf_get_info_values(header, line, bcf_hdr_int2id(header,BCF_DT_ID,info->key), (void**)&dat, &mdat, type);
  210. mdat_bytes = mdat * size;
  211. if ( nret<0 )
  212. {
  213. fprintf(stderr,"[%s:%d %s] Could not access INFO/%s at %s:%d [%d]\n", __FILE__,__LINE__,__FUNCTION__,
  214. bcf_hdr_int2id(header,BCF_DT_ID,info->key), bcf_seqname(header,line), line->pos+1, nret);
  215. exit(1);
  216. }
  217. if ( type==BCF_HT_STR )
  218. {
  219. str.l = 0;
  220. char *ss = (char*) dat, *se = (char*) dat;
  221. if ( vlen==BCF_VL_A || vlen==BCF_VL_R )
  222. {
  223. int nexp, inc = 0;
  224. if ( vlen==BCF_VL_A )
  225. {
  226. nexp = nA_ori;
  227. inc = 1;
  228. }
  229. else
  230. nexp = nR_ori;
  231. for (j=0; j<nexp; j++)
  232. {
  233. if ( !*se ) break;
  234. while ( *se && *se!=',' ) se++;
  235. if ( rm_mask & 1<<(j+inc) )
  236. {
  237. if ( *se ) se++;
  238. ss = se;
  239. continue;
  240. }
  241. if ( str.l ) kputc(',',&str);
  242. kputsn(ss,se-ss,&str);
  243. if ( *se ) se++;
  244. ss = se;
  245. }
  246. assert( j==nexp );
  247. }
  248. else // Number=G, assuming diploid genotype
  249. {
  250. int k = 0, n = 0;
  251. for (j=0; j<nR_ori; j++)
  252. {
  253. for (k=0; k<=j; k++)
  254. {
  255. if ( !*se ) break;
  256. while ( *se && *se!=',' ) se++;
  257. n++;
  258. if ( rm_mask & 1<<j || rm_mask & 1<<k )
  259. {
  260. if ( *se ) se++;
  261. ss = se;
  262. continue;
  263. }
  264. if ( str.l ) kputc(',',&str);
  265. kputsn(ss,se-ss,&str);
  266. if ( *se ) se++;
  267. ss = se;
  268. }
  269. if ( !*se ) break;
  270. }
  271. assert( n=nG_ori );
  272. }
  273. nret = bcf_update_info(header, line, bcf_hdr_int2id(header,BCF_DT_ID,info->key), (void*)str.s, str.l, type);
  274. if ( nret<0 )
  275. {
  276. fprintf(stderr,"[%s:%d %s] Could not update INFO/%s at %s:%d [%d]\n", __FILE__,__LINE__,__FUNCTION__,
  277. bcf_hdr_int2id(header,BCF_DT_ID,info->key), bcf_seqname(header,line), line->pos+1, nret);
  278. exit(1);
  279. }
  280. continue;
  281. }
  282. if ( vlen==BCF_VL_A || vlen==BCF_VL_R )
  283. {
  284. int inc = 0, ntop;
  285. if ( vlen==BCF_VL_A )
  286. {
  287. assert( nret==nA_ori );
  288. ntop = nA_ori;
  289. ndat = nA_new;
  290. inc = 1;
  291. }
  292. else
  293. {
  294. assert( nret==nR_ori );
  295. ntop = nR_ori;
  296. ndat = nR_new;
  297. }
  298. int k = 0;
  299. #define BRANCH(type_t,is_vector_end) \
  300. { \
  301. type_t *ptr = (type_t*) dat; \
  302. int size = sizeof(type_t); \
  303. for (j=0; j<ntop; j++) /* j:ori, k:new */ \
  304. { \
  305. if ( is_vector_end ) { memcpy(dat+k*size, dat+j*size, size); break; } \
  306. if ( rm_mask & 1<<(j+inc) ) continue; \
  307. if ( j!=k ) memcpy(dat+k*size, dat+j*size, size); \
  308. k++; \
  309. } \
  310. }
  311. switch (type)
  312. {
  313. case BCF_HT_INT: BRANCH(int32_t,ptr[j]==bcf_int32_vector_end); break;
  314. case BCF_HT_REAL: BRANCH(float,bcf_float_is_vector_end(ptr[j])); break;
  315. }
  316. #undef BRANCH
  317. }
  318. else // Number=G
  319. {
  320. assert( nret==nG_ori );
  321. int k, l_ori = -1, l_new = 0;
  322. ndat = nG_new;
  323. #define BRANCH(type_t,is_vector_end) \
  324. { \
  325. type_t *ptr = (type_t*) dat; \
  326. int size = sizeof(type_t); \
  327. for (j=0; j<nR_ori; j++) \
  328. { \
  329. for (k=0; k<=j; k++) \
  330. { \
  331. l_ori++; \
  332. if ( is_vector_end ) { memcpy(dat+l_new*size, dat+l_ori*size, size); break; } \
  333. if ( rm_mask & 1<<j || rm_mask & 1<<k ) continue; \
  334. if ( l_ori!=l_new ) memcpy(dat+l_new*size, dat+l_ori*size, size); \
  335. l_new++; \
  336. } \
  337. } \
  338. }
  339. switch (type)
  340. {
  341. case BCF_HT_INT: BRANCH(int32_t,ptr[l_ori]==bcf_int32_vector_end); break;
  342. case BCF_HT_REAL: BRANCH(float,bcf_float_is_vector_end(ptr[l_ori])); break;
  343. }
  344. #undef BRANCH
  345. }
  346. nret = bcf_update_info(header, line, bcf_hdr_int2id(header,BCF_DT_ID,info->key), (void*)dat, ndat, type);
  347. if ( nret<0 )
  348. {
  349. fprintf(stderr,"[%s:%d %s] Could not update INFO/%s at %s:%d [%d]\n", __FILE__,__LINE__,__FUNCTION__,
  350. bcf_hdr_int2id(header,BCF_DT_ID,info->key), bcf_seqname(header,line), line->pos+1, nret);
  351. exit(1);
  352. }
  353. }
  354. // Update GT fields, the allele indexes might have changed
  355. for (i=1; i<line->n_allele; i++) if ( map[i]!=i ) break;
  356. if ( i<line->n_allele )
  357. {
  358. mdat = mdat_bytes / 4; // sizeof(int32_t)
  359. nret = bcf_get_genotypes(header,line,(void**)&dat,&mdat);
  360. mdat_bytes = mdat * 4;
  361. if ( nret>0 )
  362. {
  363. nret /= line->n_sample;
  364. int32_t *ptr = (int32_t*) dat;
  365. for (i=0; i<line->n_sample; i++)
  366. {
  367. for (j=0; j<nret; j++)
  368. {
  369. if ( ptr[j]==bcf_gt_missing ) continue;
  370. if ( ptr[j]==bcf_int32_vector_end ) break;
  371. int al = bcf_gt_allele(ptr[j]);
  372. assert( al<nR_ori && map[al]>=0 );
  373. ptr[j] = (map[al]+1)<<1 | (ptr[j]&1);
  374. }
  375. ptr += nret;
  376. }
  377. bcf_update_genotypes(header, line, (void*)dat, nret*line->n_sample);
  378. }
  379. }
  380. // Remove from Number=G, Number=R and Number=A FORMAT fields.
  381. // Assuming haploid or diploid GTs
  382. for (i=0; i<line->n_fmt; i++)
  383. {
  384. bcf_fmt_t *fmt = &line->d.fmt[i];
  385. int vlen = bcf_hdr_id2length(header,BCF_HL_FMT,fmt->id);
  386. if ( vlen!=BCF_VL_A && vlen!=BCF_VL_G && vlen!=BCF_VL_R ) continue; // no need to change
  387. int type = bcf_hdr_id2type(header,BCF_HL_FMT,fmt->id);
  388. if ( type==BCF_HT_FLAG ) continue;
  389. int size = 1;
  390. if ( type==BCF_HT_REAL || type==BCF_HT_INT ) size = 4;
  391. mdat = mdat_bytes / size;
  392. nret = bcf_get_format_values(header, line, bcf_hdr_int2id(header,BCF_DT_ID,fmt->id), (void**)&dat, &mdat, type);
  393. mdat_bytes = mdat * size;
  394. if ( nret<0 )
  395. {
  396. fprintf(stderr,"[%s:%d %s] Could not access FORMAT/%s at %s:%d [%d]\n", __FILE__,__LINE__,__FUNCTION__,
  397. bcf_hdr_int2id(header,BCF_DT_ID,fmt->id), bcf_seqname(header,line), line->pos+1, nret);
  398. exit(1);
  399. }
  400. if ( type==BCF_HT_STR )
  401. {
  402. int size = nret/line->n_sample; // number of bytes per sample
  403. str.l = 0;
  404. if ( vlen==BCF_VL_A || vlen==BCF_VL_R )
  405. {
  406. int nexp, inc = 0;
  407. if ( vlen==BCF_VL_A )
  408. {
  409. nexp = nA_ori;
  410. inc = 1;
  411. }
  412. else
  413. nexp = nR_ori;
  414. for (j=0; j<line->n_sample; j++)
  415. {
  416. char *ss = ((char*)dat) + j*size, *se = ss + size, *ptr = ss;
  417. int k_src = 0, k_dst = 0, l = str.l;
  418. for (k_src=0; k_src<nexp; k_src++)
  419. {
  420. if ( ptr>=se || !*ptr) break;
  421. while ( ptr<se && *ptr && *ptr!=',' ) ptr++;
  422. if ( rm_mask & 1<<(k_src+inc) )
  423. {
  424. ss = ++ptr;
  425. continue;
  426. }
  427. if ( k_dst ) kputc(',',&str);
  428. kputsn(ss,ptr-ss,&str);
  429. ss = ++ptr;
  430. k_dst++;
  431. }
  432. assert( k_src==nexp );
  433. l = str.l - l;
  434. for (; l<size; l++) kputc(0, &str);
  435. }
  436. }
  437. else // Number=G, diploid or haploid
  438. {
  439. for (j=0; j<line->n_sample; j++)
  440. {
  441. char *ss = ((char*)dat) + j*size, *se = ss + size, *ptr = ss;
  442. int k_src = 0, k_dst = 0, l = str.l;
  443. int nexp = 0; // diploid or haploid?
  444. while ( ptr<se )
  445. {
  446. if ( !*ptr ) break;
  447. if ( *ptr==',' ) nexp++;
  448. ptr++;
  449. }
  450. if ( ptr!=ss ) nexp++;
  451. assert( nexp==nG_ori || nexp==nR_ori );
  452. ptr = ss;
  453. if ( nexp==nG_ori ) // diploid
  454. {
  455. int ia, ib;
  456. for (ia=0; ia<nR_ori; ia++)
  457. {
  458. for (ib=0; ib<=ia; ib++)
  459. {
  460. if ( ptr>=se || !*ptr ) break;
  461. while ( ptr<se && *ptr && *ptr!=',' ) ptr++;
  462. if ( rm_mask & 1<<ia || rm_mask & 1<<ib )
  463. {
  464. ss = ++ptr;
  465. continue;
  466. }
  467. if ( k_dst ) kputc(',',&str);
  468. kputsn(ss,ptr-ss,&str);
  469. ss = ++ptr;
  470. k_dst++;
  471. }
  472. if ( ptr>=se || !*ptr ) break;
  473. }
  474. }
  475. else // haploid
  476. {
  477. for (k_src=0; k_src<nR_ori; k_src++)
  478. {
  479. if ( ptr>=se || !*ptr ) break;
  480. while ( ptr<se && *ptr && *ptr!=',' ) ptr++;
  481. if ( rm_mask & 1<<k_src )
  482. {
  483. ss = ++ptr;
  484. continue;
  485. }
  486. if ( k_dst ) kputc(',',&str);
  487. kputsn(ss,ptr-ss,&str);
  488. ss = ++ptr;
  489. k_dst++;
  490. }
  491. assert( k_src==nR_ori );
  492. l = str.l - l;
  493. for (; l<size; l++) kputc(0, &str);
  494. }
  495. }
  496. }
  497. nret = bcf_update_format(header, line, bcf_hdr_int2id(header,BCF_DT_ID,fmt->id), (void*)str.s, str.l, type);
  498. if ( nret<0 )
  499. {
  500. fprintf(stderr,"[%s:%d %s] Could not update FORMAT/%s at %s:%d [%d]\n", __FILE__,__LINE__,__FUNCTION__,
  501. bcf_hdr_int2id(header,BCF_DT_ID,fmt->id), bcf_seqname(header,line), line->pos+1, nret);
  502. exit(1);
  503. }
  504. continue;
  505. }
  506. int nori = nret / line->n_sample;
  507. if ( vlen==BCF_VL_A || vlen==BCF_VL_R || (vlen==BCF_VL_G && nori==nR_ori) ) // Number=A, R or haploid Number=G
  508. {
  509. int ntop, inc = 0;
  510. if ( vlen==BCF_VL_A )
  511. {
  512. assert( nori==nA_ori ); // todo: will fail if all values are missing
  513. ntop = nA_ori;
  514. ndat = nA_new*line->n_sample;
  515. inc = 1;
  516. }
  517. else
  518. {
  519. assert( nori==nR_ori ); // todo: will fail if all values are missing
  520. ntop = nR_ori;
  521. ndat = nR_new*line->n_sample;
  522. }
  523. #define BRANCH(type_t,is_vector_end) \
  524. { \
  525. for (j=0; j<line->n_sample; j++) \
  526. { \
  527. type_t *ptr_src = ((type_t*)dat) + j*nori; \
  528. type_t *ptr_dst = ((type_t*)dat) + j*nA_new; \
  529. int size = sizeof(type_t); \
  530. int k_src, k_dst = 0; \
  531. for (k_src=0; k_src<ntop; k_src++) \
  532. { \
  533. if ( is_vector_end ) { memcpy(ptr_dst+k_dst, ptr_src+k_src, size); break; } \
  534. if ( rm_mask & 1<<(k_src+inc) ) continue; \
  535. if ( k_src!=k_dst ) memcpy(ptr_dst+k_dst, ptr_src+k_src, size); \
  536. k_dst++; \
  537. } \
  538. } \
  539. }
  540. switch (type)
  541. {
  542. case BCF_HT_INT: BRANCH(int32_t,ptr_src[k_src]==bcf_int32_vector_end); break;
  543. case BCF_HT_REAL: BRANCH(float,bcf_float_is_vector_end(ptr_src[k_src])); break;
  544. }
  545. #undef BRANCH
  546. }
  547. else // Number=G, diploid or mixture of haploid+diploid
  548. {
  549. assert( nori==nG_ori );
  550. ndat = nG_new*line->n_sample;
  551. #define BRANCH(type_t,is_vector_end) \
  552. { \
  553. for (j=0; j<line->n_sample; j++) \
  554. { \
  555. type_t *ptr_src = ((type_t*)dat) + j*nori; \
  556. type_t *ptr_dst = ((type_t*)dat) + j*nG_new; \
  557. int size = sizeof(type_t); \
  558. int ia, ib, k_dst = 0, k_src; \
  559. int nset = 0; /* haploid or diploid? */ \
  560. for (k_src=0; k_src<nG_ori; k_src++) { if ( is_vector_end ) break; nset++; } \
  561. if ( nset==nR_ori ) /* haploid */ \
  562. { \
  563. for (k_src=0; k_src<nR_ori; k_src++) \
  564. { \
  565. if ( rm_mask & 1<<k_src ) continue; \
  566. if ( k_src!=k_dst ) memcpy(ptr_dst+k_dst, ptr_src+k_src, size); \
  567. k_dst++; \
  568. } \
  569. memcpy(ptr_dst+k_dst, ptr_src+k_src, size); \
  570. } \
  571. else /* diploid */ \
  572. { \
  573. k_src = -1; \
  574. for (ia=0; ia<nR_ori; ia++) \
  575. { \
  576. for (ib=0; ib<=ia; ib++) \
  577. { \
  578. k_src++; \
  579. if ( is_vector_end ) { memcpy(ptr_dst+k_dst, ptr_src+k_src, size); ia = nR_ori; break; } \
  580. if ( rm_mask & 1<<ia || rm_mask & 1<<ib ) continue; \
  581. if ( k_src!=k_dst ) memcpy(ptr_dst+k_dst, ptr_src+k_src, size); \
  582. k_dst++; \
  583. } \
  584. } \
  585. } \
  586. } \
  587. }
  588. switch (type)
  589. {
  590. case BCF_HT_INT: BRANCH(int32_t,ptr_src[k_src]==bcf_int32_vector_end); break;
  591. case BCF_HT_REAL: BRANCH(float,bcf_float_is_vector_end(ptr_src[k_src])); break;
  592. }
  593. #undef BRANCH
  594. }
  595. nret = bcf_update_format(header, line, bcf_hdr_int2id(header,BCF_DT_ID,fmt->id), (void*)dat, ndat, type);
  596. if ( nret<0 )
  597. {
  598. fprintf(stderr,"[%s:%d %s] Could not update FORMAT/%s at %s:%d [%d]\n", __FILE__,__LINE__,__FUNCTION__,
  599. bcf_hdr_int2id(header,BCF_DT_ID,fmt->id), bcf_seqname(header,line), line->pos+1, nret);
  600. exit(1);
  601. }
  602. }
  603. free(dat);
  604. free(str.s);
  605. free(map);
  606. }