vcf.c 99 KB


  1. #include <zlib.h>
  2. #include <stdio.h>
  3. #include <ctype.h>
  4. #include <assert.h>
  5. #include <string.h>
  6. #include <stdlib.h>
  7. #include <limits.h>
  8. #include "htslib/kstring.h"
  9. #include "htslib/bgzf.h"
  10. #include "htslib/vcf.h"
  11. #include "htslib/tbx.h"
  12. #include "htslib/hfile.h"
  13. #include "htslib/khash.h"
  14. KHASH_MAP_INIT_STR(vdict, bcf_idinfo_t)
  15. typedef khash_t(vdict) vdict_t;
  16. #include "htslib/kseq.h"
  17. KSTREAM_DECLARE(gzFile, gzread)
  18. uint32_t bcf_float_missing = 0x7F800001;
  19. uint32_t bcf_float_vector_end = 0x7F800002;
  20. uint8_t bcf_type_shift[] = { 0, 0, 1, 2, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 };
  21. static bcf_idinfo_t bcf_idinfo_def = { .info = { 15, 15, 15 }, .hrec = { NULL, NULL, NULL}, .id = -1 };
  22. /*************************
  23. *** VCF header parser ***
  24. *************************/
  25. int bcf_hdr_sync(bcf_hdr_t *h);
  26. int bcf_hdr_add_sample(bcf_hdr_t *h, const char *s)
  27. {
  28. if ( !s )
  29. {
  30. bcf_hdr_sync(h);
  31. return 0;
  32. }
  33. const char *ss = s;
  34. while ( !*ss && isspace(*ss) ) ss++;
  35. if ( !*ss )
  36. {
  37. fprintf(stderr,"[W::%s] Empty sample name: trailing spaces/tabs in the header line?\n", __func__);
  38. abort();
  39. }
  40. vdict_t *d = (vdict_t*)h->dict[BCF_DT_SAMPLE];
  41. int ret;
  42. char *sdup = strdup(s);
  43. int k = kh_put(vdict, d, sdup, &ret);
  44. if (ret) { // absent
  45. kh_val(d, k) = bcf_idinfo_def;
  46. kh_val(d, k).id = kh_size(d) - 1;
  47. } else {
  48. if (hts_verbose >= 2)
  49. fprintf(stderr, "[W::%s] Duplicated sample name '%s'. Skipped.\n", __func__, s);
  50. free(sdup);
  51. return -1;
  52. }
  53. int n = kh_size(d);
  54. h->samples = (char**) realloc(h->samples,sizeof(char*)*n);
  55. h->samples[n-1] = sdup;
  56. return 0;
  57. }
  58. void bcf_hdr_parse_sample_line(bcf_hdr_t *h, const char *str)
  59. {
  60. int i = 0;
  61. const char *p, *q;
  62. // add samples
  63. for (p = q = str;; ++q) {
  64. if (*q != '\t' && *q != 0 && *q != '\n') continue;
  65. if (++i > 9) {
  66. char *s = (char*)malloc(q - p + 1);
  67. strncpy(s, p, q - p);
  68. s[q - p] = 0;
  69. bcf_hdr_add_sample(h,s);
  70. free(s);
  71. }
  72. if (*q == 0 || *q == '\n') break;
  73. p = q + 1;
  74. }
  75. bcf_hdr_add_sample(h,NULL);
  76. }
  77. int bcf_hdr_sync(bcf_hdr_t *h)
  78. {
  79. int i;
  80. for (i = 0; i < 3; i++)
  81. {
  82. vdict_t *d = (vdict_t*)h->dict[i];
  83. khint_t k;
  84. // find out the largest id, there may be holes because of IDX
  85. int max_id = -1;
  86. for (k=kh_begin(d); k<kh_end(d); k++)
  87. {
  88. if (!kh_exist(d,k)) continue;
  89. if ( max_id < kh_val(d,k).id ) max_id = kh_val(d,k).id;
  90. }
  91. if ( max_id >= h->n[i] )
  92. {
  93. h->id[i] = (bcf_idpair_t*)realloc(h->id[i], (max_id+1)*sizeof(bcf_idpair_t));
  94. for (k=h->n[i]; k<=max_id; k++)
  95. {
  96. h->id[i][k].key = NULL;
  97. h->id[i][k].val = NULL;
  98. }
  99. h->n[i] = max_id+1;
  100. }
  101. for (k=kh_begin(d); k<kh_end(d); k++)
  102. {
  103. if (!kh_exist(d,k)) continue;
  104. h->id[i][kh_val(d,k).id].key = kh_key(d,k);
  105. h->id[i][kh_val(d,k).id].val = &kh_val(d,k);
  106. }
  107. }
  108. return 0;
  109. }
  110. void bcf_hrec_destroy(bcf_hrec_t *hrec)
  111. {
  112. free(hrec->key);
  113. if ( hrec->value ) free(hrec->value);
  114. int i;
  115. for (i=0; i<hrec->nkeys; i++)
  116. {
  117. free(hrec->keys[i]);
  118. free(hrec->vals[i]);
  119. }
  120. free(hrec->keys);
  121. free(hrec->vals);
  122. free(hrec);
  123. }
  124. // Copies all fields except IDX.
  125. bcf_hrec_t *bcf_hrec_dup(bcf_hrec_t *hrec)
  126. {
  127. bcf_hrec_t *out = (bcf_hrec_t*) calloc(1,sizeof(bcf_hrec_t));
  128. out->type = hrec->type;
  129. if ( hrec->key ) out->key = strdup(hrec->key);
  130. if ( hrec->value ) out->value = strdup(hrec->value);
  131. out->nkeys = hrec->nkeys;
  132. out->keys = (char**) malloc(sizeof(char*)*hrec->nkeys);
  133. out->vals = (char**) malloc(sizeof(char*)*hrec->nkeys);
  134. int i, j = 0;
  135. for (i=0; i<hrec->nkeys; i++)
  136. {
  137. if ( hrec->keys[i] && !strcmp("IDX",hrec->keys[i]) ) continue;
  138. if ( hrec->keys[i] ) out->keys[j] = strdup(hrec->keys[i]);
  139. if ( hrec->vals[i] ) out->vals[j] = strdup(hrec->vals[i]);
  140. j++;
  141. }
  142. if ( i!=j ) out->nkeys--; // IDX was omitted
  143. return out;
  144. }
  145. void bcf_hrec_debug(FILE *fp, bcf_hrec_t *hrec)
  146. {
  147. fprintf(fp, "key=[%s] value=[%s]", hrec->key, hrec->value?hrec->value:"");
  148. int i;
  149. for (i=0; i<hrec->nkeys; i++)
  150. fprintf(fp, "\t[%s]=[%s]", hrec->keys[i],hrec->vals[i]);
  151. fprintf(fp, "\n");
  152. }
  153. void bcf_header_debug(bcf_hdr_t *hdr)
  154. {
  155. int i, j;
  156. for (i=0; i<hdr->nhrec; i++)
  157. {
  158. if ( !hdr->hrec[i]->value )
  159. {
  160. fprintf(stderr, "##%s=<", hdr->hrec[i]->key);
  161. fprintf(stderr,"%s=%s", hdr->hrec[i]->keys[0], hdr->hrec[i]->vals[0]);
  162. for (j=1; j<hdr->hrec[i]->nkeys; j++)
  163. fprintf(stderr,",%s=%s", hdr->hrec[i]->keys[j], hdr->hrec[i]->vals[j]);
  164. fprintf(stderr,">\n");
  165. }
  166. else
  167. fprintf(stderr,"##%s=%s\n", hdr->hrec[i]->key,hdr->hrec[i]->value);
  168. }
  169. }
  170. void bcf_hrec_add_key(bcf_hrec_t *hrec, const char *str, int len)
  171. {
  172. int n = ++hrec->nkeys;
  173. hrec->keys = (char**) realloc(hrec->keys, sizeof(char*)*n);
  174. hrec->vals = (char**) realloc(hrec->vals, sizeof(char*)*n);
  175. assert( len );
  176. hrec->keys[n-1] = (char*) malloc((len+1)*sizeof(char));
  177. memcpy(hrec->keys[n-1],str,len);
  178. hrec->keys[n-1][len] = 0;
  179. hrec->vals[n-1] = NULL;
  180. }
  181. void bcf_hrec_set_val(bcf_hrec_t *hrec, int i, const char *str, int len, int is_quoted)
  182. {
  183. if ( !str ) { hrec->vals[i] = NULL; return; }
  184. if ( hrec->vals[i] ) free(hrec->vals[i]);
  185. if ( is_quoted )
  186. {
  187. hrec->vals[i] = (char*) malloc((len+3)*sizeof(char));
  188. hrec->vals[i][0] = '"';
  189. memcpy(&hrec->vals[i][1],str,len);
  190. hrec->vals[i][len+1] = '"';
  191. hrec->vals[i][len+2] = 0;
  192. }
  193. else
  194. {
  195. hrec->vals[i] = (char*) malloc((len+1)*sizeof(char));
  196. memcpy(hrec->vals[i],str,len);
  197. hrec->vals[i][len] = 0;
  198. }
  199. }
  200. void hrec_add_idx(bcf_hrec_t *hrec, int idx)
  201. {
  202. int n = ++hrec->nkeys;
  203. hrec->keys = (char**) realloc(hrec->keys, sizeof(char*)*n);
  204. hrec->vals = (char**) realloc(hrec->vals, sizeof(char*)*n);
  205. hrec->keys[n-1] = strdup("IDX");
  206. kstring_t str = {0,0,0};
  207. kputw(idx, &str);
  208. hrec->vals[n-1] = str.s;
  209. }
  210. int bcf_hrec_find_key(bcf_hrec_t *hrec, const char *key)
  211. {
  212. int i;
  213. for (i=0; i<hrec->nkeys; i++)
  214. if ( !strcasecmp(key,hrec->keys[i]) ) return i;
  215. return -1;
  216. }
  217. static inline int is_escaped(const char *min, const char *str)
  218. {
  219. int n = 0;
  220. while ( --str>=min && *str=='\\' ) n++;
  221. return n%2;
  222. }
  223. bcf_hrec_t *bcf_hdr_parse_line(const bcf_hdr_t *h, const char *line, int *len)
  224. {
  225. const char *p = line;
  226. if (p[0] != '#' || p[1] != '#') { *len = 0; return NULL; }
  227. p += 2;
  228. const char *q = p;
  229. while ( *q && *q!='=' ) q++;
  230. int n = q-p;
  231. if ( *q!='=' || !n ) { *len = q-line+1; return NULL; } // wrong format
  232. bcf_hrec_t *hrec = (bcf_hrec_t*) calloc(1,sizeof(bcf_hrec_t));
  233. hrec->key = (char*) malloc(sizeof(char)*(n+1));
  234. memcpy(hrec->key,p,n);
  235. hrec->key[n] = 0;
  236. p = ++q;
  237. if ( *p!='<' ) // generic field, e.g. ##samtoolsVersion=0.1.18-r579
  238. {
  239. while ( *q && *q!='\n' ) q++;
  240. hrec->value = (char*) malloc((q-p+1)*sizeof(char));
  241. memcpy(hrec->value, p, q-p);
  242. hrec->value[q-p] = 0;
  243. *len = q-line+1;
  244. return hrec;
  245. }
  246. // structured line, e.g. ##INFO=<ID=PV1,Number=1,Type=Float,Description="P-value for baseQ bias">
  247. int nopen = 1;
  248. while ( *q && *q!='\n' && nopen )
  249. {
  250. p = ++q;
  251. while ( *q && *q!='=' ) q++;
  252. n = q-p;
  253. if ( *q!='=' || !n ) { *len = q-line+1; bcf_hrec_destroy(hrec); return NULL; } // wrong format
  254. bcf_hrec_add_key(hrec, p, q-p);
  255. p = ++q;
  256. int quoted = *p=='"' ? 1 : 0;
  257. if ( quoted ) p++, q++;
  258. while (1)
  259. {
  260. if ( !*q ) break;
  261. if ( quoted ) { if ( *q=='"' && !is_escaped(p,q) ) break; }
  262. else
  263. {
  264. if ( *q=='<' ) nopen++;
  265. if ( *q=='>' ) nopen--;
  266. if ( !nopen ) break;
  267. if ( *q==',' && nopen==1 ) break;
  268. }
  269. q++;
  270. }
  271. bcf_hrec_set_val(hrec, hrec->nkeys-1, p, q-p, quoted);
  272. if ( quoted ) q++;
  273. if ( *q=='>' ) { nopen--; q++; }
  274. }
  275. *len = q-line+1;
  276. return hrec;
  277. }
  278. // returns: 1 when hdr needs to be synced, 0 otherwise
  279. int bcf_hdr_register_hrec(bcf_hdr_t *hdr, bcf_hrec_t *hrec)
  280. {
  281. // contig
  282. int i,j,k, ret;
  283. char *str;
  284. if ( !strcmp(hrec->key, "contig") )
  285. {
  286. hrec->type = BCF_HL_CTG;
  287. // Get the contig ID ($str) and length ($j)
  288. i = bcf_hrec_find_key(hrec,"length");
  289. if ( i<0 ) return 0;
  290. if ( sscanf(hrec->vals[i],"%d",&j)!=1 ) return 0;
  291. i = bcf_hrec_find_key(hrec,"ID");
  292. if ( i<0 ) return 0;
  293. str = strdup(hrec->vals[i]);
  294. // Register in the dictionary
  295. vdict_t *d = (vdict_t*)hdr->dict[BCF_DT_CTG];
  296. k = kh_put(vdict, d, str, &ret);
  297. if ( !ret ) { free(str); return 0; } // already present
  298. int idx = bcf_hrec_find_key(hrec,"IDX");
  299. if ( idx!=-1 )
  300. {
  301. char *tmp = hrec->vals[idx];
  302. idx = strtol(hrec->vals[idx], &tmp, 10);
  303. if ( *tmp )
  304. {
  305. fprintf(stderr,"[%s:%d %s] Error parsing the IDX tag, skipping.\n", __FILE__,__LINE__,__FUNCTION__);
  306. return 0;
  307. }
  308. }
  309. else
  310. {
  311. idx = kh_size(d) - 1;
  312. hrec_add_idx(hrec, idx);
  313. }
  314. kh_val(d, k) = bcf_idinfo_def;
  315. kh_val(d, k).id = idx;
  316. kh_val(d, k).info[0] = i;
  317. kh_val(d, k).hrec[0] = hrec;
  318. return 1;
  319. }
  320. if ( !strcmp(hrec->key, "INFO") ) hrec->type = BCF_HL_INFO;
  321. else if ( !strcmp(hrec->key, "FILTER") ) hrec->type = BCF_HL_FLT;
  322. else if ( !strcmp(hrec->key, "FORMAT") ) hrec->type = BCF_HL_FMT;
  323. else if ( hrec->nkeys>0 ) { hrec->type = BCF_HL_STR; return 1; }
  324. else return 0;
  325. // INFO/FILTER/FORMAT
  326. char *id = NULL;
  327. int type = -1, num = -1, var = -1, idx = -1;
  328. for (i=0; i<hrec->nkeys; i++)
  329. {
  330. if ( !strcmp(hrec->keys[i], "ID") ) id = hrec->vals[i];
  331. else if ( !strcmp(hrec->keys[i], "IDX") )
  332. {
  333. char *tmp = hrec->vals[i];
  334. idx = strtol(hrec->vals[i], &tmp, 10);
  335. if ( *tmp )
  336. {
  337. fprintf(stderr,"[%s:%d %s] Error parsing the IDX tag, skipping.\n", __FILE__,__LINE__,__FUNCTION__);
  338. return 0;
  339. }
  340. }
  341. else if ( !strcmp(hrec->keys[i], "Type") )
  342. {
  343. if ( !strcmp(hrec->vals[i], "Integer") ) type = BCF_HT_INT;
  344. else if ( !strcmp(hrec->vals[i], "Float") ) type = BCF_HT_REAL;
  345. else if ( !strcmp(hrec->vals[i], "String") ) type = BCF_HT_STR;
  346. else if ( !strcmp(hrec->vals[i], "Flag") ) type = BCF_HT_FLAG;
  347. else
  348. {
  349. fprintf(stderr, "[E::%s] The type \"%s\" not supported, assuming \"String\"\n", __func__, hrec->vals[i]);
  350. type = BCF_HT_STR;
  351. }
  352. }
  353. else if ( !strcmp(hrec->keys[i], "Number") )
  354. {
  355. if ( !strcmp(hrec->vals[i],"A") ) var = BCF_VL_A;
  356. else if ( !strcmp(hrec->vals[i],"R") ) var = BCF_VL_R;
  357. else if ( !strcmp(hrec->vals[i],"G") ) var = BCF_VL_G;
  358. else if ( !strcmp(hrec->vals[i],".") ) var = BCF_VL_VAR;
  359. else
  360. {
  361. sscanf(hrec->vals[i],"%d",&num);
  362. var = BCF_VL_FIXED;
  363. }
  364. if (var != BCF_VL_FIXED) num = 0xfffff;
  365. }
  366. }
  367. uint32_t info = (uint32_t)num<<12 | var<<8 | type<<4 | hrec->type;
  368. if ( !id ) return 0;
  369. str = strdup(id);
  370. vdict_t *d = (vdict_t*)hdr->dict[BCF_DT_ID];
  371. k = kh_put(vdict, d, str, &ret);
  372. if ( !ret )
  373. {
  374. // already present
  375. free(str);
  376. if ( kh_val(d, k).hrec[info&0xf] ) return 0;
  377. kh_val(d, k).info[info&0xf] = info;
  378. kh_val(d, k).hrec[info&0xf] = hrec;
  379. return 1;
  380. }
  381. kh_val(d, k) = bcf_idinfo_def;
  382. kh_val(d, k).info[info&0xf] = info;
  383. kh_val(d, k).hrec[info&0xf] = hrec;
  384. kh_val(d, k).id = idx==-1 ? kh_size(d) - 1 : idx;
  385. if ( idx==-1 ) hrec_add_idx(hrec, kh_val(d, k).id);
  386. return 1;
  387. }
  388. int bcf_hdr_add_hrec(bcf_hdr_t *hdr, bcf_hrec_t *hrec)
  389. {
  390. hrec->type = BCF_HL_GEN;
  391. if ( !bcf_hdr_register_hrec(hdr,hrec) )
  392. {
  393. // If one of the hashed field, then it is already present
  394. if ( hrec->type != BCF_HL_GEN )
  395. {
  396. bcf_hrec_destroy(hrec);
  397. return 0;
  398. }
  399. // Is one of the generic fields and already present?
  400. int i;
  401. for (i=0; i<hdr->nhrec; i++)
  402. {
  403. if ( hdr->hrec[i]->type!=BCF_HL_GEN ) continue;
  404. if ( !strcmp(hdr->hrec[i]->key,hrec->key) && !strcmp(hrec->key,"fileformat") ) break;
  405. if ( !strcmp(hdr->hrec[i]->key,hrec->key) && !strcmp(hdr->hrec[i]->value,hrec->value) ) break;
  406. }
  407. if ( i<hdr->nhrec )
  408. {
  409. bcf_hrec_destroy(hrec);
  410. return 0;
  411. }
  412. }
  413. // New record, needs to be added
  414. int n = ++hdr->nhrec;
  415. hdr->hrec = (bcf_hrec_t**) realloc(hdr->hrec, n*sizeof(bcf_hrec_t*));
  416. hdr->hrec[n-1] = hrec;
  417. return hrec->type==BCF_HL_GEN ? 0 : 1;
  418. }
  419. bcf_hrec_t *bcf_hdr_get_hrec(const bcf_hdr_t *hdr, int type, const char *id)
  420. {
  421. int i;
  422. if ( type==BCF_HL_GEN )
  423. {
  424. for (i=0; i<hdr->nhrec; i++)
  425. {
  426. if ( hdr->hrec[i]->type!=BCF_HL_GEN ) continue;
  427. if ( !strcmp(hdr->hrec[i]->key,id) ) return hdr->hrec[i];
  428. }
  429. return NULL;
  430. }
  431. vdict_t *d = type==BCF_HL_CTG ? (vdict_t*)hdr->dict[BCF_DT_CTG] : (vdict_t*)hdr->dict[BCF_DT_ID];
  432. khint_t k = kh_get(vdict, d, id);
  433. if ( k == kh_end(d) ) return NULL;
  434. return kh_val(d, k).hrec[type==BCF_HL_CTG?0:type];
  435. }
  436. void bcf_hdr_check_sanity(bcf_hdr_t *hdr)
  437. {
  438. static int PL_warned = 0, GL_warned = 0;
  439. if ( !PL_warned )
  440. {
  441. int id = bcf_hdr_id2int(hdr, BCF_DT_ID, "PL");
  442. if ( bcf_hdr_idinfo_exists(hdr,BCF_HL_FMT,id) && bcf_hdr_id2length(hdr,BCF_HL_FMT,id)!=BCF_VL_G )
  443. {
  444. fprintf(stderr,"[W::%s] PL should be declared as Number=G\n", __func__);
  445. PL_warned = 1;
  446. }
  447. }
  448. if ( !GL_warned )
  449. {
  450. int id = bcf_hdr_id2int(hdr, BCF_HL_FMT, "GL");
  451. if ( bcf_hdr_idinfo_exists(hdr,BCF_HL_FMT,id) && bcf_hdr_id2length(hdr,BCF_HL_FMT,id)!=BCF_VL_G )
  452. {
  453. fprintf(stderr,"[W::%s] GL should be declared as Number=G\n", __func__);
  454. PL_warned = 1;
  455. }
  456. }
  457. }
  458. int bcf_hdr_parse(bcf_hdr_t *hdr, char *htxt)
  459. {
  460. int len, needs_sync = 0;
  461. char *p = htxt;
  462. // Check sanity: "fileformat" string must come as first
  463. bcf_hrec_t *hrec = bcf_hdr_parse_line(hdr,p,&len);
  464. if ( !hrec->key || strcasecmp(hrec->key,"fileformat") )
  465. fprintf(stderr, "[W::%s] The first line should be ##fileformat; is the VCF/BCF header broken?\n", __func__);
  466. needs_sync += bcf_hdr_add_hrec(hdr, hrec);
  467. // The filter PASS must appear first in the dictionary
  468. hrec = bcf_hdr_parse_line(hdr,"##FILTER=<ID=PASS,Description=\"All filters passed\">",&len);
  469. needs_sync += bcf_hdr_add_hrec(hdr, hrec);
  470. // Parse the whole header
  471. while ( (hrec=bcf_hdr_parse_line(hdr,p,&len)) )
  472. {
  473. needs_sync += bcf_hdr_add_hrec(hdr, hrec);
  474. p += len;
  475. }
  476. bcf_hdr_parse_sample_line(hdr,p);
  477. if ( needs_sync ) bcf_hdr_sync(hdr);
  478. bcf_hdr_check_sanity(hdr);
  479. return 0;
  480. }
  481. int bcf_hdr_append(bcf_hdr_t *hdr, const char *line)
  482. {
  483. int len;
  484. bcf_hrec_t *hrec = bcf_hdr_parse_line(hdr, (char*) line, &len);
  485. if ( !hrec ) return -1;
  486. if ( bcf_hdr_add_hrec(hdr, hrec) )
  487. bcf_hdr_sync(hdr);
  488. return 0;
  489. }
  490. void bcf_hdr_remove(bcf_hdr_t *hdr, int type, const char *key)
  491. {
  492. int i;
  493. bcf_hrec_t *hrec;
  494. while (1)
  495. {
  496. if ( type==BCF_HL_FLT || type==BCF_HL_INFO || type==BCF_HL_FMT || type== BCF_HL_CTG )
  497. {
  498. hrec = bcf_hdr_get_hrec(hdr, type, key);
  499. if ( !hrec ) return;
  500. for (i=0; i<hdr->nhrec; i++)
  501. if ( hdr->hrec[i]==hrec ) break;
  502. assert( i<hdr->nhrec );
  503. vdict_t *d = type==BCF_HL_CTG ? (vdict_t*)hdr->dict[BCF_DT_CTG] : (vdict_t*)hdr->dict[BCF_DT_ID];
  504. khint_t k = kh_get(vdict, d, key);
  505. kh_val(d, k).hrec[type==BCF_HL_CTG?0:type] = NULL;
  506. }
  507. else
  508. {
  509. for (i=0; i<hdr->nhrec; i++)
  510. {
  511. if ( hdr->hrec[i]->type!=type ) continue;
  512. if ( !strcmp(hdr->hrec[i]->key,key) ) break;
  513. }
  514. if ( i==hdr->nhrec ) return;
  515. hrec = hdr->hrec[i];
  516. }
  517. hdr->nhrec--;
  518. if ( i < hdr->nhrec )
  519. memmove(&hdr->hrec[i],&hdr->hrec[i+1],(hdr->nhrec-i)*sizeof(bcf_hrec_t*));
  520. bcf_hrec_destroy(hrec);
  521. bcf_hdr_sync(hdr);
  522. }
  523. }
  524. int bcf_hdr_printf(bcf_hdr_t *hdr, const char *fmt, ...)
  525. {
  526. va_list ap;
  527. va_start(ap, fmt);
  528. int n = vsnprintf(NULL, 0, fmt, ap) + 2;
  529. va_end(ap);
  530. char *line = (char*)malloc(n);
  531. va_start(ap, fmt);
  532. vsnprintf(line, n, fmt, ap);
  533. va_end(ap);
  534. int ret = bcf_hdr_append(hdr, line);
  535. free(line);
  536. return ret;
  537. }
  538. /**********************
  539. *** BCF header I/O ***
  540. **********************/
  541. const char *bcf_hdr_get_version(const bcf_hdr_t *hdr)
  542. {
  543. bcf_hrec_t *hrec = bcf_hdr_get_hrec(hdr, BCF_HL_GEN, "fileformat");
  544. if ( !hrec )
  545. {
  546. fprintf(stderr,"No version string found, assuming VCFv4.2\n");
  547. return "VCFv4.2";
  548. }
  549. return hrec->value;
  550. }
  551. void bcf_hdr_set_version(bcf_hdr_t *hdr, const char *version)
  552. {
  553. bcf_hrec_t *hrec = bcf_hdr_get_hrec(hdr, BCF_HL_GEN, "fileformat");
  554. if ( !hrec )
  555. {
  556. int len;
  557. kstring_t str = {0,0,0};
  558. ksprintf(&str,"##fileformat=%s", version);
  559. hrec = bcf_hdr_parse_line(hdr, str.s, &len);
  560. free(str.s);
  561. }
  562. else
  563. {
  564. free(hrec->value);
  565. hrec->value = strdup(version);
  566. }
  567. bcf_hdr_sync(hdr);
  568. }
  569. bcf_hdr_t *bcf_hdr_init(const char *mode)
  570. {
  571. int i;
  572. bcf_hdr_t *h;
  573. h = (bcf_hdr_t*)calloc(1, sizeof(bcf_hdr_t));
  574. for (i = 0; i < 3; ++i)
  575. h->dict[i] = kh_init(vdict);
  576. if ( strchr(mode,'w') )
  577. {
  578. bcf_hdr_append(h, "##fileformat=VCFv4.2");
  579. // The filter PASS must appear first in the dictionary
  580. bcf_hdr_append(h, "##FILTER=<ID=PASS,Description=\"All filters passed\">");
  581. }
  582. return h;
  583. }
  584. void bcf_hdr_destroy(bcf_hdr_t *h)
  585. {
  586. int i;
  587. khint_t k;
  588. for (i = 0; i < 3; ++i) {
  589. vdict_t *d = (vdict_t*)h->dict[i];
  590. if (d == 0) continue;
  591. for (k = kh_begin(d); k != kh_end(d); ++k)
  592. if (kh_exist(d, k)) free((char*)kh_key(d, k));
  593. kh_destroy(vdict, d);
  594. free(h->id[i]);
  595. }
  596. for (i=0; i<h->nhrec; i++)
  597. bcf_hrec_destroy(h->hrec[i]);
  598. if (h->nhrec) free(h->hrec);
  599. if (h->samples) free(h->samples);
  600. free(h->keep_samples);
  601. free(h->transl[0]); free(h->transl[1]);
  602. free(h->mem.s);
  603. free(h);
  604. }
  605. bcf_hdr_t *bcf_hdr_read(htsFile *hfp)
  606. {
  607. if (!hfp->is_bin)
  608. return vcf_hdr_read(hfp);
  609. BGZF *fp = hfp->fp.bgzf;
  610. uint8_t magic[5];
  611. bcf_hdr_t *h;
  612. h = bcf_hdr_init("r");
  613. if ( bgzf_read(fp, magic, 5)<0 )
  614. {
  615. fprintf(stderr,"[%s:%d %s] Failed to read the header (reading BCF in text mode?)\n", __FILE__,__LINE__,__FUNCTION__);
  616. return NULL;
  617. }
  618. if (strncmp((char*)magic, "BCF\2\2", 5) != 0)
  619. {
  620. if (!strncmp((char*)magic, "BCF", 3))
  621. fprintf(stderr,"[%s:%d %s] invalid BCF2 magic string: only BCFv2.2 is supported.\n", __FILE__,__LINE__,__FUNCTION__);
  622. else if (hts_verbose >= 2)
  623. fprintf(stderr, "[E::%s] invalid BCF2 magic string\n", __func__);
  624. bcf_hdr_destroy(h);
  625. return 0;
  626. }
  627. int hlen;
  628. char *htxt;
  629. bgzf_read(fp, &hlen, 4);
  630. htxt = (char*)malloc(hlen);
  631. bgzf_read(fp, htxt, hlen);
  632. bcf_hdr_parse(h, htxt);
  633. free(htxt);
  634. return h;
  635. }
  636. int bcf_hdr_write(htsFile *hfp, const bcf_hdr_t *h)
  637. {
  638. if (!hfp->is_bin) return vcf_hdr_write(hfp, h);
  639. int hlen;
  640. char *htxt = bcf_hdr_fmt_text(h, 1, &hlen);
  641. hlen++; // include the \0 byte
  642. BGZF *fp = hfp->fp.bgzf;
  643. if ( bgzf_write(fp, "BCF\2\2", 5) !=5 ) return -1;
  644. if ( bgzf_write(fp, &hlen, 4) !=4 ) return -1;
  645. if ( bgzf_write(fp, htxt, hlen) != hlen ) return -1;
  646. free(htxt);
  647. return 0;
  648. }
  649. /********************
  650. *** BCF site I/O ***
  651. ********************/
  652. bcf1_t *bcf_init1()
  653. {
  654. bcf1_t *v;
  655. v = (bcf1_t*)calloc(1, sizeof(bcf1_t));
  656. return v;
  657. }
  658. void bcf_clear(bcf1_t *v)
  659. {
  660. int i;
  661. for (i=0; i<v->d.m_info; i++)
  662. {
  663. if ( v->d.info[i].vptr_free )
  664. {
  665. free(v->d.info[i].vptr - v->d.info[i].vptr_off);
  666. v->d.info[i].vptr_free = 0;
  667. }
  668. }
  669. for (i=0; i<v->d.m_fmt; i++)
  670. {
  671. if ( v->d.fmt[i].p_free )
  672. {
  673. free(v->d.fmt[i].p - v->d.fmt[i].p_off);
  674. v->d.fmt[i].p_free = 0;
  675. }
  676. }
  677. v->rid = v->pos = v->rlen = v->unpacked = 0;
  678. bcf_float_set_missing(v->qual);
  679. v->n_info = v->n_allele = v->n_fmt = v->n_sample = 0;
  680. v->shared.l = v->indiv.l = 0;
  681. v->d.var_type = -1;
  682. v->d.shared_dirty = 0;
  683. v->d.indiv_dirty = 0;
  684. v->d.n_flt = 0;
  685. v->errcode = 0;
  686. if (v->d.m_als) v->d.als[0] = 0;
  687. if (v->d.m_id) v->d.id[0] = 0;
  688. }
  689. void bcf_empty1(bcf1_t *v)
  690. {
  691. bcf_clear1(v);
  692. free(v->d.id);
  693. free(v->d.als);
  694. free(v->d.allele); free(v->d.flt); free(v->d.info); free(v->d.fmt);
  695. if (v->d.var ) free(v->d.var);
  696. free(v->shared.s); free(v->indiv.s);
  697. }
  698. void bcf_destroy1(bcf1_t *v)
  699. {
  700. bcf_empty1(v);
  701. free(v);
  702. }
  703. static inline int bcf_read1_core(BGZF *fp, bcf1_t *v)
  704. {
  705. uint32_t x[8];
  706. int ret;
  707. if ((ret = bgzf_read(fp, x, 32)) != 32) {
  708. if (ret == 0) return -1;
  709. return -2;
  710. }
  711. bcf_clear1(v);
  712. x[0] -= 24; // to exclude six 32-bit integers
  713. ks_resize(&v->shared, x[0]);
  714. ks_resize(&v->indiv, x[1]);
  715. memcpy(v, x + 2, 16);
  716. v->n_allele = x[6]>>16; v->n_info = x[6]&0xffff;
  717. v->n_fmt = x[7]>>24; v->n_sample = x[7]&0xffffff;
  718. v->shared.l = x[0], v->indiv.l = x[1];
  719. // silent fix of broken BCFs produced by earlier versions of bcf_subset, prior to and including bd6ed8b4
  720. if ( (!v->indiv.l || !v->n_sample) && v->n_fmt ) v->n_fmt = 0;
  721. bgzf_read(fp, v->shared.s, v->shared.l);
  722. bgzf_read(fp, v->indiv.s, v->indiv.l);
  723. return 0;
  724. }
  725. #define bit_array_size(n) ((n)/8+1)
  726. #define bit_array_set(a,i) ((a)[(i)/8] |= 1 << ((i)%8))
  727. #define bit_array_clear(a,i) ((a)[(i)/8] &= ~(1 << ((i)%8)))
  728. #define bit_array_test(a,i) ((a)[(i)/8] & (1 << ((i)%8)))
  729. static inline uint8_t *bcf_unpack_fmt_core1(uint8_t *ptr, int n_sample, bcf_fmt_t *fmt);
  730. int bcf_subset_format(const bcf_hdr_t *hdr, bcf1_t *rec)
  731. {
  732. if ( !hdr->keep_samples ) return 0;
  733. if ( !bcf_hdr_nsamples(hdr) )
  734. {
  735. rec->indiv.l = rec->n_sample = 0;
  736. return 0;
  737. }
  738. int i, j;
  739. uint8_t *ptr = (uint8_t*)rec->indiv.s, *dst = NULL, *src;
  740. bcf_dec_t *dec = &rec->d;
  741. hts_expand(bcf_fmt_t, rec->n_fmt, dec->m_fmt, dec->fmt);
  742. for (i=0; i<dec->m_fmt; ++i) dec->fmt[i].p_free = 0;
  743. for (i=0; i<rec->n_fmt; i++)
  744. {
  745. ptr = bcf_unpack_fmt_core1(ptr, rec->n_sample, &dec->fmt[i]);
  746. src = dec->fmt[i].p - dec->fmt[i].size;
  747. if ( dst )
  748. {
  749. memmove(dec->fmt[i-1].p + dec->fmt[i-1].p_len, dec->fmt[i].p - dec->fmt[i].p_off, dec->fmt[i].p_off);
  750. dec->fmt[i].p = dec->fmt[i-1].p + dec->fmt[i-1].p_len + dec->fmt[i].p_off;
  751. }
  752. dst = dec->fmt[i].p;
  753. for (j=0; j<hdr->nsamples_ori; j++)
  754. {
  755. src += dec->fmt[i].size;
  756. if ( !bit_array_test(hdr->keep_samples,j) ) continue;
  757. memmove(dst, src, dec->fmt[i].size);
  758. dst += dec->fmt[i].size;
  759. }
  760. rec->indiv.l -= dec->fmt[i].p_len - (dst - dec->fmt[i].p);
  761. dec->fmt[i].p_len = dst - dec->fmt[i].p;
  762. }
  763. rec->unpacked |= BCF_UN_FMT;
  764. rec->n_sample = bcf_hdr_nsamples(hdr);
  765. return 0;
  766. }
  767. int bcf_read(htsFile *fp, const bcf_hdr_t *h, bcf1_t *v)
  768. {
  769. if (!fp->is_bin) return vcf_read(fp,h,v);
  770. int ret = bcf_read1_core(fp->fp.bgzf, v);
  771. if ( ret!=0 || !h->keep_samples ) return ret;
  772. return bcf_subset_format(h,v);
  773. }
  774. int bcf_readrec(BGZF *fp, void *null, void *vv, int *tid, int *beg, int *end)
  775. {
  776. bcf1_t *v = (bcf1_t *) vv;
  777. int ret;
  778. if ((ret = bcf_read1_core(fp, v)) >= 0)
  779. *tid = v->rid, *beg = v->pos, *end = v->pos + v->rlen;
  780. return ret;
  781. }
  782. static inline void bcf1_sync_id(bcf1_t *line, kstring_t *str)
  783. {
  784. // single typed string
  785. if ( line->d.id && strcmp(line->d.id, ".") ) bcf_enc_vchar(str, strlen(line->d.id), line->d.id);
  786. else bcf_enc_size(str, 0, BCF_BT_CHAR);
  787. }
  788. static inline void bcf1_sync_alleles(bcf1_t *line, kstring_t *str)
  789. {
  790. // list of typed strings
  791. int i;
  792. for (i=0; i<line->n_allele; i++)
  793. bcf_enc_vchar(str, strlen(line->d.allele[i]), line->d.allele[i]);
  794. if ( !line->rlen && line->n_allele ) line->rlen = strlen(line->d.allele[0]);
  795. }
  796. static inline void bcf1_sync_filter(bcf1_t *line, kstring_t *str)
  797. {
  798. // typed vector of integers
  799. if ( line->d.n_flt ) bcf_enc_vint(str, line->d.n_flt, line->d.flt, -1);
  800. else bcf_enc_vint(str, 0, 0, -1);
  801. }
  802. static inline void bcf1_sync_info(bcf1_t *line, kstring_t *str)
  803. {
  804. // pairs of typed vectors
  805. int i, irm = -1;
  806. for (i=0; i<line->n_info; i++)
  807. {
  808. bcf_info_t *info = &line->d.info[i];
  809. if ( !info->vptr )
  810. {
  811. // marked for removal
  812. if ( irm < 0 ) irm = i;
  813. continue;
  814. }
  815. kputsn_(info->vptr - info->vptr_off, info->vptr_len + info->vptr_off, str);
  816. if ( irm >=0 )
  817. {
  818. bcf_info_t tmp = line->d.info[irm]; line->d.info[irm] = line->d.info[i]; line->d.info[i] = tmp;
  819. while ( irm<=i && line->d.info[irm].vptr ) irm++;
  820. }
  821. }
  822. if ( irm>=0 ) line->n_info = irm;
  823. }
  824. static int bcf1_sync(bcf1_t *line)
  825. {
  826. char *shared_ori = line->shared.s;
  827. size_t prev_len;
  828. kstring_t tmp = {0,0,0};
  829. if ( !line->shared.l )
  830. {
  831. // New line created via API, BCF data blocks do not exist. Get it ready for BCF output
  832. tmp = line->shared;
  833. bcf1_sync_id(line, &tmp);
  834. line->unpack_size[0] = tmp.l; prev_len = tmp.l;
  835. bcf1_sync_alleles(line, &tmp);
  836. line->unpack_size[1] = tmp.l - prev_len; prev_len = tmp.l;
  837. bcf1_sync_filter(line, &tmp);
  838. line->unpack_size[2] = tmp.l - prev_len;
  839. bcf1_sync_info(line, &tmp);
  840. line->shared = tmp;
  841. }
  842. else if ( line->d.shared_dirty )
  843. {
  844. // The line was edited, update the BCF data block, ptr_ori points
  845. // to the original unchanged BCF data.
  846. uint8_t *ptr_ori = (uint8_t *) line->shared.s;
  847. assert( line->unpacked & BCF_UN_STR );
  848. // ID: single typed string
  849. if ( line->d.shared_dirty & BCF1_DIRTY_ID )
  850. bcf1_sync_id(line, &tmp);
  851. else
  852. kputsn_(ptr_ori, line->unpack_size[0], &tmp);
  853. ptr_ori += line->unpack_size[0];
  854. line->unpack_size[0] = tmp.l; prev_len = tmp.l;
  855. // REF+ALT: list of typed strings
  856. if ( line->d.shared_dirty & BCF1_DIRTY_ALS )
  857. bcf1_sync_alleles(line, &tmp);
  858. else
  859. {
  860. kputsn_(ptr_ori, line->unpack_size[1], &tmp);
  861. if ( !line->rlen && line->n_allele ) line->rlen = strlen(line->d.allele[0]);
  862. }
  863. ptr_ori += line->unpack_size[1];
  864. line->unpack_size[1] = tmp.l - prev_len; prev_len = tmp.l;
  865. if ( line->unpacked & BCF_UN_FLT )
  866. {
  867. // FILTER: typed vector of integers
  868. if ( line->d.shared_dirty & BCF1_DIRTY_FLT )
  869. bcf1_sync_filter(line, &tmp);
  870. else if ( line->d.n_flt )
  871. kputsn_(ptr_ori, line->unpack_size[2], &tmp);
  872. else
  873. bcf_enc_vint(&tmp, 0, 0, -1);
  874. ptr_ori += line->unpack_size[2];
  875. line->unpack_size[2] = tmp.l - prev_len;
  876. if ( line->unpacked & BCF_UN_INFO )
  877. {
  878. // INFO: pairs of typed vectors
  879. if ( line->d.shared_dirty & BCF1_DIRTY_INF )
  880. {
  881. bcf1_sync_info(line, &tmp);
  882. ptr_ori = (uint8_t*)line->shared.s + line->shared.l;
  883. }
  884. }
  885. }
  886. int size = line->shared.l - (size_t)ptr_ori + (size_t)line->shared.s;
  887. if ( size ) kputsn_(ptr_ori, size, &tmp);
  888. free(line->shared.s);
  889. line->shared = tmp;
  890. }
  891. if ( line->shared.s != shared_ori && line->unpacked & BCF_UN_INFO )
  892. {
  893. // Reallocated line->shared.s block invalidated line->d.info[].vptr pointers
  894. size_t off_new = line->unpack_size[0] + line->unpack_size[1] + line->unpack_size[2];
  895. int i;
  896. for (i=0; i<line->n_info; i++)
  897. {
  898. uint8_t *vptr_free = line->d.info[i].vptr_free ? line->d.info[i].vptr - line->d.info[i].vptr_off : NULL;
  899. line->d.info[i].vptr = (uint8_t*) line->shared.s + off_new + line->d.info[i].vptr_off;
  900. off_new += line->d.info[i].vptr_len + line->d.info[i].vptr_off;
  901. if ( vptr_free )
  902. {
  903. free(vptr_free);
  904. line->d.info[i].vptr_free = 0;
  905. }
  906. }
  907. }
  908. if ( line->n_sample && line->n_fmt && (!line->indiv.l || line->d.indiv_dirty) )
  909. {
  910. // The genotype fields changed or are not present
  911. tmp.l = tmp.m = 0; tmp.s = NULL;
  912. int i, irm = -1;
  913. for (i=0; i<line->n_fmt; i++)
  914. {
  915. bcf_fmt_t *fmt = &line->d.fmt[i];
  916. if ( !fmt->p )
  917. {
  918. // marked for removal
  919. if ( irm < 0 ) irm = i;
  920. continue;
  921. }
  922. kputsn_(fmt->p - fmt->p_off, fmt->p_len + fmt->p_off, &tmp);
  923. if ( irm >=0 )
  924. {
  925. bcf_fmt_t tfmt = line->d.fmt[irm]; line->d.fmt[irm] = line->d.fmt[i]; line->d.fmt[i] = tfmt;
  926. while ( irm<=i && line->d.fmt[irm].p ) irm++;
  927. }
  928. }
  929. if ( irm>=0 ) line->n_fmt = irm;
  930. free(line->indiv.s);
  931. line->indiv = tmp;
  932. // Reallocated line->indiv.s block invalidated line->d.fmt[].p pointers
  933. size_t off_new = 0;
  934. for (i=0; i<line->n_fmt; i++)
  935. {
  936. uint8_t *p_free = line->d.fmt[i].p_free ? line->d.fmt[i].p - line->d.fmt[i].p_off : NULL;
  937. line->d.fmt[i].p = (uint8_t*) line->indiv.s + off_new + line->d.fmt[i].p_off;
  938. off_new += line->d.fmt[i].p_len + line->d.fmt[i].p_off;
  939. if ( p_free )
  940. {
  941. free(p_free);
  942. line->d.fmt[i].p_free = 0;
  943. }
  944. }
  945. }
  946. if ( !line->n_sample ) line->n_fmt = 0;
  947. line->d.shared_dirty = line->d.indiv_dirty = 0;
  948. return 0;
  949. }
  950. bcf1_t *bcf_dup(bcf1_t *src)
  951. {
  952. bcf1_sync(src);
  953. bcf1_t *out = bcf_init1();
  954. out->rid = src->rid;
  955. out->pos = src->pos;
  956. out->rlen = src->rlen;
  957. out->qual = src->qual;
  958. out->n_info = src->n_info; out->n_allele = src->n_allele;
  959. out->n_fmt = src->n_fmt; out->n_sample = src->n_sample;
  960. out->shared.m = out->shared.l = src->shared.l;
  961. out->shared.s = (char*) malloc(out->shared.l);
  962. memcpy(out->shared.s,src->shared.s,out->shared.l);
  963. out->indiv.m = out->indiv.l = src->indiv.l;
  964. out->indiv.s = (char*) malloc(out->indiv.l);
  965. memcpy(out->indiv.s,src->indiv.s,out->indiv.l);
  966. return out;
  967. }
  968. int bcf_write(htsFile *hfp, const bcf_hdr_t *h, bcf1_t *v)
  969. {
  970. if ( bcf_hdr_nsamples(h)!=v->n_sample )
  971. {
  972. fprintf(stderr,"[%s:%d %s] Broken VCF record, the number of columns at %s:%d does not match the number of samples (%d vs %d).\n",
  973. __FILE__,__LINE__,__FUNCTION__,bcf_seqname(h,v),v->pos+1, v->n_sample,bcf_hdr_nsamples(h));
  974. return -1;
  975. }
  976. if ( !hfp->is_bin ) return vcf_write(hfp,h,v);
  977. if ( v->errcode )
  978. {
  979. // vcf_parse1() encountered a new contig or tag, undeclared in the
  980. // header. At this point, the header must have been printed,
  981. // proceeding would lead to a broken BCF file. Errors must be checked
  982. // and cleared by the caller before we can proceed.
  983. fprintf(stderr,"[%s:%d %s] Unchecked error (%d), exiting.\n", __FILE__,__LINE__,__FUNCTION__,v->errcode);
  984. exit(1);
  985. }
  986. bcf1_sync(v); // check if the BCF record was modified
  987. BGZF *fp = hfp->fp.bgzf;
  988. uint32_t x[8];
  989. x[0] = v->shared.l + 24; // to include six 32-bit integers
  990. x[1] = v->indiv.l;
  991. memcpy(x + 2, v, 16);
  992. x[6] = (uint32_t)v->n_allele<<16 | v->n_info;
  993. x[7] = (uint32_t)v->n_fmt<<24 | v->n_sample;
  994. if ( bgzf_write(fp, x, 32) != 32 ) return -1;
  995. if ( bgzf_write(fp, v->shared.s, v->shared.l) != v->shared.l ) return -1;
  996. if ( bgzf_write(fp, v->indiv.s, v->indiv.l) != v->indiv.l ) return -1;
  997. return 0;
  998. }
  999. /**********************
  1000. *** VCF header I/O ***
  1001. **********************/
  1002. bcf_hdr_t *vcf_hdr_read(htsFile *fp)
  1003. {
  1004. kstring_t txt, *s = &fp->line;
  1005. bcf_hdr_t *h;
  1006. h = bcf_hdr_init("r");
  1007. txt.l = txt.m = 0; txt.s = 0;
  1008. while (hts_getline(fp, KS_SEP_LINE, s) >= 0) {
  1009. if (s->l == 0) continue;
  1010. if (s->s[0] != '#') {
  1011. if (hts_verbose >= 2)
  1012. fprintf(stderr, "[E::%s] no sample line\n", __func__);
  1013. free(txt.s);
  1014. bcf_hdr_destroy(h);
  1015. return 0;
  1016. }
  1017. if (s->s[1] != '#' && fp->fn_aux) { // insert contigs here
  1018. int dret;
  1019. gzFile f;
  1020. kstream_t *ks;
  1021. kstring_t tmp;
  1022. tmp.l = tmp.m = 0; tmp.s = 0;
  1023. f = gzopen(fp->fn_aux, "r");
  1024. ks = ks_init(f);
  1025. while (ks_getuntil(ks, 0, &tmp, &dret) >= 0) {
  1026. int c;
  1027. kputs("##contig=<ID=", &txt); kputs(tmp.s, &txt);
  1028. ks_getuntil(ks, 0, &tmp, &dret);
  1029. kputs(",length=", &txt); kputw(atol(tmp.s), &txt);
  1030. kputsn(">\n", 2, &txt);
  1031. if (dret != '\n')
  1032. while ((c = ks_getc(ks)) != '\n' && c != -1); // skip the rest of the line
  1033. }
  1034. free(tmp.s);
  1035. ks_destroy(ks);
  1036. gzclose(f);
  1037. }
  1038. kputsn(s->s, s->l, &txt);
  1039. kputc('\n', &txt);
  1040. if (s->s[1] != '#') break;
  1041. }
  1042. if ( !txt.s )
  1043. {
  1044. fprintf(stderr,"[%s:%d %s] Could not read the header\n", __FILE__,__LINE__,__FUNCTION__);
  1045. return NULL;
  1046. }
  1047. bcf_hdr_parse(h, txt.s);
  1048. // check tabix index, are all contigs listed in the header? add the missing ones
  1049. tbx_t *idx = tbx_index_load(fp->fn);
  1050. if ( idx )
  1051. {
  1052. int i, n, need_sync = 0;
  1053. const char **names = tbx_seqnames(idx, &n);
  1054. for (i=0; i<n; i++)
  1055. {
  1056. bcf_hrec_t *hrec = bcf_hdr_get_hrec(h, BCF_DT_CTG, (char*) names[i]);
  1057. if ( hrec ) continue;
  1058. hrec = (bcf_hrec_t*) calloc(1,sizeof(bcf_hrec_t));
  1059. hrec->key = strdup("contig");
  1060. bcf_hrec_add_key(hrec, "ID", strlen("ID"));
  1061. bcf_hrec_set_val(hrec, hrec->nkeys-1, (char*) names[i], strlen(names[i]), 0);
  1062. bcf_hrec_add_key(hrec, "length", strlen("length"));
  1063. bcf_hrec_set_val(hrec, hrec->nkeys-1, "2147483647", strlen("2147483647"), 0);
  1064. bcf_hdr_add_hrec(h, hrec);
  1065. need_sync = 1;
  1066. }
  1067. free(names);
  1068. tbx_destroy(idx);
  1069. if ( need_sync )
  1070. bcf_hdr_sync(h);
  1071. }
  1072. free(txt.s);
  1073. return h;
  1074. }
  1075. int bcf_hdr_set(bcf_hdr_t *hdr, const char *fname)
  1076. {
  1077. int i, n;
  1078. char **lines = hts_readlines(fname, &n);
  1079. if ( !lines ) return 1;
  1080. for (i=0; i<n-1; i++)
  1081. {
  1082. int k;
  1083. bcf_hrec_t *hrec = bcf_hdr_parse_line(hdr,lines[i],&k);
  1084. bcf_hdr_add_hrec(hdr, hrec);
  1085. free(lines[i]);
  1086. }
  1087. bcf_hdr_parse_sample_line(hdr,lines[n-1]);
  1088. free(lines[n-1]);
  1089. free(lines);
  1090. bcf_hdr_sync(hdr);
  1091. return 0;
  1092. }
  1093. static void _bcf_hrec_format(const bcf_hrec_t *hrec, int is_bcf, kstring_t *str)
  1094. {
  1095. if ( !hrec->value )
  1096. {
  1097. int j, nout = 0;
  1098. ksprintf(str, "##%s=<", hrec->key);
  1099. for (j=0; j<hrec->nkeys; j++)
  1100. {
  1101. // do not output IDX if output is VCF
  1102. if ( !is_bcf && !strcmp("IDX",hrec->keys[j]) ) continue;
  1103. if ( nout ) kputc(',',str);
  1104. ksprintf(str,"%s=%s", hrec->keys[j], hrec->vals[j]);
  1105. nout++;
  1106. }
  1107. ksprintf(str,">\n");
  1108. }
  1109. else
  1110. ksprintf(str,"##%s=%s\n", hrec->key,hrec->value);
  1111. }
  1112. void bcf_hrec_format(const bcf_hrec_t *hrec, kstring_t *str)
  1113. {
  1114. _bcf_hrec_format(hrec,0,str);
  1115. }
  1116. char *bcf_hdr_fmt_text(const bcf_hdr_t *hdr, int is_bcf, int *len)
  1117. {
  1118. int i;
  1119. kstring_t txt = {0,0,0};
  1120. for (i=0; i<hdr->nhrec; i++)
  1121. _bcf_hrec_format(hdr->hrec[i], is_bcf, &txt);
  1122. ksprintf(&txt,"#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO");
  1123. if ( bcf_hdr_nsamples(hdr) )
  1124. {
  1125. ksprintf(&txt,"\tFORMAT");
  1126. for (i=0; i<bcf_hdr_nsamples(hdr); i++)
  1127. ksprintf(&txt,"\t%s", hdr->samples[i]);
  1128. }
  1129. ksprintf(&txt,"\n");
  1130. if ( len ) *len = txt.l;
  1131. return txt.s;
  1132. }
  1133. const char **bcf_hdr_seqnames(const bcf_hdr_t *h, int *n)
  1134. {
  1135. vdict_t *d = (vdict_t*)h->dict[BCF_DT_CTG];
  1136. int tid, m = kh_size(d);
  1137. const char **names = (const char**) calloc(m,sizeof(const char*));
  1138. khint_t k;
  1139. for (k=kh_begin(d); k<kh_end(d); k++)
  1140. {
  1141. if ( !kh_exist(d,k) ) continue;
  1142. tid = kh_val(d,k).id;
  1143. assert( tid<m );
  1144. names[tid] = kh_key(d,k);
  1145. }
  1146. // sanity check: there should be no gaps
  1147. for (tid=0; tid<m; tid++)
  1148. assert(names[tid]);
  1149. *n = m;
  1150. return names;
  1151. }
  1152. int vcf_hdr_write(htsFile *fp, const bcf_hdr_t *h)
  1153. {
  1154. int hlen;
  1155. char *htxt = bcf_hdr_fmt_text(h, 0, &hlen);
  1156. while (hlen && htxt[hlen-1] == 0) --hlen; // kill trailing zeros
  1157. int ret;
  1158. if ( fp->is_compressed==1 )
  1159. ret = bgzf_write(fp->fp.bgzf, htxt, hlen);
  1160. else
  1161. ret = hwrite(fp->fp.hfile, htxt, hlen);
  1162. free(htxt);
  1163. return ret<0 ? -1 : 0;
  1164. }
  1165. /***********************
  1166. *** Typed value I/O ***
  1167. ***********************/
  1168. void bcf_enc_vint(kstring_t *s, int n, int32_t *a, int wsize)
  1169. {
  1170. int32_t max = INT32_MIN + 1, min = INT32_MAX;
  1171. int i;
  1172. if (n == 0) bcf_enc_size(s, 0, BCF_BT_NULL);
  1173. else if (n == 1) bcf_enc_int1(s, a[0]);
  1174. else {
  1175. if (wsize <= 0) wsize = n;
  1176. for (i = 0; i < n; ++i) {
  1177. if (a[i] == bcf_int32_missing || a[i] == bcf_int32_vector_end ) continue;
  1178. if (max < a[i]) max = a[i];
  1179. if (min > a[i]) min = a[i];
  1180. }
  1181. if (max <= INT8_MAX && min > bcf_int8_vector_end) {
  1182. bcf_enc_size(s, wsize, BCF_BT_INT8);
  1183. for (i = 0; i < n; ++i)
  1184. if ( a[i]==bcf_int32_vector_end ) kputc(bcf_int8_vector_end, s);
  1185. else if ( a[i]==bcf_int32_missing ) kputc(bcf_int8_missing, s);
  1186. else kputc(a[i], s);
  1187. } else if (max <= INT16_MAX && min > bcf_int16_vector_end) {
  1188. bcf_enc_size(s, wsize, BCF_BT_INT16);
  1189. for (i = 0; i < n; ++i)
  1190. {
  1191. int16_t x;
  1192. if ( a[i]==bcf_int32_vector_end ) x = bcf_int16_vector_end;
  1193. else if ( a[i]==bcf_int32_missing ) x = bcf_int16_missing;
  1194. else x = a[i];
  1195. kputsn((char*)&x, 2, s);
  1196. }
  1197. } else {
  1198. bcf_enc_size(s, wsize, BCF_BT_INT32);
  1199. for (i = 0; i < n; ++i) {
  1200. int32_t x = a[i];
  1201. kputsn((char*)&x, 4, s);
  1202. }
  1203. }
  1204. }
  1205. }
  1206. void bcf_enc_vfloat(kstring_t *s, int n, float *a)
  1207. {
  1208. bcf_enc_size(s, n, BCF_BT_FLOAT);
  1209. kputsn((char*)a, n << 2, s);
  1210. }
  1211. void bcf_enc_vchar(kstring_t *s, int l, const char *a)
  1212. {
  1213. bcf_enc_size(s, l, BCF_BT_CHAR);
  1214. kputsn(a, l, s);
  1215. }
  1216. void bcf_fmt_array(kstring_t *s, int n, int type, void *data)
  1217. {
  1218. int j = 0;
  1219. if (n == 0) {
  1220. kputc('.', s);
  1221. return;
  1222. }
  1223. if (type == BCF_BT_CHAR)
  1224. {
  1225. char *p = (char*)data;
  1226. for (j = 0; j < n && *p; ++j, ++p)
  1227. {
  1228. if ( *p==bcf_str_missing ) kputc('.', s);
  1229. else kputc(*p, s);
  1230. }
  1231. }
  1232. else
  1233. {
  1234. #define BRANCH(type_t, is_missing, is_vector_end, kprint) { \
  1235. type_t *p = (type_t *) data; \
  1236. for (j=0; j<n; j++) \
  1237. { \
  1238. if ( is_vector_end ) break; \
  1239. if ( j ) kputc(',', s); \
  1240. if ( is_missing ) kputc('.', s); \
  1241. else kprint; \
  1242. } \
  1243. }
  1244. switch (type) {
  1245. case BCF_BT_INT8: BRANCH(int8_t, p[j]==bcf_int8_missing, p[j]==bcf_int8_vector_end, kputw(p[j], s)); break;
  1246. case BCF_BT_INT16: BRANCH(int16_t, p[j]==bcf_int16_missing, p[j]==bcf_int16_vector_end, kputw(p[j], s)); break;
  1247. case BCF_BT_INT32: BRANCH(int32_t, p[j]==bcf_int32_missing, p[j]==bcf_int32_vector_end, kputw(p[j], s)); break;
  1248. case BCF_BT_FLOAT: BRANCH(float, bcf_float_is_missing(p[j]), bcf_float_is_vector_end(p[j]), ksprintf(s, "%g", p[j])); break;
  1249. default: fprintf(stderr,"todo: type %d\n", type); exit(1); break;
  1250. }
  1251. #undef BRANCH
  1252. }
  1253. }
  1254. uint8_t *bcf_fmt_sized_array(kstring_t *s, uint8_t *ptr)
  1255. {
  1256. int x, type;
  1257. x = bcf_dec_size(ptr, &ptr, &type);
  1258. bcf_fmt_array(s, x, type, ptr);
  1259. return ptr + (x << bcf_type_shift[type]);
  1260. }
  1261. /********************
  1262. *** VCF site I/O ***
  1263. ********************/
  1264. typedef struct {
  1265. int key, max_m, size, offset;
  1266. uint32_t is_gt:1, max_g:15, max_l:16;
  1267. uint32_t y;
  1268. uint8_t *buf;
  1269. } fmt_aux_t;
  1270. static inline void align_mem(kstring_t *s)
  1271. {
  1272. if (s->l&7) {
  1273. uint64_t zero = 0;
  1274. int l = ((s->l + 7)>>3<<3) - s->l;
  1275. kputsn((char*)&zero, l, s);
  1276. }
  1277. }
  1278. // p,q is the start and the end of the FORMAT field
  1279. int _vcf_parse_format(kstring_t *s, const bcf_hdr_t *h, bcf1_t *v, char *p, char *q)
  1280. {
  1281. if ( !bcf_hdr_nsamples(h) ) return 0;
  1282. char *r, *t;
  1283. int j, l, m, g;
  1284. khint_t k;
  1285. ks_tokaux_t aux1;
  1286. vdict_t *d = (vdict_t*)h->dict[BCF_DT_ID];
  1287. kstring_t *mem = (kstring_t*)&h->mem;
  1288. mem->l = 0;
  1289. // count the number of format fields
  1290. for (r = p, v->n_fmt = 1; *r; ++r)
  1291. if (*r == ':') ++v->n_fmt;
  1292. char *end = s->s + s->l;
  1293. if ( q>=end )
  1294. {
  1295. fprintf(stderr,"[%s:%d %s] Error: FORMAT column with no sample columns starting at %s:%d\n", __FILE__,__LINE__,__FUNCTION__,s->s,v->pos+1);
  1296. return -1;
  1297. }
  1298. fmt_aux_t *fmt = (fmt_aux_t*)alloca(v->n_fmt * sizeof(fmt_aux_t));
  1299. // get format information from the dictionary
  1300. for (j = 0, t = kstrtok(p, ":", &aux1); t; t = kstrtok(0, 0, &aux1), ++j) {
  1301. *(char*)aux1.p = 0;
  1302. k = kh_get(vdict, d, t);
  1303. if (k == kh_end(d) || kh_val(d, k).info[BCF_HL_FMT] == 15) {
  1304. fprintf(stderr, "[W::%s] FORMAT '%s' is not defined in the header, assuming Type=String\n", __func__, t);
  1305. kstring_t tmp = {0,0,0};
  1306. int l;
  1307. ksprintf(&tmp, "##FORMAT=<ID=%s,Number=1,Type=String,Description=\"Dummy\">", t);
  1308. bcf_hrec_t *hrec = bcf_hdr_parse_line(h,tmp.s,&l);
  1309. free(tmp.s);
  1310. if ( bcf_hdr_add_hrec((bcf_hdr_t*)h, hrec) ) bcf_hdr_sync((bcf_hdr_t*)h);
  1311. k = kh_get(vdict, d, t);
  1312. v->errcode = BCF_ERR_TAG_UNDEF;
  1313. }
  1314. fmt[j].max_l = fmt[j].max_m = fmt[j].max_g = 0;
  1315. fmt[j].key = kh_val(d, k).id;
  1316. fmt[j].is_gt = !strcmp(t, "GT");
  1317. fmt[j].y = h->id[0][fmt[j].key].val->info[BCF_HL_FMT];
  1318. }
  1319. // compute max
  1320. int n_sample_ori = -1;
  1321. r = q + 1; // r: position in the format string
  1322. m = l = g = 1, v->n_sample = 0; // m: max vector size, l: max field len, g: max number of alleles
  1323. while ( r<end )
  1324. {
  1325. // can we skip some samples?
  1326. if ( h->keep_samples )
  1327. {
  1328. n_sample_ori++;
  1329. if ( !bit_array_test(h->keep_samples,n_sample_ori) )
  1330. {
  1331. while ( *r!='\t' && r<end ) r++;
  1332. if ( *r=='\t' ) { *r = 0; r++; }
  1333. continue;
  1334. }
  1335. }
  1336. // collect fmt stats: max vector size, length, number of alleles
  1337. j = 0; // j-th format field
  1338. for (;;)
  1339. {
  1340. if ( *r == '\t' ) *r = 0;
  1341. if ( *r == ':' || !*r ) // end of field or end of sample
  1342. {
  1343. if (fmt[j].max_m < m) fmt[j].max_m = m;
  1344. if (fmt[j].max_l < l - 1) fmt[j].max_l = l - 1;
  1345. if (fmt[j].is_gt && fmt[j].max_g < g) fmt[j].max_g = g;
  1346. l = 0, m = g = 1;
  1347. if ( *r==':' ) j++;
  1348. else break;
  1349. }
  1350. else if ( *r== ',' ) m++;
  1351. else if ( fmt[j].is_gt && (*r == '|' || *r == '/') ) g++;
  1352. if ( r>=end ) break;
  1353. r++; l++;
  1354. }
  1355. v->n_sample++;
  1356. if ( v->n_sample == bcf_hdr_nsamples(h) ) break;
  1357. r++;
  1358. }
  1359. // allocate memory for arrays
  1360. for (j = 0; j < v->n_fmt; ++j) {
  1361. fmt_aux_t *f = &fmt[j];
  1362. if ( !f->max_m ) f->max_m = 1; // omitted trailing format field
  1363. if ((f->y>>4&0xf) == BCF_HT_STR) {
  1364. f->size = f->is_gt? f->max_g << 2 : f->max_l;
  1365. } else if ((f->y>>4&0xf) == BCF_HT_REAL || (f->y>>4&0xf) == BCF_HT_INT) {
  1366. f->size = f->max_m << 2;
  1367. } else
  1368. {
  1369. fprintf(stderr, "[E::%s] the format type %d currently not supported\n", __func__, f->y>>4&0xf);
  1370. abort(); // I do not know how to do with Flag in the genotype fields
  1371. }
  1372. align_mem(mem);
  1373. f->offset = mem->l;
  1374. ks_resize(mem, mem->l + v->n_sample * f->size);
  1375. mem->l += v->n_sample * f->size;
  1376. }
  1377. for (j = 0; j < v->n_fmt; ++j)
  1378. fmt[j].buf = (uint8_t*)mem->s + fmt[j].offset;
  1379. // fill the sample fields; at beginning of the loop, t points to the first char of a format
  1380. n_sample_ori = -1;
  1381. t = q + 1; m = 0; // m: sample id
  1382. while ( t<end )
  1383. {
  1384. // can we skip some samples?
  1385. if ( h->keep_samples )
  1386. {
  1387. n_sample_ori++;
  1388. if ( !bit_array_test(h->keep_samples,n_sample_ori) )
  1389. {
  1390. while ( *t && t<end ) t++;
  1391. t++;
  1392. continue;
  1393. }
  1394. }
  1395. if ( m == bcf_hdr_nsamples(h) ) break;
  1396. j = 0; // j-th format field, m-th sample
  1397. while ( *t )
  1398. {
  1399. fmt_aux_t *z = &fmt[j];
  1400. if ((z->y>>4&0xf) == BCF_HT_STR) {
  1401. if (z->is_gt) { // genotypes
  1402. int32_t is_phased = 0, *x = (int32_t*)(z->buf + z->size * m);
  1403. for (l = 0;; ++t) {
  1404. if (*t == '.') ++t, x[l++] = is_phased;
  1405. else x[l++] = (strtol(t, &t, 10) + 1) << 1 | is_phased;
  1406. #if THOROUGH_SANITY_CHECKS
  1407. assert( 0 ); // success of strtol,strtod not checked
  1408. #endif
  1409. is_phased = (*t == '|');
  1410. if (*t == ':' || *t == 0) break;
  1411. }
  1412. if ( !l ) x[l++] = 0; // An empty field, insert missing value
  1413. for (; l < z->size>>2; ++l) x[l] = bcf_int32_vector_end;
  1414. } else {
  1415. char *x = (char*)z->buf + z->size * m;
  1416. for (r = t, l = 0; *t != ':' && *t; ++t) x[l++] = *t;
  1417. for (; l < z->size; ++l) x[l] = 0;
  1418. }
  1419. } else if ((z->y>>4&0xf) == BCF_HT_INT) {
  1420. int32_t *x = (int32_t*)(z->buf + z->size * m);
  1421. for (l = 0;; ++t) {
  1422. if (*t == '.') x[l++] = bcf_int32_missing, ++t; // ++t to skip "."
  1423. else x[l++] = strtol(t, &t, 10);
  1424. if (*t == ':' || *t == 0) break;
  1425. }
  1426. if ( !l ) x[l++] = bcf_int32_missing;
  1427. for (; l < z->size>>2; ++l) x[l] = bcf_int32_vector_end;
  1428. } else if ((z->y>>4&0xf) == BCF_HT_REAL) {
  1429. float *x = (float*)(z->buf + z->size * m);
  1430. for (l = 0;; ++t) {
  1431. if (*t == '.' && !isdigit(t[1])) bcf_float_set_missing(x[l++]), ++t; // ++t to skip "."
  1432. else x[l++] = strtod(t, &t);
  1433. if (*t == ':' || *t == 0) break;
  1434. }
  1435. if ( !l ) bcf_float_set_missing(x[l++]); // An empty field, insert missing value
  1436. for (; l < z->size>>2; ++l) bcf_float_set_vector_end(x[l]);
  1437. } else abort();
  1438. if (*t == 0) {
  1439. for (++j; j < v->n_fmt; ++j) { // fill end-of-vector values
  1440. z = &fmt[j];
  1441. if ((z->y>>4&0xf) == BCF_HT_STR) {
  1442. if (z->is_gt) {
  1443. int32_t *x = (int32_t*)(z->buf + z->size * m);
  1444. x[0] = bcf_int32_missing;
  1445. for (l = 1; l < z->size>>2; ++l) x[l] = bcf_int32_vector_end;
  1446. } else {
  1447. char *x = (char*)z->buf + z->size * m;
  1448. if ( z->size ) x[0] = '.';
  1449. for (l = 1; l < z->size; ++l) x[l] = 0;
  1450. }
  1451. } else if ((z->y>>4&0xf) == BCF_HT_INT) {
  1452. int32_t *x = (int32_t*)(z->buf + z->size * m);
  1453. x[0] = bcf_int32_missing;
  1454. for (l = 1; l < z->size>>2; ++l) x[l] = bcf_int32_vector_end;
  1455. } else if ((z->y>>4&0xf) == BCF_HT_REAL) {
  1456. float *x = (float*)(z->buf + z->size * m);
  1457. bcf_float_set_missing(x[0]);
  1458. for (l = 1; l < z->size>>2; ++l) bcf_float_set_vector_end(x[l]);
  1459. }
  1460. }
  1461. break;
  1462. }
  1463. else
  1464. {
  1465. if (*t == ':') ++j;
  1466. t++;
  1467. }
  1468. }
  1469. m++; t++;
  1470. }
  1471. // write individual genotype information
  1472. kstring_t *str = &v->indiv;
  1473. int i;
  1474. if (v->n_sample > 0) {
  1475. for (i = 0; i < v->n_fmt; ++i) {
  1476. fmt_aux_t *z = &fmt[i];
  1477. bcf_enc_int1(str, z->key);
  1478. if ((z->y>>4&0xf) == BCF_HT_STR && !z->is_gt) {
  1479. bcf_enc_size(str, z->size, BCF_BT_CHAR);
  1480. kputsn((char*)z->buf, z->size * v->n_sample, str);
  1481. } else if ((z->y>>4&0xf) == BCF_HT_INT || z->is_gt) {
  1482. bcf_enc_vint(str, (z->size>>2) * v->n_sample, (int32_t*)z->buf, z->size>>2);
  1483. } else {
  1484. bcf_enc_size(str, z->size>>2, BCF_BT_FLOAT);
  1485. kputsn((char*)z->buf, z->size * v->n_sample, str);
  1486. }
  1487. }
  1488. }
  1489. if ( v->n_sample!=bcf_hdr_nsamples(h) )
  1490. {
  1491. fprintf(stderr,"[%s:%d %s] Number of columns at %s:%d does not match the number of samples (%d vs %d).\n",
  1492. __FILE__,__LINE__,__FUNCTION__,bcf_seqname(h,v),v->pos+1, v->n_sample,bcf_hdr_nsamples(h));
  1493. v->errcode |= BCF_ERR_NCOLS;
  1494. return -1;
  1495. }
  1496. return 0;
  1497. }
  1498. int vcf_parse(kstring_t *s, const bcf_hdr_t *h, bcf1_t *v)
  1499. {
  1500. int i = 0;
  1501. char *p, *q, *r, *t;
  1502. kstring_t *str;
  1503. khint_t k;
  1504. ks_tokaux_t aux;
  1505. bcf_clear1(v);
  1506. str = &v->shared;
  1507. memset(&aux, 0, sizeof(ks_tokaux_t));
  1508. for (p = kstrtok(s->s, "\t", &aux), i = 0; p; p = kstrtok(0, 0, &aux), ++i) {
  1509. q = (char*)aux.p;
  1510. *q = 0;
  1511. if (i == 0) { // CHROM
  1512. vdict_t *d = (vdict_t*)h->dict[BCF_DT_CTG];
  1513. k = kh_get(vdict, d, p);
  1514. if (k == kh_end(d))
  1515. {
  1516. // Simple error recovery for chromosomes not defined in the header. It will not help when VCF header has
  1517. // been already printed, but will enable tools like vcfcheck to proceed.
  1518. fprintf(stderr, "[W::%s] contig '%s' is not defined in the header. (Quick workaround: index the file with tabix.)\n", __func__, p);
  1519. kstring_t tmp = {0,0,0};
  1520. int l;
  1521. ksprintf(&tmp, "##contig=<ID=%s,length=2147483647>", p);
  1522. bcf_hrec_t *hrec = bcf_hdr_parse_line(h,tmp.s,&l);
  1523. free(tmp.s);
  1524. if ( bcf_hdr_add_hrec((bcf_hdr_t*)h, hrec) ) bcf_hdr_sync((bcf_hdr_t*)h);
  1525. k = kh_get(vdict, d, p);
  1526. v->errcode = BCF_ERR_CTG_UNDEF;
  1527. }
  1528. v->rid = kh_val(d, k).id;
  1529. } else if (i == 1) { // POS
  1530. v->pos = atoi(p) - 1;
  1531. } else if (i == 2) { // ID
  1532. if (strcmp(p, ".")) bcf_enc_vchar(str, q - p, p);
  1533. else bcf_enc_size(str, 0, BCF_BT_CHAR);
  1534. } else if (i == 3) { // REF
  1535. bcf_enc_vchar(str, q - p, p);
  1536. v->n_allele = 1, v->rlen = q - p;
  1537. } else if (i == 4) { // ALT
  1538. if (strcmp(p, ".")) {
  1539. for (r = t = p;; ++r) {
  1540. if (*r == ',' || *r == 0) {
  1541. bcf_enc_vchar(str, r - t, t);
  1542. t = r + 1;
  1543. ++v->n_allele;
  1544. }
  1545. if (r == q) break;
  1546. }
  1547. }
  1548. } else if (i == 5) { // QUAL
  1549. if (strcmp(p, ".")) v->qual = atof(p);
  1550. else memcpy(&v->qual, &bcf_float_missing, 4);
  1551. if ( v->max_unpack && !(v->max_unpack>>1) ) return 0; // BCF_UN_STR
  1552. } else if (i == 6) { // FILTER
  1553. if (strcmp(p, ".")) {
  1554. int32_t *a;
  1555. int n_flt = 1, i;
  1556. ks_tokaux_t aux1;
  1557. vdict_t *d = (vdict_t*)h->dict[BCF_DT_ID];
  1558. // count the number of filters
  1559. if (*(q-1) == ';') *(q-1) = 0;
  1560. for (r = p; *r; ++r)
  1561. if (*r == ';') ++n_flt;
  1562. a = (int32_t*)alloca(n_flt * 4);
  1563. // add filters
  1564. for (t = kstrtok(p, ";", &aux1), i = 0; t; t = kstrtok(0, 0, &aux1)) {
  1565. *(char*)aux1.p = 0;
  1566. k = kh_get(vdict, d, t);
  1567. if (k == kh_end(d))
  1568. {
  1569. // Simple error recovery for FILTERs not defined in the header. It will not help when VCF header has
  1570. // been already printed, but will enable tools like vcfcheck to proceed.
  1571. fprintf(stderr, "[W::%s] FILTER '%s' is not defined in the header\n", __func__, t);
  1572. kstring_t tmp = {0,0,0};
  1573. int l;
  1574. ksprintf(&tmp, "##FILTER=<ID=%s,Description=\"Dummy\">", t);
  1575. bcf_hrec_t *hrec = bcf_hdr_parse_line(h,tmp.s,&l);
  1576. free(tmp.s);
  1577. if ( bcf_hdr_add_hrec((bcf_hdr_t*)h, hrec) ) bcf_hdr_sync((bcf_hdr_t*)h);
  1578. k = kh_get(vdict, d, t);
  1579. v->errcode = BCF_ERR_TAG_UNDEF;
  1580. }
  1581. a[i++] = kh_val(d, k).id;
  1582. }
  1583. n_flt = i;
  1584. bcf_enc_vint(str, n_flt, a, -1);
  1585. } else bcf_enc_vint(str, 0, 0, -1);
  1586. if ( v->max_unpack && !(v->max_unpack>>2) ) return 0; // BCF_UN_FLT
  1587. } else if (i == 7) { // INFO
  1588. char *key;
  1589. vdict_t *d = (vdict_t*)h->dict[BCF_DT_ID];
  1590. v->n_info = 0;
  1591. if (strcmp(p, ".")) {
  1592. if (*(q-1) == ';') *(q-1) = 0;
  1593. for (r = key = p;; ++r) {
  1594. int c;
  1595. char *val, *end;
  1596. if (*r != ';' && *r != '=' && *r != 0) continue;
  1597. val = end = 0;
  1598. c = *r; *r = 0;
  1599. if (c == '=') {
  1600. val = r + 1;
  1601. for (end = val; *end != ';' && *end != 0; ++end);
  1602. c = *end; *end = 0;
  1603. } else end = r;
  1604. k = kh_get(vdict, d, key);
  1605. if (k == kh_end(d) || kh_val(d, k).info[BCF_HL_INFO] == 15)
  1606. {
  1607. fprintf(stderr, "[W::%s] INFO '%s' is not defined in the header, assuming Type=String\n", __func__, key);
  1608. kstring_t tmp = {0,0,0};
  1609. int l;
  1610. ksprintf(&tmp, "##INFO=<ID=%s,Number=1,Type=String,Description=\"Dummy\">", key);
  1611. bcf_hrec_t *hrec = bcf_hdr_parse_line(h,tmp.s,&l);
  1612. free(tmp.s);
  1613. if ( bcf_hdr_add_hrec((bcf_hdr_t*)h, hrec) ) bcf_hdr_sync((bcf_hdr_t*)h);
  1614. k = kh_get(vdict, d, key);
  1615. v->errcode = BCF_ERR_TAG_UNDEF;
  1616. }
  1617. uint32_t y = kh_val(d, k).info[BCF_HL_INFO];
  1618. ++v->n_info;
  1619. bcf_enc_int1(str, kh_val(d, k).id);
  1620. if (val == 0) {
  1621. bcf_enc_size(str, 0, BCF_BT_NULL);
  1622. } else if ((y>>4&0xf) == BCF_HT_FLAG || (y>>4&0xf) == BCF_HT_STR) { // if Flag has a value, treat it as a string
  1623. bcf_enc_vchar(str, end - val, val);
  1624. } else { // int/float value/array
  1625. int i, n_val;
  1626. char *t, *te;
  1627. for (t = val, n_val = 1; *t; ++t) // count the number of values
  1628. if (*t == ',') ++n_val;
  1629. if ((y>>4&0xf) == BCF_HT_INT) {
  1630. int32_t *z;
  1631. z = (int32_t*)alloca(n_val<<2);
  1632. for (i = 0, t = val; i < n_val; ++i, ++t)
  1633. {
  1634. z[i] = strtol(t, &te, 10);
  1635. if ( te==t ) // conversion failed
  1636. {
  1637. z[i] = bcf_int32_missing;
  1638. while ( *te && *te!=',' ) te++;
  1639. }
  1640. t = te;
  1641. }
  1642. bcf_enc_vint(str, n_val, z, -1);
  1643. if (strcmp(key, "END") == 0) v->rlen = z[0] - v->pos;
  1644. } else if ((y>>4&0xf) == BCF_HT_REAL) {
  1645. float *z;
  1646. z = (float*)alloca(n_val<<2);
  1647. for (i = 0, t = val; i < n_val; ++i, ++t)
  1648. {
  1649. z[i] = strtod(t, &te);
  1650. if ( te==t ) // conversion failed
  1651. {
  1652. bcf_float_set_missing(z[i]);
  1653. while ( *te && *te!=',' ) te++;
  1654. }
  1655. t = te;
  1656. }
  1657. bcf_enc_vfloat(str, n_val, z);
  1658. }
  1659. }
  1660. if (c == 0) break;
  1661. r = end;
  1662. key = r + 1;
  1663. }
  1664. }
  1665. if ( v->max_unpack && !(v->max_unpack>>3) ) return 0;
  1666. } else if (i == 8) // FORMAT
  1667. return _vcf_parse_format(s, h, v, p, q);
  1668. }
  1669. return 0;
  1670. }
  1671. int vcf_read(htsFile *fp, const bcf_hdr_t *h, bcf1_t *v)
  1672. {
  1673. int ret;
  1674. ret = hts_getline(fp, KS_SEP_LINE, &fp->line);
  1675. if (ret < 0) return -1;
  1676. return vcf_parse1(&fp->line, h, v);
  1677. }
  1678. static inline uint8_t *bcf_unpack_fmt_core1(uint8_t *ptr, int n_sample, bcf_fmt_t *fmt)
  1679. {
  1680. uint8_t *ptr_start = ptr;
  1681. fmt->id = bcf_dec_typed_int1(ptr, &ptr);
  1682. fmt->n = bcf_dec_size(ptr, &ptr, &fmt->type);
  1683. fmt->size = fmt->n << bcf_type_shift[fmt->type];
  1684. fmt->p = ptr;
  1685. fmt->p_off = ptr - ptr_start;
  1686. fmt->p_free = 0;
  1687. ptr += n_sample * fmt->size;
  1688. fmt->p_len = ptr - fmt->p;
  1689. return ptr;
  1690. }
  1691. static inline uint8_t *bcf_unpack_info_core1(uint8_t *ptr, bcf_info_t *info)
  1692. {
  1693. uint8_t *ptr_start = ptr;
  1694. info->key = bcf_dec_typed_int1(ptr, &ptr);
  1695. info->len = bcf_dec_size(ptr, &ptr, &info->type);
  1696. info->vptr = ptr;
  1697. info->vptr_off = ptr - ptr_start;
  1698. info->vptr_free = 0;
  1699. info->v1.i = 0;
  1700. if (info->len == 1) {
  1701. if (info->type == BCF_BT_INT8 || info->type == BCF_BT_CHAR) info->v1.i = *(int8_t*)ptr;
  1702. else if (info->type == BCF_BT_INT32) info->v1.i = *(int32_t*)ptr;
  1703. else if (info->type == BCF_BT_FLOAT) info->v1.f = *(float*)ptr;
  1704. else if (info->type == BCF_BT_INT16) info->v1.i = *(int16_t*)ptr;
  1705. }
  1706. ptr += info->len << bcf_type_shift[info->type];
  1707. info->vptr_len = ptr - info->vptr;
  1708. return ptr;
  1709. }
  1710. int bcf_unpack(bcf1_t *b, int which)
  1711. {
  1712. if ( !b->shared.l ) return 0; // Building a new BCF record from scratch
  1713. uint8_t *ptr = (uint8_t*)b->shared.s, *ptr_ori;
  1714. int *offset, i;
  1715. bcf_dec_t *d = &b->d;
  1716. if (which & BCF_UN_FLT) which |= BCF_UN_STR;
  1717. if (which & BCF_UN_INFO) which |= BCF_UN_SHR;
  1718. if ((which&BCF_UN_STR) && !(b->unpacked&BCF_UN_STR))
  1719. {
  1720. kstring_t tmp;
  1721. // ID
  1722. tmp.l = 0; tmp.s = d->id; tmp.m = d->m_id;
  1723. ptr_ori = ptr;
  1724. ptr = bcf_fmt_sized_array(&tmp, ptr);
  1725. b->unpack_size[0] = ptr - ptr_ori;
  1726. kputc('\0', &tmp);
  1727. d->id = tmp.s; d->m_id = tmp.m;
  1728. // REF and ALT are in a single block (d->als) and d->alleles are pointers into this block
  1729. tmp.l = 0; tmp.s = d->als; tmp.m = d->m_als;
  1730. offset = (int*)alloca(b->n_allele * sizeof(int));
  1731. ptr_ori = ptr;
  1732. for (i = 0; i < b->n_allele; ++i) {
  1733. offset[i] = tmp.l;
  1734. ptr = bcf_fmt_sized_array(&tmp, ptr);
  1735. kputc('\0', &tmp);
  1736. }
  1737. b->unpack_size[1] = ptr - ptr_ori;
  1738. d->als = tmp.s; d->m_als = tmp.m;
  1739. hts_expand(char*, b->n_allele, d->m_allele, d->allele); // NM: hts_expand() is a macro
  1740. for (i = 0; i < b->n_allele; ++i)
  1741. d->allele[i] = d->als + offset[i];
  1742. b->unpacked |= BCF_UN_STR;
  1743. }
  1744. if ((which&BCF_UN_FLT) && !(b->unpacked&BCF_UN_FLT)) { // FILTER
  1745. ptr = (uint8_t*)b->shared.s + b->unpack_size[0] + b->unpack_size[1];
  1746. ptr_ori = ptr;
  1747. if (*ptr>>4) {
  1748. int type;
  1749. d->n_flt = bcf_dec_size(ptr, &ptr, &type);
  1750. hts_expand(int, d->n_flt, d->m_flt, d->flt);
  1751. for (i = 0; i < d->n_flt; ++i)
  1752. d->flt[i] = bcf_dec_int1(ptr, type, &ptr);
  1753. } else ++ptr, d->n_flt = 0;
  1754. b->unpack_size[2] = ptr - ptr_ori;
  1755. b->unpacked |= BCF_UN_FLT;
  1756. }
  1757. if ((which&BCF_UN_INFO) && !(b->unpacked&BCF_UN_INFO)) { // INFO
  1758. ptr = (uint8_t*)b->shared.s + b->unpack_size[0] + b->unpack_size[1] + b->unpack_size[2];
  1759. hts_expand(bcf_info_t, b->n_info, d->m_info, d->info);
  1760. for (i = 0; i < d->m_info; ++i) d->info[i].vptr_free = 0;
  1761. for (i = 0; i < b->n_info; ++i)
  1762. ptr = bcf_unpack_info_core1(ptr, &d->info[i]);
  1763. b->unpacked |= BCF_UN_INFO;
  1764. }
  1765. if ((which&BCF_UN_FMT) && b->n_sample && !(b->unpacked&BCF_UN_FMT)) { // FORMAT
  1766. ptr = (uint8_t*)b->indiv.s;
  1767. hts_expand(bcf_fmt_t, b->n_fmt, d->m_fmt, d->fmt);
  1768. for (i = 0; i < d->m_fmt; ++i) d->fmt[i].p_free = 0;
  1769. for (i = 0; i < b->n_fmt; ++i)
  1770. ptr = bcf_unpack_fmt_core1(ptr, b->n_sample, &d->fmt[i]);
  1771. b->unpacked |= BCF_UN_FMT;
  1772. }
  1773. return 0;
  1774. }
  1775. int vcf_format(const bcf_hdr_t *h, const bcf1_t *v, kstring_t *s)
  1776. {
  1777. int i;
  1778. bcf_unpack((bcf1_t*)v, BCF_UN_ALL);
  1779. kputs(h->id[BCF_DT_CTG][v->rid].key, s); // CHROM
  1780. kputc('\t', s); kputw(v->pos + 1, s); // POS
  1781. kputc('\t', s); kputs(v->d.id ? v->d.id : ".", s); // ID
  1782. kputc('\t', s); // REF
  1783. if (v->n_allele > 0) kputs(v->d.allele[0], s);
  1784. else kputc('.', s);
  1785. kputc('\t', s); // ALT
  1786. if (v->n_allele > 1) {
  1787. for (i = 1; i < v->n_allele; ++i) {
  1788. if (i > 1) kputc(',', s);
  1789. kputs(v->d.allele[i], s);
  1790. }
  1791. } else kputc('.', s);
  1792. kputc('\t', s); // QUAL
  1793. if (memcmp(&v->qual, &bcf_float_missing, 4) == 0) kputc('.', s); // QUAL
  1794. else ksprintf(s, "%g", v->qual);
  1795. kputc('\t', s); // FILTER
  1796. if (v->d.n_flt) {
  1797. for (i = 0; i < v->d.n_flt; ++i) {
  1798. if (i) kputc(';', s);
  1799. kputs(h->id[BCF_DT_ID][v->d.flt[i]].key, s);
  1800. }
  1801. } else kputc('.', s);
  1802. kputc('\t', s); // INFO
  1803. if (v->n_info) {
  1804. int first = 1;
  1805. for (i = 0; i < v->n_info; ++i) {
  1806. bcf_info_t *z = &v->d.info[i];
  1807. if ( !z->vptr ) continue;
  1808. if ( !first ) kputc(';', s); first = 0;
  1809. kputs(h->id[BCF_DT_ID][z->key].key, s);
  1810. if (z->len <= 0) continue;
  1811. kputc('=', s);
  1812. if (z->len == 1) {
  1813. if (z->type == BCF_BT_FLOAT) ksprintf(s, "%g", z->v1.f);
  1814. else if (z->type != BCF_BT_CHAR) kputw(z->v1.i, s);
  1815. else kputc(z->v1.i, s);
  1816. } else bcf_fmt_array(s, z->len, z->type, z->vptr);
  1817. }
  1818. if ( first ) kputc('.', s);
  1819. } else kputc('.', s);
  1820. // FORMAT and individual information
  1821. if (v->n_sample)
  1822. {
  1823. int i,j;
  1824. if ( v->n_fmt)
  1825. {
  1826. int gt_i = -1;
  1827. bcf_fmt_t *fmt = v->d.fmt;
  1828. int first = 1;
  1829. for (i = 0; i < (int)v->n_fmt; ++i) {
  1830. if ( !fmt[i].p ) continue;
  1831. kputc(!first ? ':' : '\t', s); first = 0;
  1832. if ( fmt[i].id<0 ) //!bcf_hdr_idinfo_exists(h,BCF_HL_FMT,fmt[i].id) )
  1833. {
  1834. fprintf(stderr, "[E::%s] invalid BCF, the FORMAT tag id=%d not present in the header.\n", __func__, fmt[i].id);
  1835. abort();
  1836. }
  1837. kputs(h->id[BCF_DT_ID][fmt[i].id].key, s);
  1838. if (strcmp(h->id[BCF_DT_ID][fmt[i].id].key, "GT") == 0) gt_i = i;
  1839. }
  1840. if ( first ) kputs("\t.", s);
  1841. for (j = 0; j < v->n_sample; ++j) {
  1842. kputc('\t', s);
  1843. first = 1;
  1844. for (i = 0; i < (int)v->n_fmt; ++i) {
  1845. bcf_fmt_t *f = &fmt[i];
  1846. if ( !f->p ) continue;
  1847. if (!first) kputc(':', s); first = 0;
  1848. if (gt_i == i)
  1849. bcf_format_gt(f,j,s);
  1850. else
  1851. bcf_fmt_array(s, f->n, f->type, f->p + j * f->size);
  1852. }
  1853. if ( first ) kputc('.', s);
  1854. }
  1855. }
  1856. else
  1857. for (j=0; j<=v->n_sample; j++)
  1858. kputs("\t.", s);
  1859. }
  1860. kputc('\n', s);
  1861. return 0;
  1862. }
  1863. int vcf_write_line(htsFile *fp, kstring_t *line)
  1864. {
  1865. int ret;
  1866. if ( line->s[line->l-1]!='\n' ) kputc('\n',line);
  1867. if ( fp->is_compressed==1 )
  1868. ret = bgzf_write(fp->fp.bgzf, line->s, line->l);
  1869. else
  1870. ret = hwrite(fp->fp.hfile, line->s, line->l);
  1871. return ret==line->l ? 0 : -1;
  1872. }
  1873. int vcf_write(htsFile *fp, const bcf_hdr_t *h, bcf1_t *v)
  1874. {
  1875. int ret;
  1876. fp->line.l = 0;
  1877. vcf_format1(h, v, &fp->line);
  1878. if ( fp->is_compressed==1 )
  1879. ret = bgzf_write(fp->fp.bgzf, fp->line.s, fp->line.l);
  1880. else
  1881. ret = hwrite(fp->fp.hfile, fp->line.s, fp->line.l);
  1882. return ret==fp->line.l ? 0 : -1;
  1883. }
  1884. /************************
  1885. * Data access routines *
  1886. ************************/
  1887. int bcf_hdr_id2int(const bcf_hdr_t *h, int which, const char *id)
  1888. {
  1889. khint_t k;
  1890. vdict_t *d = (vdict_t*)h->dict[which];
  1891. k = kh_get(vdict, d, id);
  1892. return k == kh_end(d)? -1 : kh_val(d, k).id;
  1893. }
  1894. /********************
  1895. *** BCF indexing ***
  1896. ********************/
  1897. hts_idx_t *bcf_index(htsFile *fp, int min_shift)
  1898. {
  1899. int n_lvls, i;
  1900. bcf1_t *b;
  1901. hts_idx_t *idx;
  1902. bcf_hdr_t *h;
  1903. int64_t max_len = 0, s;
  1904. h = bcf_hdr_read(fp);
  1905. if ( !h ) return NULL;
  1906. int nids = 0;
  1907. for (i = 0; i < h->n[BCF_DT_CTG]; ++i)
  1908. {
  1909. if ( !h->id[BCF_DT_CTG][i].val ) continue;
  1910. if ( max_len < h->id[BCF_DT_CTG][i].val->info[0] ) max_len = h->id[BCF_DT_CTG][i].val->info[0];
  1911. nids++;
  1912. }
  1913. if ( !max_len ) max_len = ((int64_t)1<<31) - 1; // In case contig line is broken.
  1914. max_len += 256;
  1915. for (n_lvls = 0, s = 1<<min_shift; max_len > s; ++n_lvls, s <<= 3);
  1916. idx = hts_idx_init(nids, HTS_FMT_CSI, bgzf_tell(fp->fp.bgzf), min_shift, n_lvls);
  1917. b = bcf_init1();
  1918. while (bcf_read1(fp,h, b) >= 0) {
  1919. int ret;
  1920. ret = hts_idx_push(idx, b->rid, b->pos, b->pos + b->rlen, bgzf_tell(fp->fp.bgzf), 1);
  1921. if (ret < 0)
  1922. {
  1923. bcf_destroy1(b);
  1924. hts_idx_destroy(idx);
  1925. return NULL;
  1926. }
  1927. }
  1928. hts_idx_finish(idx, bgzf_tell(fp->fp.bgzf));
  1929. bcf_destroy1(b);
  1930. bcf_hdr_destroy(h);
  1931. return idx;
  1932. }
  1933. int bcf_index_build(const char *fn, int min_shift)
  1934. {
  1935. htsFile *fp;
  1936. hts_idx_t *idx;
  1937. if ((fp = hts_open(fn, "rb")) == 0) return -1;
  1938. if ( !fp->fp.bgzf->is_compressed ) { hts_close(fp); return -1; }
  1939. idx = bcf_index(fp, min_shift);
  1940. hts_close(fp);
  1941. if ( !idx ) return -1;
  1942. hts_idx_save(idx, fn, HTS_FMT_CSI);
  1943. hts_idx_destroy(idx);
  1944. return 0;
  1945. }
  1946. /*****************
  1947. *** Utilities ***
  1948. *****************/
  1949. void bcf_hdr_combine(bcf_hdr_t *dst, const bcf_hdr_t *src)
  1950. {
  1951. int i, ndst_ori = dst->nhrec, need_sync = 0;
  1952. for (i=0; i<src->nhrec; i++)
  1953. {
  1954. if ( src->hrec[i]->type==BCF_HL_GEN && src->hrec[i]->value )
  1955. {
  1956. int j;
  1957. for (j=0; j<ndst_ori; j++)
  1958. {
  1959. if ( dst->hrec[j]->type!=BCF_HL_GEN ) continue;
  1960. if ( !strcmp(src->hrec[i]->key,dst->hrec[j]->key) && !strcmp(src->hrec[i]->value,dst->hrec[j]->value) ) break;
  1961. }
  1962. if ( j>=ndst_ori )
  1963. need_sync += bcf_hdr_add_hrec(dst, bcf_hrec_dup(src->hrec[i]));
  1964. }
  1965. else
  1966. {
  1967. bcf_hrec_t *rec = bcf_hdr_get_hrec(dst, src->hrec[i]->type, src->hrec[i]->vals[0]);
  1968. if ( !rec )
  1969. need_sync += bcf_hdr_add_hrec(dst, bcf_hrec_dup(src->hrec[i]));
  1970. }
  1971. }
  1972. if ( need_sync ) bcf_hdr_sync(dst);
  1973. }
  1974. int bcf_translate(const bcf_hdr_t *dst_hdr, bcf_hdr_t *src_hdr, bcf1_t *line)
  1975. {
  1976. int i;
  1977. if ( line->errcode )
  1978. {
  1979. fprintf(stderr,"[%s:%d %s] Unchecked error (%d), exiting.\n", __FILE__,__LINE__,__FUNCTION__,line->errcode);
  1980. exit(1);
  1981. }
  1982. if ( src_hdr->ntransl==-1 ) return 0; // no need to translate, all tags have the same id
  1983. if ( !src_hdr->ntransl ) // called for the first time, see what needs translating
  1984. {
  1985. int dict;
  1986. for (dict=0; dict<2; dict++) // BCF_DT_ID and BCF_DT_CTG
  1987. {
  1988. src_hdr->transl[dict] = (int*) malloc(src_hdr->n[dict]*sizeof(int));
  1989. for (i=0; i<src_hdr->n[dict]; i++)
  1990. {
  1991. if ( i>=dst_hdr->n[dict] || strcmp(src_hdr->id[dict][i].key,dst_hdr->id[dict][i].key) )
  1992. {
  1993. src_hdr->transl[dict][i] = bcf_hdr_id2int(dst_hdr,dict,src_hdr->id[dict][i].key);
  1994. src_hdr->ntransl++;
  1995. }
  1996. else
  1997. src_hdr->transl[dict][i] = -1;
  1998. }
  1999. }
  2000. if ( !src_hdr->ntransl )
  2001. {
  2002. free(src_hdr->transl[0]); src_hdr->transl[0] = NULL;
  2003. free(src_hdr->transl[1]); src_hdr->transl[1] = NULL;
  2004. src_hdr->ntransl = -1;
  2005. }
  2006. if ( src_hdr->ntransl==-1 ) return 0;
  2007. }
  2008. bcf_unpack(line,BCF_UN_ALL);
  2009. // CHROM
  2010. if ( src_hdr->transl[BCF_DT_CTG][line->rid] >=0 ) line->rid = src_hdr->transl[BCF_DT_CTG][line->rid];
  2011. // FILTER
  2012. for (i=0; i<line->d.n_flt; i++)
  2013. {
  2014. int src_id = line->d.flt[i];
  2015. if ( src_hdr->transl[BCF_DT_ID][src_id] >=0 )
  2016. line->d.flt[i] = src_hdr->transl[BCF_DT_ID][src_id];
  2017. line->d.shared_dirty |= BCF1_DIRTY_FLT;
  2018. }
  2019. // INFO
  2020. for (i=0; i<line->n_info; i++)
  2021. {
  2022. int src_id = line->d.info[i].key;
  2023. int dst_id = src_hdr->transl[BCF_DT_ID][src_id];
  2024. if ( dst_id<0 ) continue;
  2025. int src_size = src_id>>7 ? ( src_id>>15 ? BCF_BT_INT32 : BCF_BT_INT16) : BCF_BT_INT8;
  2026. int dst_size = dst_id>>7 ? ( dst_id>>15 ? BCF_BT_INT32 : BCF_BT_INT16) : BCF_BT_INT8;
  2027. if ( src_size==dst_size ) // can overwrite
  2028. {
  2029. line->d.info[i].key = dst_id;
  2030. uint8_t *vptr = line->d.info[i].vptr - line->d.info[i].vptr_off;
  2031. if ( dst_size==BCF_BT_INT8 ) { vptr[1] = (uint8_t)dst_id; }
  2032. else if ( dst_size==BCF_BT_INT16 ) { *(uint16_t*)vptr = (uint16_t)dst_id; }
  2033. else { *(uint32_t*)vptr = (uint32_t)dst_id; }
  2034. }
  2035. else // must realloc
  2036. {
  2037. bcf_info_t *info = &line->d.info[i];
  2038. assert( !info->vptr_free );
  2039. kstring_t str = {0,0,0};
  2040. bcf_enc_int1(&str, dst_id);
  2041. bcf_enc_size(&str, info->len,info->type);
  2042. info->vptr_off = str.l;
  2043. kputsn((char*)info->vptr, info->vptr_len, &str);
  2044. info->vptr = (uint8_t*)str.s + info->vptr_off;
  2045. info->vptr_free = 1;
  2046. info->key = dst_id;
  2047. line->d.shared_dirty |= BCF1_DIRTY_INF;
  2048. }
  2049. }
  2050. // FORMAT
  2051. for (i=0; i<line->n_fmt; i++)
  2052. {
  2053. int src_id = line->d.fmt[i].id;
  2054. int dst_id = src_hdr->transl[BCF_DT_ID][src_id];
  2055. if ( dst_id<0 ) continue;
  2056. int src_size = src_id>>7 ? ( src_id>>15 ? BCF_BT_INT32 : BCF_BT_INT16) : BCF_BT_INT8;
  2057. int dst_size = dst_id>>7 ? ( dst_id>>15 ? BCF_BT_INT32 : BCF_BT_INT16) : BCF_BT_INT8;
  2058. if ( src_size==dst_size ) // can overwrite
  2059. {
  2060. line->d.fmt[i].id = dst_id;
  2061. uint8_t *p = line->d.fmt[i].p - line->d.fmt[i].p_off; // pointer to the vector size (4bits) and BT type (4bits)
  2062. if ( dst_size==BCF_BT_INT8 ) { p[1] = dst_id; }
  2063. else if ( dst_size==BCF_BT_INT16 ) { uint8_t *x = (uint8_t*) &dst_id; p[1] = x[0]; p[2] = x[1]; }
  2064. else { uint8_t *x = (uint8_t*) &dst_id; p[1] = x[0]; p[2] = x[1]; p[3] = x[2]; p[4] = x[3]; }
  2065. }
  2066. else // must realloc
  2067. {
  2068. bcf_fmt_t *fmt = &line->d.fmt[i];
  2069. assert( !fmt->p_free );
  2070. kstring_t str = {0,0,0};
  2071. bcf_enc_int1(&str, dst_id);
  2072. bcf_enc_size(&str, fmt->n, fmt->type);
  2073. fmt->p_off = str.l;
  2074. kputsn((char*)fmt->p, fmt->p_len, &str);
  2075. fmt->p = (uint8_t*)str.s + fmt->p_off;
  2076. fmt->p_free = 1;
  2077. fmt->id = dst_id;
  2078. line->d.indiv_dirty = 1;
  2079. }
  2080. }
  2081. return 0;
  2082. }
  2083. bcf_hdr_t *bcf_hdr_dup(const bcf_hdr_t *hdr)
  2084. {
  2085. bcf_hdr_t *hout = bcf_hdr_init("r");
  2086. char *htxt = bcf_hdr_fmt_text(hdr, 1, NULL);
  2087. bcf_hdr_parse(hout, htxt);
  2088. free(htxt);
  2089. return hout;
  2090. }
  2091. bcf_hdr_t *bcf_hdr_subset(const bcf_hdr_t *h0, int n, char *const* samples, int *imap)
  2092. {
  2093. int hlen;
  2094. char *htxt = bcf_hdr_fmt_text(h0, 1, &hlen);
  2095. kstring_t str;
  2096. bcf_hdr_t *h;
  2097. str.l = str.m = 0; str.s = 0;
  2098. h = bcf_hdr_init("w");
  2099. bcf_hdr_set_version(h,bcf_hdr_get_version(h0));
  2100. int j;
  2101. for (j=0; j<n; j++) imap[j] = -1;
  2102. if ( bcf_hdr_nsamples(h0) > 0) {
  2103. char *p;
  2104. int i = 0, end = n? 8 : 7;
  2105. while ((p = strstr(htxt, "#CHROM\t")) != 0)
  2106. if (p > htxt && *(p-1) == '\n') break;
  2107. while ((p = strchr(p, '\t')) != 0 && i < end) ++i, ++p;
  2108. if (i != end) {
  2109. free(h); free(str.s);
  2110. return 0; // malformated header
  2111. }
  2112. kputsn(htxt, p - htxt, &str);
  2113. for (i = 0; i < n; ++i) {
  2114. imap[i] = bcf_hdr_id2int(h0, BCF_DT_SAMPLE, samples[i]);
  2115. if (imap[i] < 0) continue;
  2116. kputc('\t', &str);
  2117. kputs(samples[i], &str);
  2118. }
  2119. } else kputsn(htxt, hlen, &str);
  2120. while (str.l && (!str.s[str.l-1] || str.s[str.l-1]=='\n') ) str.l--; // kill trailing zeros and newlines
  2121. kputc('\n',&str);
  2122. bcf_hdr_parse(h, str.s);
  2123. free(str.s);
  2124. free(htxt);
  2125. return h;
  2126. }
  2127. int bcf_hdr_set_samples(bcf_hdr_t *hdr, const char *samples, int is_file)
  2128. {
  2129. if ( samples && !strcmp("-",samples) ) return 0; // keep all samples
  2130. hdr->nsamples_ori = bcf_hdr_nsamples(hdr);
  2131. if ( !samples ) { bcf_hdr_nsamples(hdr) = 0; return 0; } // exclude all samples
  2132. int i, narr = bit_array_size(bcf_hdr_nsamples(hdr));
  2133. hdr->keep_samples = (uint8_t*) calloc(narr,1);
  2134. if ( samples[0]=='^' )
  2135. for (i=0; i<bcf_hdr_nsamples(hdr); i++) bit_array_set(hdr->keep_samples,i);
  2136. int idx, n, ret = 0;
  2137. char **smpls = hts_readlist(samples[0]=='^'?samples+1:samples, is_file, &n);
  2138. if ( !smpls ) return -1;
  2139. for (i=0; i<n; i++)
  2140. {
  2141. idx = bcf_hdr_id2int(hdr,BCF_DT_SAMPLE,smpls[i]);
  2142. if ( idx<0 )
  2143. {
  2144. if ( !ret ) ret = i+1;
  2145. continue;
  2146. }
  2147. assert( idx<bcf_hdr_nsamples(hdr) );
  2148. if ( samples[0]=='^' )
  2149. bit_array_clear(hdr->keep_samples, idx);
  2150. else
  2151. bit_array_set(hdr->keep_samples, idx);
  2152. }
  2153. for (i=0; i<n; i++) free(smpls[i]);
  2154. free(smpls);
  2155. bcf_hdr_nsamples(hdr) = 0;
  2156. for (i=0; i<hdr->nsamples_ori; i++)
  2157. if ( bit_array_test(hdr->keep_samples,i) ) bcf_hdr_nsamples(hdr)++;
  2158. if ( !bcf_hdr_nsamples(hdr) ) { free(hdr->keep_samples); hdr->keep_samples=NULL; }
  2159. else
  2160. {
  2161. char **samples = (char**) malloc(sizeof(char*)*bcf_hdr_nsamples(hdr));
  2162. idx = 0;
  2163. for (i=0; i<hdr->nsamples_ori; i++)
  2164. if ( bit_array_test(hdr->keep_samples,i) ) samples[idx++] = strdup(hdr->samples[i]);
  2165. free(hdr->samples);
  2166. hdr->samples = samples;
  2167. // delete original samples from the dictionary
  2168. vdict_t *d = (vdict_t*)hdr->dict[BCF_DT_SAMPLE];
  2169. int k;
  2170. for (k = kh_begin(d); k != kh_end(d); ++k)
  2171. if (kh_exist(d, k)) free((char*)kh_key(d, k));
  2172. kh_destroy(vdict, d);
  2173. // add the subset back
  2174. hdr->dict[BCF_DT_SAMPLE] = d = kh_init(vdict);
  2175. for (i=0; i<bcf_hdr_nsamples(hdr); i++)
  2176. {
  2177. int ignore, k = kh_put(vdict, d, hdr->samples[i], &ignore);
  2178. kh_val(d, k) = bcf_idinfo_def;
  2179. kh_val(d, k).id = kh_size(d) - 1;
  2180. }
  2181. bcf_hdr_sync(hdr);
  2182. }
  2183. return ret;
  2184. }
  2185. int bcf_subset(const bcf_hdr_t *h, bcf1_t *v, int n, int *imap)
  2186. {
  2187. kstring_t ind;
  2188. ind.s = 0; ind.l = ind.m = 0;
  2189. if (n) {
  2190. bcf_fmt_t *fmt;
  2191. int i, j;
  2192. fmt = (bcf_fmt_t*)alloca(v->n_fmt * sizeof(bcf_fmt_t));
  2193. uint8_t *ptr = (uint8_t*)v->indiv.s;
  2194. for (i = 0; i < v->n_fmt; ++i)
  2195. ptr = bcf_unpack_fmt_core1(ptr, v->n_sample, &fmt[i]);
  2196. for (i = 0; i < (int)v->n_fmt; ++i) {
  2197. bcf_fmt_t *f = &fmt[i];
  2198. bcf_enc_int1(&ind, f->id);
  2199. bcf_enc_size(&ind, f->n, f->type);
  2200. for (j = 0; j < n; ++j)
  2201. if (imap[j] >= 0) kputsn((char*)(f->p + imap[j] * f->size), f->size, &ind);
  2202. }
  2203. for (i = j = 0; j < n; ++j) if (imap[j] >= 0) ++i;
  2204. v->n_sample = i;
  2205. } else v->n_sample = 0;
  2206. if ( !v->n_sample ) v->n_fmt = 0;
  2207. free(v->indiv.s);
  2208. v->indiv = ind;
  2209. v->unpacked &= ~BCF_UN_FMT; // only BCF is ready for output, VCF will need to unpack again
  2210. return 0;
  2211. }
  2212. int bcf_is_snp(bcf1_t *v)
  2213. {
  2214. int i;
  2215. bcf_unpack(v, BCF_UN_STR);
  2216. for (i = 0; i < v->n_allele; ++i)
  2217. if (strlen(v->d.allele[i]) != 1) break;
  2218. return i == v->n_allele;
  2219. }
  2220. static void bcf_set_variant_type(const char *ref, const char *alt, variant_t *var)
  2221. {
  2222. // The most frequent case
  2223. if ( !ref[1] && !alt[1] )
  2224. {
  2225. if ( *alt == '.' || *ref==*alt ) { var->n = 0; var->type = VCF_REF; return; }
  2226. if ( *alt == 'X' ) { var->n = 0; var->type = VCF_REF; return; } // mpileup's X allele shouldn't be treated as variant
  2227. var->n = 1; var->type = VCF_SNP; return;
  2228. }
  2229. const char *r = ref, *a = alt;
  2230. while (*r && *a && *r==*a ) { r++; a++; }
  2231. if ( *a && !*r )
  2232. {
  2233. while ( *a ) a++;
  2234. var->n = (a-alt)-(r-ref); var->type = VCF_INDEL; return;
  2235. }
  2236. else if ( *r && !*a )
  2237. {
  2238. while ( *r ) r++;
  2239. var->n = (a-alt)-(r-ref); var->type = VCF_INDEL; return;
  2240. }
  2241. else if ( !*r && !*a )
  2242. {
  2243. var->n = 0; var->type = VCF_REF; return;
  2244. }
  2245. const char *re = r, *ae = a;
  2246. while ( re[1] ) re++;
  2247. while ( ae[1] ) ae++;
  2248. while ( *re==*ae && re>r && ae>a ) { re--; ae--; }
  2249. if ( ae==a )
  2250. {
  2251. if ( re==r ) { var->n = 1; var->type = VCF_SNP; return; }
  2252. var->n = -(re-r);
  2253. if ( *re==*ae ) { var->type = VCF_INDEL; return; }
  2254. var->type = VCF_OTHER; return;
  2255. }
  2256. else if ( re==r )
  2257. {
  2258. var->n = ae-a;
  2259. if ( *re==*ae ) { var->type = VCF_INDEL; return; }
  2260. var->type = VCF_OTHER; return;
  2261. }
  2262. var->type = ( re-r == ae-a ) ? VCF_MNP : VCF_OTHER;
  2263. var->n = ( re-r > ae-a ) ? -(re-r+1) : ae-a+1;
  2264. // should do also complex events, SVs, etc...
  2265. }
  2266. static void bcf_set_variant_types(bcf1_t *b)
  2267. {
  2268. if ( !(b->unpacked & BCF_UN_STR) ) bcf_unpack(b, BCF_UN_STR);
  2269. bcf_dec_t *d = &b->d;
  2270. if ( d->n_var < b->n_allele )
  2271. {
  2272. d->var = (variant_t *) realloc(d->var, sizeof(variant_t)*b->n_allele);
  2273. d->n_var = b->n_allele;
  2274. }
  2275. int i;
  2276. b->d.var_type = 0;
  2277. for (i=1; i<b->n_allele; i++)
  2278. {
  2279. bcf_set_variant_type(d->allele[0],d->allele[i], &d->var[i]);
  2280. b->d.var_type |= d->var[i].type;
  2281. //fprintf(stderr,"[set_variant_type] %d %s %s -> %d %d .. %d\n", b->pos+1,d->allele[0],d->allele[i],d->var[i].type,d->var[i].n, b->d.var_type);
  2282. }
  2283. }
  2284. int bcf_get_variant_types(bcf1_t *rec)
  2285. {
  2286. if ( rec->d.var_type==-1 ) bcf_set_variant_types(rec);
  2287. return rec->d.var_type;
  2288. }
  2289. int bcf_get_variant_type(bcf1_t *rec, int ith_allele)
  2290. {
  2291. if ( rec->d.var_type==-1 ) bcf_set_variant_types(rec);
  2292. return rec->d.var[ith_allele].type;
  2293. }
  2294. int bcf_update_info(const bcf_hdr_t *hdr, bcf1_t *line, const char *key, const void *values, int n, int type)
  2295. {
  2296. // Is the field already present?
  2297. int i, inf_id = bcf_hdr_id2int(hdr,BCF_DT_ID,key);
  2298. if ( !bcf_hdr_idinfo_exists(hdr,BCF_HL_INFO,inf_id) ) return -1; // No such INFO field in the header
  2299. if ( !(line->unpacked & BCF_UN_INFO) ) bcf_unpack(line, BCF_UN_INFO);
  2300. for (i=0; i<line->n_info; i++)
  2301. if ( inf_id==line->d.info[i].key ) break;
  2302. bcf_info_t *inf = i==line->n_info ? NULL : &line->d.info[i];
  2303. if ( !n || (type==BCF_HT_STR && !values) )
  2304. {
  2305. if ( inf )
  2306. {
  2307. // Mark the tag for removal, free existing memory if necessary
  2308. if ( inf->vptr_free )
  2309. {
  2310. free(inf->vptr - inf->vptr_off);
  2311. inf->vptr_free = 0;
  2312. }
  2313. line->d.shared_dirty |= BCF1_DIRTY_INF;
  2314. inf->vptr = NULL;
  2315. }
  2316. return 0;
  2317. }
  2318. // Encode the values and determine the size required to accommodate the values
  2319. kstring_t str = {0,0,0};
  2320. bcf_enc_int1(&str, inf_id);
  2321. if ( type==BCF_HT_INT )
  2322. bcf_enc_vint(&str, n, (int32_t*)values, -1);
  2323. else if ( type==BCF_HT_REAL )
  2324. bcf_enc_vfloat(&str, n, (float*)values);
  2325. else if ( type==BCF_HT_FLAG || type==BCF_HT_STR )
  2326. {
  2327. if ( values==NULL )
  2328. bcf_enc_size(&str, 0, BCF_BT_NULL);
  2329. else
  2330. bcf_enc_vchar(&str, strlen((char*)values), (char*)values);
  2331. }
  2332. else
  2333. {
  2334. fprintf(stderr, "[E::%s] the type %d not implemented yet\n", __func__, type);
  2335. abort();
  2336. }
  2337. // Is the INFO tag already present
  2338. if ( inf )
  2339. {
  2340. // Is it big enough to accommodate new block?
  2341. if ( str.l <= inf->vptr_len + inf->vptr_off )
  2342. {
  2343. if ( str.l != inf->vptr_len + inf->vptr_off ) line->d.shared_dirty |= BCF1_DIRTY_INF;
  2344. uint8_t *ptr = inf->vptr - inf->vptr_off;
  2345. memcpy(ptr, str.s, str.l);
  2346. free(str.s);
  2347. int vptr_free = inf->vptr_free;
  2348. bcf_unpack_info_core1(ptr, inf);
  2349. inf->vptr_free = vptr_free;
  2350. }
  2351. else
  2352. {
  2353. assert( !inf->vptr_free ); // fix the caller or improve here: this has been modified before
  2354. bcf_unpack_info_core1((uint8_t*)str.s, inf);
  2355. inf->vptr_free = 1;
  2356. line->d.shared_dirty |= BCF1_DIRTY_INF;
  2357. }
  2358. }
  2359. else
  2360. {
  2361. // The tag is not present, create new one
  2362. line->n_info++;
  2363. hts_expand0(bcf_info_t, line->n_info, line->d.m_info , line->d.info);
  2364. inf = &line->d.info[line->n_info-1];
  2365. bcf_unpack_info_core1((uint8_t*)str.s, inf);
  2366. inf->vptr_free = 1;
  2367. line->d.shared_dirty |= BCF1_DIRTY_INF;
  2368. }
  2369. line->unpacked |= BCF_UN_INFO;
  2370. return 0;
  2371. }
  2372. int bcf_update_format_string(const bcf_hdr_t *hdr, bcf1_t *line, const char *key, const char **values, int n)
  2373. {
  2374. if ( !n )
  2375. return bcf_update_format(hdr,line,key,NULL,0,BCF_HT_STR);
  2376. int i, max_len = 0;
  2377. for (i=0; i<n; i++)
  2378. {
  2379. int len = strlen(values[i]);
  2380. if ( len > max_len ) max_len = len;
  2381. }
  2382. char *out = (char*) malloc(max_len*n);
  2383. if ( !out ) return -2;
  2384. for (i=0; i<n; i++)
  2385. {
  2386. char *dst = out+i*max_len;
  2387. const char *src = values[i];
  2388. int j = 0;
  2389. while ( src[j] ) { dst[j] = src[j]; j++; }
  2390. for (; j<max_len; j++) dst[j] = 0;
  2391. }
  2392. int ret = bcf_update_format(hdr,line,key,out,max_len*n,BCF_HT_STR);
  2393. free(out);
  2394. return ret;
  2395. }
  2396. int bcf_update_format(const bcf_hdr_t *hdr, bcf1_t *line, const char *key, const void *values, int n, int type)
  2397. {
  2398. // Is the field already present?
  2399. int i, fmt_id = bcf_hdr_id2int(hdr,BCF_DT_ID,key);
  2400. if ( !bcf_hdr_idinfo_exists(hdr,BCF_HL_FMT,fmt_id) )
  2401. {
  2402. if ( !n ) return 0;
  2403. return -1; // the key not present in the header
  2404. }
  2405. if ( !(line->unpacked & BCF_UN_FMT) ) bcf_unpack(line, BCF_UN_FMT);
  2406. for (i=0; i<line->n_fmt; i++)
  2407. if ( line->d.fmt[i].id==fmt_id ) break;
  2408. bcf_fmt_t *fmt = i==line->n_fmt ? NULL : &line->d.fmt[i];
  2409. if ( !n )
  2410. {
  2411. if ( fmt )
  2412. {
  2413. // Mark the tag for removal, free existing memory if necessary
  2414. if ( fmt->p_free )
  2415. {
  2416. free(fmt->p - fmt->p_off);
  2417. fmt->p_free = 0;
  2418. }
  2419. line->d.indiv_dirty = 1;
  2420. fmt->p = NULL;
  2421. }
  2422. return 0;
  2423. }
  2424. line->n_sample = bcf_hdr_nsamples(hdr);
  2425. int nps = n / line->n_sample; // number of values per sample
  2426. assert( nps && nps*line->n_sample==n ); // must be divisible by n_sample
  2427. // Encode the values and determine the size required to accommodate the values
  2428. kstring_t str = {0,0,0};
  2429. bcf_enc_int1(&str, fmt_id);
  2430. if ( type==BCF_HT_INT )
  2431. bcf_enc_vint(&str, n, (int32_t*)values, nps);
  2432. else if ( type==BCF_HT_REAL )
  2433. {
  2434. bcf_enc_size(&str, nps, BCF_BT_FLOAT);
  2435. kputsn((char*)values, nps*line->n_sample*sizeof(float), &str);
  2436. }
  2437. else if ( type==BCF_HT_STR )
  2438. {
  2439. bcf_enc_size(&str, nps, BCF_BT_CHAR);
  2440. kputsn((char*)values, nps*line->n_sample, &str);
  2441. }
  2442. else
  2443. {
  2444. fprintf(stderr, "[E::%s] the type %d not implemented yet\n", __func__, type);
  2445. abort();
  2446. }
  2447. if ( !fmt )
  2448. {
  2449. // Not present, new format field
  2450. line->n_fmt++;
  2451. hts_expand0(bcf_fmt_t, line->n_fmt, line->d.m_fmt, line->d.fmt);
  2452. // Special case: VCF specification requires that GT is always first
  2453. if ( line->n_fmt > 1 && key[0]=='G' && key[1]=='T' && !key[2] )
  2454. {
  2455. for (i=line->n_fmt-1; i>0; i--)
  2456. line->d.fmt[i] = line->d.fmt[i-1];
  2457. fmt = &line->d.fmt[0];
  2458. }
  2459. else
  2460. fmt = &line->d.fmt[line->n_fmt-1];
  2461. bcf_unpack_fmt_core1((uint8_t*)str.s, line->n_sample, fmt);
  2462. line->d.indiv_dirty = 1;
  2463. fmt->p_free = 1;
  2464. }
  2465. else
  2466. {
  2467. // The tag is already present, check if it is big enough to accomodate the new block
  2468. if ( str.l <= fmt->p_len + fmt->p_off )
  2469. {
  2470. // good, the block is big enough
  2471. if ( str.l != fmt->p_len + fmt->p_off ) line->d.indiv_dirty = 1;
  2472. uint8_t *ptr = fmt->p - fmt->p_off;
  2473. memcpy(ptr, str.s, str.l);
  2474. free(str.s);
  2475. int p_free = fmt->p_free;
  2476. bcf_unpack_fmt_core1(ptr, line->n_sample, fmt);
  2477. fmt->p_free = p_free;
  2478. }
  2479. else
  2480. {
  2481. assert( !fmt->p_free ); // fix the caller or improve here: this has been modified before
  2482. bcf_unpack_fmt_core1((uint8_t*)str.s, line->n_sample, fmt);
  2483. fmt->p_free = 1;
  2484. line->d.indiv_dirty = 1;
  2485. }
  2486. }
  2487. line->unpacked |= BCF_UN_FMT;
  2488. return 0;
  2489. }
  2490. int bcf_update_filter(const bcf_hdr_t *hdr, bcf1_t *line, int *flt_ids, int n)
  2491. {
  2492. if ( !(line->unpacked & BCF_UN_FLT) ) bcf_unpack(line, BCF_UN_FLT);
  2493. line->d.shared_dirty |= BCF1_DIRTY_FLT;
  2494. line->d.n_flt = n;
  2495. if ( !n ) return 0;
  2496. hts_expand(int, line->d.n_flt, line->d.m_flt, line->d.flt);
  2497. int i;
  2498. for (i=0; i<n; i++)
  2499. line->d.flt[i] = flt_ids[i];
  2500. return 0;
  2501. }
  2502. int bcf_add_filter(const bcf_hdr_t *hdr, bcf1_t *line, int flt_id)
  2503. {
  2504. if ( !(line->unpacked & BCF_UN_FLT) ) bcf_unpack(line, BCF_UN_FLT);
  2505. int i;
  2506. for (i=0; i<line->d.n_flt; i++)
  2507. if ( flt_id==line->d.flt[i] ) break;
  2508. if ( i<line->d.n_flt ) return 0; // this filter is already set
  2509. line->d.shared_dirty |= BCF1_DIRTY_FLT;
  2510. if ( flt_id==0 ) // set to PASS
  2511. line->d.n_flt = 1;
  2512. else if ( line->d.n_flt==1 && line->d.flt[0]==0 )
  2513. line->d.n_flt = 1;
  2514. else
  2515. line->d.n_flt++;
  2516. hts_expand(int, line->d.n_flt, line->d.m_flt, line->d.flt);
  2517. line->d.flt[line->d.n_flt-1] = flt_id;
  2518. return 1;
  2519. }
  2520. int bcf_remove_filter(const bcf_hdr_t *hdr, bcf1_t *line, int flt_id, int pass)
  2521. {
  2522. if ( !(line->unpacked & BCF_UN_FLT) ) bcf_unpack(line, BCF_UN_FLT);
  2523. int i;
  2524. for (i=0; i<line->d.n_flt; i++)
  2525. if ( flt_id==line->d.flt[i] ) break;
  2526. if ( i==line->d.n_flt ) return 0; // the filter is not present
  2527. line->d.shared_dirty |= BCF1_DIRTY_FLT;
  2528. if ( i!=line->d.n_flt-1 ) memmove(line->d.flt+i,line->d.flt+i+1,line->d.n_flt-i);
  2529. line->d.n_flt--;
  2530. if ( !line->d.n_flt && pass ) bcf_add_filter(hdr,line,0);
  2531. return 0;
  2532. }
  2533. int bcf_has_filter(const bcf_hdr_t *hdr, bcf1_t *line, char *filter)
  2534. {
  2535. if ( filter[0]=='.' && !filter[1] ) filter = "PASS";
  2536. int id = bcf_hdr_id2int(hdr, BCF_DT_ID, filter);
  2537. if ( !bcf_hdr_idinfo_exists(hdr,BCF_HL_FLT,id) ) return -1; // not defined in the header
  2538. if ( !(line->unpacked & BCF_UN_FLT) ) bcf_unpack(line, BCF_UN_FLT);
  2539. if ( id==0 && !line->d.n_flt) return 1; // PASS
  2540. int i;
  2541. for (i=0; i<line->d.n_flt; i++)
  2542. if ( line->d.flt[i]==id ) return 1;
  2543. return 0;
  2544. }
  2545. static inline int _bcf1_sync_alleles(const bcf_hdr_t *hdr, bcf1_t *line, int nals)
  2546. {
  2547. line->d.shared_dirty |= BCF1_DIRTY_ALS;
  2548. line->n_allele = nals;
  2549. hts_expand(char*, line->n_allele, line->d.m_allele, line->d.allele);
  2550. char *als = line->d.als;
  2551. int n = 0;
  2552. while (n<nals)
  2553. {
  2554. line->d.allele[n] = als;
  2555. while ( *als ) als++;
  2556. als++;
  2557. n++;
  2558. }
  2559. return 0;
  2560. }
  2561. int bcf_update_alleles(const bcf_hdr_t *hdr, bcf1_t *line, const char **alleles, int nals)
  2562. {
  2563. kstring_t tmp = {0,0,0};
  2564. char *free_old = NULL;
  2565. // If the supplied alleles are not pointers to line->d.als, the existing block can be reused.
  2566. int i;
  2567. for (i=0; i<nals; i++)
  2568. if ( alleles[i]>=line->d.als && alleles[i]<line->d.als+line->d.m_als ) break;
  2569. if ( i==nals )
  2570. {
  2571. // all alleles point elsewhere, reuse the existing block
  2572. tmp.l = 0; tmp.s = line->d.als; tmp.m = line->d.m_als;
  2573. }
  2574. else
  2575. free_old = line->d.als;
  2576. for (i=0; i<nals; i++)
  2577. {
  2578. kputs(alleles[i], &tmp);
  2579. kputc(0, &tmp);
  2580. }
  2581. line->d.als = tmp.s; line->d.m_als = tmp.m;
  2582. free(free_old);
  2583. return _bcf1_sync_alleles(hdr,line,nals);
  2584. }
  2585. int bcf_update_alleles_str(const bcf_hdr_t *hdr, bcf1_t *line, const char *alleles_string)
  2586. {
  2587. kstring_t tmp;
  2588. tmp.l = 0; tmp.s = line->d.als; tmp.m = line->d.m_als;
  2589. kputs(alleles_string, &tmp);
  2590. line->d.als = tmp.s; line->d.m_als = tmp.m;
  2591. int nals = 1;
  2592. char *t = line->d.als;
  2593. while (*t)
  2594. {
  2595. if ( *t==',' ) { *t = 0; nals++; }
  2596. t++;
  2597. }
  2598. return _bcf1_sync_alleles(hdr, line, nals);
  2599. }
  2600. int bcf_update_id(const bcf_hdr_t *hdr, bcf1_t *line, const char *id)
  2601. {
  2602. kstring_t tmp;
  2603. tmp.l = 0; tmp.s = line->d.id; tmp.m = line->d.m_id;
  2604. if ( id )
  2605. kputs(id, &tmp);
  2606. else
  2607. kputs(".", &tmp);
  2608. line->d.id = tmp.s; line->d.m_id = tmp.m;
  2609. line->d.shared_dirty |= BCF1_DIRTY_ID;
  2610. return 0;
  2611. }
  2612. bcf_fmt_t *bcf_get_fmt(const bcf_hdr_t *hdr, bcf1_t *line, const char *key)
  2613. {
  2614. int i, id = bcf_hdr_id2int(hdr, BCF_DT_ID, key);
  2615. if ( !bcf_hdr_idinfo_exists(hdr,BCF_HL_FMT,id) ) return NULL; // no such FMT field in the header
  2616. if ( !(line->unpacked & BCF_UN_FMT) ) bcf_unpack(line, BCF_UN_FMT);
  2617. for (i=0; i<line->n_fmt; i++)
  2618. {
  2619. if ( line->d.fmt[i].id==id ) return &line->d.fmt[i];
  2620. }
  2621. return NULL;
  2622. }
  2623. bcf_info_t *bcf_get_info(const bcf_hdr_t *hdr, bcf1_t *line, const char *key)
  2624. {
  2625. int i, id = bcf_hdr_id2int(hdr, BCF_DT_ID, key);
  2626. if ( !bcf_hdr_idinfo_exists(hdr,BCF_HL_INFO,id) ) return NULL; // no such INFO field in the header
  2627. if ( !(line->unpacked & BCF_UN_INFO) ) bcf_unpack(line, BCF_UN_INFO);
  2628. for (i=0; i<line->n_info; i++)
  2629. {
  2630. if ( line->d.info[i].key==id ) return &line->d.info[i];
  2631. }
  2632. return NULL;
  2633. }
  2634. int bcf_get_info_values(const bcf_hdr_t *hdr, bcf1_t *line, const char *tag, void **dst, int *ndst, int type)
  2635. {
  2636. int i,j, tag_id = bcf_hdr_id2int(hdr, BCF_DT_ID, tag);
  2637. if ( !bcf_hdr_idinfo_exists(hdr,BCF_HL_INFO,tag_id) ) return -1; // no such INFO field in the header
  2638. if ( bcf_hdr_id2type(hdr,BCF_HL_INFO,tag_id)!=type ) return -2; // expected different type
  2639. if ( !(line->unpacked & BCF_UN_INFO) ) bcf_unpack(line, BCF_UN_INFO);
  2640. for (i=0; i<line->n_info; i++)
  2641. if ( line->d.info[i].key==tag_id ) break;
  2642. if ( i==line->n_info ) return ( type==BCF_HT_FLAG ) ? 0 : -3; // the tag is not present in this record
  2643. if ( type==BCF_HT_FLAG ) return 1;
  2644. bcf_info_t *info = &line->d.info[i];
  2645. if ( type==BCF_HT_STR )
  2646. {
  2647. if ( *ndst < info->len+1 )
  2648. {
  2649. *ndst = info->len + 1;
  2650. *dst = realloc(*dst, *ndst);
  2651. }
  2652. memcpy(*dst,info->vptr,info->len);
  2653. ((uint8_t*)*dst)[info->len] = 0;
  2654. return info->len;
  2655. }
  2656. // Make sure the buffer is big enough
  2657. int size1 = type==BCF_HT_INT ? sizeof(int32_t) : sizeof(float);
  2658. if ( *ndst < info->len )
  2659. {
  2660. *ndst = info->len;
  2661. *dst = realloc(*dst, *ndst * size1);
  2662. }
  2663. if ( info->len == 1 )
  2664. {
  2665. if ( info->type==BCF_BT_FLOAT ) *((float*)*dst) = info->v1.f;
  2666. else *((int32_t*)*dst) = info->v1.i;
  2667. return 1;
  2668. }
  2669. #define BRANCH(type_t, is_missing, is_vector_end, set_missing, out_type_t) { \
  2670. out_type_t *tmp = (out_type_t *) *dst; \
  2671. type_t *p = (type_t *) info->vptr; \
  2672. for (j=0; j<info->len; j++) \
  2673. { \
  2674. if ( is_vector_end ) return j; \
  2675. if ( is_missing ) set_missing; \
  2676. else *tmp = p[j]; \
  2677. tmp++; \
  2678. } \
  2679. return j; \
  2680. }
  2681. switch (info->type) {
  2682. case BCF_BT_INT8: BRANCH(int8_t, p[j]==bcf_int8_missing, p[j]==bcf_int8_vector_end, *tmp=bcf_int32_missing, int32_t); break;
  2683. case BCF_BT_INT16: BRANCH(int16_t, p[j]==bcf_int16_missing, p[j]==bcf_int16_vector_end, *tmp=bcf_int32_missing, int32_t); break;
  2684. case BCF_BT_INT32: BRANCH(int32_t, p[j]==bcf_int32_missing, p[j]==bcf_int32_vector_end, *tmp=bcf_int32_missing, int32_t); break;
  2685. case BCF_BT_FLOAT: BRANCH(float, bcf_float_is_missing(p[j]), bcf_float_is_vector_end(p[j]), bcf_float_set_missing(*tmp), float); break;
  2686. default: fprintf(stderr,"TODO: %s:%d .. info->type=%d\n", __FILE__,__LINE__, info->type); exit(1);
  2687. }
  2688. #undef BRANCH
  2689. return -4; // this can never happen
  2690. }
  2691. int bcf_get_format_string(const bcf_hdr_t *hdr, bcf1_t *line, const char *tag, char ***dst, int *ndst)
  2692. {
  2693. int i,tag_id = bcf_hdr_id2int(hdr, BCF_DT_ID, tag);
  2694. if ( !bcf_hdr_idinfo_exists(hdr,BCF_HL_FMT,tag_id) ) return -1; // no such FORMAT field in the header
  2695. if ( bcf_hdr_id2type(hdr,BCF_HL_FMT,tag_id)!=BCF_HT_STR ) return -2; // expected different type
  2696. if ( !(line->unpacked & BCF_UN_FMT) ) bcf_unpack(line, BCF_UN_FMT);
  2697. for (i=0; i<line->n_fmt; i++)
  2698. if ( line->d.fmt[i].id==tag_id ) break;
  2699. if ( i==line->n_fmt ) return -3; // the tag is not present in this record
  2700. bcf_fmt_t *fmt = &line->d.fmt[i];
  2701. int nsmpl = bcf_hdr_nsamples(hdr);
  2702. if ( !*dst )
  2703. {
  2704. *dst = (char**) malloc(sizeof(char*)*nsmpl);
  2705. if ( !*dst ) return -4; // could not alloc
  2706. (*dst)[0] = NULL;
  2707. }
  2708. int n = (fmt->n+1)*nsmpl;
  2709. if ( *ndst < n )
  2710. {
  2711. (*dst)[0] = realloc((*dst)[0], n);
  2712. if ( !(*dst)[0] ) return -4; // could not alloc
  2713. *ndst = n;
  2714. }
  2715. for (i=0; i<nsmpl; i++)
  2716. {
  2717. uint8_t *src = fmt->p + i*fmt->n;
  2718. uint8_t *tmp = (uint8_t*)(*dst)[0] + i*(fmt->n+1);
  2719. memcpy(tmp,src,fmt->n);
  2720. tmp[fmt->n] = 0;
  2721. (*dst)[i] = (char*) tmp;
  2722. }
  2723. return n;
  2724. }
  2725. int bcf_get_format_values(const bcf_hdr_t *hdr, bcf1_t *line, const char *tag, void **dst, int *ndst, int type)
  2726. {
  2727. int i,j, tag_id = bcf_hdr_id2int(hdr, BCF_DT_ID, tag);
  2728. if ( !bcf_hdr_idinfo_exists(hdr,BCF_HL_FMT,tag_id) ) return -1; // no such FORMAT field in the header
  2729. if ( tag[0]=='G' && tag[1]=='T' && tag[2]==0 )
  2730. {
  2731. // Ugly: GT field is considered to be a string by the VCF header but BCF represents it as INT.
  2732. if ( bcf_hdr_id2type(hdr,BCF_HL_FMT,tag_id)!=BCF_HT_STR ) return -2;
  2733. }
  2734. else if ( bcf_hdr_id2type(hdr,BCF_HL_FMT,tag_id)!=type ) return -2; // expected different type
  2735. if ( !(line->unpacked & BCF_UN_FMT) ) bcf_unpack(line, BCF_UN_FMT);
  2736. for (i=0; i<line->n_fmt; i++)
  2737. if ( line->d.fmt[i].id==tag_id ) break;
  2738. if ( i==line->n_fmt ) return -3; // the tag is not present in this record
  2739. bcf_fmt_t *fmt = &line->d.fmt[i];
  2740. if ( type==BCF_HT_STR )
  2741. {
  2742. int n = fmt->n*bcf_hdr_nsamples(hdr);
  2743. if ( *ndst < n )
  2744. {
  2745. *dst = realloc(*dst, n);
  2746. if ( !*dst ) return -4; // could not alloc
  2747. *ndst = n;
  2748. }
  2749. memcpy(*dst,fmt->p,n);
  2750. return n;
  2751. }
  2752. // Make sure the buffer is big enough
  2753. int nsmpl = bcf_hdr_nsamples(hdr);
  2754. int size1 = type==BCF_HT_INT ? sizeof(int32_t) : sizeof(float);
  2755. if ( *ndst < fmt->n*nsmpl )
  2756. {
  2757. *ndst = fmt->n*nsmpl;
  2758. *dst = realloc(*dst, *ndst*size1);
  2759. if ( !dst ) return -4; // could not alloc
  2760. }
  2761. #define BRANCH(type_t, is_missing, is_vector_end, set_missing, set_vector_end, out_type_t) { \
  2762. out_type_t *tmp = (out_type_t *) *dst; \
  2763. type_t *p = (type_t*) fmt->p; \
  2764. for (i=0; i<nsmpl; i++) \
  2765. { \
  2766. for (j=0; j<fmt->n; j++) \
  2767. { \
  2768. if ( is_missing ) set_missing; \
  2769. else if ( is_vector_end ) { set_vector_end; break; } \
  2770. else *tmp = p[j]; \
  2771. tmp++; \
  2772. } \
  2773. for (; j<fmt->n; j++) { set_vector_end; tmp++; } \
  2774. p = (type_t *)((char *)p + fmt->size); \
  2775. } \
  2776. }
  2777. switch (fmt->type) {
  2778. case BCF_BT_INT8: BRANCH(int8_t, p[j]==bcf_int8_missing, p[j]==bcf_int8_vector_end, *tmp=bcf_int32_missing, *tmp=bcf_int32_vector_end, int32_t); break;
  2779. case BCF_BT_INT16: BRANCH(int16_t, p[j]==bcf_int16_missing, p[j]==bcf_int16_vector_end, *tmp=bcf_int32_missing, *tmp=bcf_int32_vector_end, int32_t); break;
  2780. case BCF_BT_INT32: BRANCH(int32_t, p[j]==bcf_int32_missing, p[j]==bcf_int32_vector_end, *tmp=bcf_int32_missing, *tmp=bcf_int32_vector_end, int32_t); break;
  2781. case BCF_BT_FLOAT: BRANCH(float, bcf_float_is_missing(p[j]), bcf_float_is_vector_end(p[j]), bcf_float_set_missing(*tmp), bcf_float_set_vector_end(*tmp), float); break;
  2782. default: fprintf(stderr,"TODO: %s:%d .. fmt->type=%d\n", __FILE__,__LINE__, fmt->type); exit(1);
  2783. }
  2784. #undef BRANCH
  2785. return nsmpl*fmt->n;
  2786. }