kstring.c 5.5 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229
  1. #include <stdarg.h>
  2. #include <stdio.h>
  3. #include <ctype.h>
  4. #include <string.h>
  5. #include <stdint.h>
  6. #include "htslib/kstring.h"
  7. int kvsprintf(kstring_t *s, const char *fmt, va_list ap)
  8. {
  9. va_list args;
  10. int l;
  11. va_copy(args, ap);
  12. l = vsnprintf(s->s + s->l, s->m - s->l, fmt, args); // This line does not work with glibc 2.0. See `man snprintf'.
  13. va_end(args);
  14. if (l + 1 > s->m - s->l) {
  15. s->m = s->l + l + 2;
  16. kroundup32(s->m);
  17. s->s = (char*)realloc(s->s, s->m);
  18. va_copy(args, ap);
  19. l = vsnprintf(s->s + s->l, s->m - s->l, fmt, args);
  20. va_end(args);
  21. }
  22. s->l += l;
  23. return l;
  24. }
  25. int ksprintf(kstring_t *s, const char *fmt, ...)
  26. {
  27. va_list ap;
  28. int l;
  29. va_start(ap, fmt);
  30. l = kvsprintf(s, fmt, ap);
  31. va_end(ap);
  32. return l;
  33. }
  34. char *kstrtok(const char *str, const char *sep, ks_tokaux_t *aux)
  35. {
  36. const char *p, *start;
  37. if (sep) { // set up the table
  38. if (str == 0 && (aux->tab[0]&1)) return 0; // no need to set up if we have finished
  39. aux->finished = 0;
  40. if (sep[1]) {
  41. aux->sep = -1;
  42. aux->tab[0] = aux->tab[1] = aux->tab[2] = aux->tab[3] = 0;
  43. for (p = sep; *p; ++p) aux->tab[*p>>6] |= 1ull<<(*p&0x3f);
  44. } else aux->sep = sep[0];
  45. }
  46. if (aux->finished) return 0;
  47. else if (str) aux->p = str - 1, aux->finished = 0;
  48. if (aux->sep < 0) {
  49. for (p = start = aux->p + 1; *p; ++p)
  50. if (aux->tab[*p>>6]>>(*p&0x3f)&1) break;
  51. } else {
  52. for (p = start = aux->p + 1; *p; ++p)
  53. if (*p == aux->sep) break;
  54. }
  55. aux->p = p; // end of token
  56. if (*p == 0) aux->finished = 1; // no more tokens
  57. return (char*)start;
  58. }
  59. // s MUST BE a null terminated string; l = strlen(s)
  60. int ksplit_core(char *s, int delimiter, int *_max, int **_offsets)
  61. {
  62. int i, n, max, last_char, last_start, *offsets, l;
  63. n = 0; max = *_max; offsets = *_offsets;
  64. l = strlen(s);
  65. #define __ksplit_aux do { \
  66. if (_offsets) { \
  67. s[i] = 0; \
  68. if (n == max) { \
  69. int *tmp; \
  70. max = max? max<<1 : 2; \
  71. if ((tmp = (int*)realloc(offsets, sizeof(int) * max))) { \
  72. offsets = tmp; \
  73. } else { \
  74. free(offsets); \
  75. *_offsets = NULL; \
  76. return 0; \
  77. } \
  78. } \
  79. offsets[n++] = last_start; \
  80. } else ++n; \
  81. } while (0)
  82. for (i = 0, last_char = last_start = 0; i <= l; ++i) {
  83. if (delimiter == 0) {
  84. if (isspace(s[i]) || s[i] == 0) {
  85. if (isgraph(last_char)) __ksplit_aux; // the end of a field
  86. } else {
  87. if (isspace(last_char) || last_char == 0) last_start = i;
  88. }
  89. } else {
  90. if (s[i] == delimiter || s[i] == 0) {
  91. if (last_char != 0 && last_char != delimiter) __ksplit_aux; // the end of a field
  92. } else {
  93. if (last_char == delimiter || last_char == 0) last_start = i;
  94. }
  95. }
  96. last_char = s[i];
  97. }
  98. *_max = max; *_offsets = offsets;
  99. return n;
  100. }
  101. /**********************
  102. * Boyer-Moore search *
  103. **********************/
  104. typedef unsigned char ubyte_t;
  105. // reference: http://www-igm.univ-mlv.fr/~lecroq/string/node14.html
  106. static int *ksBM_prep(const ubyte_t *pat, int m)
  107. {
  108. int i, *suff, *prep, *bmGs, *bmBc;
  109. prep = (int*)calloc(m + 256, sizeof(int));
  110. bmGs = prep; bmBc = prep + m;
  111. { // preBmBc()
  112. for (i = 0; i < 256; ++i) bmBc[i] = m;
  113. for (i = 0; i < m - 1; ++i) bmBc[pat[i]] = m - i - 1;
  114. }
  115. suff = (int*)calloc(m, sizeof(int));
  116. { // suffixes()
  117. int f = 0, g;
  118. suff[m - 1] = m;
  119. g = m - 1;
  120. for (i = m - 2; i >= 0; --i) {
  121. if (i > g && suff[i + m - 1 - f] < i - g)
  122. suff[i] = suff[i + m - 1 - f];
  123. else {
  124. if (i < g) g = i;
  125. f = i;
  126. while (g >= 0 && pat[g] == pat[g + m - 1 - f]) --g;
  127. suff[i] = f - g;
  128. }
  129. }
  130. }
  131. { // preBmGs()
  132. int j = 0;
  133. for (i = 0; i < m; ++i) bmGs[i] = m;
  134. for (i = m - 1; i >= 0; --i)
  135. if (suff[i] == i + 1)
  136. for (; j < m - 1 - i; ++j)
  137. if (bmGs[j] == m)
  138. bmGs[j] = m - 1 - i;
  139. for (i = 0; i <= m - 2; ++i)
  140. bmGs[m - 1 - suff[i]] = m - 1 - i;
  141. }
  142. free(suff);
  143. return prep;
  144. }
  145. void *kmemmem(const void *_str, int n, const void *_pat, int m, int **_prep)
  146. {
  147. int i, j, *prep = 0, *bmGs, *bmBc;
  148. const ubyte_t *str, *pat;
  149. str = (const ubyte_t*)_str; pat = (const ubyte_t*)_pat;
  150. prep = (_prep == 0 || *_prep == 0)? ksBM_prep(pat, m) : *_prep;
  151. if (_prep && *_prep == 0) *_prep = prep;
  152. bmGs = prep; bmBc = prep + m;
  153. j = 0;
  154. while (j <= n - m) {
  155. for (i = m - 1; i >= 0 && pat[i] == str[i+j]; --i);
  156. if (i >= 0) {
  157. int max = bmBc[str[i+j]] - m + 1 + i;
  158. if (max < bmGs[i]) max = bmGs[i];
  159. j += max;
  160. } else return (void*)(str + j);
  161. }
  162. if (_prep == 0) free(prep);
  163. return 0;
  164. }
  165. char *kstrstr(const char *str, const char *pat, int **_prep)
  166. {
  167. return (char*)kmemmem(str, strlen(str), pat, strlen(pat), _prep);
  168. }
  169. char *kstrnstr(const char *str, const char *pat, int n, int **_prep)
  170. {
  171. return (char*)kmemmem(str, n, pat, strlen(pat), _prep);
  172. }
  173. /***********************
  174. * The main() function *
  175. ***********************/
  176. #ifdef KSTRING_MAIN
  177. #include <stdio.h>
  178. int main()
  179. {
  180. kstring_t *s;
  181. int *fields, n, i;
  182. ks_tokaux_t aux;
  183. char *p;
  184. s = (kstring_t*)calloc(1, sizeof(kstring_t));
  185. // test ksprintf()
  186. ksprintf(s, " abcdefg: %d ", 100);
  187. printf("'%s'\n", s->s);
  188. // test ksplit()
  189. fields = ksplit(s, 0, &n);
  190. for (i = 0; i < n; ++i)
  191. printf("field[%d] = '%s'\n", i, s->s + fields[i]);
  192. // test kstrtok()
  193. s->l = 0;
  194. for (p = kstrtok("ab:cde:fg/hij::k", ":/", &aux); p; p = kstrtok(0, 0, &aux)) {
  195. kputsn(p, aux.p - p, s);
  196. kputc('\n', s);
  197. }
  198. printf("%s", s->s);
  199. // free
  200. free(s->s); free(s); free(fields);
  201. {
  202. static char *str = "abcdefgcdgcagtcakcdcd";
  203. static char *pat = "cd";
  204. char *ret, *s = str;
  205. int *prep = 0;
  206. while ((ret = kstrstr(s, pat, &prep)) != 0) {
  207. printf("match: %s\n", ret);
  208. s = ret + prep[0];
  209. }
  210. free(prep);
  211. }
  212. return 0;
  213. }
  214. #endif