mFILE.c 15 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634
  1. /*
  2. Copyright (c) 2005-2006, 2008-2009, 2013 Genome Research Ltd.
  3. Author: James Bonfield <jkb@sanger.ac.uk>
  4. Redistribution and use in source and binary forms, with or without
  5. modification, are permitted provided that the following conditions are met:
  6. 1. Redistributions of source code must retain the above copyright notice,
  7. this list of conditions and the following disclaimer.
  8. 2. Redistributions in binary form must reproduce the above copyright notice,
  9. this list of conditions and the following disclaimer in the documentation
  10. and/or other materials provided with the distribution.
  11. 3. Neither the names Genome Research Ltd and Wellcome Trust Sanger
  12. Institute nor the names of its contributors may be used to endorse or promote
  13. products derived from this software without specific prior written permission.
  14. THIS SOFTWARE IS PROVIDED BY GENOME RESEARCH LTD AND CONTRIBUTORS "AS IS" AND
  15. ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
  16. WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
  17. DISCLAIMED. IN NO EVENT SHALL GENOME RESEARCH LTD OR CONTRIBUTORS BE LIABLE
  18. FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  19. DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
  20. SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
  21. CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
  22. OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  23. OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  24. */
  25. #ifdef HAVE_CONFIG_H
  26. #include "io_lib_config.h"
  27. #endif
  28. #include <stdio.h>
  29. #include <stdlib.h>
  30. #include <errno.h>
  31. #include <string.h>
  32. #include <sys/types.h>
  33. #include <sys/stat.h>
  34. #include <fcntl.h>
  35. #include <unistd.h>
  36. #include <stdarg.h>
  37. #include "cram/os.h"
  38. #include "cram/mFILE.h"
  39. #include "cram/vlen.h"
  40. /*
  41. * This file contains memory-based versions of the most commonly used
  42. * (by io_lib) stdio functions.
  43. *
  44. * Actual file IO takes place either on opening or closing an mFILE.
  45. *
  46. * Coupled to this are a bunch of rather scary macros which can be obtained
  47. * by including stdio_hack.h. It is recommended though that you use mFILE.h
  48. * instead and replace fopen with mfopen (etc). This is more or less
  49. * mandatory if you wish to use both FILE and mFILE structs in a single file.
  50. */
  51. static mFILE *m_channel[3]; /* stdin, stdout and stderr fakes */
  52. /*
  53. * Reads the entirety of fp into memory. If 'fn' exists it is the filename
  54. * associated with fp. This will be used for more optimal reading (via a
  55. * stat to identify the size and a single read). Otherwise we use successive
  56. * reads until EOF.
  57. *
  58. * Returns a malloced buffer on success of length *size
  59. * NULL on failure
  60. */
  61. static char *mfload(FILE *fp, const char *fn, size_t *size, int binary) {
  62. struct stat sb;
  63. char *data = NULL;
  64. size_t allocated = 0, used = 0;
  65. int bufsize = 8192;
  66. #ifdef _WIN32
  67. if (binary)
  68. _setmode(_fileno(fp), _O_BINARY);
  69. else
  70. _setmode(_fileno(fp), _O_TEXT);
  71. #endif
  72. if (fn && -1 != stat(fn, &sb)) {
  73. data = malloc(allocated = sb.st_size);
  74. bufsize = sb.st_size;
  75. } else {
  76. fn = NULL;
  77. }
  78. do {
  79. size_t len;
  80. if (used + bufsize > allocated) {
  81. allocated += bufsize;
  82. data = realloc(data, allocated);
  83. }
  84. len = fread(data + used, 1, allocated - used, fp);
  85. if (len > 0)
  86. used += len;
  87. } while (!feof(fp) && (fn == NULL || used < sb.st_size));
  88. *size = used;
  89. return data;
  90. }
  91. /*
  92. * Creates and returns m_channel[0].
  93. * We initialise this on the first attempted read, which then slurps in
  94. * all of stdin until EOF is met.
  95. */
  96. mFILE *mstdin(void) {
  97. if (m_channel[0])
  98. return m_channel[0];
  99. m_channel[0] = mfcreate(NULL, 0);
  100. if (NULL == m_channel[0]) return NULL;
  101. m_channel[0]->fp = stdin;
  102. return m_channel[0];
  103. }
  104. static void init_mstdin(void) {
  105. static int done_stdin = 0;
  106. if (done_stdin)
  107. return;
  108. m_channel[0]->data = mfload(stdin, NULL, &m_channel[0]->size, 1);
  109. m_channel[0]->mode = MF_READ;
  110. done_stdin = 1;
  111. }
  112. /*
  113. * Creates and returns m_channel[1]. This is the fake for stdout. It starts as
  114. * an empty buffer which is physically written out only when mfflush or
  115. * mfclose are called.
  116. */
  117. mFILE *mstdout(void) {
  118. if (m_channel[1])
  119. return m_channel[1];
  120. m_channel[1] = mfcreate(NULL, 0);
  121. if (NULL == m_channel[1]) return NULL;
  122. m_channel[1]->fp = stdout;
  123. m_channel[1]->mode = MF_WRITE;
  124. return m_channel[1];
  125. }
  126. /*
  127. * Stderr as an mFILE.
  128. * The code handles stderr by returning m_channel[2], but also checking
  129. * for stderr in fprintf (the common usage of it) to auto-flush.
  130. */
  131. mFILE *mstderr(void) {
  132. if (m_channel[2])
  133. return m_channel[2];
  134. m_channel[2] = mfcreate(NULL, 0);
  135. if (NULL == m_channel[2]) return NULL;
  136. m_channel[2]->fp = stderr;
  137. m_channel[2]->mode = MF_WRITE;
  138. return m_channel[2];
  139. }
  140. /*
  141. * For creating existing mFILE pointers directly from memory buffers.
  142. */
  143. mFILE *mfcreate(char *data, int size) {
  144. mFILE *mf = (mFILE *)malloc(sizeof(*mf));
  145. if (NULL == mf) return NULL;
  146. mf->fp = NULL;
  147. mf->data = data;
  148. mf->alloced = size;
  149. mf->size = size;
  150. mf->eof = 0;
  151. mf->offset = 0;
  152. mf->flush_pos = 0;
  153. mf->mode = MF_READ | MF_WRITE;
  154. return mf;
  155. }
  156. /*
  157. * Recreate an existing mFILE to house new data/size.
  158. * It also rewinds the file.
  159. */
  160. void mfrecreate(mFILE *mf, char *data, int size) {
  161. if (mf->data)
  162. free(mf->data);
  163. mf->data = data;
  164. mf->size = size;
  165. mf->alloced = size;
  166. mf->eof = 0;
  167. mf->offset = 0;
  168. mf->flush_pos = 0;
  169. }
  170. /*
  171. * Creates a new mFILE to contain the contents of the FILE pointer.
  172. * This mFILE is purely for in-memory operations and has no links to the
  173. * original FILE* it came from. It also doesn't close the FILE pointer.
  174. * Consider using mfreopen() is you need different behaviour.
  175. *
  176. * Returns mFILE * on success
  177. * NULL on failure.
  178. */
  179. mFILE *mfcreate_from(const char *path, const char *mode_str, FILE *fp) {
  180. mFILE *mf;
  181. /* Open using mfreopen() */
  182. if (NULL == (mf = mfreopen(path, mode_str, fp)))
  183. return NULL;
  184. /* Disassociate from the input stream */
  185. mf->fp = NULL;
  186. return mf;
  187. }
  188. /*
  189. * Converts a FILE * to an mFILE *.
  190. * Use this for wrapper functions to turn external prototypes requring
  191. * FILE * as an argument into internal code using mFILE *.
  192. */
  193. mFILE *mfreopen(const char *path, const char *mode_str, FILE *fp) {
  194. mFILE *mf;
  195. int r = 0, w = 0, a = 0, b = 0, x = 0, mode = 0;
  196. /* Parse mode:
  197. * r = read file contents (if truncated => don't read)
  198. * w = write on close
  199. * a = position at end of buffer
  200. * x = position at same location as the original fp, don't seek on flush
  201. */
  202. if (strchr(mode_str, 'r'))
  203. r = 1, mode |= MF_READ;
  204. if (strchr(mode_str, 'w'))
  205. w = 1, mode |= MF_WRITE | MF_TRUNC;
  206. if (strchr(mode_str, 'a'))
  207. w = a = 1, mode |= MF_WRITE | MF_APPEND;
  208. if (strchr(mode_str, 'b'))
  209. b = 1, mode |= MF_BINARY;
  210. if (strchr(mode_str, 'x'))
  211. x = 1;
  212. if (strchr(mode_str, '+')) {
  213. w = 1, mode |= MF_READ | MF_WRITE;
  214. if (a)
  215. r = 1;
  216. }
  217. if (r) {
  218. mf = mfcreate(NULL, 0);
  219. if (NULL == mf) return NULL;
  220. if (!(mode & MF_TRUNC)) {
  221. mf->data = mfload(fp, path, &mf->size, b);
  222. mf->alloced = mf->size;
  223. if (!a)
  224. fseek(fp, 0, SEEK_SET);
  225. }
  226. } else if (w) {
  227. /* Write - initialise the data structures */
  228. mf = mfcreate(NULL, 0);
  229. if (NULL == mf) return NULL;
  230. } else {
  231. fprintf(stderr, "Must specify either r, w or a for mode\n");
  232. return NULL;
  233. }
  234. mf->fp = fp;
  235. mf->mode = mode;
  236. if (x) {
  237. mf->mode |= MF_MODEX;
  238. }
  239. if (a) {
  240. mf->flush_pos = mf->size;
  241. fseek(fp, 0, SEEK_END);
  242. }
  243. return mf;
  244. }
  245. /*
  246. * Opens a file. If we have read access (r or a+) then it loads the entire
  247. * file into memory. If We have write access then the pathname is stored.
  248. * We do not actually write until an mfclose, which then checks this pathname.
  249. */
  250. mFILE *mfopen(const char *path, const char *mode) {
  251. FILE *fp;
  252. if (NULL == (fp = fopen(path, mode)))
  253. return NULL;
  254. return mfreopen(path, mode, fp);
  255. }
  256. /*
  257. * Closes an mFILE. If the filename is known (implying write access) then this
  258. * also writes the data to disk.
  259. *
  260. * Stdout is handled by calling mfflush which writes to stdout if appropriate.
  261. */
  262. int mfclose(mFILE *mf) {
  263. if (!mf)
  264. return -1;
  265. mfflush(mf);
  266. if (mf->fp)
  267. fclose(mf->fp);
  268. mfdestroy(mf);
  269. return 0;
  270. }
  271. /*
  272. * Closes the file pointer contained within the mFILE without destroying
  273. * the in-memory data.
  274. */
  275. int mfdetach(mFILE *mf) {
  276. if (!mf)
  277. return -1;
  278. mfflush(mf);
  279. if (mf->fp) {
  280. fclose(mf->fp);
  281. mf->fp = NULL;
  282. }
  283. return 0;
  284. }
  285. /*
  286. * Destroys an mFILE structure but does not flush or close it
  287. */
  288. int mfdestroy(mFILE *mf) {
  289. if (!mf)
  290. return -1;
  291. if (mf->data)
  292. free(mf->data);
  293. free(mf);
  294. return 0;
  295. }
  296. /*
  297. * Steals that data out of an mFILE. The mFILE itself will be closed.
  298. * It is up to the caller to free the stolen buffer. If size_out is
  299. * not NULL, mf->size will be stored in it.
  300. * This is more-or-less the opposite of mfcreate().
  301. */
  302. void *mfsteal(mFILE *mf, size_t *size_out) {
  303. void *data;
  304. if (!mf) return NULL;
  305. data = mf->data;
  306. if (NULL != size_out) *size_out = mf->size;
  307. mfdetach(mf);
  308. mf->data = NULL;
  309. mfdestroy(mf);
  310. return data;
  311. }
  312. /*
  313. * Seek/tell functions. Nothing more than updating and reporting an
  314. * in-memory index. NB we can seek on stdin or stdout even provided we
  315. * haven't been flushing.
  316. */
  317. int mfseek(mFILE *mf, long offset, int whence) {
  318. switch (whence) {
  319. case SEEK_SET:
  320. mf->offset = offset;
  321. break;
  322. case SEEK_CUR:
  323. mf->offset += offset;
  324. break;
  325. case SEEK_END:
  326. mf->offset = mf->size + offset;
  327. break;
  328. default:
  329. errno = EINVAL;
  330. return -1;
  331. }
  332. mf->eof = 0;
  333. return 0;
  334. }
  335. long mftell(mFILE *mf) {
  336. return mf->offset;
  337. }
  338. void mrewind(mFILE *mf) {
  339. mf->offset = 0;
  340. mf->eof = 0;
  341. }
  342. /*
  343. * mftruncate is not directly a translation of ftruncate as the latter
  344. * takes a file descriptor instead of a FILE *. It performs the analogous
  345. * role though.
  346. *
  347. * If offset is -1 then the file is truncated to be the current file
  348. * offset.
  349. */
  350. void mftruncate(mFILE *mf, long offset) {
  351. mf->size = offset != -1 ? offset : mf->offset;
  352. if (mf->offset > mf->size)
  353. mf->offset = mf->size;
  354. }
  355. int mfeof(mFILE *mf) {
  356. return mf->eof;
  357. }
  358. /*
  359. * mFILE read/write functions. Basically these turn fread/fwrite syntax
  360. * into memcpy statements, with appropriate memory handling for writing.
  361. */
  362. size_t mfread(void *ptr, size_t size, size_t nmemb, mFILE *mf) {
  363. size_t len;
  364. char *cptr = (char *)ptr;
  365. if (mf == m_channel[0]) init_mstdin();
  366. if (mf->size <= mf->offset)
  367. return 0;
  368. len = size * nmemb <= mf->size - mf->offset
  369. ? size * nmemb
  370. : mf->size - mf->offset;
  371. if (!size)
  372. return 0;
  373. memcpy(cptr, &mf->data[mf->offset], len);
  374. mf->offset += len;
  375. if (len != size * nmemb) {
  376. mf->eof = 1;
  377. }
  378. return len / size;
  379. }
  380. size_t mfwrite(void *ptr, size_t size, size_t nmemb, mFILE *mf) {
  381. if (!(mf->mode & MF_WRITE))
  382. return 0;
  383. /* Append mode => forced all writes to end of file */
  384. if (mf->mode & MF_APPEND)
  385. mf->offset = mf->size;
  386. /* Make sure we have enough room */
  387. while (size * nmemb + mf->offset > mf->alloced) {
  388. size_t new_alloced = mf->alloced ? mf->alloced * 2 : 1024;
  389. void * new_data = realloc(mf->data, new_alloced);
  390. if (NULL == new_data) return 0;
  391. mf->alloced = new_alloced;
  392. mf->data = new_data;
  393. }
  394. /* Record where we need to reflush from */
  395. if (mf->offset < mf->flush_pos)
  396. mf->flush_pos = mf->offset;
  397. /* Copy the data over */
  398. memcpy(&mf->data[mf->offset], ptr, size * nmemb);
  399. mf->offset += size * nmemb;
  400. if (mf->size < mf->offset)
  401. mf->size = mf->offset;
  402. return nmemb;
  403. }
  404. int mfgetc(mFILE *mf) {
  405. if (mf == m_channel[0]) init_mstdin();
  406. if (mf->offset < mf->size) {
  407. return (unsigned char)mf->data[mf->offset++];
  408. }
  409. mf->eof = 1;
  410. return -1;
  411. }
  412. int mungetc(int c, mFILE *mf) {
  413. if (mf->offset > 0) {
  414. mf->data[--mf->offset] = c;
  415. return c;
  416. }
  417. mf->eof = 1;
  418. return -1;
  419. }
  420. char *mfgets(char *s, int size, mFILE *mf) {
  421. int i;
  422. if (mf == m_channel[0]) init_mstdin();
  423. *s = 0;
  424. for (i = 0; i < size-1;) {
  425. if (mf->offset < mf->size) {
  426. s[i] = mf->data[mf->offset++];
  427. if (s[i++] == '\n')
  428. break;
  429. } else {
  430. mf->eof = 1;
  431. break;
  432. }
  433. }
  434. s[i] = 0;
  435. return i ? s : NULL;
  436. }
  437. /*
  438. * Flushes an mFILE. If this is a real open of a file in write mode then
  439. * mFILE->fp will be set. We then write out any new data in mFILE since the
  440. * last flush. We cannot tell what may have been modified as we don't keep
  441. * track of that, so we typically rewrite out the entire file contents between
  442. * the last flush_pos and the end of file.
  443. *
  444. * For stderr/stdout we also reset the offsets so we cannot modify things
  445. * we've already output.
  446. */
  447. int mfflush(mFILE *mf) {
  448. if (!mf->fp)
  449. return 0;
  450. /* FIXME: only do this when opened in write mode */
  451. if (mf == m_channel[1] || mf == m_channel[2]) {
  452. if (mf->flush_pos < mf->size) {
  453. size_t bytes = mf->size - mf->flush_pos;
  454. if (fwrite(mf->data + mf->flush_pos, 1, bytes, mf->fp) < bytes)
  455. return -1;
  456. if (0 != fflush(mf->fp))
  457. return -1;
  458. }
  459. /* Stdout & stderr are non-seekable streams so throw away the data */
  460. mf->offset = mf->size = mf->flush_pos = 0;
  461. }
  462. /* only flush when opened in write mode */
  463. if (mf->mode & MF_WRITE) {
  464. if (mf->flush_pos < mf->size) {
  465. size_t bytes = mf->size - mf->flush_pos;
  466. if (!(mf->mode & MF_MODEX)) {
  467. fseek(mf->fp, mf->flush_pos, SEEK_SET);
  468. }
  469. if (fwrite(mf->data + mf->flush_pos, 1, bytes, mf->fp) < bytes)
  470. return -1;
  471. if (0 != fflush(mf->fp))
  472. return -1;
  473. }
  474. if (ftell(mf->fp) != -1 &&
  475. ftruncate(fileno(mf->fp), ftell(mf->fp)) == -1)
  476. return -1;
  477. mf->flush_pos = mf->size;
  478. }
  479. return 0;
  480. }
  481. /*
  482. * A wrapper around vsprintf() to write to an mFILE. This also uses vflen() to
  483. * estimate how many additional bytes of storage will be required for the
  484. * vsprintf to work.
  485. */
  486. int mfprintf(mFILE *mf, char *fmt, ...) {
  487. int ret;
  488. size_t est_length;
  489. va_list args;
  490. va_start(args, fmt);
  491. est_length = vflen(fmt, args);
  492. va_end(args);
  493. while (est_length + mf->offset > mf->alloced) {
  494. size_t new_alloced = mf->alloced ? mf->alloced * 2 : 1024;
  495. void * new_data = realloc(mf->data, new_alloced);
  496. if (NULL == new_data) return -1;
  497. mf->alloced = new_alloced;
  498. mf->data = new_data;
  499. }
  500. va_start(args, fmt);
  501. ret = vsprintf(&mf->data[mf->offset], fmt, args);
  502. va_end(args);
  503. if (ret > 0) {
  504. mf->offset += ret;
  505. if (mf->size < mf->offset)
  506. mf->size = mf->offset;
  507. }
  508. if (mf->fp == stderr) {
  509. /* Auto-flush for stderr */
  510. if (0 != mfflush(mf)) return -1;
  511. }
  512. return ret;
  513. }
  514. /*
  515. * Converts an mFILE from binary to ascii mode by replacing all
  516. * cr-nl with nl.
  517. *
  518. * Primarily used on windows when we've uncompressed a binary file which
  519. * happens to be a text file (eg Experiment File). Previously we would have
  520. * seeked back to the start and used _setmode(fileno(fp), _O_TEXT).
  521. *
  522. * Side effect: resets offset and flush_pos back to the start.
  523. */
  524. void mfascii(mFILE *mf) {
  525. size_t p1, p2;
  526. for (p1 = p2 = 1; p1 < mf->size; p1++, p2++) {
  527. if (mf->data[p1] == '\n' && mf->data[p1-1] == '\r') {
  528. p2--; /* delete the \r */
  529. }
  530. mf->data[p2] = mf->data[p1];
  531. }
  532. mf->size = p2;
  533. mf->offset = mf->flush_pos = 0;
  534. }