/* $NetBSD: offtab.c,v 1.15 2017/07/29 21:04:07 riastradh Exp $ */ /*- * Copyright (c) 2014 The NetBSD Foundation, Inc. * All rights reserved. * * This code is derived from software contributed to The NetBSD Foundation * by Taylor R. Campbell. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE * POSSIBILITY OF SUCH DAMAGE. */ #include __RCSID("$NetBSD: offtab.c,v 1.15 2017/07/29 21:04:07 riastradh Exp $"); #include #include #include #include #include #include #include #include #include #include #include "common.h" #include "utils.h" #include "offtab.h" static void __printflike(1,2) __dead offtab_bug(const char *fmt, ...) { errx(1, "bug in offtab, please report"); } static void __printflike(1,2) __dead offtab_bugx(const char *fmt, ...) { errx(1, "bug in offtab, please report"); } static uint32_t offtab_compute_window_size(struct offtab *offtab, uint32_t start) { assert(start < offtab->ot_n_offsets); return MIN(offtab->ot_window_size, (offtab->ot_n_offsets - start)); } static uint32_t offtab_current_window_size(struct offtab *offtab) { return offtab_compute_window_size(offtab, offtab->ot_window_start); } static uint32_t offtab_current_window_end(struct offtab *offtab) { assert(offtab->ot_window_start < offtab->ot_n_offsets); assert(offtab_current_window_size(offtab) <= (offtab->ot_n_offsets - offtab->ot_window_start)); return (offtab->ot_window_start + offtab_current_window_size(offtab)); } static void offtab_compute_window_position(struct offtab *offtab, uint32_t window_start, size_t *bytes, off_t *pos) { const uint32_t window_size = offtab_compute_window_size(offtab, window_start); __CTASSERT(MUL_OK(size_t, MAX_WINDOW_SIZE, sizeof(uint64_t))); *bytes = (window_size * sizeof(uint64_t)); assert(window_start <= offtab->ot_n_offsets); __CTASSERT(MUL_OK(off_t, MAX_N_OFFSETS, sizeof(uint64_t))); const off_t window_offset = ((off_t)window_start * (off_t)sizeof(uint64_t)); assert(offtab->ot_fdpos <= OFFTAB_MAX_FDPOS); __CTASSERT(ADD_OK(off_t, OFFTAB_MAX_FDPOS, (off_t)MAX_N_OFFSETS*sizeof(uint64_t))); assert(ADD_OK(off_t, offtab->ot_fdpos, window_offset)); *pos = (offtab->ot_fdpos + window_offset); } #define OFFTAB_READ_SEEK 0x01 #define OFFTAB_READ_NOSEEK 0x00 static bool offtab_read_window(struct offtab *offtab, uint32_t blkno, int read_flags) { const uint32_t window_start = rounddown(blkno, offtab->ot_window_size); size_t window_bytes; off_t window_pos; assert(offtab->ot_mode == OFFTAB_MODE_READ); assert(ISSET(read_flags, OFFTAB_READ_SEEK) || (lseek(offtab->ot_fd, 0, SEEK_CUR) == offtab->ot_fdpos) || ((lseek(offtab->ot_fd, 0, SEEK_CUR) == -1) && (errno == ESPIPE))); offtab_compute_window_position(offtab, window_start, &window_bytes, &window_pos); const ssize_t n_read = (ISSET(read_flags, OFFTAB_READ_SEEK) ? pread_block(offtab->ot_fd, offtab->ot_window, window_bytes, window_pos) : read_block(offtab->ot_fd, offtab->ot_window, window_bytes)); if (n_read == -1) { (*offtab->ot_report)("read offset table at %"PRIuMAX, (uintmax_t)window_pos); return false; } assert(n_read >= 0); if ((size_t)n_read != window_bytes) { (*offtab->ot_reportx)("partial read of offset table" " at %"PRIuMAX": %zu != %zu", (uintmax_t)window_pos, (size_t)n_read, window_bytes); return false; } offtab->ot_window_start = window_start; return true; } static bool offtab_maybe_read_window(struct offtab *offtab, uint32_t blkno, int read_flags) { /* Don't bother if blkno is already in the window. */ if ((offtab->ot_window_start <= blkno) && (blkno < offtab_current_window_end(offtab))) return true; if (!offtab_read_window(offtab, blkno, read_flags)) return false; return true; } static void offtab_write_window(struct offtab *offtab) { size_t window_bytes; off_t window_pos; assert(offtab->ot_mode == OFFTAB_MODE_WRITE); offtab_compute_window_position(offtab, offtab->ot_window_start, &window_bytes, &window_pos); const ssize_t n_written = pwrite(offtab->ot_fd, offtab->ot_window, window_bytes, window_pos); if (n_written == -1) err_ss(1, "write initial offset table"); assert(n_written >= 0); if ((size_t)n_written != window_bytes) errx_ss(1, "partial write of initial offset bytes: %zu <= %zu", (size_t)n_written, window_bytes); } static void offtab_maybe_write_window(struct offtab *offtab, uint32_t start, uint32_t end) { /* Don't bother if [start, end) does not cover our window. */ if (end <= offtab->ot_window_start) return; if (offtab_current_window_end(offtab) < start) return; offtab_write_window(offtab); } /* * Initialize an offtab to support the specified number of offsets read * to or written from fd at byte position fdpos. */ void offtab_init(struct offtab *offtab, uint32_t n_offsets, uint32_t window_size, int fd, off_t fdpos) { assert(offtab != NULL); assert(0 < n_offsets); assert(0 <= fd); assert(0 <= fdpos); assert(fdpos <= OFFTAB_MAX_FDPOS); offtab->ot_n_offsets = n_offsets; if ((window_size == 0) || (n_offsets < window_size)) offtab->ot_window_size = n_offsets; else offtab->ot_window_size = window_size; assert(offtab->ot_window_size <= offtab->ot_n_offsets); offtab->ot_window_start = (uint32_t)-1; __CTASSERT(MUL_OK(size_t, MAX_WINDOW_SIZE, sizeof(uint64_t))); offtab->ot_window = malloc(offtab->ot_window_size * sizeof(uint64_t)); if (offtab->ot_window == NULL) err(1, "malloc offset table"); offtab->ot_blkno = (uint32_t)-1; offtab->ot_fd = fd; offtab->ot_fdpos = fdpos; offtab->ot_report = &offtab_bug; offtab->ot_reportx = &offtab_bugx; offtab->ot_mode = OFFTAB_MODE_NONE; } /* * Destroy an offtab. */ void offtab_destroy(struct offtab *offtab) { free(offtab->ot_window); } /* * For an offtab that has been used to read data from disk, convert it * to an offtab that can be used to write subsequent data to disk. * blkno is the last valid blkno read from disk. */ bool offtab_transmogrify_read_to_write(struct offtab *offtab, uint32_t blkno) { assert(offtab->ot_mode == OFFTAB_MODE_READ); assert(0 < blkno); if (!offtab_maybe_read_window(offtab, blkno, OFFTAB_READ_SEEK)) return false; offtab->ot_mode = OFFTAB_MODE_WRITE; offtab->ot_blkno = blkno; return true; } /* * Reset an offtab for reading an offset table from the beginning. * Initializes in-memory state and may read data from offtab->ot_fd, * which must currently be at byte position offtab->ot_fdpos. Failure * will be reported by the report/reportx routines, which are called * like warn/warnx. May fail; returns true on success, false on * failure. * * This almost has copypasta of offtab_prepare_get, but this uses read, * rather than pread, so that it will work on nonseekable input if the * window is the whole offset table. */ bool offtab_reset_read(struct offtab *offtab, void (*report)(const char *, ...) __printflike(1,2), void (*reportx)(const char *, ...) __printflike(1,2)) { assert((lseek(offtab->ot_fd, 0, SEEK_CUR) == offtab->ot_fdpos) || ((lseek(offtab->ot_fd, 0, SEEK_CUR) == -1) && (errno == ESPIPE))); offtab->ot_report = report; offtab->ot_reportx = reportx; offtab->ot_mode = OFFTAB_MODE_READ; offtab->ot_blkno = (uint32_t)-1; if (!offtab_read_window(offtab, 0, OFFTAB_READ_NOSEEK)) return false; if (offtab->ot_window_size < offtab->ot_n_offsets) { __CTASSERT(MUL_OK(off_t, MAX_N_OFFSETS, sizeof(uint64_t))); const off_t offtab_bytes = ((off_t)offtab->ot_n_offsets * (off_t)sizeof(uint64_t)); assert(offtab->ot_fdpos <= OFFTAB_MAX_FDPOS); __CTASSERT(ADD_OK(off_t, OFFTAB_MAX_FDPOS, (off_t)MAX_N_OFFSETS*sizeof(uint64_t))); assert(ADD_OK(off_t, offtab->ot_fdpos, offtab_bytes)); const off_t first_offset = (offtab->ot_fdpos + offtab_bytes); if (lseek(offtab->ot_fd, first_offset, SEEK_SET) == -1) { (*offtab->ot_report)("lseek to first offset 0x%"PRIx64, first_offset); return false; } } return true; } /* * Do any I/O or bookkeeping necessary to fetch the offset for blkno in * preparation for a call to offtab_get. May fail; returns true on * success, false on failure. */ bool offtab_prepare_get(struct offtab *offtab, uint32_t blkno) { assert(offtab->ot_mode == OFFTAB_MODE_READ); assert(blkno < offtab->ot_n_offsets); if (!offtab_maybe_read_window(offtab, blkno, OFFTAB_READ_SEEK)) return false; assert(offtab->ot_window_start <= blkno); assert(blkno < offtab_current_window_end(offtab)); offtab->ot_blkno = blkno; return true; } /* * Return the offset for blkno. Caller must have called * offtab_prepare_get beforehand. */ uint64_t offtab_get(struct offtab *offtab, uint32_t blkno) { assert(offtab->ot_mode == OFFTAB_MODE_READ); assert(blkno == offtab->ot_blkno); assert(offtab->ot_window_start <= blkno); assert(blkno < offtab_current_window_end(offtab)); return be64toh(offtab->ot_window[blkno - offtab->ot_window_start]); } /* * Reset offtab for writing a fresh offset table. Initializes * in-memory state and writes an empty offset table to offtab->ot_fd, * which must currently be at byte position offtab->ot_fdpos. May * fail; returns on success, aborts with err(3) on failure. */ void offtab_reset_write(struct offtab *offtab) { uint32_t i; assert(lseek(offtab->ot_fd, 0, SEEK_CUR) == offtab->ot_fdpos); offtab->ot_mode = OFFTAB_MODE_WRITE; offtab->ot_blkno = (uint32_t)-1; /* * Initialize the offset table to all ones (except for the * fixed first offset) so that we can easily detect where we * were interrupted if we want to restart. */ __CTASSERT(MAX_N_OFFSETS <= UINT32_MAX); assert(offtab->ot_n_offsets > 0); /* Initialize window of all ones. */ for (i = 0; i < offtab->ot_window_size; i++) offtab->ot_window[i] = ~(uint64_t)0; /* Write the window to every position in the table. */ const uint32_t n_windows = howmany(offtab->ot_n_offsets, offtab->ot_window_size); for (i = 1; i < n_windows; i++) { /* Change the start but reuse the all-ones buffer. */ offtab->ot_window_start = (i * offtab->ot_window_size); offtab_write_window(offtab); } /* Compute the number of bytes in the offset table. */ __CTASSERT(MUL_OK(off_t, MAX_N_OFFSETS, sizeof(uint64_t))); const off_t offtab_bytes = ((off_t)offtab->ot_n_offsets * sizeof(uint64_t)); /* Compute the offset of the first block. */ assert(offtab->ot_fdpos <= OFFTAB_MAX_FDPOS); __CTASSERT(ADD_OK(off_t, OFFTAB_MAX_FDPOS, MAX_N_OFFSETS*sizeof(uint64_t))); assert(ADD_OK(off_t, offtab->ot_fdpos, offtab_bytes)); const off_t first_offset = (offtab->ot_fdpos + offtab_bytes); /* Assert that it fits in 64 bits. */ __CTASSERT(MUL_OK(uint64_t, MAX_N_OFFSETS, sizeof(uint64_t))); __CTASSERT(ADD_OK(uint64_t, OFFTAB_MAX_FDPOS, (uint64_t)MAX_N_OFFSETS*sizeof(uint64_t))); /* Write out the first window with the first offset. */ offtab->ot_window_start = 0; offtab->ot_window[0] = htobe64((uint64_t)first_offset); offtab_write_window(offtab); if (lseek(offtab->ot_fd, first_offset, SEEK_SET) == -1) err(1, "lseek to first offset failed"); } /* * Guarantee that the disk reflects block offsets [0, n_offsets). If * OFFTAB_CHECKPOINT_SYNC is set in flags, will also fsync the entire * offset table. May fail; returns on success, aborts with err(3) on * failure. Fsync failure is considered success but is reported with a * warning. * * This routine does not write state in memory, and does not read state * that is not signal-safe. The only state read is offtab->ot_window, * offtab->ot_window_start, and quantities that are static for the * signal-interruptable existence of the offset table. */ void offtab_checkpoint(struct offtab *offtab, uint32_t n_offsets, int flags) { assert(offtab->ot_mode == OFFTAB_MODE_WRITE); assert(n_offsets <= offtab->ot_n_offsets); /* * Write the window unless we just did that and were * interrupted before we could move the window. */ if (offtab->ot_window != NULL) offtab_maybe_write_window(offtab, 0, n_offsets); if (ISSET(flags, OFFTAB_CHECKPOINT_SYNC)) { __CTASSERT(MUL_OK(off_t, MAX_N_OFFSETS, sizeof(uint64_t))); const off_t sync_bytes = ((off_t)n_offsets * (off_t)sizeof(uint64_t)); __CTASSERT(ADD_OK(off_t, OFFTAB_MAX_FDPOS, MAX_N_OFFSETS*sizeof(uint64_t))); assert(ADD_OK(off_t, offtab->ot_fdpos, sync_bytes)); if (fsync_range(offtab->ot_fd, (FFILESYNC | FDISKSYNC), offtab->ot_fdpos, (offtab->ot_fdpos + sync_bytes)) == -1) warn_ss("fsync of offset table failed"); } } /* * Do any I/O or bookkeeping necessary to set an offset for blkno. May * fail; returns on success, aborts with err(3) on failure. */ void offtab_prepare_put(struct offtab *offtab, uint32_t blkno) { uint32_t i; assert(offtab->ot_mode == OFFTAB_MODE_WRITE); assert(blkno < offtab->ot_n_offsets); /* * Assume, for convenience, that we write blocks in order. * Thus we need not do another read -- we can just clear the * window. */ assert((offtab->ot_blkno == (uint32_t)-1) || ((offtab->ot_blkno + 1) == blkno)); /* If it's already in our window, we're good to go. */ if ((offtab->ot_window_start <= blkno) && (blkno < offtab_current_window_end(offtab))) goto win; /* Otherwise, write out the current window and choose a new one. */ offtab_write_window(offtab); assert(offtab->ot_window_size <= blkno); assert(offtab->ot_window_start == (blkno - offtab->ot_window_size)); assert((offtab->ot_window_start + offtab->ot_window_size) == rounddown(blkno, offtab->ot_window_size)); { uint64_t *window; sigset_t sigmask; /* * Mark the window as being updated so nobody tries to write it * (since we just wrote it) while we fill it with ones. */ block_signals(&sigmask); window = offtab->ot_window; offtab->ot_window = NULL; restore_sigmask(&sigmask); /* Fill the window with ones. */ for (i = 0; i < offtab_current_window_size(offtab); i++) window[i] = ~(uint64_t)0; /* Restore the window as ready again. */ block_signals(&sigmask); offtab->ot_window = window; offtab->ot_window_start = rounddown(blkno, offtab->ot_window_size); restore_sigmask(&sigmask); } win: assert(offtab->ot_window_start <= blkno); assert(blkno < offtab_current_window_end(offtab)); offtab->ot_blkno = blkno; } /* * Actually set the offset for blkno. */ void offtab_put(struct offtab *offtab, uint32_t blkno, uint64_t offset) { assert(offtab->ot_mode == OFFTAB_MODE_WRITE); assert(blkno == offtab->ot_blkno); assert(offtab->ot_window_start <= blkno); assert(blkno < offtab_current_window_end(offtab)); offtab->ot_window[blkno - offtab->ot_window_start] = htobe64(offset); }