First some background. A typical snapshot setup uses four devices: Visible device-mapper devices: + origin (ORIGIN) + snapshot (SNAP) Hidden devices: + original device (DEV) + copy-on-write device (COW) The device-mapper snapshot target constructs the virtual devices ORIGIN and SNAP out of the real devices DEV and COW. SNAP is an image of DEV at the instant the snapshot was created. When you write to each part of ORIGIN for the first time, a copy of the data you're about to change is first stashed away in COW so that SNAP can reconstruct it later. Now for the first race in the existing implementation. If you attempt to read from SNAP data that has not yet changed, the request gets passed through mapped to DEV. The snapshot code does not track it beyond that point so it has no way of knowing whether or not the I/O to DEV is complete nor even whether it has been submitted yet. If, subsequently, a write is issued to ORIGIN for the same blocks, the code does not know about the outstanding read and blithely goes ahead and copies the original data across to COW and then allows the write to change DEV. It does not wait for the read to complete first, so if the read from SNAP got delayed for any reason it could return changed data from DEV! Fortunately this is very rare because reading from DEV is simple and quick but writing to ORIGIN involves a series of slow steps. This patch addresses the problem by adding code to track reads to SNAP. The pending_exception structure (previously only used for ORIGIN and SNAP writes) is now also allocated for SNAP reads that get mapped to DEV. The sibling_count variable is renamed to ref_count and now keeps track of the number of outstanding reads against SNAP and the number of in chunks in the process of being copied to COW. Device-mapper offers us private per-bio storage in map_context->ptr, so we store the pending_exception structure there when mapping the bio so we can retrieve it in our end_io function. pending_complete() now checks for any outstanding SNAP reads. If it finds any, it no longer writes to DEV and no longer destroys the pending exception, but rather leaves this until all SNAP reads have been completed. It still updates the exception tables though so that all further I/O will see the real exception instead of the pending one. When the SNAP read is complete, snapshot_end_io will be called. If there are no more SNAP reads outstanding, and the chunk is not in the process of being copied to any COW device, then it triggers a flush of all queued ORIGIN writes and destroys the pending exception. The scope of the pe_lock spinlock is expanded to protect ref_count and calls to lookup_exception(), insert_exception() and remove_exception() for pending_exceptions. Index: linux-2.6.16.i686/drivers/md/dm-snap.c =================================================================== --- linux-2.6.16.i686.orig/drivers/md/dm-snap.c 2006-04-25 08:58:17.000000000 +0100 +++ linux-2.6.16.i686/drivers/md/dm-snap.c 2006-04-25 18:08:26.000000000 +0100 @@ -57,21 +57,22 @@ struct pending_exception { struct list_head list; /* - * The primary pending_exception is the one that holds - * the sibling_count and the list of origin_bios for a - * group of pending_exceptions. It is always last to get freed. + * The primary pending_exception is the one that holds the + * ref_count and the list of origin_bios for a group of + * pending_exceptions. It is always last to get freed. * These fields get set up when writing to the origin. */ struct pending_exception *primary_pe; /* - * Number of pending_exceptions processing this chunk. - * When this drops to zero we must complete the origin bios. + * Number of exception copies or snapshot reads pending on + * this chunk. When this drops to zero we must complete the + * origin bios. * If incrementing or decrementing this, hold pe->snap->lock for * the sibling concerned and not pe->primary_pe->snap->lock unless * they are the same. */ - atomic_t sibling_count; + atomic_t ref_count; /* Pointer back to snapshot context */ struct dm_snapshot *snap; @@ -655,12 +656,47 @@ static void __invalidate_snapshot(struct dm_table_event(s->table); } +static inline void ref_pending_exception(struct pending_exception *pe) +{ + atomic_inc(&pe->ref_count); + if (pe->primary_pe && pe->primary_pe != pe) + atomic_inc(&pe->primary_pe->ref_count); +} + +static struct bio *unref_pending_exception(struct pending_exception *pe) +{ + struct dm_snapshot *s = pe->snap; + struct pending_exception *primary_pe; + struct bio *origin_flush = NULL; + unsigned long flags; + + spin_lock_irqsave(&s->pe_lock, flags); + + primary_pe = pe->primary_pe; + + if ((!primary_pe || primary_pe != pe) && + atomic_dec_and_test(&pe->ref_count)) { + remove_exception(&pe->e); + free_pending_exception(pe); + } + + if (primary_pe && atomic_dec_and_test(&primary_pe->ref_count)) { + origin_flush = bio_list_get(&primary_pe->origin_bios); + remove_exception(&primary_pe->e); + free_pending_exception(primary_pe); + } + + spin_unlock_irqrestore(&s->pe_lock, flags); + + return origin_flush; +} + static void pending_complete(struct pending_exception *pe, int success) { struct exception *e; - struct pending_exception *primary_pe; struct dm_snapshot *s = pe->snap; - struct bio *flush = NULL; + struct bio *snapshot_flush = NULL; + struct bio *origin_flush = NULL; int error = 0; if (!success) { @@ -692,43 +728,20 @@ static void pending_complete(struct pend * in-flight exception from the list. */ insert_exception(&s->complete, e); - remove_exception(&pe->e); out: - primary_pe = pe->primary_pe; + snapshot_flush = bio_list_get(&pe->snapshot_bios); - /* - * If this pe is involved in a write to the origin and - * it is the last sibling to complete then release - * the bios for the original write to the origin. - */ - if (primary_pe && - atomic_dec_and_test(&primary_pe->sibling_count)) - flush = bio_list_get(&primary_pe->origin_bios); + origin_flush = unref_pending_exception(pe); up_write(&s->lock); - /* Submit any pending write bios */ if (!error) - flush_bios(bio_list_get(&pe->snapshot_bios)); + flush_bios(snapshot_flush); else - error_bios(bio_list_get(&pe->snapshot_bios)); - - /* - * Free the pe if it's not linked to an origin write or if - * it's not itself a primary pe. - */ - if (!primary_pe || primary_pe != pe) - free_pending_exception(pe); - - /* - * Free the primary pe if nothing references it. - */ - if (primary_pe && !atomic_read(&primary_pe->sibling_count)) - free_pending_exception(primary_pe); + error_bios(snapshot_flush); - if (flush) - flush_bios(flush); + flush_bios(origin_flush); } static void commit_callback(void *context, int success) @@ -794,6 +807,9 @@ __find_pending_exception(struct dm_snaps struct exception *e; struct pending_exception *pe; chunk_t chunk = sector_to_chunk(s, bio->bi_sector); + unsigned long flags; + + spin_lock_irqsave(&s->pe_lock, flags); /* * Is there a pending exception for this already ? @@ -802,45 +818,55 @@ __find_pending_exception(struct dm_snaps if (e) { /* cast the exception to a pending exception */ pe = container_of(e, struct pending_exception, e); - goto out; + goto ref_and_out; } /* * Create a new pending exception, we don't want * to hold the lock while we do this. */ + spin_unlock_irqrestore(&s->pe_lock, flags); up_write(&s->lock); pe = alloc_pending_exception(); down_write(&s->lock); + spin_lock_irqsave(&s->pe_lock, flags); if (!s->valid) { free_pending_exception(pe); - return NULL; + pe = NULL; + goto out; } e = lookup_exception(&s->pending, chunk); if (e) { free_pending_exception(pe); pe = container_of(e, struct pending_exception, e); - goto out; + goto ref_and_out; } pe->e.old_chunk = chunk; bio_list_init(&pe->origin_bios); bio_list_init(&pe->snapshot_bios); pe->primary_pe = NULL; - atomic_set(&pe->sibling_count, 1); + atomic_set(&pe->ref_count, 0); pe->snap = s; pe->started = 0; - if (s->store.prepare_exception(&s->store, &pe->e)) { - free_pending_exception(pe); - return NULL; - } - insert_exception(&s->pending, &pe->e); + ref_and_out: + if (bio_rw(bio) != WRITE) + ref_pending_exception(pe); + else if (!pe->started) { + if (s->store.prepare_exception(&s->store, &pe->e)) { + free_pending_exception(pe); + pe = NULL; + goto out; + } + ref_pending_exception(pe); + } out: + spin_unlock_irqrestore(&s->pe_lock, flags); return pe; } @@ -887,39 +913,28 @@ static int snapshot_map(struct dm_target goto out_unlock; } - /* - * Write to snapshot - higher level takes care of RW/RO - * flags so we should only get this if we are - * writeable. - */ - if (bio_rw(bio) == WRITE) { - pe = __find_pending_exception(s, bio); - if (!pe) { - __invalidate_snapshot(s, pe, -ENOMEM); - r = -EIO; - goto out_unlock; - } + pe = __find_pending_exception(s, bio); + if (!pe) { + __invalidate_snapshot(s, pe, -ENOMEM); + r = -EIO; + goto out_unlock; + } + if (bio_rw(bio) == WRITE) { remap_exception(s, &pe->e, bio); bio_list_add(&pe->snapshot_bios, bio); r = 0; if (!pe->started) { - /* this is protected by snap->lock */ pe->started = 1; up_write(&s->lock); start_copy(pe); goto out; } } else { - /* - * FIXME: this read path scares me because we - * always use the origin when we have a pending - * exception. However I can't think of a - * situation where this is wrong - ejt. - */ bio->bi_bdev = s->origin->bdev; + map_context->ptr = pe; } out_unlock: @@ -928,6 +943,38 @@ static int snapshot_map(struct dm_target return r; } +static int snapshot_end_io(struct dm_target *ti, struct bio *bio, + int error, union map_info *map_context) +{ + struct dm_snapshot *s = (struct dm_snapshot *) ti->private; + struct pending_exception *pe = + (struct pending_exception *) map_context->ptr; + struct bio *flush = NULL; + struct bio *next; + unsigned long flags; + + if (bio_rw(bio) == WRITE || pe == NULL) + return error; + + flush = unref_pending_exception(pe); + + if (flush) { + spin_lock_irqsave(&s->pe_lock, flags); + + while (flush) { + next = flush->bi_next; + bio_list_add(&s->queued_bios, flush); + flush = next; + } + + queue_work(ksnapd, &s->queued_bios_work); + + spin_unlock_irqrestore(&s->pe_lock, flags); + } + + return error; +} + static void snapshot_resume(struct dm_target *ti) { struct dm_snapshot *s = (struct dm_snapshot *) ti->private; @@ -982,11 +1029,12 @@ static int snapshot_status(struct dm_tar *---------------------------------------------------------------*/ static int __origin_write(struct list_head *snapshots, struct bio *bio) { - int r = 1, first = 0; + int r = 1; struct dm_snapshot *snap; struct exception *e; struct pending_exception *pe, *next_pe, *primary_pe = NULL; chunk_t chunk; + unsigned long flags; LIST_HEAD(pe_queue); /* Do all the snapshots on this origin */ @@ -1013,8 +1061,6 @@ static int __origin_write(struct list_he * is already remapped in this snapshot * and trigger an exception if not. * - * sibling_count is initialised to 1 so pending_complete() - * won't destroy the primary_pe while we're inside this loop. */ e = lookup_exception(&snap->complete, chunk); if (e) @@ -1026,6 +1072,8 @@ static int __origin_write(struct list_he goto next_snapshot; } + spin_lock_irqsave(&snap->pe_lock, flags); + if (!primary_pe) { /* * Either every pe here has same @@ -1033,10 +1081,15 @@ static int __origin_write(struct list_he */ if (pe->primary_pe) primary_pe = pe->primary_pe; - else { + else primary_pe = pe; - first = 1; - } + + /* + * we take a ref here so primary_pe won't + * be destroyed while we're inside this + * loop. + */ + ref_pending_exception(primary_pe); bio_list_add(&primary_pe->origin_bios, bio); @@ -1044,10 +1097,14 @@ static int __origin_write(struct list_he } if (!pe->primary_pe) { - atomic_inc(&primary_pe->sibling_count); pe->primary_pe = primary_pe; + if (primary_pe != pe) + atomic_add(atomic_read(&pe->ref_count), + &primary_pe->ref_count); } + spin_unlock_irqrestore(&snap->pe_lock, flags); + if (!pe->started) { pe->started = 1; list_add_tail(&pe->list, &pe_queue); @@ -1058,21 +1115,14 @@ static int __origin_write(struct list_he } if (!primary_pe) - goto out; + return r; /* - * If this is the first time we're processing this chunk and - * sibling_count is now 1 it means all the pending exceptions + * If ref_count is now 1 it means all the pending exceptions * got completed while we were in the loop above, so it falls to * us here to remove the primary_pe and submit any origin_bios. */ - - if (first && atomic_dec_and_test(&primary_pe->sibling_count)) { - flush_bios(bio_list_get(&primary_pe->origin_bios)); - free_pending_exception(primary_pe); - /* If we got here, pe_queue is necessarily empty. */ - goto out; - } + flush_bios(unref_pending_exception(primary_pe)); /* * Now that we have a complete pe list we can start the copying. @@ -1080,7 +1130,6 @@ static int __origin_write(struct list_he list_for_each_entry_safe(pe, next_pe, &pe_queue, list) start_copy(pe); - out: return r; } @@ -1209,6 +1258,7 @@ static struct target_type snapshot_targe .ctr = snapshot_ctr, .dtr = snapshot_dtr, .map = snapshot_map, + .end_io = snapshot_end_io, .resume = snapshot_resume, .status = snapshot_status, };