[go: nahoru, domu]

1/*
2 * blkfront.c
3 *
4 * XenLinux virtual block device driver.
5 *
6 * Copyright (c) 2003-2004, Keir Fraser & Steve Hand
7 * Modifications by Mark A. Williamson are (c) Intel Research Cambridge
8 * Copyright (c) 2004, Christian Limpach
9 * Copyright (c) 2004, Andrew Warfield
10 * Copyright (c) 2005, Christopher Clark
11 * Copyright (c) 2005, XenSource Ltd
12 *
13 * This program is free software; you can redistribute it and/or
14 * modify it under the terms of the GNU General Public License version 2
15 * as published by the Free Software Foundation; or, when distributed
16 * separately from the Linux kernel or incorporated into other
17 * software packages, subject to the following license:
18 *
19 * Permission is hereby granted, free of charge, to any person obtaining a copy
20 * of this source file (the "Software"), to deal in the Software without
21 * restriction, including without limitation the rights to use, copy, modify,
22 * merge, publish, distribute, sublicense, and/or sell copies of the Software,
23 * and to permit persons to whom the Software is furnished to do so, subject to
24 * the following conditions:
25 *
26 * The above copyright notice and this permission notice shall be included in
27 * all copies or substantial portions of the Software.
28 *
29 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
30 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
31 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
32 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
33 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
34 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
35 * IN THE SOFTWARE.
36 */
37
38#include <linux/interrupt.h>
39#include <linux/blkdev.h>
40#include <linux/hdreg.h>
41#include <linux/cdrom.h>
42#include <linux/module.h>
43#include <linux/slab.h>
44#include <linux/mutex.h>
45#include <linux/scatterlist.h>
46#include <linux/bitmap.h>
47#include <linux/list.h>
48
49#include <xen/xen.h>
50#include <xen/xenbus.h>
51#include <xen/grant_table.h>
52#include <xen/events.h>
53#include <xen/page.h>
54#include <xen/platform_pci.h>
55
56#include <xen/interface/grant_table.h>
57#include <xen/interface/io/blkif.h>
58#include <xen/interface/io/protocols.h>
59
60#include <asm/xen/hypervisor.h>
61
62enum blkif_state {
63	BLKIF_STATE_DISCONNECTED,
64	BLKIF_STATE_CONNECTED,
65	BLKIF_STATE_SUSPENDED,
66};
67
68struct grant {
69	grant_ref_t gref;
70	unsigned long pfn;
71	struct list_head node;
72};
73
74struct blk_shadow {
75	struct blkif_request req;
76	struct request *request;
77	struct grant **grants_used;
78	struct grant **indirect_grants;
79	struct scatterlist *sg;
80};
81
82struct split_bio {
83	struct bio *bio;
84	atomic_t pending;
85	int err;
86};
87
88static DEFINE_MUTEX(blkfront_mutex);
89static const struct block_device_operations xlvbd_block_fops;
90
91/*
92 * Maximum number of segments in indirect requests, the actual value used by
93 * the frontend driver is the minimum of this value and the value provided
94 * by the backend driver.
95 */
96
97static unsigned int xen_blkif_max_segments = 32;
98module_param_named(max, xen_blkif_max_segments, int, S_IRUGO);
99MODULE_PARM_DESC(max, "Maximum amount of segments in indirect requests (default is 32)");
100
101#define BLK_RING_SIZE __CONST_RING_SIZE(blkif, PAGE_SIZE)
102
103/*
104 * We have one of these per vbd, whether ide, scsi or 'other'.  They
105 * hang in private_data off the gendisk structure. We may end up
106 * putting all kinds of interesting stuff here :-)
107 */
108struct blkfront_info
109{
110	spinlock_t io_lock;
111	struct mutex mutex;
112	struct xenbus_device *xbdev;
113	struct gendisk *gd;
114	int vdevice;
115	blkif_vdev_t handle;
116	enum blkif_state connected;
117	int ring_ref;
118	struct blkif_front_ring ring;
119	unsigned int evtchn, irq;
120	struct request_queue *rq;
121	struct work_struct work;
122	struct gnttab_free_callback callback;
123	struct blk_shadow shadow[BLK_RING_SIZE];
124	struct list_head grants;
125	struct list_head indirect_pages;
126	unsigned int persistent_gnts_c;
127	unsigned long shadow_free;
128	unsigned int feature_flush;
129	unsigned int flush_op;
130	unsigned int feature_discard:1;
131	unsigned int feature_secdiscard:1;
132	unsigned int discard_granularity;
133	unsigned int discard_alignment;
134	unsigned int feature_persistent:1;
135	unsigned int max_indirect_segments;
136	int is_ready;
137};
138
139static unsigned int nr_minors;
140static unsigned long *minors;
141static DEFINE_SPINLOCK(minor_lock);
142
143#define MAXIMUM_OUTSTANDING_BLOCK_REQS \
144	(BLKIF_MAX_SEGMENTS_PER_REQUEST * BLK_RING_SIZE)
145#define GRANT_INVALID_REF	0
146
147#define PARTS_PER_DISK		16
148#define PARTS_PER_EXT_DISK      256
149
150#define BLKIF_MAJOR(dev) ((dev)>>8)
151#define BLKIF_MINOR(dev) ((dev) & 0xff)
152
153#define EXT_SHIFT 28
154#define EXTENDED (1<<EXT_SHIFT)
155#define VDEV_IS_EXTENDED(dev) ((dev)&(EXTENDED))
156#define BLKIF_MINOR_EXT(dev) ((dev)&(~EXTENDED))
157#define EMULATED_HD_DISK_MINOR_OFFSET (0)
158#define EMULATED_HD_DISK_NAME_OFFSET (EMULATED_HD_DISK_MINOR_OFFSET / 256)
159#define EMULATED_SD_DISK_MINOR_OFFSET (0)
160#define EMULATED_SD_DISK_NAME_OFFSET (EMULATED_SD_DISK_MINOR_OFFSET / 256)
161
162#define DEV_NAME	"xvd"	/* name in /dev */
163
164#define SEGS_PER_INDIRECT_FRAME \
165	(PAGE_SIZE/sizeof(struct blkif_request_segment))
166#define INDIRECT_GREFS(_segs) \
167	((_segs + SEGS_PER_INDIRECT_FRAME - 1)/SEGS_PER_INDIRECT_FRAME)
168
169static int blkfront_setup_indirect(struct blkfront_info *info);
170
171static int get_id_from_freelist(struct blkfront_info *info)
172{
173	unsigned long free = info->shadow_free;
174	BUG_ON(free >= BLK_RING_SIZE);
175	info->shadow_free = info->shadow[free].req.u.rw.id;
176	info->shadow[free].req.u.rw.id = 0x0fffffee; /* debug */
177	return free;
178}
179
180static int add_id_to_freelist(struct blkfront_info *info,
181			       unsigned long id)
182{
183	if (info->shadow[id].req.u.rw.id != id)
184		return -EINVAL;
185	if (info->shadow[id].request == NULL)
186		return -EINVAL;
187	info->shadow[id].req.u.rw.id  = info->shadow_free;
188	info->shadow[id].request = NULL;
189	info->shadow_free = id;
190	return 0;
191}
192
193static int fill_grant_buffer(struct blkfront_info *info, int num)
194{
195	struct page *granted_page;
196	struct grant *gnt_list_entry, *n;
197	int i = 0;
198
199	while(i < num) {
200		gnt_list_entry = kzalloc(sizeof(struct grant), GFP_NOIO);
201		if (!gnt_list_entry)
202			goto out_of_memory;
203
204		if (info->feature_persistent) {
205			granted_page = alloc_page(GFP_NOIO);
206			if (!granted_page) {
207				kfree(gnt_list_entry);
208				goto out_of_memory;
209			}
210			gnt_list_entry->pfn = page_to_pfn(granted_page);
211		}
212
213		gnt_list_entry->gref = GRANT_INVALID_REF;
214		list_add(&gnt_list_entry->node, &info->grants);
215		i++;
216	}
217
218	return 0;
219
220out_of_memory:
221	list_for_each_entry_safe(gnt_list_entry, n,
222	                         &info->grants, node) {
223		list_del(&gnt_list_entry->node);
224		if (info->feature_persistent)
225			__free_page(pfn_to_page(gnt_list_entry->pfn));
226		kfree(gnt_list_entry);
227		i--;
228	}
229	BUG_ON(i != 0);
230	return -ENOMEM;
231}
232
233static struct grant *get_grant(grant_ref_t *gref_head,
234                               unsigned long pfn,
235                               struct blkfront_info *info)
236{
237	struct grant *gnt_list_entry;
238	unsigned long buffer_mfn;
239
240	BUG_ON(list_empty(&info->grants));
241	gnt_list_entry = list_first_entry(&info->grants, struct grant,
242	                                  node);
243	list_del(&gnt_list_entry->node);
244
245	if (gnt_list_entry->gref != GRANT_INVALID_REF) {
246		info->persistent_gnts_c--;
247		return gnt_list_entry;
248	}
249
250	/* Assign a gref to this page */
251	gnt_list_entry->gref = gnttab_claim_grant_reference(gref_head);
252	BUG_ON(gnt_list_entry->gref == -ENOSPC);
253	if (!info->feature_persistent) {
254		BUG_ON(!pfn);
255		gnt_list_entry->pfn = pfn;
256	}
257	buffer_mfn = pfn_to_mfn(gnt_list_entry->pfn);
258	gnttab_grant_foreign_access_ref(gnt_list_entry->gref,
259	                                info->xbdev->otherend_id,
260	                                buffer_mfn, 0);
261	return gnt_list_entry;
262}
263
264static const char *op_name(int op)
265{
266	static const char *const names[] = {
267		[BLKIF_OP_READ] = "read",
268		[BLKIF_OP_WRITE] = "write",
269		[BLKIF_OP_WRITE_BARRIER] = "barrier",
270		[BLKIF_OP_FLUSH_DISKCACHE] = "flush",
271		[BLKIF_OP_DISCARD] = "discard" };
272
273	if (op < 0 || op >= ARRAY_SIZE(names))
274		return "unknown";
275
276	if (!names[op])
277		return "reserved";
278
279	return names[op];
280}
281static int xlbd_reserve_minors(unsigned int minor, unsigned int nr)
282{
283	unsigned int end = minor + nr;
284	int rc;
285
286	if (end > nr_minors) {
287		unsigned long *bitmap, *old;
288
289		bitmap = kcalloc(BITS_TO_LONGS(end), sizeof(*bitmap),
290				 GFP_KERNEL);
291		if (bitmap == NULL)
292			return -ENOMEM;
293
294		spin_lock(&minor_lock);
295		if (end > nr_minors) {
296			old = minors;
297			memcpy(bitmap, minors,
298			       BITS_TO_LONGS(nr_minors) * sizeof(*bitmap));
299			minors = bitmap;
300			nr_minors = BITS_TO_LONGS(end) * BITS_PER_LONG;
301		} else
302			old = bitmap;
303		spin_unlock(&minor_lock);
304		kfree(old);
305	}
306
307	spin_lock(&minor_lock);
308	if (find_next_bit(minors, end, minor) >= end) {
309		bitmap_set(minors, minor, nr);
310		rc = 0;
311	} else
312		rc = -EBUSY;
313	spin_unlock(&minor_lock);
314
315	return rc;
316}
317
318static void xlbd_release_minors(unsigned int minor, unsigned int nr)
319{
320	unsigned int end = minor + nr;
321
322	BUG_ON(end > nr_minors);
323	spin_lock(&minor_lock);
324	bitmap_clear(minors,  minor, nr);
325	spin_unlock(&minor_lock);
326}
327
328static void blkif_restart_queue_callback(void *arg)
329{
330	struct blkfront_info *info = (struct blkfront_info *)arg;
331	schedule_work(&info->work);
332}
333
334static int blkif_getgeo(struct block_device *bd, struct hd_geometry *hg)
335{
336	/* We don't have real geometry info, but let's at least return
337	   values consistent with the size of the device */
338	sector_t nsect = get_capacity(bd->bd_disk);
339	sector_t cylinders = nsect;
340
341	hg->heads = 0xff;
342	hg->sectors = 0x3f;
343	sector_div(cylinders, hg->heads * hg->sectors);
344	hg->cylinders = cylinders;
345	if ((sector_t)(hg->cylinders + 1) * hg->heads * hg->sectors < nsect)
346		hg->cylinders = 0xffff;
347	return 0;
348}
349
350static int blkif_ioctl(struct block_device *bdev, fmode_t mode,
351		       unsigned command, unsigned long argument)
352{
353	struct blkfront_info *info = bdev->bd_disk->private_data;
354	int i;
355
356	dev_dbg(&info->xbdev->dev, "command: 0x%x, argument: 0x%lx\n",
357		command, (long)argument);
358
359	switch (command) {
360	case CDROMMULTISESSION:
361		dev_dbg(&info->xbdev->dev, "FIXME: support multisession CDs later\n");
362		for (i = 0; i < sizeof(struct cdrom_multisession); i++)
363			if (put_user(0, (char __user *)(argument + i)))
364				return -EFAULT;
365		return 0;
366
367	case CDROM_GET_CAPABILITY: {
368		struct gendisk *gd = info->gd;
369		if (gd->flags & GENHD_FL_CD)
370			return 0;
371		return -EINVAL;
372	}
373
374	default:
375		/*printk(KERN_ALERT "ioctl %08x not supported by Xen blkdev\n",
376		  command);*/
377		return -EINVAL; /* same return as native Linux */
378	}
379
380	return 0;
381}
382
383/*
384 * Generate a Xen blkfront IO request from a blk layer request.  Reads
385 * and writes are handled as expected.
386 *
387 * @req: a request struct
388 */
389static int blkif_queue_request(struct request *req)
390{
391	struct blkfront_info *info = req->rq_disk->private_data;
392	struct blkif_request *ring_req;
393	unsigned long id;
394	unsigned int fsect, lsect;
395	int i, ref, n;
396	struct blkif_request_segment *segments = NULL;
397
398	/*
399	 * Used to store if we are able to queue the request by just using
400	 * existing persistent grants, or if we have to get new grants,
401	 * as there are not sufficiently many free.
402	 */
403	bool new_persistent_gnts;
404	grant_ref_t gref_head;
405	struct grant *gnt_list_entry = NULL;
406	struct scatterlist *sg;
407	int nseg, max_grefs;
408
409	if (unlikely(info->connected != BLKIF_STATE_CONNECTED))
410		return 1;
411
412	max_grefs = req->nr_phys_segments;
413	if (max_grefs > BLKIF_MAX_SEGMENTS_PER_REQUEST)
414		/*
415		 * If we are using indirect segments we need to account
416		 * for the indirect grefs used in the request.
417		 */
418		max_grefs += INDIRECT_GREFS(req->nr_phys_segments);
419
420	/* Check if we have enough grants to allocate a requests */
421	if (info->persistent_gnts_c < max_grefs) {
422		new_persistent_gnts = 1;
423		if (gnttab_alloc_grant_references(
424		    max_grefs - info->persistent_gnts_c,
425		    &gref_head) < 0) {
426			gnttab_request_free_callback(
427				&info->callback,
428				blkif_restart_queue_callback,
429				info,
430				max_grefs);
431			return 1;
432		}
433	} else
434		new_persistent_gnts = 0;
435
436	/* Fill out a communications ring structure. */
437	ring_req = RING_GET_REQUEST(&info->ring, info->ring.req_prod_pvt);
438	id = get_id_from_freelist(info);
439	info->shadow[id].request = req;
440
441	if (unlikely(req->cmd_flags & (REQ_DISCARD | REQ_SECURE))) {
442		ring_req->operation = BLKIF_OP_DISCARD;
443		ring_req->u.discard.nr_sectors = blk_rq_sectors(req);
444		ring_req->u.discard.id = id;
445		ring_req->u.discard.sector_number = (blkif_sector_t)blk_rq_pos(req);
446		if ((req->cmd_flags & REQ_SECURE) && info->feature_secdiscard)
447			ring_req->u.discard.flag = BLKIF_DISCARD_SECURE;
448		else
449			ring_req->u.discard.flag = 0;
450	} else {
451		BUG_ON(info->max_indirect_segments == 0 &&
452		       req->nr_phys_segments > BLKIF_MAX_SEGMENTS_PER_REQUEST);
453		BUG_ON(info->max_indirect_segments &&
454		       req->nr_phys_segments > info->max_indirect_segments);
455		nseg = blk_rq_map_sg(req->q, req, info->shadow[id].sg);
456		ring_req->u.rw.id = id;
457		if (nseg > BLKIF_MAX_SEGMENTS_PER_REQUEST) {
458			/*
459			 * The indirect operation can only be a BLKIF_OP_READ or
460			 * BLKIF_OP_WRITE
461			 */
462			BUG_ON(req->cmd_flags & (REQ_FLUSH | REQ_FUA));
463			ring_req->operation = BLKIF_OP_INDIRECT;
464			ring_req->u.indirect.indirect_op = rq_data_dir(req) ?
465				BLKIF_OP_WRITE : BLKIF_OP_READ;
466			ring_req->u.indirect.sector_number = (blkif_sector_t)blk_rq_pos(req);
467			ring_req->u.indirect.handle = info->handle;
468			ring_req->u.indirect.nr_segments = nseg;
469		} else {
470			ring_req->u.rw.sector_number = (blkif_sector_t)blk_rq_pos(req);
471			ring_req->u.rw.handle = info->handle;
472			ring_req->operation = rq_data_dir(req) ?
473				BLKIF_OP_WRITE : BLKIF_OP_READ;
474			if (req->cmd_flags & (REQ_FLUSH | REQ_FUA)) {
475				/*
476				 * Ideally we can do an unordered flush-to-disk. In case the
477				 * backend onlysupports barriers, use that. A barrier request
478				 * a superset of FUA, so we can implement it the same
479				 * way.  (It's also a FLUSH+FUA, since it is
480				 * guaranteed ordered WRT previous writes.)
481				 */
482				ring_req->operation = info->flush_op;
483			}
484			ring_req->u.rw.nr_segments = nseg;
485		}
486		for_each_sg(info->shadow[id].sg, sg, nseg, i) {
487			fsect = sg->offset >> 9;
488			lsect = fsect + (sg->length >> 9) - 1;
489
490			if ((ring_req->operation == BLKIF_OP_INDIRECT) &&
491			    (i % SEGS_PER_INDIRECT_FRAME == 0)) {
492				unsigned long uninitialized_var(pfn);
493
494				if (segments)
495					kunmap_atomic(segments);
496
497				n = i / SEGS_PER_INDIRECT_FRAME;
498				if (!info->feature_persistent) {
499					struct page *indirect_page;
500
501					/* Fetch a pre-allocated page to use for indirect grefs */
502					BUG_ON(list_empty(&info->indirect_pages));
503					indirect_page = list_first_entry(&info->indirect_pages,
504					                                 struct page, lru);
505					list_del(&indirect_page->lru);
506					pfn = page_to_pfn(indirect_page);
507				}
508				gnt_list_entry = get_grant(&gref_head, pfn, info);
509				info->shadow[id].indirect_grants[n] = gnt_list_entry;
510				segments = kmap_atomic(pfn_to_page(gnt_list_entry->pfn));
511				ring_req->u.indirect.indirect_grefs[n] = gnt_list_entry->gref;
512			}
513
514			gnt_list_entry = get_grant(&gref_head, page_to_pfn(sg_page(sg)), info);
515			ref = gnt_list_entry->gref;
516
517			info->shadow[id].grants_used[i] = gnt_list_entry;
518
519			if (rq_data_dir(req) && info->feature_persistent) {
520				char *bvec_data;
521				void *shared_data;
522
523				BUG_ON(sg->offset + sg->length > PAGE_SIZE);
524
525				shared_data = kmap_atomic(pfn_to_page(gnt_list_entry->pfn));
526				bvec_data = kmap_atomic(sg_page(sg));
527
528				/*
529				 * this does not wipe data stored outside the
530				 * range sg->offset..sg->offset+sg->length.
531				 * Therefore, blkback *could* see data from
532				 * previous requests. This is OK as long as
533				 * persistent grants are shared with just one
534				 * domain. It may need refactoring if this
535				 * changes
536				 */
537				memcpy(shared_data + sg->offset,
538				       bvec_data   + sg->offset,
539				       sg->length);
540
541				kunmap_atomic(bvec_data);
542				kunmap_atomic(shared_data);
543			}
544			if (ring_req->operation != BLKIF_OP_INDIRECT) {
545				ring_req->u.rw.seg[i] =
546						(struct blkif_request_segment) {
547							.gref       = ref,
548							.first_sect = fsect,
549							.last_sect  = lsect };
550			} else {
551				n = i % SEGS_PER_INDIRECT_FRAME;
552				segments[n] =
553					(struct blkif_request_segment) {
554							.gref       = ref,
555							.first_sect = fsect,
556							.last_sect  = lsect };
557			}
558		}
559		if (segments)
560			kunmap_atomic(segments);
561	}
562
563	info->ring.req_prod_pvt++;
564
565	/* Keep a private copy so we can reissue requests when recovering. */
566	info->shadow[id].req = *ring_req;
567
568	if (new_persistent_gnts)
569		gnttab_free_grant_references(gref_head);
570
571	return 0;
572}
573
574
575static inline void flush_requests(struct blkfront_info *info)
576{
577	int notify;
578
579	RING_PUSH_REQUESTS_AND_CHECK_NOTIFY(&info->ring, notify);
580
581	if (notify)
582		notify_remote_via_irq(info->irq);
583}
584
585static inline bool blkif_request_flush_valid(struct request *req,
586					     struct blkfront_info *info)
587{
588	return ((req->cmd_type != REQ_TYPE_FS) ||
589		((req->cmd_flags & (REQ_FLUSH | REQ_FUA)) &&
590		!info->flush_op));
591}
592
593/*
594 * do_blkif_request
595 *  read a block; request is in a request queue
596 */
597static void do_blkif_request(struct request_queue *rq)
598{
599	struct blkfront_info *info = NULL;
600	struct request *req;
601	int queued;
602
603	pr_debug("Entered do_blkif_request\n");
604
605	queued = 0;
606
607	while ((req = blk_peek_request(rq)) != NULL) {
608		info = req->rq_disk->private_data;
609
610		if (RING_FULL(&info->ring))
611			goto wait;
612
613		blk_start_request(req);
614
615		if (blkif_request_flush_valid(req, info)) {
616			__blk_end_request_all(req, -EIO);
617			continue;
618		}
619
620		pr_debug("do_blk_req %p: cmd %p, sec %lx, "
621			 "(%u/%u) [%s]\n",
622			 req, req->cmd, (unsigned long)blk_rq_pos(req),
623			 blk_rq_cur_sectors(req), blk_rq_sectors(req),
624			 rq_data_dir(req) ? "write" : "read");
625
626		if (blkif_queue_request(req)) {
627			blk_requeue_request(rq, req);
628wait:
629			/* Avoid pointless unplugs. */
630			blk_stop_queue(rq);
631			break;
632		}
633
634		queued++;
635	}
636
637	if (queued != 0)
638		flush_requests(info);
639}
640
641static int xlvbd_init_blk_queue(struct gendisk *gd, u16 sector_size,
642				unsigned int physical_sector_size,
643				unsigned int segments)
644{
645	struct request_queue *rq;
646	struct blkfront_info *info = gd->private_data;
647
648	rq = blk_init_queue(do_blkif_request, &info->io_lock);
649	if (rq == NULL)
650		return -1;
651
652	queue_flag_set_unlocked(QUEUE_FLAG_VIRT, rq);
653
654	if (info->feature_discard) {
655		queue_flag_set_unlocked(QUEUE_FLAG_DISCARD, rq);
656		blk_queue_max_discard_sectors(rq, get_capacity(gd));
657		rq->limits.discard_granularity = info->discard_granularity;
658		rq->limits.discard_alignment = info->discard_alignment;
659		if (info->feature_secdiscard)
660			queue_flag_set_unlocked(QUEUE_FLAG_SECDISCARD, rq);
661	}
662
663	/* Hard sector size and max sectors impersonate the equiv. hardware. */
664	blk_queue_logical_block_size(rq, sector_size);
665	blk_queue_physical_block_size(rq, physical_sector_size);
666	blk_queue_max_hw_sectors(rq, (segments * PAGE_SIZE) / 512);
667
668	/* Each segment in a request is up to an aligned page in size. */
669	blk_queue_segment_boundary(rq, PAGE_SIZE - 1);
670	blk_queue_max_segment_size(rq, PAGE_SIZE);
671
672	/* Ensure a merged request will fit in a single I/O ring slot. */
673	blk_queue_max_segments(rq, segments);
674
675	/* Make sure buffer addresses are sector-aligned. */
676	blk_queue_dma_alignment(rq, 511);
677
678	/* Make sure we don't use bounce buffers. */
679	blk_queue_bounce_limit(rq, BLK_BOUNCE_ANY);
680
681	gd->queue = rq;
682
683	return 0;
684}
685
686
687static void xlvbd_flush(struct blkfront_info *info)
688{
689	blk_queue_flush(info->rq, info->feature_flush);
690	printk(KERN_INFO "blkfront: %s: %s: %s %s %s %s %s\n",
691	       info->gd->disk_name,
692	       info->flush_op == BLKIF_OP_WRITE_BARRIER ?
693		"barrier" : (info->flush_op == BLKIF_OP_FLUSH_DISKCACHE ?
694		"flush diskcache" : "barrier or flush"),
695	       info->feature_flush ? "enabled;" : "disabled;",
696	       "persistent grants:",
697	       info->feature_persistent ? "enabled;" : "disabled;",
698	       "indirect descriptors:",
699	       info->max_indirect_segments ? "enabled;" : "disabled;");
700}
701
702static int xen_translate_vdev(int vdevice, int *minor, unsigned int *offset)
703{
704	int major;
705	major = BLKIF_MAJOR(vdevice);
706	*minor = BLKIF_MINOR(vdevice);
707	switch (major) {
708		case XEN_IDE0_MAJOR:
709			*offset = (*minor / 64) + EMULATED_HD_DISK_NAME_OFFSET;
710			*minor = ((*minor / 64) * PARTS_PER_DISK) +
711				EMULATED_HD_DISK_MINOR_OFFSET;
712			break;
713		case XEN_IDE1_MAJOR:
714			*offset = (*minor / 64) + 2 + EMULATED_HD_DISK_NAME_OFFSET;
715			*minor = (((*minor / 64) + 2) * PARTS_PER_DISK) +
716				EMULATED_HD_DISK_MINOR_OFFSET;
717			break;
718		case XEN_SCSI_DISK0_MAJOR:
719			*offset = (*minor / PARTS_PER_DISK) + EMULATED_SD_DISK_NAME_OFFSET;
720			*minor = *minor + EMULATED_SD_DISK_MINOR_OFFSET;
721			break;
722		case XEN_SCSI_DISK1_MAJOR:
723		case XEN_SCSI_DISK2_MAJOR:
724		case XEN_SCSI_DISK3_MAJOR:
725		case XEN_SCSI_DISK4_MAJOR:
726		case XEN_SCSI_DISK5_MAJOR:
727		case XEN_SCSI_DISK6_MAJOR:
728		case XEN_SCSI_DISK7_MAJOR:
729			*offset = (*minor / PARTS_PER_DISK) +
730				((major - XEN_SCSI_DISK1_MAJOR + 1) * 16) +
731				EMULATED_SD_DISK_NAME_OFFSET;
732			*minor = *minor +
733				((major - XEN_SCSI_DISK1_MAJOR + 1) * 16 * PARTS_PER_DISK) +
734				EMULATED_SD_DISK_MINOR_OFFSET;
735			break;
736		case XEN_SCSI_DISK8_MAJOR:
737		case XEN_SCSI_DISK9_MAJOR:
738		case XEN_SCSI_DISK10_MAJOR:
739		case XEN_SCSI_DISK11_MAJOR:
740		case XEN_SCSI_DISK12_MAJOR:
741		case XEN_SCSI_DISK13_MAJOR:
742		case XEN_SCSI_DISK14_MAJOR:
743		case XEN_SCSI_DISK15_MAJOR:
744			*offset = (*minor / PARTS_PER_DISK) +
745				((major - XEN_SCSI_DISK8_MAJOR + 8) * 16) +
746				EMULATED_SD_DISK_NAME_OFFSET;
747			*minor = *minor +
748				((major - XEN_SCSI_DISK8_MAJOR + 8) * 16 * PARTS_PER_DISK) +
749				EMULATED_SD_DISK_MINOR_OFFSET;
750			break;
751		case XENVBD_MAJOR:
752			*offset = *minor / PARTS_PER_DISK;
753			break;
754		default:
755			printk(KERN_WARNING "blkfront: your disk configuration is "
756					"incorrect, please use an xvd device instead\n");
757			return -ENODEV;
758	}
759	return 0;
760}
761
762static char *encode_disk_name(char *ptr, unsigned int n)
763{
764	if (n >= 26)
765		ptr = encode_disk_name(ptr, n / 26 - 1);
766	*ptr = 'a' + n % 26;
767	return ptr + 1;
768}
769
770static int xlvbd_alloc_gendisk(blkif_sector_t capacity,
771			       struct blkfront_info *info,
772			       u16 vdisk_info, u16 sector_size,
773			       unsigned int physical_sector_size)
774{
775	struct gendisk *gd;
776	int nr_minors = 1;
777	int err;
778	unsigned int offset;
779	int minor;
780	int nr_parts;
781	char *ptr;
782
783	BUG_ON(info->gd != NULL);
784	BUG_ON(info->rq != NULL);
785
786	if ((info->vdevice>>EXT_SHIFT) > 1) {
787		/* this is above the extended range; something is wrong */
788		printk(KERN_WARNING "blkfront: vdevice 0x%x is above the extended range; ignoring\n", info->vdevice);
789		return -ENODEV;
790	}
791
792	if (!VDEV_IS_EXTENDED(info->vdevice)) {
793		err = xen_translate_vdev(info->vdevice, &minor, &offset);
794		if (err)
795			return err;
796 		nr_parts = PARTS_PER_DISK;
797	} else {
798		minor = BLKIF_MINOR_EXT(info->vdevice);
799		nr_parts = PARTS_PER_EXT_DISK;
800		offset = minor / nr_parts;
801		if (xen_hvm_domain() && offset < EMULATED_HD_DISK_NAME_OFFSET + 4)
802			printk(KERN_WARNING "blkfront: vdevice 0x%x might conflict with "
803					"emulated IDE disks,\n\t choose an xvd device name"
804					"from xvde on\n", info->vdevice);
805	}
806	if (minor >> MINORBITS) {
807		pr_warn("blkfront: %#x's minor (%#x) out of range; ignoring\n",
808			info->vdevice, minor);
809		return -ENODEV;
810	}
811
812	if ((minor % nr_parts) == 0)
813		nr_minors = nr_parts;
814
815	err = xlbd_reserve_minors(minor, nr_minors);
816	if (err)
817		goto out;
818	err = -ENODEV;
819
820	gd = alloc_disk(nr_minors);
821	if (gd == NULL)
822		goto release;
823
824	strcpy(gd->disk_name, DEV_NAME);
825	ptr = encode_disk_name(gd->disk_name + sizeof(DEV_NAME) - 1, offset);
826	BUG_ON(ptr >= gd->disk_name + DISK_NAME_LEN);
827	if (nr_minors > 1)
828		*ptr = 0;
829	else
830		snprintf(ptr, gd->disk_name + DISK_NAME_LEN - ptr,
831			 "%d", minor & (nr_parts - 1));
832
833	gd->major = XENVBD_MAJOR;
834	gd->first_minor = minor;
835	gd->fops = &xlvbd_block_fops;
836	gd->private_data = info;
837	gd->driverfs_dev = &(info->xbdev->dev);
838	set_capacity(gd, capacity);
839
840	if (xlvbd_init_blk_queue(gd, sector_size, physical_sector_size,
841				 info->max_indirect_segments ? :
842				 BLKIF_MAX_SEGMENTS_PER_REQUEST)) {
843		del_gendisk(gd);
844		goto release;
845	}
846
847	info->rq = gd->queue;
848	info->gd = gd;
849
850	xlvbd_flush(info);
851
852	if (vdisk_info & VDISK_READONLY)
853		set_disk_ro(gd, 1);
854
855	if (vdisk_info & VDISK_REMOVABLE)
856		gd->flags |= GENHD_FL_REMOVABLE;
857
858	if (vdisk_info & VDISK_CDROM)
859		gd->flags |= GENHD_FL_CD;
860
861	return 0;
862
863 release:
864	xlbd_release_minors(minor, nr_minors);
865 out:
866	return err;
867}
868
869static void xlvbd_release_gendisk(struct blkfront_info *info)
870{
871	unsigned int minor, nr_minors;
872	unsigned long flags;
873
874	if (info->rq == NULL)
875		return;
876
877	spin_lock_irqsave(&info->io_lock, flags);
878
879	/* No more blkif_request(). */
880	blk_stop_queue(info->rq);
881
882	/* No more gnttab callback work. */
883	gnttab_cancel_free_callback(&info->callback);
884	spin_unlock_irqrestore(&info->io_lock, flags);
885
886	/* Flush gnttab callback work. Must be done with no locks held. */
887	flush_work(&info->work);
888
889	del_gendisk(info->gd);
890
891	minor = info->gd->first_minor;
892	nr_minors = info->gd->minors;
893	xlbd_release_minors(minor, nr_minors);
894
895	blk_cleanup_queue(info->rq);
896	info->rq = NULL;
897
898	put_disk(info->gd);
899	info->gd = NULL;
900}
901
902static void kick_pending_request_queues(struct blkfront_info *info)
903{
904	if (!RING_FULL(&info->ring)) {
905		/* Re-enable calldowns. */
906		blk_start_queue(info->rq);
907		/* Kick things off immediately. */
908		do_blkif_request(info->rq);
909	}
910}
911
912static void blkif_restart_queue(struct work_struct *work)
913{
914	struct blkfront_info *info = container_of(work, struct blkfront_info, work);
915
916	spin_lock_irq(&info->io_lock);
917	if (info->connected == BLKIF_STATE_CONNECTED)
918		kick_pending_request_queues(info);
919	spin_unlock_irq(&info->io_lock);
920}
921
922static void blkif_free(struct blkfront_info *info, int suspend)
923{
924	struct grant *persistent_gnt;
925	struct grant *n;
926	int i, j, segs;
927
928	/* Prevent new requests being issued until we fix things up. */
929	spin_lock_irq(&info->io_lock);
930	info->connected = suspend ?
931		BLKIF_STATE_SUSPENDED : BLKIF_STATE_DISCONNECTED;
932	/* No more blkif_request(). */
933	if (info->rq)
934		blk_stop_queue(info->rq);
935
936	/* Remove all persistent grants */
937	if (!list_empty(&info->grants)) {
938		list_for_each_entry_safe(persistent_gnt, n,
939		                         &info->grants, node) {
940			list_del(&persistent_gnt->node);
941			if (persistent_gnt->gref != GRANT_INVALID_REF) {
942				gnttab_end_foreign_access(persistent_gnt->gref,
943				                          0, 0UL);
944				info->persistent_gnts_c--;
945			}
946			if (info->feature_persistent)
947				__free_page(pfn_to_page(persistent_gnt->pfn));
948			kfree(persistent_gnt);
949		}
950	}
951	BUG_ON(info->persistent_gnts_c != 0);
952
953	/*
954	 * Remove indirect pages, this only happens when using indirect
955	 * descriptors but not persistent grants
956	 */
957	if (!list_empty(&info->indirect_pages)) {
958		struct page *indirect_page, *n;
959
960		BUG_ON(info->feature_persistent);
961		list_for_each_entry_safe(indirect_page, n, &info->indirect_pages, lru) {
962			list_del(&indirect_page->lru);
963			__free_page(indirect_page);
964		}
965	}
966
967	for (i = 0; i < BLK_RING_SIZE; i++) {
968		/*
969		 * Clear persistent grants present in requests already
970		 * on the shared ring
971		 */
972		if (!info->shadow[i].request)
973			goto free_shadow;
974
975		segs = info->shadow[i].req.operation == BLKIF_OP_INDIRECT ?
976		       info->shadow[i].req.u.indirect.nr_segments :
977		       info->shadow[i].req.u.rw.nr_segments;
978		for (j = 0; j < segs; j++) {
979			persistent_gnt = info->shadow[i].grants_used[j];
980			gnttab_end_foreign_access(persistent_gnt->gref, 0, 0UL);
981			if (info->feature_persistent)
982				__free_page(pfn_to_page(persistent_gnt->pfn));
983			kfree(persistent_gnt);
984		}
985
986		if (info->shadow[i].req.operation != BLKIF_OP_INDIRECT)
987			/*
988			 * If this is not an indirect operation don't try to
989			 * free indirect segments
990			 */
991			goto free_shadow;
992
993		for (j = 0; j < INDIRECT_GREFS(segs); j++) {
994			persistent_gnt = info->shadow[i].indirect_grants[j];
995			gnttab_end_foreign_access(persistent_gnt->gref, 0, 0UL);
996			__free_page(pfn_to_page(persistent_gnt->pfn));
997			kfree(persistent_gnt);
998		}
999
1000free_shadow:
1001		kfree(info->shadow[i].grants_used);
1002		info->shadow[i].grants_used = NULL;
1003		kfree(info->shadow[i].indirect_grants);
1004		info->shadow[i].indirect_grants = NULL;
1005		kfree(info->shadow[i].sg);
1006		info->shadow[i].sg = NULL;
1007	}
1008
1009	/* No more gnttab callback work. */
1010	gnttab_cancel_free_callback(&info->callback);
1011	spin_unlock_irq(&info->io_lock);
1012
1013	/* Flush gnttab callback work. Must be done with no locks held. */
1014	flush_work(&info->work);
1015
1016	/* Free resources associated with old device channel. */
1017	if (info->ring_ref != GRANT_INVALID_REF) {
1018		gnttab_end_foreign_access(info->ring_ref, 0,
1019					  (unsigned long)info->ring.sring);
1020		info->ring_ref = GRANT_INVALID_REF;
1021		info->ring.sring = NULL;
1022	}
1023	if (info->irq)
1024		unbind_from_irqhandler(info->irq, info);
1025	info->evtchn = info->irq = 0;
1026
1027}
1028
1029static void blkif_completion(struct blk_shadow *s, struct blkfront_info *info,
1030			     struct blkif_response *bret)
1031{
1032	int i = 0;
1033	struct scatterlist *sg;
1034	char *bvec_data;
1035	void *shared_data;
1036	int nseg;
1037
1038	nseg = s->req.operation == BLKIF_OP_INDIRECT ?
1039		s->req.u.indirect.nr_segments : s->req.u.rw.nr_segments;
1040
1041	if (bret->operation == BLKIF_OP_READ && info->feature_persistent) {
1042		/*
1043		 * Copy the data received from the backend into the bvec.
1044		 * Since bv_offset can be different than 0, and bv_len different
1045		 * than PAGE_SIZE, we have to keep track of the current offset,
1046		 * to be sure we are copying the data from the right shared page.
1047		 */
1048		for_each_sg(s->sg, sg, nseg, i) {
1049			BUG_ON(sg->offset + sg->length > PAGE_SIZE);
1050			shared_data = kmap_atomic(
1051				pfn_to_page(s->grants_used[i]->pfn));
1052			bvec_data = kmap_atomic(sg_page(sg));
1053			memcpy(bvec_data   + sg->offset,
1054			       shared_data + sg->offset,
1055			       sg->length);
1056			kunmap_atomic(bvec_data);
1057			kunmap_atomic(shared_data);
1058		}
1059	}
1060	/* Add the persistent grant into the list of free grants */
1061	for (i = 0; i < nseg; i++) {
1062		if (gnttab_query_foreign_access(s->grants_used[i]->gref)) {
1063			/*
1064			 * If the grant is still mapped by the backend (the
1065			 * backend has chosen to make this grant persistent)
1066			 * we add it at the head of the list, so it will be
1067			 * reused first.
1068			 */
1069			if (!info->feature_persistent)
1070				pr_alert_ratelimited("backed has not unmapped grant: %u\n",
1071						     s->grants_used[i]->gref);
1072			list_add(&s->grants_used[i]->node, &info->grants);
1073			info->persistent_gnts_c++;
1074		} else {
1075			/*
1076			 * If the grant is not mapped by the backend we end the
1077			 * foreign access and add it to the tail of the list,
1078			 * so it will not be picked again unless we run out of
1079			 * persistent grants.
1080			 */
1081			gnttab_end_foreign_access(s->grants_used[i]->gref, 0, 0UL);
1082			s->grants_used[i]->gref = GRANT_INVALID_REF;
1083			list_add_tail(&s->grants_used[i]->node, &info->grants);
1084		}
1085	}
1086	if (s->req.operation == BLKIF_OP_INDIRECT) {
1087		for (i = 0; i < INDIRECT_GREFS(nseg); i++) {
1088			if (gnttab_query_foreign_access(s->indirect_grants[i]->gref)) {
1089				if (!info->feature_persistent)
1090					pr_alert_ratelimited("backed has not unmapped grant: %u\n",
1091							     s->indirect_grants[i]->gref);
1092				list_add(&s->indirect_grants[i]->node, &info->grants);
1093				info->persistent_gnts_c++;
1094			} else {
1095				struct page *indirect_page;
1096
1097				gnttab_end_foreign_access(s->indirect_grants[i]->gref, 0, 0UL);
1098				/*
1099				 * Add the used indirect page back to the list of
1100				 * available pages for indirect grefs.
1101				 */
1102				indirect_page = pfn_to_page(s->indirect_grants[i]->pfn);
1103				list_add(&indirect_page->lru, &info->indirect_pages);
1104				s->indirect_grants[i]->gref = GRANT_INVALID_REF;
1105				list_add_tail(&s->indirect_grants[i]->node, &info->grants);
1106			}
1107		}
1108	}
1109}
1110
1111static irqreturn_t blkif_interrupt(int irq, void *dev_id)
1112{
1113	struct request *req;
1114	struct blkif_response *bret;
1115	RING_IDX i, rp;
1116	unsigned long flags;
1117	struct blkfront_info *info = (struct blkfront_info *)dev_id;
1118	int error;
1119
1120	spin_lock_irqsave(&info->io_lock, flags);
1121
1122	if (unlikely(info->connected != BLKIF_STATE_CONNECTED)) {
1123		spin_unlock_irqrestore(&info->io_lock, flags);
1124		return IRQ_HANDLED;
1125	}
1126
1127 again:
1128	rp = info->ring.sring->rsp_prod;
1129	rmb(); /* Ensure we see queued responses up to 'rp'. */
1130
1131	for (i = info->ring.rsp_cons; i != rp; i++) {
1132		unsigned long id;
1133
1134		bret = RING_GET_RESPONSE(&info->ring, i);
1135		id   = bret->id;
1136		/*
1137		 * The backend has messed up and given us an id that we would
1138		 * never have given to it (we stamp it up to BLK_RING_SIZE -
1139		 * look in get_id_from_freelist.
1140		 */
1141		if (id >= BLK_RING_SIZE) {
1142			WARN(1, "%s: response to %s has incorrect id (%ld)\n",
1143			     info->gd->disk_name, op_name(bret->operation), id);
1144			/* We can't safely get the 'struct request' as
1145			 * the id is busted. */
1146			continue;
1147		}
1148		req  = info->shadow[id].request;
1149
1150		if (bret->operation != BLKIF_OP_DISCARD)
1151			blkif_completion(&info->shadow[id], info, bret);
1152
1153		if (add_id_to_freelist(info, id)) {
1154			WARN(1, "%s: response to %s (id %ld) couldn't be recycled!\n",
1155			     info->gd->disk_name, op_name(bret->operation), id);
1156			continue;
1157		}
1158
1159		error = (bret->status == BLKIF_RSP_OKAY) ? 0 : -EIO;
1160		switch (bret->operation) {
1161		case BLKIF_OP_DISCARD:
1162			if (unlikely(bret->status == BLKIF_RSP_EOPNOTSUPP)) {
1163				struct request_queue *rq = info->rq;
1164				printk(KERN_WARNING "blkfront: %s: %s op failed\n",
1165					   info->gd->disk_name, op_name(bret->operation));
1166				error = -EOPNOTSUPP;
1167				info->feature_discard = 0;
1168				info->feature_secdiscard = 0;
1169				queue_flag_clear(QUEUE_FLAG_DISCARD, rq);
1170				queue_flag_clear(QUEUE_FLAG_SECDISCARD, rq);
1171			}
1172			__blk_end_request_all(req, error);
1173			break;
1174		case BLKIF_OP_FLUSH_DISKCACHE:
1175		case BLKIF_OP_WRITE_BARRIER:
1176			if (unlikely(bret->status == BLKIF_RSP_EOPNOTSUPP)) {
1177				printk(KERN_WARNING "blkfront: %s: %s op failed\n",
1178				       info->gd->disk_name, op_name(bret->operation));
1179				error = -EOPNOTSUPP;
1180			}
1181			if (unlikely(bret->status == BLKIF_RSP_ERROR &&
1182				     info->shadow[id].req.u.rw.nr_segments == 0)) {
1183				printk(KERN_WARNING "blkfront: %s: empty %s op failed\n",
1184				       info->gd->disk_name, op_name(bret->operation));
1185				error = -EOPNOTSUPP;
1186			}
1187			if (unlikely(error)) {
1188				if (error == -EOPNOTSUPP)
1189					error = 0;
1190				info->feature_flush = 0;
1191				info->flush_op = 0;
1192				xlvbd_flush(info);
1193			}
1194			/* fall through */
1195		case BLKIF_OP_READ:
1196		case BLKIF_OP_WRITE:
1197			if (unlikely(bret->status != BLKIF_RSP_OKAY))
1198				dev_dbg(&info->xbdev->dev, "Bad return from blkdev data "
1199					"request: %x\n", bret->status);
1200
1201			__blk_end_request_all(req, error);
1202			break;
1203		default:
1204			BUG();
1205		}
1206	}
1207
1208	info->ring.rsp_cons = i;
1209
1210	if (i != info->ring.req_prod_pvt) {
1211		int more_to_do;
1212		RING_FINAL_CHECK_FOR_RESPONSES(&info->ring, more_to_do);
1213		if (more_to_do)
1214			goto again;
1215	} else
1216		info->ring.sring->rsp_event = i + 1;
1217
1218	kick_pending_request_queues(info);
1219
1220	spin_unlock_irqrestore(&info->io_lock, flags);
1221
1222	return IRQ_HANDLED;
1223}
1224
1225
1226static int setup_blkring(struct xenbus_device *dev,
1227			 struct blkfront_info *info)
1228{
1229	struct blkif_sring *sring;
1230	int err;
1231
1232	info->ring_ref = GRANT_INVALID_REF;
1233
1234	sring = (struct blkif_sring *)__get_free_page(GFP_NOIO | __GFP_HIGH);
1235	if (!sring) {
1236		xenbus_dev_fatal(dev, -ENOMEM, "allocating shared ring");
1237		return -ENOMEM;
1238	}
1239	SHARED_RING_INIT(sring);
1240	FRONT_RING_INIT(&info->ring, sring, PAGE_SIZE);
1241
1242	err = xenbus_grant_ring(dev, virt_to_mfn(info->ring.sring));
1243	if (err < 0) {
1244		free_page((unsigned long)sring);
1245		info->ring.sring = NULL;
1246		goto fail;
1247	}
1248	info->ring_ref = err;
1249
1250	err = xenbus_alloc_evtchn(dev, &info->evtchn);
1251	if (err)
1252		goto fail;
1253
1254	err = bind_evtchn_to_irqhandler(info->evtchn, blkif_interrupt, 0,
1255					"blkif", info);
1256	if (err <= 0) {
1257		xenbus_dev_fatal(dev, err,
1258				 "bind_evtchn_to_irqhandler failed");
1259		goto fail;
1260	}
1261	info->irq = err;
1262
1263	return 0;
1264fail:
1265	blkif_free(info, 0);
1266	return err;
1267}
1268
1269
1270/* Common code used when first setting up, and when resuming. */
1271static int talk_to_blkback(struct xenbus_device *dev,
1272			   struct blkfront_info *info)
1273{
1274	const char *message = NULL;
1275	struct xenbus_transaction xbt;
1276	int err;
1277
1278	/* Create shared ring, alloc event channel. */
1279	err = setup_blkring(dev, info);
1280	if (err)
1281		goto out;
1282
1283again:
1284	err = xenbus_transaction_start(&xbt);
1285	if (err) {
1286		xenbus_dev_fatal(dev, err, "starting transaction");
1287		goto destroy_blkring;
1288	}
1289
1290	err = xenbus_printf(xbt, dev->nodename,
1291			    "ring-ref", "%u", info->ring_ref);
1292	if (err) {
1293		message = "writing ring-ref";
1294		goto abort_transaction;
1295	}
1296	err = xenbus_printf(xbt, dev->nodename,
1297			    "event-channel", "%u", info->evtchn);
1298	if (err) {
1299		message = "writing event-channel";
1300		goto abort_transaction;
1301	}
1302	err = xenbus_printf(xbt, dev->nodename, "protocol", "%s",
1303			    XEN_IO_PROTO_ABI_NATIVE);
1304	if (err) {
1305		message = "writing protocol";
1306		goto abort_transaction;
1307	}
1308	err = xenbus_printf(xbt, dev->nodename,
1309			    "feature-persistent", "%u", 1);
1310	if (err)
1311		dev_warn(&dev->dev,
1312			 "writing persistent grants feature to xenbus");
1313
1314	err = xenbus_transaction_end(xbt, 0);
1315	if (err) {
1316		if (err == -EAGAIN)
1317			goto again;
1318		xenbus_dev_fatal(dev, err, "completing transaction");
1319		goto destroy_blkring;
1320	}
1321
1322	xenbus_switch_state(dev, XenbusStateInitialised);
1323
1324	return 0;
1325
1326 abort_transaction:
1327	xenbus_transaction_end(xbt, 1);
1328	if (message)
1329		xenbus_dev_fatal(dev, err, "%s", message);
1330 destroy_blkring:
1331	blkif_free(info, 0);
1332 out:
1333	return err;
1334}
1335
1336/**
1337 * Entry point to this code when a new device is created.  Allocate the basic
1338 * structures and the ring buffer for communication with the backend, and
1339 * inform the backend of the appropriate details for those.  Switch to
1340 * Initialised state.
1341 */
1342static int blkfront_probe(struct xenbus_device *dev,
1343			  const struct xenbus_device_id *id)
1344{
1345	int err, vdevice, i;
1346	struct blkfront_info *info;
1347
1348	/* FIXME: Use dynamic device id if this is not set. */
1349	err = xenbus_scanf(XBT_NIL, dev->nodename,
1350			   "virtual-device", "%i", &vdevice);
1351	if (err != 1) {
1352		/* go looking in the extended area instead */
1353		err = xenbus_scanf(XBT_NIL, dev->nodename, "virtual-device-ext",
1354				   "%i", &vdevice);
1355		if (err != 1) {
1356			xenbus_dev_fatal(dev, err, "reading virtual-device");
1357			return err;
1358		}
1359	}
1360
1361	if (xen_hvm_domain()) {
1362		char *type;
1363		int len;
1364		/* no unplug has been done: do not hook devices != xen vbds */
1365		if (xen_has_pv_and_legacy_disk_devices()) {
1366			int major;
1367
1368			if (!VDEV_IS_EXTENDED(vdevice))
1369				major = BLKIF_MAJOR(vdevice);
1370			else
1371				major = XENVBD_MAJOR;
1372
1373			if (major != XENVBD_MAJOR) {
1374				printk(KERN_INFO
1375						"%s: HVM does not support vbd %d as xen block device\n",
1376						__FUNCTION__, vdevice);
1377				return -ENODEV;
1378			}
1379		}
1380		/* do not create a PV cdrom device if we are an HVM guest */
1381		type = xenbus_read(XBT_NIL, dev->nodename, "device-type", &len);
1382		if (IS_ERR(type))
1383			return -ENODEV;
1384		if (strncmp(type, "cdrom", 5) == 0) {
1385			kfree(type);
1386			return -ENODEV;
1387		}
1388		kfree(type);
1389	}
1390	info = kzalloc(sizeof(*info), GFP_KERNEL);
1391	if (!info) {
1392		xenbus_dev_fatal(dev, -ENOMEM, "allocating info structure");
1393		return -ENOMEM;
1394	}
1395
1396	mutex_init(&info->mutex);
1397	spin_lock_init(&info->io_lock);
1398	info->xbdev = dev;
1399	info->vdevice = vdevice;
1400	INIT_LIST_HEAD(&info->grants);
1401	INIT_LIST_HEAD(&info->indirect_pages);
1402	info->persistent_gnts_c = 0;
1403	info->connected = BLKIF_STATE_DISCONNECTED;
1404	INIT_WORK(&info->work, blkif_restart_queue);
1405
1406	for (i = 0; i < BLK_RING_SIZE; i++)
1407		info->shadow[i].req.u.rw.id = i+1;
1408	info->shadow[BLK_RING_SIZE-1].req.u.rw.id = 0x0fffffff;
1409
1410	/* Front end dir is a number, which is used as the id. */
1411	info->handle = simple_strtoul(strrchr(dev->nodename, '/')+1, NULL, 0);
1412	dev_set_drvdata(&dev->dev, info);
1413
1414	err = talk_to_blkback(dev, info);
1415	if (err) {
1416		kfree(info);
1417		dev_set_drvdata(&dev->dev, NULL);
1418		return err;
1419	}
1420
1421	return 0;
1422}
1423
1424static void split_bio_end(struct bio *bio, int error)
1425{
1426	struct split_bio *split_bio = bio->bi_private;
1427
1428	if (error)
1429		split_bio->err = error;
1430
1431	if (atomic_dec_and_test(&split_bio->pending)) {
1432		split_bio->bio->bi_phys_segments = 0;
1433		bio_endio(split_bio->bio, split_bio->err);
1434		kfree(split_bio);
1435	}
1436	bio_put(bio);
1437}
1438
1439static int blkif_recover(struct blkfront_info *info)
1440{
1441	int i;
1442	struct request *req, *n;
1443	struct blk_shadow *copy;
1444	int rc;
1445	struct bio *bio, *cloned_bio;
1446	struct bio_list bio_list, merge_bio;
1447	unsigned int segs, offset;
1448	int pending, size;
1449	struct split_bio *split_bio;
1450	struct list_head requests;
1451
1452	/* Stage 1: Make a safe copy of the shadow state. */
1453	copy = kmemdup(info->shadow, sizeof(info->shadow),
1454		       GFP_NOIO | __GFP_REPEAT | __GFP_HIGH);
1455	if (!copy)
1456		return -ENOMEM;
1457
1458	/* Stage 2: Set up free list. */
1459	memset(&info->shadow, 0, sizeof(info->shadow));
1460	for (i = 0; i < BLK_RING_SIZE; i++)
1461		info->shadow[i].req.u.rw.id = i+1;
1462	info->shadow_free = info->ring.req_prod_pvt;
1463	info->shadow[BLK_RING_SIZE-1].req.u.rw.id = 0x0fffffff;
1464
1465	rc = blkfront_setup_indirect(info);
1466	if (rc) {
1467		kfree(copy);
1468		return rc;
1469	}
1470
1471	segs = info->max_indirect_segments ? : BLKIF_MAX_SEGMENTS_PER_REQUEST;
1472	blk_queue_max_segments(info->rq, segs);
1473	bio_list_init(&bio_list);
1474	INIT_LIST_HEAD(&requests);
1475	for (i = 0; i < BLK_RING_SIZE; i++) {
1476		/* Not in use? */
1477		if (!copy[i].request)
1478			continue;
1479
1480		/*
1481		 * Get the bios in the request so we can re-queue them.
1482		 */
1483		if (copy[i].request->cmd_flags &
1484		    (REQ_FLUSH | REQ_FUA | REQ_DISCARD | REQ_SECURE)) {
1485			/*
1486			 * Flush operations don't contain bios, so
1487			 * we need to requeue the whole request
1488			 */
1489			list_add(&copy[i].request->queuelist, &requests);
1490			continue;
1491		}
1492		merge_bio.head = copy[i].request->bio;
1493		merge_bio.tail = copy[i].request->biotail;
1494		bio_list_merge(&bio_list, &merge_bio);
1495		copy[i].request->bio = NULL;
1496		blk_put_request(copy[i].request);
1497	}
1498
1499	kfree(copy);
1500
1501	/*
1502	 * Empty the queue, this is important because we might have
1503	 * requests in the queue with more segments than what we
1504	 * can handle now.
1505	 */
1506	spin_lock_irq(&info->io_lock);
1507	while ((req = blk_fetch_request(info->rq)) != NULL) {
1508		if (req->cmd_flags &
1509		    (REQ_FLUSH | REQ_FUA | REQ_DISCARD | REQ_SECURE)) {
1510			list_add(&req->queuelist, &requests);
1511			continue;
1512		}
1513		merge_bio.head = req->bio;
1514		merge_bio.tail = req->biotail;
1515		bio_list_merge(&bio_list, &merge_bio);
1516		req->bio = NULL;
1517		if (req->cmd_flags & (REQ_FLUSH | REQ_FUA))
1518			pr_alert("diskcache flush request found!\n");
1519		__blk_put_request(info->rq, req);
1520	}
1521	spin_unlock_irq(&info->io_lock);
1522
1523	xenbus_switch_state(info->xbdev, XenbusStateConnected);
1524
1525	spin_lock_irq(&info->io_lock);
1526
1527	/* Now safe for us to use the shared ring */
1528	info->connected = BLKIF_STATE_CONNECTED;
1529
1530	/* Kick any other new requests queued since we resumed */
1531	kick_pending_request_queues(info);
1532
1533	list_for_each_entry_safe(req, n, &requests, queuelist) {
1534		/* Requeue pending requests (flush or discard) */
1535		list_del_init(&req->queuelist);
1536		BUG_ON(req->nr_phys_segments > segs);
1537		blk_requeue_request(info->rq, req);
1538	}
1539	spin_unlock_irq(&info->io_lock);
1540
1541	while ((bio = bio_list_pop(&bio_list)) != NULL) {
1542		/* Traverse the list of pending bios and re-queue them */
1543		if (bio_segments(bio) > segs) {
1544			/*
1545			 * This bio has more segments than what we can
1546			 * handle, we have to split it.
1547			 */
1548			pending = (bio_segments(bio) + segs - 1) / segs;
1549			split_bio = kzalloc(sizeof(*split_bio), GFP_NOIO);
1550			BUG_ON(split_bio == NULL);
1551			atomic_set(&split_bio->pending, pending);
1552			split_bio->bio = bio;
1553			for (i = 0; i < pending; i++) {
1554				offset = (i * segs * PAGE_SIZE) >> 9;
1555				size = min((unsigned int)(segs * PAGE_SIZE) >> 9,
1556					   (unsigned int)bio_sectors(bio) - offset);
1557				cloned_bio = bio_clone(bio, GFP_NOIO);
1558				BUG_ON(cloned_bio == NULL);
1559				bio_trim(cloned_bio, offset, size);
1560				cloned_bio->bi_private = split_bio;
1561				cloned_bio->bi_end_io = split_bio_end;
1562				submit_bio(cloned_bio->bi_rw, cloned_bio);
1563			}
1564			/*
1565			 * Now we have to wait for all those smaller bios to
1566			 * end, so we can also end the "parent" bio.
1567			 */
1568			continue;
1569		}
1570		/* We don't need to split this bio */
1571		submit_bio(bio->bi_rw, bio);
1572	}
1573
1574	return 0;
1575}
1576
1577/**
1578 * We are reconnecting to the backend, due to a suspend/resume, or a backend
1579 * driver restart.  We tear down our blkif structure and recreate it, but
1580 * leave the device-layer structures intact so that this is transparent to the
1581 * rest of the kernel.
1582 */
1583static int blkfront_resume(struct xenbus_device *dev)
1584{
1585	struct blkfront_info *info = dev_get_drvdata(&dev->dev);
1586	int err;
1587
1588	dev_dbg(&dev->dev, "blkfront_resume: %s\n", dev->nodename);
1589
1590	blkif_free(info, info->connected == BLKIF_STATE_CONNECTED);
1591
1592	err = talk_to_blkback(dev, info);
1593
1594	/*
1595	 * We have to wait for the backend to switch to
1596	 * connected state, since we want to read which
1597	 * features it supports.
1598	 */
1599
1600	return err;
1601}
1602
1603static void
1604blkfront_closing(struct blkfront_info *info)
1605{
1606	struct xenbus_device *xbdev = info->xbdev;
1607	struct block_device *bdev = NULL;
1608
1609	mutex_lock(&info->mutex);
1610
1611	if (xbdev->state == XenbusStateClosing) {
1612		mutex_unlock(&info->mutex);
1613		return;
1614	}
1615
1616	if (info->gd)
1617		bdev = bdget_disk(info->gd, 0);
1618
1619	mutex_unlock(&info->mutex);
1620
1621	if (!bdev) {
1622		xenbus_frontend_closed(xbdev);
1623		return;
1624	}
1625
1626	mutex_lock(&bdev->bd_mutex);
1627
1628	if (bdev->bd_openers) {
1629		xenbus_dev_error(xbdev, -EBUSY,
1630				 "Device in use; refusing to close");
1631		xenbus_switch_state(xbdev, XenbusStateClosing);
1632	} else {
1633		xlvbd_release_gendisk(info);
1634		xenbus_frontend_closed(xbdev);
1635	}
1636
1637	mutex_unlock(&bdev->bd_mutex);
1638	bdput(bdev);
1639}
1640
1641static void blkfront_setup_discard(struct blkfront_info *info)
1642{
1643	int err;
1644	unsigned int discard_granularity;
1645	unsigned int discard_alignment;
1646	unsigned int discard_secure;
1647
1648	info->feature_discard = 1;
1649	err = xenbus_gather(XBT_NIL, info->xbdev->otherend,
1650		"discard-granularity", "%u", &discard_granularity,
1651		"discard-alignment", "%u", &discard_alignment,
1652		NULL);
1653	if (!err) {
1654		info->discard_granularity = discard_granularity;
1655		info->discard_alignment = discard_alignment;
1656	}
1657	err = xenbus_gather(XBT_NIL, info->xbdev->otherend,
1658		    "discard-secure", "%d", &discard_secure,
1659		    NULL);
1660	if (!err)
1661		info->feature_secdiscard = !!discard_secure;
1662}
1663
1664static int blkfront_setup_indirect(struct blkfront_info *info)
1665{
1666	unsigned int indirect_segments, segs;
1667	int err, i;
1668
1669	err = xenbus_gather(XBT_NIL, info->xbdev->otherend,
1670			    "feature-max-indirect-segments", "%u", &indirect_segments,
1671			    NULL);
1672	if (err) {
1673		info->max_indirect_segments = 0;
1674		segs = BLKIF_MAX_SEGMENTS_PER_REQUEST;
1675	} else {
1676		info->max_indirect_segments = min(indirect_segments,
1677						  xen_blkif_max_segments);
1678		segs = info->max_indirect_segments;
1679	}
1680
1681	err = fill_grant_buffer(info, (segs + INDIRECT_GREFS(segs)) * BLK_RING_SIZE);
1682	if (err)
1683		goto out_of_memory;
1684
1685	if (!info->feature_persistent && info->max_indirect_segments) {
1686		/*
1687		 * We are using indirect descriptors but not persistent
1688		 * grants, we need to allocate a set of pages that can be
1689		 * used for mapping indirect grefs
1690		 */
1691		int num = INDIRECT_GREFS(segs) * BLK_RING_SIZE;
1692
1693		BUG_ON(!list_empty(&info->indirect_pages));
1694		for (i = 0; i < num; i++) {
1695			struct page *indirect_page = alloc_page(GFP_NOIO);
1696			if (!indirect_page)
1697				goto out_of_memory;
1698			list_add(&indirect_page->lru, &info->indirect_pages);
1699		}
1700	}
1701
1702	for (i = 0; i < BLK_RING_SIZE; i++) {
1703		info->shadow[i].grants_used = kzalloc(
1704			sizeof(info->shadow[i].grants_used[0]) * segs,
1705			GFP_NOIO);
1706		info->shadow[i].sg = kzalloc(sizeof(info->shadow[i].sg[0]) * segs, GFP_NOIO);
1707		if (info->max_indirect_segments)
1708			info->shadow[i].indirect_grants = kzalloc(
1709				sizeof(info->shadow[i].indirect_grants[0]) *
1710				INDIRECT_GREFS(segs),
1711				GFP_NOIO);
1712		if ((info->shadow[i].grants_used == NULL) ||
1713			(info->shadow[i].sg == NULL) ||
1714		     (info->max_indirect_segments &&
1715		     (info->shadow[i].indirect_grants == NULL)))
1716			goto out_of_memory;
1717		sg_init_table(info->shadow[i].sg, segs);
1718	}
1719
1720
1721	return 0;
1722
1723out_of_memory:
1724	for (i = 0; i < BLK_RING_SIZE; i++) {
1725		kfree(info->shadow[i].grants_used);
1726		info->shadow[i].grants_used = NULL;
1727		kfree(info->shadow[i].sg);
1728		info->shadow[i].sg = NULL;
1729		kfree(info->shadow[i].indirect_grants);
1730		info->shadow[i].indirect_grants = NULL;
1731	}
1732	if (!list_empty(&info->indirect_pages)) {
1733		struct page *indirect_page, *n;
1734		list_for_each_entry_safe(indirect_page, n, &info->indirect_pages, lru) {
1735			list_del(&indirect_page->lru);
1736			__free_page(indirect_page);
1737		}
1738	}
1739	return -ENOMEM;
1740}
1741
1742/*
1743 * Invoked when the backend is finally 'ready' (and has told produced
1744 * the details about the physical device - #sectors, size, etc).
1745 */
1746static void blkfront_connect(struct blkfront_info *info)
1747{
1748	unsigned long long sectors;
1749	unsigned long sector_size;
1750	unsigned int physical_sector_size;
1751	unsigned int binfo;
1752	int err;
1753	int barrier, flush, discard, persistent;
1754
1755	switch (info->connected) {
1756	case BLKIF_STATE_CONNECTED:
1757		/*
1758		 * Potentially, the back-end may be signalling
1759		 * a capacity change; update the capacity.
1760		 */
1761		err = xenbus_scanf(XBT_NIL, info->xbdev->otherend,
1762				   "sectors", "%Lu", &sectors);
1763		if (XENBUS_EXIST_ERR(err))
1764			return;
1765		printk(KERN_INFO "Setting capacity to %Lu\n",
1766		       sectors);
1767		set_capacity(info->gd, sectors);
1768		revalidate_disk(info->gd);
1769
1770		return;
1771	case BLKIF_STATE_SUSPENDED:
1772		/*
1773		 * If we are recovering from suspension, we need to wait
1774		 * for the backend to announce it's features before
1775		 * reconnecting, at least we need to know if the backend
1776		 * supports indirect descriptors, and how many.
1777		 */
1778		blkif_recover(info);
1779		return;
1780
1781	default:
1782		break;
1783	}
1784
1785	dev_dbg(&info->xbdev->dev, "%s:%s.\n",
1786		__func__, info->xbdev->otherend);
1787
1788	err = xenbus_gather(XBT_NIL, info->xbdev->otherend,
1789			    "sectors", "%llu", &sectors,
1790			    "info", "%u", &binfo,
1791			    "sector-size", "%lu", &sector_size,
1792			    NULL);
1793	if (err) {
1794		xenbus_dev_fatal(info->xbdev, err,
1795				 "reading backend fields at %s",
1796				 info->xbdev->otherend);
1797		return;
1798	}
1799
1800	/*
1801	 * physcial-sector-size is a newer field, so old backends may not
1802	 * provide this. Assume physical sector size to be the same as
1803	 * sector_size in that case.
1804	 */
1805	err = xenbus_scanf(XBT_NIL, info->xbdev->otherend,
1806			   "physical-sector-size", "%u", &physical_sector_size);
1807	if (err != 1)
1808		physical_sector_size = sector_size;
1809
1810	info->feature_flush = 0;
1811	info->flush_op = 0;
1812
1813	err = xenbus_gather(XBT_NIL, info->xbdev->otherend,
1814			    "feature-barrier", "%d", &barrier,
1815			    NULL);
1816
1817	/*
1818	 * If there's no "feature-barrier" defined, then it means
1819	 * we're dealing with a very old backend which writes
1820	 * synchronously; nothing to do.
1821	 *
1822	 * If there are barriers, then we use flush.
1823	 */
1824	if (!err && barrier) {
1825		info->feature_flush = REQ_FLUSH | REQ_FUA;
1826		info->flush_op = BLKIF_OP_WRITE_BARRIER;
1827	}
1828	/*
1829	 * And if there is "feature-flush-cache" use that above
1830	 * barriers.
1831	 */
1832	err = xenbus_gather(XBT_NIL, info->xbdev->otherend,
1833			    "feature-flush-cache", "%d", &flush,
1834			    NULL);
1835
1836	if (!err && flush) {
1837		info->feature_flush = REQ_FLUSH;
1838		info->flush_op = BLKIF_OP_FLUSH_DISKCACHE;
1839	}
1840
1841	err = xenbus_gather(XBT_NIL, info->xbdev->otherend,
1842			    "feature-discard", "%d", &discard,
1843			    NULL);
1844
1845	if (!err && discard)
1846		blkfront_setup_discard(info);
1847
1848	err = xenbus_gather(XBT_NIL, info->xbdev->otherend,
1849			    "feature-persistent", "%u", &persistent,
1850			    NULL);
1851	if (err)
1852		info->feature_persistent = 0;
1853	else
1854		info->feature_persistent = persistent;
1855
1856	err = blkfront_setup_indirect(info);
1857	if (err) {
1858		xenbus_dev_fatal(info->xbdev, err, "setup_indirect at %s",
1859				 info->xbdev->otherend);
1860		return;
1861	}
1862
1863	err = xlvbd_alloc_gendisk(sectors, info, binfo, sector_size,
1864				  physical_sector_size);
1865	if (err) {
1866		xenbus_dev_fatal(info->xbdev, err, "xlvbd_add at %s",
1867				 info->xbdev->otherend);
1868		return;
1869	}
1870
1871	xenbus_switch_state(info->xbdev, XenbusStateConnected);
1872
1873	/* Kick pending requests. */
1874	spin_lock_irq(&info->io_lock);
1875	info->connected = BLKIF_STATE_CONNECTED;
1876	kick_pending_request_queues(info);
1877	spin_unlock_irq(&info->io_lock);
1878
1879	add_disk(info->gd);
1880
1881	info->is_ready = 1;
1882}
1883
1884/**
1885 * Callback received when the backend's state changes.
1886 */
1887static void blkback_changed(struct xenbus_device *dev,
1888			    enum xenbus_state backend_state)
1889{
1890	struct blkfront_info *info = dev_get_drvdata(&dev->dev);
1891
1892	dev_dbg(&dev->dev, "blkfront:blkback_changed to state %d.\n", backend_state);
1893
1894	switch (backend_state) {
1895	case XenbusStateInitialising:
1896	case XenbusStateInitWait:
1897	case XenbusStateInitialised:
1898	case XenbusStateReconfiguring:
1899	case XenbusStateReconfigured:
1900	case XenbusStateUnknown:
1901		break;
1902
1903	case XenbusStateConnected:
1904		blkfront_connect(info);
1905		break;
1906
1907	case XenbusStateClosed:
1908		if (dev->state == XenbusStateClosed)
1909			break;
1910		/* Missed the backend's Closing state -- fallthrough */
1911	case XenbusStateClosing:
1912		blkfront_closing(info);
1913		break;
1914	}
1915}
1916
1917static int blkfront_remove(struct xenbus_device *xbdev)
1918{
1919	struct blkfront_info *info = dev_get_drvdata(&xbdev->dev);
1920	struct block_device *bdev = NULL;
1921	struct gendisk *disk;
1922
1923	dev_dbg(&xbdev->dev, "%s removed", xbdev->nodename);
1924
1925	blkif_free(info, 0);
1926
1927	mutex_lock(&info->mutex);
1928
1929	disk = info->gd;
1930	if (disk)
1931		bdev = bdget_disk(disk, 0);
1932
1933	info->xbdev = NULL;
1934	mutex_unlock(&info->mutex);
1935
1936	if (!bdev) {
1937		kfree(info);
1938		return 0;
1939	}
1940
1941	/*
1942	 * The xbdev was removed before we reached the Closed
1943	 * state. See if it's safe to remove the disk. If the bdev
1944	 * isn't closed yet, we let release take care of it.
1945	 */
1946
1947	mutex_lock(&bdev->bd_mutex);
1948	info = disk->private_data;
1949
1950	dev_warn(disk_to_dev(disk),
1951		 "%s was hot-unplugged, %d stale handles\n",
1952		 xbdev->nodename, bdev->bd_openers);
1953
1954	if (info && !bdev->bd_openers) {
1955		xlvbd_release_gendisk(info);
1956		disk->private_data = NULL;
1957		kfree(info);
1958	}
1959
1960	mutex_unlock(&bdev->bd_mutex);
1961	bdput(bdev);
1962
1963	return 0;
1964}
1965
1966static int blkfront_is_ready(struct xenbus_device *dev)
1967{
1968	struct blkfront_info *info = dev_get_drvdata(&dev->dev);
1969
1970	return info->is_ready && info->xbdev;
1971}
1972
1973static int blkif_open(struct block_device *bdev, fmode_t mode)
1974{
1975	struct gendisk *disk = bdev->bd_disk;
1976	struct blkfront_info *info;
1977	int err = 0;
1978
1979	mutex_lock(&blkfront_mutex);
1980
1981	info = disk->private_data;
1982	if (!info) {
1983		/* xbdev gone */
1984		err = -ERESTARTSYS;
1985		goto out;
1986	}
1987
1988	mutex_lock(&info->mutex);
1989
1990	if (!info->gd)
1991		/* xbdev is closed */
1992		err = -ERESTARTSYS;
1993
1994	mutex_unlock(&info->mutex);
1995
1996out:
1997	mutex_unlock(&blkfront_mutex);
1998	return err;
1999}
2000
2001static void blkif_release(struct gendisk *disk, fmode_t mode)
2002{
2003	struct blkfront_info *info = disk->private_data;
2004	struct block_device *bdev;
2005	struct xenbus_device *xbdev;
2006
2007	mutex_lock(&blkfront_mutex);
2008
2009	bdev = bdget_disk(disk, 0);
2010
2011	if (!bdev) {
2012		WARN(1, "Block device %s yanked out from us!\n", disk->disk_name);
2013		goto out_mutex;
2014	}
2015	if (bdev->bd_openers)
2016		goto out;
2017
2018	/*
2019	 * Check if we have been instructed to close. We will have
2020	 * deferred this request, because the bdev was still open.
2021	 */
2022
2023	mutex_lock(&info->mutex);
2024	xbdev = info->xbdev;
2025
2026	if (xbdev && xbdev->state == XenbusStateClosing) {
2027		/* pending switch to state closed */
2028		dev_info(disk_to_dev(bdev->bd_disk), "releasing disk\n");
2029		xlvbd_release_gendisk(info);
2030		xenbus_frontend_closed(info->xbdev);
2031 	}
2032
2033	mutex_unlock(&info->mutex);
2034
2035	if (!xbdev) {
2036		/* sudden device removal */
2037		dev_info(disk_to_dev(bdev->bd_disk), "releasing disk\n");
2038		xlvbd_release_gendisk(info);
2039		disk->private_data = NULL;
2040		kfree(info);
2041	}
2042
2043out:
2044	bdput(bdev);
2045out_mutex:
2046	mutex_unlock(&blkfront_mutex);
2047}
2048
2049static const struct block_device_operations xlvbd_block_fops =
2050{
2051	.owner = THIS_MODULE,
2052	.open = blkif_open,
2053	.release = blkif_release,
2054	.getgeo = blkif_getgeo,
2055	.ioctl = blkif_ioctl,
2056};
2057
2058
2059static const struct xenbus_device_id blkfront_ids[] = {
2060	{ "vbd" },
2061	{ "" }
2062};
2063
2064static struct xenbus_driver blkfront_driver = {
2065	.ids  = blkfront_ids,
2066	.probe = blkfront_probe,
2067	.remove = blkfront_remove,
2068	.resume = blkfront_resume,
2069	.otherend_changed = blkback_changed,
2070	.is_ready = blkfront_is_ready,
2071};
2072
2073static int __init xlblk_init(void)
2074{
2075	int ret;
2076
2077	if (!xen_domain())
2078		return -ENODEV;
2079
2080	if (!xen_has_pv_disk_devices())
2081		return -ENODEV;
2082
2083	if (register_blkdev(XENVBD_MAJOR, DEV_NAME)) {
2084		printk(KERN_WARNING "xen_blk: can't get major %d with name %s\n",
2085		       XENVBD_MAJOR, DEV_NAME);
2086		return -ENODEV;
2087	}
2088
2089	ret = xenbus_register_frontend(&blkfront_driver);
2090	if (ret) {
2091		unregister_blkdev(XENVBD_MAJOR, DEV_NAME);
2092		return ret;
2093	}
2094
2095	return 0;
2096}
2097module_init(xlblk_init);
2098
2099
2100static void __exit xlblk_exit(void)
2101{
2102	xenbus_unregister_driver(&blkfront_driver);
2103	unregister_blkdev(XENVBD_MAJOR, DEV_NAME);
2104	kfree(minors);
2105}
2106module_exit(xlblk_exit);
2107
2108MODULE_DESCRIPTION("Xen virtual block device frontend");
2109MODULE_LICENSE("GPL");
2110MODULE_ALIAS_BLOCKDEV_MAJOR(XENVBD_MAJOR);
2111MODULE_ALIAS("xen:vbd");
2112MODULE_ALIAS("xenblk");
2113