[go: nahoru, domu]

1/*
2 * GPL HEADER START
3 *
4 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
5 *
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License version 2 only,
8 * as published by the Free Software Foundation.
9 *
10 * This program is distributed in the hope that it will be useful, but
11 * WITHOUT ANY WARRANTY; without even the implied warranty of
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
13 * General Public License version 2 for more details (a copy is included
14 * in the LICENSE file that accompanied this code).
15 *
16 * You should have received a copy of the GNU General Public License
17 * version 2 along with this program; If not, see
18 * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
19 *
20 * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
21 * CA 95054 USA or visit www.sun.com if you need additional information or
22 * have any questions.
23 *
24 * GPL HEADER END
25 */
26/*
27 * Copyright (c) 2004, 2010, Oracle and/or its affiliates. All rights reserved.
28 * Use is subject to license terms.
29 *
30 * Copyright (c) 2011, 2012, Intel Corporation.
31 */
32/*
33 * This file is part of Lustre, http://www.lustre.org/
34 * Lustre is a trademark of Sun Microsystems, Inc.
35 */
36
37#define DEBUG_SUBSYSTEM S_LMV
38#include <linux/slab.h>
39#include <linux/module.h>
40#include <linux/init.h>
41#include <linux/pagemap.h>
42#include <linux/mm.h>
43#include <asm/div64.h>
44#include <linux/seq_file.h>
45#include <linux/namei.h>
46#include <asm/uaccess.h>
47
48#include "../include/lustre/lustre_idl.h"
49#include "../include/obd_support.h"
50#include "../include/lustre_lib.h"
51#include "../include/lustre_net.h"
52#include "../include/obd_class.h"
53#include "../include/lprocfs_status.h"
54#include "../include/lustre_lite.h"
55#include "../include/lustre_fid.h"
56#include "lmv_internal.h"
57
58static void lmv_activate_target(struct lmv_obd *lmv,
59				struct lmv_tgt_desc *tgt,
60				int activate)
61{
62	if (tgt->ltd_active == activate)
63		return;
64
65	tgt->ltd_active = activate;
66	lmv->desc.ld_active_tgt_count += (activate ? 1 : -1);
67}
68
69/**
70 * Error codes:
71 *
72 *  -EINVAL  : UUID can't be found in the LMV's target list
73 *  -ENOTCONN: The UUID is found, but the target connection is bad (!)
74 *  -EBADF   : The UUID is found, but the OBD of the wrong type (!)
75 */
76static int lmv_set_mdc_active(struct lmv_obd *lmv, struct obd_uuid *uuid,
77			      int activate)
78{
79	struct lmv_tgt_desc    *uninitialized_var(tgt);
80	struct obd_device      *obd;
81	int		     i;
82	int		     rc = 0;
83
84	CDEBUG(D_INFO, "Searching in lmv %p for uuid %s (activate=%d)\n",
85	       lmv, uuid->uuid, activate);
86
87	spin_lock(&lmv->lmv_lock);
88	for (i = 0; i < lmv->desc.ld_tgt_count; i++) {
89		tgt = lmv->tgts[i];
90		if (tgt == NULL || tgt->ltd_exp == NULL)
91			continue;
92
93		CDEBUG(D_INFO, "Target idx %d is %s conn %#llx\n", i,
94		       tgt->ltd_uuid.uuid, tgt->ltd_exp->exp_handle.h_cookie);
95
96		if (obd_uuid_equals(uuid, &tgt->ltd_uuid))
97			break;
98	}
99
100	if (i == lmv->desc.ld_tgt_count) {
101		rc = -EINVAL;
102		goto out_lmv_lock;
103	}
104
105	obd = class_exp2obd(tgt->ltd_exp);
106	if (obd == NULL) {
107		rc = -ENOTCONN;
108		goto out_lmv_lock;
109	}
110
111	CDEBUG(D_INFO, "Found OBD %s=%s device %d (%p) type %s at LMV idx %d\n",
112	       obd->obd_name, obd->obd_uuid.uuid, obd->obd_minor, obd,
113	       obd->obd_type->typ_name, i);
114	LASSERT(strcmp(obd->obd_type->typ_name, LUSTRE_MDC_NAME) == 0);
115
116	if (tgt->ltd_active == activate) {
117		CDEBUG(D_INFO, "OBD %p already %sactive!\n", obd,
118		       activate ? "" : "in");
119		goto out_lmv_lock;
120	}
121
122	CDEBUG(D_INFO, "Marking OBD %p %sactive\n", obd,
123	       activate ? "" : "in");
124	lmv_activate_target(lmv, tgt, activate);
125
126 out_lmv_lock:
127	spin_unlock(&lmv->lmv_lock);
128	return rc;
129}
130
131struct obd_uuid *lmv_get_uuid(struct obd_export *exp)
132{
133	struct lmv_obd *lmv = &exp->exp_obd->u.lmv;
134
135	return obd_get_uuid(lmv->tgts[0]->ltd_exp);
136}
137
138static int lmv_notify(struct obd_device *obd, struct obd_device *watched,
139		      enum obd_notify_event ev, void *data)
140{
141	struct obd_connect_data *conn_data;
142	struct lmv_obd	  *lmv = &obd->u.lmv;
143	struct obd_uuid	 *uuid;
144	int		      rc = 0;
145
146	if (strcmp(watched->obd_type->typ_name, LUSTRE_MDC_NAME)) {
147		CERROR("unexpected notification of %s %s!\n",
148		       watched->obd_type->typ_name,
149		       watched->obd_name);
150		return -EINVAL;
151	}
152
153	uuid = &watched->u.cli.cl_target_uuid;
154	if (ev == OBD_NOTIFY_ACTIVE || ev == OBD_NOTIFY_INACTIVE) {
155		/*
156		 * Set MDC as active before notifying the observer, so the
157		 * observer can use the MDC normally.
158		 */
159		rc = lmv_set_mdc_active(lmv, uuid,
160					ev == OBD_NOTIFY_ACTIVE);
161		if (rc) {
162			CERROR("%sactivation of %s failed: %d\n",
163			       ev == OBD_NOTIFY_ACTIVE ? "" : "de",
164			       uuid->uuid, rc);
165			return rc;
166		}
167	} else if (ev == OBD_NOTIFY_OCD) {
168		conn_data = &watched->u.cli.cl_import->imp_connect_data;
169		/*
170		 * XXX: Make sure that ocd_connect_flags from all targets are
171		 * the same. Otherwise one of MDTs runs wrong version or
172		 * something like this.  --umka
173		 */
174		obd->obd_self_export->exp_connect_data = *conn_data;
175	}
176#if 0
177	else if (ev == OBD_NOTIFY_DISCON) {
178		/*
179		 * For disconnect event, flush fld cache for failout MDS case.
180		 */
181		fld_client_flush(&lmv->lmv_fld);
182	}
183#endif
184	/*
185	 * Pass the notification up the chain.
186	 */
187	if (obd->obd_observer)
188		rc = obd_notify(obd->obd_observer, watched, ev, data);
189
190	return rc;
191}
192
193/**
194 * This is fake connect function. Its purpose is to initialize lmv and say
195 * caller that everything is okay. Real connection will be performed later.
196 */
197static int lmv_connect(const struct lu_env *env,
198		       struct obd_export **exp, struct obd_device *obd,
199		       struct obd_uuid *cluuid, struct obd_connect_data *data,
200		       void *localdata)
201{
202	struct proc_dir_entry *lmv_proc_dir;
203	struct lmv_obd	*lmv = &obd->u.lmv;
204	struct lustre_handle  conn = { 0 };
205	int		    rc = 0;
206
207	/*
208	 * We don't want to actually do the underlying connections more than
209	 * once, so keep track.
210	 */
211	lmv->refcount++;
212	if (lmv->refcount > 1) {
213		*exp = NULL;
214		return 0;
215	}
216
217	rc = class_connect(&conn, obd, cluuid);
218	if (rc) {
219		CERROR("class_connection() returned %d\n", rc);
220		return rc;
221	}
222
223	*exp = class_conn2export(&conn);
224	class_export_get(*exp);
225
226	lmv->exp = *exp;
227	lmv->connected = 0;
228	lmv->cluuid = *cluuid;
229
230	if (data)
231		lmv->conn_data = *data;
232
233	if (obd->obd_proc_private != NULL) {
234		lmv_proc_dir = obd->obd_proc_private;
235	} else {
236		lmv_proc_dir = lprocfs_register("target_obds", obd->obd_proc_entry,
237						NULL, NULL);
238		if (IS_ERR(lmv_proc_dir)) {
239			CERROR("could not register /proc/fs/lustre/%s/%s/target_obds.",
240			       obd->obd_type->typ_name, obd->obd_name);
241			lmv_proc_dir = NULL;
242		}
243		obd->obd_proc_private = lmv_proc_dir;
244	}
245
246	/*
247	 * All real clients should perform actual connection right away, because
248	 * it is possible, that LMV will not have opportunity to connect targets
249	 * and MDC stuff will be called directly, for instance while reading
250	 * ../mdc/../kbytesfree procfs file, etc.
251	 */
252	if (data->ocd_connect_flags & OBD_CONNECT_REAL)
253		rc = lmv_check_connect(obd);
254
255	if (rc && lmv_proc_dir) {
256		lprocfs_remove(&lmv_proc_dir);
257		obd->obd_proc_private = NULL;
258	}
259
260	return rc;
261}
262
263static void lmv_set_timeouts(struct obd_device *obd)
264{
265	struct lmv_tgt_desc   *tgt;
266	struct lmv_obd	*lmv;
267	int		    i;
268
269	lmv = &obd->u.lmv;
270	if (lmv->server_timeout == 0)
271		return;
272
273	if (lmv->connected == 0)
274		return;
275
276	for (i = 0; i < lmv->desc.ld_tgt_count; i++) {
277		tgt = lmv->tgts[i];
278		if (tgt == NULL || tgt->ltd_exp == NULL || tgt->ltd_active == 0)
279			continue;
280
281		obd_set_info_async(NULL, tgt->ltd_exp, sizeof(KEY_INTERMDS),
282				   KEY_INTERMDS, 0, NULL, NULL);
283	}
284}
285
286static int lmv_init_ea_size(struct obd_export *exp, int easize,
287			    int def_easize, int cookiesize, int def_cookiesize)
288{
289	struct obd_device   *obd = exp->exp_obd;
290	struct lmv_obd      *lmv = &obd->u.lmv;
291	int		  i;
292	int		  rc = 0;
293	int		  change = 0;
294
295	if (lmv->max_easize < easize) {
296		lmv->max_easize = easize;
297		change = 1;
298	}
299	if (lmv->max_def_easize < def_easize) {
300		lmv->max_def_easize = def_easize;
301		change = 1;
302	}
303	if (lmv->max_cookiesize < cookiesize) {
304		lmv->max_cookiesize = cookiesize;
305		change = 1;
306	}
307	if (lmv->max_def_cookiesize < def_cookiesize) {
308		lmv->max_def_cookiesize = def_cookiesize;
309		change = 1;
310	}
311	if (change == 0)
312		return 0;
313
314	if (lmv->connected == 0)
315		return 0;
316
317	for (i = 0; i < lmv->desc.ld_tgt_count; i++) {
318		if (lmv->tgts[i] == NULL ||
319		    lmv->tgts[i]->ltd_exp == NULL ||
320		    lmv->tgts[i]->ltd_active == 0) {
321			CWARN("%s: NULL export for %d\n", obd->obd_name, i);
322			continue;
323		}
324
325		rc = md_init_ea_size(lmv->tgts[i]->ltd_exp, easize, def_easize,
326				     cookiesize, def_cookiesize);
327		if (rc) {
328			CERROR("%s: obd_init_ea_size() failed on MDT target %d:"
329			       " rc = %d.\n", obd->obd_name, i, rc);
330			break;
331		}
332	}
333	return rc;
334}
335
336#define MAX_STRING_SIZE 128
337
338int lmv_connect_mdc(struct obd_device *obd, struct lmv_tgt_desc *tgt)
339{
340	struct proc_dir_entry   *lmv_proc_dir;
341	struct lmv_obd	  *lmv = &obd->u.lmv;
342	struct obd_uuid	 *cluuid = &lmv->cluuid;
343	struct obd_uuid	  lmv_mdc_uuid = { "LMV_MDC_UUID" };
344	struct obd_device       *mdc_obd;
345	struct obd_export       *mdc_exp;
346	struct lu_fld_target     target;
347	int		      rc;
348
349	mdc_obd = class_find_client_obd(&tgt->ltd_uuid, LUSTRE_MDC_NAME,
350					&obd->obd_uuid);
351	if (!mdc_obd) {
352		CERROR("target %s not attached\n", tgt->ltd_uuid.uuid);
353		return -EINVAL;
354	}
355
356	CDEBUG(D_CONFIG, "connect to %s(%s) - %s, %s FOR %s\n",
357		mdc_obd->obd_name, mdc_obd->obd_uuid.uuid,
358		tgt->ltd_uuid.uuid, obd->obd_uuid.uuid,
359		cluuid->uuid);
360
361	if (!mdc_obd->obd_set_up) {
362		CERROR("target %s is not set up\n", tgt->ltd_uuid.uuid);
363		return -EINVAL;
364	}
365
366	rc = obd_connect(NULL, &mdc_exp, mdc_obd, &lmv_mdc_uuid,
367			 &lmv->conn_data, NULL);
368	if (rc) {
369		CERROR("target %s connect error %d\n", tgt->ltd_uuid.uuid, rc);
370		return rc;
371	}
372
373	/*
374	 * Init fid sequence client for this mdc and add new fld target.
375	 */
376	rc = obd_fid_init(mdc_obd, mdc_exp, LUSTRE_SEQ_METADATA);
377	if (rc)
378		return rc;
379
380	target.ft_srv = NULL;
381	target.ft_exp = mdc_exp;
382	target.ft_idx = tgt->ltd_idx;
383
384	fld_client_add_target(&lmv->lmv_fld, &target);
385
386	rc = obd_register_observer(mdc_obd, obd);
387	if (rc) {
388		obd_disconnect(mdc_exp);
389		CERROR("target %s register_observer error %d\n",
390		       tgt->ltd_uuid.uuid, rc);
391		return rc;
392	}
393
394	if (obd->obd_observer) {
395		/*
396		 * Tell the observer about the new target.
397		 */
398		rc = obd_notify(obd->obd_observer, mdc_exp->exp_obd,
399				OBD_NOTIFY_ACTIVE,
400				(void *)(tgt - lmv->tgts[0]));
401		if (rc) {
402			obd_disconnect(mdc_exp);
403			return rc;
404		}
405	}
406
407	tgt->ltd_active = 1;
408	tgt->ltd_exp = mdc_exp;
409	lmv->desc.ld_active_tgt_count++;
410
411	md_init_ea_size(tgt->ltd_exp, lmv->max_easize, lmv->max_def_easize,
412			lmv->max_cookiesize, lmv->max_def_cookiesize);
413
414	CDEBUG(D_CONFIG, "Connected to %s(%s) successfully (%d)\n",
415		mdc_obd->obd_name, mdc_obd->obd_uuid.uuid,
416		atomic_read(&obd->obd_refcount));
417
418	lmv_proc_dir = obd->obd_proc_private;
419	if (lmv_proc_dir) {
420		struct proc_dir_entry *mdc_symlink;
421
422		LASSERT(mdc_obd->obd_type != NULL);
423		LASSERT(mdc_obd->obd_type->typ_name != NULL);
424		mdc_symlink = lprocfs_add_symlink(mdc_obd->obd_name,
425						  lmv_proc_dir,
426						  "../../../%s/%s",
427						  mdc_obd->obd_type->typ_name,
428						  mdc_obd->obd_name);
429		if (mdc_symlink == NULL) {
430			CERROR("Could not register LMV target "
431			       "/proc/fs/lustre/%s/%s/target_obds/%s.",
432			       obd->obd_type->typ_name, obd->obd_name,
433			       mdc_obd->obd_name);
434			lprocfs_remove(&lmv_proc_dir);
435			obd->obd_proc_private = NULL;
436		}
437	}
438	return 0;
439}
440
441static void lmv_del_target(struct lmv_obd *lmv, int index)
442{
443	if (lmv->tgts[index] == NULL)
444		return;
445
446	OBD_FREE_PTR(lmv->tgts[index]);
447	lmv->tgts[index] = NULL;
448	return;
449}
450
451static int lmv_add_target(struct obd_device *obd, struct obd_uuid *uuidp,
452			   __u32 index, int gen)
453{
454	struct lmv_obd      *lmv = &obd->u.lmv;
455	struct lmv_tgt_desc *tgt;
456	int		  rc = 0;
457
458	CDEBUG(D_CONFIG, "Target uuid: %s. index %d\n", uuidp->uuid, index);
459
460	lmv_init_lock(lmv);
461
462	if (lmv->desc.ld_tgt_count == 0) {
463		struct obd_device *mdc_obd;
464
465		mdc_obd = class_find_client_obd(uuidp, LUSTRE_MDC_NAME,
466						&obd->obd_uuid);
467		if (!mdc_obd) {
468			lmv_init_unlock(lmv);
469			CERROR("%s: Target %s not attached: rc = %d\n",
470			       obd->obd_name, uuidp->uuid, -EINVAL);
471			return -EINVAL;
472		}
473	}
474
475	if ((index < lmv->tgts_size) && (lmv->tgts[index] != NULL)) {
476		tgt = lmv->tgts[index];
477		CERROR("%s: UUID %s already assigned at LOV target index %d:"
478		       " rc = %d\n", obd->obd_name,
479		       obd_uuid2str(&tgt->ltd_uuid), index, -EEXIST);
480		lmv_init_unlock(lmv);
481		return -EEXIST;
482	}
483
484	if (index >= lmv->tgts_size) {
485		/* We need to reallocate the lmv target array. */
486		struct lmv_tgt_desc **newtgts, **old = NULL;
487		__u32 newsize = 1;
488		__u32 oldsize = 0;
489
490		while (newsize < index + 1)
491			newsize = newsize << 1;
492		OBD_ALLOC(newtgts, sizeof(*newtgts) * newsize);
493		if (newtgts == NULL) {
494			lmv_init_unlock(lmv);
495			return -ENOMEM;
496		}
497
498		if (lmv->tgts_size) {
499			memcpy(newtgts, lmv->tgts,
500			       sizeof(*newtgts) * lmv->tgts_size);
501			old = lmv->tgts;
502			oldsize = lmv->tgts_size;
503		}
504
505		lmv->tgts = newtgts;
506		lmv->tgts_size = newsize;
507		smp_rmb();
508		if (old)
509			OBD_FREE(old, sizeof(*old) * oldsize);
510
511		CDEBUG(D_CONFIG, "tgts: %p size: %d\n", lmv->tgts,
512		       lmv->tgts_size);
513	}
514
515	OBD_ALLOC_PTR(tgt);
516	if (!tgt) {
517		lmv_init_unlock(lmv);
518		return -ENOMEM;
519	}
520
521	mutex_init(&tgt->ltd_fid_mutex);
522	tgt->ltd_idx = index;
523	tgt->ltd_uuid = *uuidp;
524	tgt->ltd_active = 0;
525	lmv->tgts[index] = tgt;
526	if (index >= lmv->desc.ld_tgt_count)
527		lmv->desc.ld_tgt_count = index + 1;
528
529	if (lmv->connected) {
530		rc = lmv_connect_mdc(obd, tgt);
531		if (rc) {
532			spin_lock(&lmv->lmv_lock);
533			lmv->desc.ld_tgt_count--;
534			memset(tgt, 0, sizeof(*tgt));
535			spin_unlock(&lmv->lmv_lock);
536		} else {
537			int easize = sizeof(struct lmv_stripe_md) +
538				lmv->desc.ld_tgt_count * sizeof(struct lu_fid);
539			lmv_init_ea_size(obd->obd_self_export, easize, 0, 0, 0);
540		}
541	}
542
543	lmv_init_unlock(lmv);
544	return rc;
545}
546
547int lmv_check_connect(struct obd_device *obd)
548{
549	struct lmv_obd       *lmv = &obd->u.lmv;
550	struct lmv_tgt_desc  *tgt;
551	int		   i;
552	int		   rc;
553	int		   easize;
554
555	if (lmv->connected)
556		return 0;
557
558	lmv_init_lock(lmv);
559	if (lmv->connected) {
560		lmv_init_unlock(lmv);
561		return 0;
562	}
563
564	if (lmv->desc.ld_tgt_count == 0) {
565		lmv_init_unlock(lmv);
566		CERROR("%s: no targets configured.\n", obd->obd_name);
567		return -EINVAL;
568	}
569
570	CDEBUG(D_CONFIG, "Time to connect %s to %s\n",
571	       lmv->cluuid.uuid, obd->obd_name);
572
573	LASSERT(lmv->tgts != NULL);
574
575	for (i = 0; i < lmv->desc.ld_tgt_count; i++) {
576		tgt = lmv->tgts[i];
577		if (tgt == NULL)
578			continue;
579		rc = lmv_connect_mdc(obd, tgt);
580		if (rc)
581			goto out_disc;
582	}
583
584	lmv_set_timeouts(obd);
585	class_export_put(lmv->exp);
586	lmv->connected = 1;
587	easize = lmv_get_easize(lmv);
588	lmv_init_ea_size(obd->obd_self_export, easize, 0, 0, 0);
589	lmv_init_unlock(lmv);
590	return 0;
591
592 out_disc:
593	while (i-- > 0) {
594		int rc2;
595		tgt = lmv->tgts[i];
596		if (tgt == NULL)
597			continue;
598		tgt->ltd_active = 0;
599		if (tgt->ltd_exp) {
600			--lmv->desc.ld_active_tgt_count;
601			rc2 = obd_disconnect(tgt->ltd_exp);
602			if (rc2) {
603				CERROR("LMV target %s disconnect on "
604				       "MDC idx %d: error %d\n",
605				       tgt->ltd_uuid.uuid, i, rc2);
606			}
607		}
608	}
609	class_disconnect(lmv->exp);
610	lmv_init_unlock(lmv);
611	return rc;
612}
613
614static int lmv_disconnect_mdc(struct obd_device *obd, struct lmv_tgt_desc *tgt)
615{
616	struct proc_dir_entry  *lmv_proc_dir;
617	struct lmv_obd	 *lmv = &obd->u.lmv;
618	struct obd_device      *mdc_obd;
619	int		     rc;
620
621	LASSERT(tgt != NULL);
622	LASSERT(obd != NULL);
623
624	mdc_obd = class_exp2obd(tgt->ltd_exp);
625
626	if (mdc_obd) {
627		mdc_obd->obd_force = obd->obd_force;
628		mdc_obd->obd_fail = obd->obd_fail;
629		mdc_obd->obd_no_recov = obd->obd_no_recov;
630	}
631
632	lmv_proc_dir = obd->obd_proc_private;
633	if (lmv_proc_dir)
634		lprocfs_remove_proc_entry(mdc_obd->obd_name, lmv_proc_dir);
635
636	rc = obd_fid_fini(tgt->ltd_exp->exp_obd);
637	if (rc)
638		CERROR("Can't finalize fids factory\n");
639
640	CDEBUG(D_INFO, "Disconnected from %s(%s) successfully\n",
641	       tgt->ltd_exp->exp_obd->obd_name,
642	       tgt->ltd_exp->exp_obd->obd_uuid.uuid);
643
644	obd_register_observer(tgt->ltd_exp->exp_obd, NULL);
645	rc = obd_disconnect(tgt->ltd_exp);
646	if (rc) {
647		if (tgt->ltd_active) {
648			CERROR("Target %s disconnect error %d\n",
649			       tgt->ltd_uuid.uuid, rc);
650		}
651	}
652
653	lmv_activate_target(lmv, tgt, 0);
654	tgt->ltd_exp = NULL;
655	return 0;
656}
657
658static int lmv_disconnect(struct obd_export *exp)
659{
660	struct obd_device     *obd = class_exp2obd(exp);
661	struct lmv_obd	*lmv = &obd->u.lmv;
662	int		    rc;
663	int		    i;
664
665	if (!lmv->tgts)
666		goto out_local;
667
668	/*
669	 * Only disconnect the underlying layers on the final disconnect.
670	 */
671	lmv->refcount--;
672	if (lmv->refcount != 0)
673		goto out_local;
674
675	for (i = 0; i < lmv->desc.ld_tgt_count; i++) {
676		if (lmv->tgts[i] == NULL || lmv->tgts[i]->ltd_exp == NULL)
677			continue;
678
679		lmv_disconnect_mdc(obd, lmv->tgts[i]);
680	}
681
682	if (obd->obd_proc_private)
683		lprocfs_remove((struct proc_dir_entry **)&obd->obd_proc_private);
684	else
685		CERROR("/proc/fs/lustre/%s/%s/target_obds missing\n",
686		       obd->obd_type->typ_name, obd->obd_name);
687
688out_local:
689	/*
690	 * This is the case when no real connection is established by
691	 * lmv_check_connect().
692	 */
693	if (!lmv->connected)
694		class_export_put(exp);
695	rc = class_disconnect(exp);
696	if (lmv->refcount == 0)
697		lmv->connected = 0;
698	return rc;
699}
700
701static int lmv_fid2path(struct obd_export *exp, int len, void *karg, void *uarg)
702{
703	struct obd_device	*obddev = class_exp2obd(exp);
704	struct lmv_obd		*lmv = &obddev->u.lmv;
705	struct getinfo_fid2path *gf;
706	struct lmv_tgt_desc     *tgt;
707	struct getinfo_fid2path *remote_gf = NULL;
708	int			remote_gf_size = 0;
709	int			rc;
710
711	gf = (struct getinfo_fid2path *)karg;
712	tgt = lmv_find_target(lmv, &gf->gf_fid);
713	if (IS_ERR(tgt))
714		return PTR_ERR(tgt);
715
716repeat_fid2path:
717	rc = obd_iocontrol(OBD_IOC_FID2PATH, tgt->ltd_exp, len, gf, uarg);
718	if (rc != 0 && rc != -EREMOTE)
719		goto out_fid2path;
720
721	/* If remote_gf != NULL, it means just building the
722	 * path on the remote MDT, copy this path segment to gf */
723	if (remote_gf != NULL) {
724		struct getinfo_fid2path *ori_gf;
725		char *ptr;
726
727		ori_gf = (struct getinfo_fid2path *)karg;
728		if (strlen(ori_gf->gf_path) +
729		    strlen(gf->gf_path) > ori_gf->gf_pathlen) {
730			rc = -EOVERFLOW;
731			goto out_fid2path;
732		}
733
734		ptr = ori_gf->gf_path;
735
736		memmove(ptr + strlen(gf->gf_path) + 1, ptr,
737			strlen(ori_gf->gf_path));
738
739		strncpy(ptr, gf->gf_path, strlen(gf->gf_path));
740		ptr += strlen(gf->gf_path);
741		*ptr = '/';
742	}
743
744	CDEBUG(D_INFO, "%s: get path %s "DFID" rec: %llu ln: %u\n",
745	       tgt->ltd_exp->exp_obd->obd_name,
746	       gf->gf_path, PFID(&gf->gf_fid), gf->gf_recno,
747	       gf->gf_linkno);
748
749	if (rc == 0)
750		goto out_fid2path;
751
752	/* sigh, has to go to another MDT to do path building further */
753	if (remote_gf == NULL) {
754		remote_gf_size = sizeof(*remote_gf) + PATH_MAX;
755		OBD_ALLOC(remote_gf, remote_gf_size);
756		if (remote_gf == NULL) {
757			rc = -ENOMEM;
758			goto out_fid2path;
759		}
760		remote_gf->gf_pathlen = PATH_MAX;
761	}
762
763	if (!fid_is_sane(&gf->gf_fid)) {
764		CERROR("%s: invalid FID "DFID": rc = %d\n",
765		       tgt->ltd_exp->exp_obd->obd_name,
766		       PFID(&gf->gf_fid), -EINVAL);
767		rc = -EINVAL;
768		goto out_fid2path;
769	}
770
771	tgt = lmv_find_target(lmv, &gf->gf_fid);
772	if (IS_ERR(tgt)) {
773		rc = -EINVAL;
774		goto out_fid2path;
775	}
776
777	remote_gf->gf_fid = gf->gf_fid;
778	remote_gf->gf_recno = -1;
779	remote_gf->gf_linkno = -1;
780	memset(remote_gf->gf_path, 0, remote_gf->gf_pathlen);
781	gf = remote_gf;
782	goto repeat_fid2path;
783
784out_fid2path:
785	if (remote_gf != NULL)
786		OBD_FREE(remote_gf, remote_gf_size);
787	return rc;
788}
789
790static int lmv_hsm_req_count(struct lmv_obd *lmv,
791			     const struct hsm_user_request *hur,
792			     const struct lmv_tgt_desc *tgt_mds)
793{
794	int			i, nr = 0;
795	struct lmv_tgt_desc    *curr_tgt;
796
797	/* count how many requests must be sent to the given target */
798	for (i = 0; i < hur->hur_request.hr_itemcount; i++) {
799		curr_tgt = lmv_find_target(lmv, &hur->hur_user_item[i].hui_fid);
800		if (obd_uuid_equals(&curr_tgt->ltd_uuid, &tgt_mds->ltd_uuid))
801			nr++;
802	}
803	return nr;
804}
805
806static void lmv_hsm_req_build(struct lmv_obd *lmv,
807			      struct hsm_user_request *hur_in,
808			      const struct lmv_tgt_desc *tgt_mds,
809			      struct hsm_user_request *hur_out)
810{
811	int			i, nr_out;
812	struct lmv_tgt_desc    *curr_tgt;
813
814	/* build the hsm_user_request for the given target */
815	hur_out->hur_request = hur_in->hur_request;
816	nr_out = 0;
817	for (i = 0; i < hur_in->hur_request.hr_itemcount; i++) {
818		curr_tgt = lmv_find_target(lmv,
819					&hur_in->hur_user_item[i].hui_fid);
820		if (obd_uuid_equals(&curr_tgt->ltd_uuid, &tgt_mds->ltd_uuid)) {
821			hur_out->hur_user_item[nr_out] =
822				hur_in->hur_user_item[i];
823			nr_out++;
824		}
825	}
826	hur_out->hur_request.hr_itemcount = nr_out;
827	memcpy(hur_data(hur_out), hur_data(hur_in),
828	       hur_in->hur_request.hr_data_len);
829}
830
831static int lmv_hsm_ct_unregister(struct lmv_obd *lmv, unsigned int cmd, int len,
832				 struct lustre_kernelcomm *lk, void *uarg)
833{
834	int	i, rc = 0;
835
836	/* unregister request (call from llapi_hsm_copytool_fini) */
837	for (i = 0; i < lmv->desc.ld_tgt_count; i++) {
838		/* best effort: try to clean as much as possible
839		 * (continue on error) */
840		obd_iocontrol(cmd, lmv->tgts[i]->ltd_exp, len, lk, uarg);
841	}
842
843	/* Whatever the result, remove copytool from kuc groups.
844	 * Unreached coordinators will get EPIPE on next requests
845	 * and will unregister automatically.
846	 */
847	rc = libcfs_kkuc_group_rem(lk->lk_uid, lk->lk_group);
848	return rc;
849}
850
851static int lmv_hsm_ct_register(struct lmv_obd *lmv, unsigned int cmd, int len,
852			       struct lustre_kernelcomm *lk, void *uarg)
853{
854	struct file	*filp;
855	int		 i, j, err;
856	int		 rc = 0;
857	bool		 any_set = false;
858
859	/* All or nothing: try to register to all MDS.
860	 * In case of failure, unregister from previous MDS,
861	 * except if it because of inactive target. */
862	for (i = 0; i < lmv->desc.ld_tgt_count; i++) {
863		err = obd_iocontrol(cmd, lmv->tgts[i]->ltd_exp,
864				   len, lk, uarg);
865		if (err) {
866			if (lmv->tgts[i]->ltd_active) {
867				/* permanent error */
868				CERROR("error: iocontrol MDC %s on MDT"
869				       "idx %d cmd %x: err = %d\n",
870					lmv->tgts[i]->ltd_uuid.uuid,
871					i, cmd, err);
872				rc = err;
873				lk->lk_flags |= LK_FLG_STOP;
874				/* unregister from previous MDS */
875				for (j = 0; j < i; j++)
876					obd_iocontrol(cmd,
877						  lmv->tgts[j]->ltd_exp,
878						  len, lk, uarg);
879				return rc;
880			}
881			/* else: transient error.
882			 * kuc will register to the missing MDT
883			 * when it is back */
884		} else {
885			any_set = true;
886		}
887	}
888
889	if (!any_set)
890		/* no registration done: return error */
891		return -ENOTCONN;
892
893	/* at least one registration done, with no failure */
894	filp = fget(lk->lk_wfd);
895	if (filp == NULL) {
896		return -EBADF;
897	}
898	rc = libcfs_kkuc_group_add(filp, lk->lk_uid, lk->lk_group, lk->lk_data);
899	if (rc != 0 && filp != NULL)
900		fput(filp);
901	return rc;
902}
903
904
905
906
907static int lmv_iocontrol(unsigned int cmd, struct obd_export *exp,
908			 int len, void *karg, void *uarg)
909{
910	struct obd_device    *obddev = class_exp2obd(exp);
911	struct lmv_obd       *lmv = &obddev->u.lmv;
912	int		   i = 0;
913	int		   rc = 0;
914	int		   set = 0;
915	int		   count = lmv->desc.ld_tgt_count;
916
917	if (count == 0)
918		return -ENOTTY;
919
920	switch (cmd) {
921	case IOC_OBD_STATFS: {
922		struct obd_ioctl_data *data = karg;
923		struct obd_device *mdc_obd;
924		struct obd_statfs stat_buf = {0};
925		__u32 index;
926
927		memcpy(&index, data->ioc_inlbuf2, sizeof(__u32));
928		if ((index >= count))
929			return -ENODEV;
930
931		if (lmv->tgts[index] == NULL ||
932		    lmv->tgts[index]->ltd_active == 0)
933			return -ENODATA;
934
935		mdc_obd = class_exp2obd(lmv->tgts[index]->ltd_exp);
936		if (!mdc_obd)
937			return -EINVAL;
938
939		/* copy UUID */
940		if (copy_to_user(data->ioc_pbuf2, obd2cli_tgt(mdc_obd),
941				     min((int) data->ioc_plen2,
942					 (int) sizeof(struct obd_uuid))))
943			return -EFAULT;
944
945		rc = obd_statfs(NULL, lmv->tgts[index]->ltd_exp, &stat_buf,
946				cfs_time_shift_64(-OBD_STATFS_CACHE_SECONDS),
947				0);
948		if (rc)
949			return rc;
950		if (copy_to_user(data->ioc_pbuf1, &stat_buf,
951				     min((int) data->ioc_plen1,
952					 (int) sizeof(stat_buf))))
953			return -EFAULT;
954		break;
955	}
956	case OBD_IOC_QUOTACTL: {
957		struct if_quotactl *qctl = karg;
958		struct lmv_tgt_desc *tgt = NULL;
959		struct obd_quotactl *oqctl;
960
961		if (qctl->qc_valid == QC_MDTIDX) {
962			if (qctl->qc_idx < 0 || count <= qctl->qc_idx)
963				return -EINVAL;
964
965			tgt = lmv->tgts[qctl->qc_idx];
966			if (tgt == NULL || tgt->ltd_exp == NULL)
967				return -EINVAL;
968		} else if (qctl->qc_valid == QC_UUID) {
969			for (i = 0; i < count; i++) {
970				tgt = lmv->tgts[i];
971				if (tgt == NULL)
972					continue;
973				if (!obd_uuid_equals(&tgt->ltd_uuid,
974						     &qctl->obd_uuid))
975					continue;
976
977				if (tgt->ltd_exp == NULL)
978					return -EINVAL;
979
980				break;
981			}
982		} else {
983			return -EINVAL;
984		}
985
986		if (i >= count)
987			return -EAGAIN;
988
989		LASSERT(tgt && tgt->ltd_exp);
990		OBD_ALLOC_PTR(oqctl);
991		if (!oqctl)
992			return -ENOMEM;
993
994		QCTL_COPY(oqctl, qctl);
995		rc = obd_quotactl(tgt->ltd_exp, oqctl);
996		if (rc == 0) {
997			QCTL_COPY(qctl, oqctl);
998			qctl->qc_valid = QC_MDTIDX;
999			qctl->obd_uuid = tgt->ltd_uuid;
1000		}
1001		OBD_FREE_PTR(oqctl);
1002		break;
1003	}
1004	case OBD_IOC_CHANGELOG_SEND:
1005	case OBD_IOC_CHANGELOG_CLEAR: {
1006		struct ioc_changelog *icc = karg;
1007
1008		if (icc->icc_mdtindex >= count)
1009			return -ENODEV;
1010
1011		if (lmv->tgts[icc->icc_mdtindex] == NULL ||
1012		    lmv->tgts[icc->icc_mdtindex]->ltd_exp == NULL ||
1013		    lmv->tgts[icc->icc_mdtindex]->ltd_active == 0)
1014			return -ENODEV;
1015		rc = obd_iocontrol(cmd, lmv->tgts[icc->icc_mdtindex]->ltd_exp,
1016				   sizeof(*icc), icc, NULL);
1017		break;
1018	}
1019	case LL_IOC_GET_CONNECT_FLAGS: {
1020		if (lmv->tgts[0] == NULL)
1021			return -ENODATA;
1022		rc = obd_iocontrol(cmd, lmv->tgts[0]->ltd_exp, len, karg, uarg);
1023		break;
1024	}
1025	case OBD_IOC_FID2PATH: {
1026		rc = lmv_fid2path(exp, len, karg, uarg);
1027		break;
1028	}
1029	case LL_IOC_HSM_STATE_GET:
1030	case LL_IOC_HSM_STATE_SET:
1031	case LL_IOC_HSM_ACTION: {
1032		struct md_op_data	*op_data = karg;
1033		struct lmv_tgt_desc	*tgt;
1034
1035		tgt = lmv_find_target(lmv, &op_data->op_fid1);
1036		if (IS_ERR(tgt))
1037				return PTR_ERR(tgt);
1038
1039		if (tgt->ltd_exp == NULL)
1040				return -EINVAL;
1041
1042		rc = obd_iocontrol(cmd, tgt->ltd_exp, len, karg, uarg);
1043		break;
1044	}
1045	case LL_IOC_HSM_PROGRESS: {
1046		const struct hsm_progress_kernel *hpk = karg;
1047		struct lmv_tgt_desc	*tgt;
1048
1049		tgt = lmv_find_target(lmv, &hpk->hpk_fid);
1050		if (IS_ERR(tgt))
1051			return PTR_ERR(tgt);
1052		rc = obd_iocontrol(cmd, tgt->ltd_exp, len, karg, uarg);
1053		break;
1054	}
1055	case LL_IOC_HSM_REQUEST: {
1056		struct hsm_user_request *hur = karg;
1057		struct lmv_tgt_desc	*tgt;
1058		unsigned int reqcount = hur->hur_request.hr_itemcount;
1059
1060		if (reqcount == 0)
1061			return 0;
1062
1063		/* if the request is about a single fid
1064		 * or if there is a single MDS, no need to split
1065		 * the request. */
1066		if (reqcount == 1 || count == 1) {
1067			tgt = lmv_find_target(lmv,
1068					      &hur->hur_user_item[0].hui_fid);
1069			if (IS_ERR(tgt))
1070				return PTR_ERR(tgt);
1071			rc = obd_iocontrol(cmd, tgt->ltd_exp, len, karg, uarg);
1072		} else {
1073			/* split fid list to their respective MDS */
1074			for (i = 0; i < count; i++) {
1075				unsigned int		nr, reqlen;
1076				int			rc1;
1077				struct hsm_user_request *req;
1078
1079				nr = lmv_hsm_req_count(lmv, hur, lmv->tgts[i]);
1080				if (nr == 0) /* nothing for this MDS */
1081					continue;
1082
1083				/* build a request with fids for this MDS */
1084				reqlen = offsetof(typeof(*hur),
1085						  hur_user_item[nr])
1086					 + hur->hur_request.hr_data_len;
1087				OBD_ALLOC_LARGE(req, reqlen);
1088				if (req == NULL)
1089					return -ENOMEM;
1090
1091				lmv_hsm_req_build(lmv, hur, lmv->tgts[i], req);
1092
1093				rc1 = obd_iocontrol(cmd, lmv->tgts[i]->ltd_exp,
1094						    reqlen, req, uarg);
1095				if (rc1 != 0 && rc == 0)
1096					rc = rc1;
1097				OBD_FREE_LARGE(req, reqlen);
1098			}
1099		}
1100		break;
1101	}
1102	case LL_IOC_LOV_SWAP_LAYOUTS: {
1103		struct md_op_data	*op_data = karg;
1104		struct lmv_tgt_desc	*tgt1, *tgt2;
1105
1106		tgt1 = lmv_find_target(lmv, &op_data->op_fid1);
1107		if (IS_ERR(tgt1))
1108			return PTR_ERR(tgt1);
1109
1110		tgt2 = lmv_find_target(lmv, &op_data->op_fid2);
1111		if (IS_ERR(tgt2))
1112			return PTR_ERR(tgt2);
1113
1114		if ((tgt1->ltd_exp == NULL) || (tgt2->ltd_exp == NULL))
1115			return -EINVAL;
1116
1117		/* only files on same MDT can have their layouts swapped */
1118		if (tgt1->ltd_idx != tgt2->ltd_idx)
1119			return -EPERM;
1120
1121		rc = obd_iocontrol(cmd, tgt1->ltd_exp, len, karg, uarg);
1122		break;
1123	}
1124	case LL_IOC_HSM_CT_START: {
1125		struct lustre_kernelcomm *lk = karg;
1126		if (lk->lk_flags & LK_FLG_STOP)
1127			rc = lmv_hsm_ct_unregister(lmv, cmd, len, lk, uarg);
1128		else
1129			rc = lmv_hsm_ct_register(lmv, cmd, len, lk, uarg);
1130		break;
1131	}
1132	default:
1133		for (i = 0; i < count; i++) {
1134			struct obd_device *mdc_obd;
1135			int err;
1136
1137			if (lmv->tgts[i] == NULL ||
1138			    lmv->tgts[i]->ltd_exp == NULL)
1139				continue;
1140			/* ll_umount_begin() sets force flag but for lmv, not
1141			 * mdc. Let's pass it through */
1142			mdc_obd = class_exp2obd(lmv->tgts[i]->ltd_exp);
1143			mdc_obd->obd_force = obddev->obd_force;
1144			err = obd_iocontrol(cmd, lmv->tgts[i]->ltd_exp, len,
1145					    karg, uarg);
1146			if (err == -ENODATA && cmd == OBD_IOC_POLL_QUOTACHECK) {
1147				return err;
1148			} else if (err) {
1149				if (lmv->tgts[i]->ltd_active) {
1150					CERROR("error: iocontrol MDC %s on MDT"
1151					       "idx %d cmd %x: err = %d\n",
1152						lmv->tgts[i]->ltd_uuid.uuid,
1153						i, cmd, err);
1154					if (!rc)
1155						rc = err;
1156				}
1157			} else
1158				set = 1;
1159		}
1160		if (!set && !rc)
1161			rc = -EIO;
1162	}
1163	return rc;
1164}
1165
1166#if 0
1167static int lmv_all_chars_policy(int count, const char *name,
1168				int len)
1169{
1170	unsigned int c = 0;
1171
1172	while (len > 0)
1173		c += name[--len];
1174	c = c % count;
1175	return c;
1176}
1177
1178static int lmv_nid_policy(struct lmv_obd *lmv)
1179{
1180	struct obd_import *imp;
1181	__u32	      id;
1182
1183	/*
1184	 * XXX: To get nid we assume that underlying obd device is mdc.
1185	 */
1186	imp = class_exp2cliimp(lmv->tgts[0].ltd_exp);
1187	id = imp->imp_connection->c_self ^ (imp->imp_connection->c_self >> 32);
1188	return id % lmv->desc.ld_tgt_count;
1189}
1190
1191static int lmv_choose_mds(struct lmv_obd *lmv, struct md_op_data *op_data,
1192			  enum placement_policy placement)
1193{
1194	switch (placement) {
1195	case PLACEMENT_CHAR_POLICY:
1196		return lmv_all_chars_policy(lmv->desc.ld_tgt_count,
1197					    op_data->op_name,
1198					    op_data->op_namelen);
1199	case PLACEMENT_NID_POLICY:
1200		return lmv_nid_policy(lmv);
1201
1202	default:
1203		break;
1204	}
1205
1206	CERROR("Unsupported placement policy %x\n", placement);
1207	return -EINVAL;
1208}
1209#endif
1210
1211/**
1212 * This is _inode_ placement policy function (not name).
1213 */
1214static int lmv_placement_policy(struct obd_device *obd,
1215				struct md_op_data *op_data, u32 *mds)
1216{
1217	struct lmv_obd	  *lmv = &obd->u.lmv;
1218
1219	LASSERT(mds != NULL);
1220
1221	if (lmv->desc.ld_tgt_count == 1) {
1222		*mds = 0;
1223		return 0;
1224	}
1225
1226	/**
1227	 * If stripe_offset is provided during setdirstripe
1228	 * (setdirstripe -i xx), xx MDS will be chosen.
1229	 */
1230	if (op_data->op_cli_flags & CLI_SET_MEA) {
1231		struct lmv_user_md *lum;
1232
1233		lum = (struct lmv_user_md *)op_data->op_data;
1234		if (lum->lum_type == LMV_STRIPE_TYPE &&
1235		    lum->lum_stripe_offset != -1) {
1236			if (lum->lum_stripe_offset >= lmv->desc.ld_tgt_count) {
1237				CERROR("%s: Stripe_offset %d > MDT count %d:"
1238				       " rc = %d\n", obd->obd_name,
1239				       lum->lum_stripe_offset,
1240				       lmv->desc.ld_tgt_count, -ERANGE);
1241				return -ERANGE;
1242			}
1243			*mds = lum->lum_stripe_offset;
1244			return 0;
1245		}
1246	}
1247
1248	/* Allocate new fid on target according to operation type and parent
1249	 * home mds. */
1250	*mds = op_data->op_mds;
1251	return 0;
1252}
1253
1254int __lmv_fid_alloc(struct lmv_obd *lmv, struct lu_fid *fid, u32 mds)
1255{
1256	struct lmv_tgt_desc	*tgt;
1257	int			 rc;
1258
1259	tgt = lmv_get_target(lmv, mds);
1260	if (IS_ERR(tgt))
1261		return PTR_ERR(tgt);
1262
1263	/*
1264	 * New seq alloc and FLD setup should be atomic. Otherwise we may find
1265	 * on server that seq in new allocated fid is not yet known.
1266	 */
1267	mutex_lock(&tgt->ltd_fid_mutex);
1268
1269	if (tgt->ltd_active == 0 || tgt->ltd_exp == NULL) {
1270		rc = -ENODEV;
1271		goto out;
1272	}
1273
1274	/*
1275	 * Asking underlaying tgt layer to allocate new fid.
1276	 */
1277	rc = obd_fid_alloc(tgt->ltd_exp, fid, NULL);
1278	if (rc > 0) {
1279		LASSERT(fid_is_sane(fid));
1280		rc = 0;
1281	}
1282
1283out:
1284	mutex_unlock(&tgt->ltd_fid_mutex);
1285	return rc;
1286}
1287
1288int lmv_fid_alloc(struct obd_export *exp, struct lu_fid *fid,
1289		  struct md_op_data *op_data)
1290{
1291	struct obd_device     *obd = class_exp2obd(exp);
1292	struct lmv_obd	*lmv = &obd->u.lmv;
1293	u32		       mds = 0;
1294	int		    rc;
1295
1296	LASSERT(op_data != NULL);
1297	LASSERT(fid != NULL);
1298
1299	rc = lmv_placement_policy(obd, op_data, &mds);
1300	if (rc) {
1301		CERROR("Can't get target for allocating fid, "
1302		       "rc %d\n", rc);
1303		return rc;
1304	}
1305
1306	rc = __lmv_fid_alloc(lmv, fid, mds);
1307	if (rc) {
1308		CERROR("Can't alloc new fid, rc %d\n", rc);
1309		return rc;
1310	}
1311
1312	return rc;
1313}
1314
1315static int lmv_setup(struct obd_device *obd, struct lustre_cfg *lcfg)
1316{
1317	struct lmv_obd	     *lmv = &obd->u.lmv;
1318	struct lprocfs_static_vars  lvars;
1319	struct lmv_desc	    *desc;
1320	int			 rc;
1321
1322	if (LUSTRE_CFG_BUFLEN(lcfg, 1) < 1) {
1323		CERROR("LMV setup requires a descriptor\n");
1324		return -EINVAL;
1325	}
1326
1327	desc = (struct lmv_desc *)lustre_cfg_buf(lcfg, 1);
1328	if (sizeof(*desc) > LUSTRE_CFG_BUFLEN(lcfg, 1)) {
1329		CERROR("Lmv descriptor size wrong: %d > %d\n",
1330		       (int)sizeof(*desc), LUSTRE_CFG_BUFLEN(lcfg, 1));
1331		return -EINVAL;
1332	}
1333
1334	OBD_ALLOC(lmv->tgts, sizeof(*lmv->tgts) * 32);
1335	if (lmv->tgts == NULL)
1336		return -ENOMEM;
1337	lmv->tgts_size = 32;
1338
1339	obd_str2uuid(&lmv->desc.ld_uuid, desc->ld_uuid.uuid);
1340	lmv->desc.ld_tgt_count = 0;
1341	lmv->desc.ld_active_tgt_count = 0;
1342	lmv->max_cookiesize = 0;
1343	lmv->max_def_easize = 0;
1344	lmv->max_easize = 0;
1345	lmv->lmv_placement = PLACEMENT_CHAR_POLICY;
1346
1347	spin_lock_init(&lmv->lmv_lock);
1348	mutex_init(&lmv->init_mutex);
1349
1350	lprocfs_lmv_init_vars(&lvars);
1351
1352	lprocfs_obd_setup(obd, lvars.obd_vars);
1353#if defined (CONFIG_PROC_FS)
1354	{
1355		rc = lprocfs_seq_create(obd->obd_proc_entry, "target_obd",
1356					0444, &lmv_proc_target_fops, obd);
1357		if (rc)
1358			CWARN("%s: error adding LMV target_obd file: rc = %d\n",
1359			       obd->obd_name, rc);
1360       }
1361#endif
1362	rc = fld_client_init(&lmv->lmv_fld, obd->obd_name,
1363			     LUSTRE_CLI_FLD_HASH_DHT);
1364	if (rc) {
1365		CERROR("Can't init FLD, err %d\n", rc);
1366		goto out;
1367	}
1368
1369	return 0;
1370
1371out:
1372	return rc;
1373}
1374
1375static int lmv_cleanup(struct obd_device *obd)
1376{
1377	struct lmv_obd   *lmv = &obd->u.lmv;
1378
1379	fld_client_fini(&lmv->lmv_fld);
1380	if (lmv->tgts != NULL) {
1381		int i;
1382		for (i = 0; i < lmv->desc.ld_tgt_count; i++) {
1383			if (lmv->tgts[i] == NULL)
1384				continue;
1385			lmv_del_target(lmv, i);
1386		}
1387		OBD_FREE(lmv->tgts, sizeof(*lmv->tgts) * lmv->tgts_size);
1388		lmv->tgts_size = 0;
1389	}
1390	return 0;
1391}
1392
1393static int lmv_process_config(struct obd_device *obd, u32 len, void *buf)
1394{
1395	struct lustre_cfg	*lcfg = buf;
1396	struct obd_uuid		obd_uuid;
1397	int			gen;
1398	__u32			index;
1399	int			rc;
1400
1401	switch (lcfg->lcfg_command) {
1402	case LCFG_ADD_MDC:
1403		/* modify_mdc_tgts add 0:lustre-clilmv  1:lustre-MDT0000_UUID
1404		 * 2:0  3:1  4:lustre-MDT0000-mdc_UUID */
1405		if (LUSTRE_CFG_BUFLEN(lcfg, 1) > sizeof(obd_uuid.uuid)) {
1406			rc = -EINVAL;
1407			goto out;
1408		}
1409
1410		obd_str2uuid(&obd_uuid,  lustre_cfg_buf(lcfg, 1));
1411
1412		if (sscanf(lustre_cfg_buf(lcfg, 2), "%d", &index) != 1) {
1413			rc = -EINVAL;
1414			goto out;
1415		}
1416		if (sscanf(lustre_cfg_buf(lcfg, 3), "%d", &gen) != 1) {
1417			rc = -EINVAL;
1418			goto out;
1419		}
1420		rc = lmv_add_target(obd, &obd_uuid, index, gen);
1421		goto out;
1422	default:
1423		CERROR("Unknown command: %d\n", lcfg->lcfg_command);
1424		rc = -EINVAL;
1425		goto out;
1426	}
1427out:
1428	return rc;
1429}
1430
1431static int lmv_statfs(const struct lu_env *env, struct obd_export *exp,
1432		      struct obd_statfs *osfs, __u64 max_age, __u32 flags)
1433{
1434	struct obd_device     *obd = class_exp2obd(exp);
1435	struct lmv_obd	*lmv = &obd->u.lmv;
1436	struct obd_statfs     *temp;
1437	int		    rc = 0;
1438	int		    i;
1439
1440	rc = lmv_check_connect(obd);
1441	if (rc)
1442		return rc;
1443
1444	OBD_ALLOC(temp, sizeof(*temp));
1445	if (temp == NULL)
1446		return -ENOMEM;
1447
1448	for (i = 0; i < lmv->desc.ld_tgt_count; i++) {
1449		if (lmv->tgts[i] == NULL || lmv->tgts[i]->ltd_exp == NULL)
1450			continue;
1451
1452		rc = obd_statfs(env, lmv->tgts[i]->ltd_exp, temp,
1453				max_age, flags);
1454		if (rc) {
1455			CERROR("can't stat MDS #%d (%s), error %d\n", i,
1456			       lmv->tgts[i]->ltd_exp->exp_obd->obd_name,
1457			       rc);
1458			goto out_free_temp;
1459		}
1460
1461		if (i == 0) {
1462			*osfs = *temp;
1463			/* If the statfs is from mount, it will needs
1464			 * retrieve necessary information from MDT0.
1465			 * i.e. mount does not need the merged osfs
1466			 * from all of MDT.
1467			 * And also clients can be mounted as long as
1468			 * MDT0 is in service*/
1469			if (flags & OBD_STATFS_FOR_MDT0)
1470				goto out_free_temp;
1471		} else {
1472			osfs->os_bavail += temp->os_bavail;
1473			osfs->os_blocks += temp->os_blocks;
1474			osfs->os_ffree += temp->os_ffree;
1475			osfs->os_files += temp->os_files;
1476		}
1477	}
1478
1479out_free_temp:
1480	OBD_FREE(temp, sizeof(*temp));
1481	return rc;
1482}
1483
1484static int lmv_getstatus(struct obd_export *exp,
1485			 struct lu_fid *fid,
1486			 struct obd_capa **pc)
1487{
1488	struct obd_device    *obd = exp->exp_obd;
1489	struct lmv_obd       *lmv = &obd->u.lmv;
1490	int		   rc;
1491
1492	rc = lmv_check_connect(obd);
1493	if (rc)
1494		return rc;
1495
1496	rc = md_getstatus(lmv->tgts[0]->ltd_exp, fid, pc);
1497	return rc;
1498}
1499
1500static int lmv_getxattr(struct obd_export *exp, const struct lu_fid *fid,
1501			struct obd_capa *oc, u64 valid, const char *name,
1502			const char *input, int input_size, int output_size,
1503			int flags, struct ptlrpc_request **request)
1504{
1505	struct obd_device      *obd = exp->exp_obd;
1506	struct lmv_obd	 *lmv = &obd->u.lmv;
1507	struct lmv_tgt_desc    *tgt;
1508	int		     rc;
1509
1510	rc = lmv_check_connect(obd);
1511	if (rc)
1512		return rc;
1513
1514	tgt = lmv_find_target(lmv, fid);
1515	if (IS_ERR(tgt))
1516		return PTR_ERR(tgt);
1517
1518	rc = md_getxattr(tgt->ltd_exp, fid, oc, valid, name, input,
1519			 input_size, output_size, flags, request);
1520
1521	return rc;
1522}
1523
1524static int lmv_setxattr(struct obd_export *exp, const struct lu_fid *fid,
1525			struct obd_capa *oc, u64 valid, const char *name,
1526			const char *input, int input_size, int output_size,
1527			int flags, __u32 suppgid,
1528			struct ptlrpc_request **request)
1529{
1530	struct obd_device      *obd = exp->exp_obd;
1531	struct lmv_obd	 *lmv = &obd->u.lmv;
1532	struct lmv_tgt_desc    *tgt;
1533	int		     rc;
1534
1535	rc = lmv_check_connect(obd);
1536	if (rc)
1537		return rc;
1538
1539	tgt = lmv_find_target(lmv, fid);
1540	if (IS_ERR(tgt))
1541		return PTR_ERR(tgt);
1542
1543	rc = md_setxattr(tgt->ltd_exp, fid, oc, valid, name, input,
1544			 input_size, output_size, flags, suppgid,
1545			 request);
1546
1547	return rc;
1548}
1549
1550static int lmv_getattr(struct obd_export *exp, struct md_op_data *op_data,
1551		       struct ptlrpc_request **request)
1552{
1553	struct obd_device       *obd = exp->exp_obd;
1554	struct lmv_obd	  *lmv = &obd->u.lmv;
1555	struct lmv_tgt_desc     *tgt;
1556	int		      rc;
1557
1558	rc = lmv_check_connect(obd);
1559	if (rc)
1560		return rc;
1561
1562	tgt = lmv_find_target(lmv, &op_data->op_fid1);
1563	if (IS_ERR(tgt))
1564		return PTR_ERR(tgt);
1565
1566	if (op_data->op_flags & MF_GET_MDT_IDX) {
1567		op_data->op_mds = tgt->ltd_idx;
1568		return 0;
1569	}
1570
1571	rc = md_getattr(tgt->ltd_exp, op_data, request);
1572
1573	return rc;
1574}
1575
1576static int lmv_null_inode(struct obd_export *exp, const struct lu_fid *fid)
1577{
1578	struct obd_device   *obd = exp->exp_obd;
1579	struct lmv_obd      *lmv = &obd->u.lmv;
1580	int		  i;
1581	int		  rc;
1582
1583	rc = lmv_check_connect(obd);
1584	if (rc)
1585		return rc;
1586
1587	CDEBUG(D_INODE, "CBDATA for "DFID"\n", PFID(fid));
1588
1589	/*
1590	 * With DNE every object can have two locks in different namespaces:
1591	 * lookup lock in space of MDT storing direntry and update/open lock in
1592	 * space of MDT storing inode.
1593	 */
1594	for (i = 0; i < lmv->desc.ld_tgt_count; i++) {
1595		if (lmv->tgts[i] == NULL || lmv->tgts[i]->ltd_exp == NULL)
1596			continue;
1597		md_null_inode(lmv->tgts[i]->ltd_exp, fid);
1598	}
1599
1600	return 0;
1601}
1602
1603static int lmv_find_cbdata(struct obd_export *exp, const struct lu_fid *fid,
1604			   ldlm_iterator_t it, void *data)
1605{
1606	struct obd_device   *obd = exp->exp_obd;
1607	struct lmv_obd      *lmv = &obd->u.lmv;
1608	int		  i;
1609	int		  rc;
1610
1611	rc = lmv_check_connect(obd);
1612	if (rc)
1613		return rc;
1614
1615	CDEBUG(D_INODE, "CBDATA for "DFID"\n", PFID(fid));
1616
1617	/*
1618	 * With DNE every object can have two locks in different namespaces:
1619	 * lookup lock in space of MDT storing direntry and update/open lock in
1620	 * space of MDT storing inode.
1621	 */
1622	for (i = 0; i < lmv->desc.ld_tgt_count; i++) {
1623		if (lmv->tgts[i] == NULL || lmv->tgts[i]->ltd_exp == NULL)
1624			continue;
1625		rc = md_find_cbdata(lmv->tgts[i]->ltd_exp, fid, it, data);
1626		if (rc)
1627			return rc;
1628	}
1629
1630	return rc;
1631}
1632
1633
1634static int lmv_close(struct obd_export *exp, struct md_op_data *op_data,
1635		     struct md_open_data *mod, struct ptlrpc_request **request)
1636{
1637	struct obd_device     *obd = exp->exp_obd;
1638	struct lmv_obd	*lmv = &obd->u.lmv;
1639	struct lmv_tgt_desc   *tgt;
1640	int		    rc;
1641
1642	rc = lmv_check_connect(obd);
1643	if (rc)
1644		return rc;
1645
1646	tgt = lmv_find_target(lmv, &op_data->op_fid1);
1647	if (IS_ERR(tgt))
1648		return PTR_ERR(tgt);
1649
1650	CDEBUG(D_INODE, "CLOSE "DFID"\n", PFID(&op_data->op_fid1));
1651	rc = md_close(tgt->ltd_exp, op_data, mod, request);
1652	return rc;
1653}
1654
1655struct lmv_tgt_desc
1656*lmv_locate_mds(struct lmv_obd *lmv, struct md_op_data *op_data,
1657		struct lu_fid *fid)
1658{
1659	struct lmv_tgt_desc *tgt;
1660
1661	tgt = lmv_find_target(lmv, fid);
1662	if (IS_ERR(tgt))
1663		return tgt;
1664
1665	op_data->op_mds = tgt->ltd_idx;
1666
1667	return tgt;
1668}
1669
1670int lmv_create(struct obd_export *exp, struct md_op_data *op_data,
1671	       const void *data, int datalen, int mode, __u32 uid,
1672	       __u32 gid, cfs_cap_t cap_effective, __u64 rdev,
1673	       struct ptlrpc_request **request)
1674{
1675	struct obd_device       *obd = exp->exp_obd;
1676	struct lmv_obd	  *lmv = &obd->u.lmv;
1677	struct lmv_tgt_desc     *tgt;
1678	int		      rc;
1679
1680	rc = lmv_check_connect(obd);
1681	if (rc)
1682		return rc;
1683
1684	if (!lmv->desc.ld_active_tgt_count)
1685		return -EIO;
1686
1687	tgt = lmv_locate_mds(lmv, op_data, &op_data->op_fid1);
1688	if (IS_ERR(tgt))
1689		return PTR_ERR(tgt);
1690
1691	rc = lmv_fid_alloc(exp, &op_data->op_fid2, op_data);
1692	if (rc)
1693		return rc;
1694
1695	CDEBUG(D_INODE, "CREATE '%*s' on "DFID" -> mds #%x\n",
1696	       op_data->op_namelen, op_data->op_name, PFID(&op_data->op_fid1),
1697	       op_data->op_mds);
1698
1699	op_data->op_flags |= MF_MDC_CANCEL_FID1;
1700	rc = md_create(tgt->ltd_exp, op_data, data, datalen, mode, uid, gid,
1701		       cap_effective, rdev, request);
1702
1703	if (rc == 0) {
1704		if (*request == NULL)
1705			return rc;
1706		CDEBUG(D_INODE, "Created - "DFID"\n", PFID(&op_data->op_fid2));
1707	}
1708	return rc;
1709}
1710
1711static int lmv_done_writing(struct obd_export *exp,
1712			    struct md_op_data *op_data,
1713			    struct md_open_data *mod)
1714{
1715	struct obd_device     *obd = exp->exp_obd;
1716	struct lmv_obd	*lmv = &obd->u.lmv;
1717	struct lmv_tgt_desc   *tgt;
1718	int		    rc;
1719
1720	rc = lmv_check_connect(obd);
1721	if (rc)
1722		return rc;
1723
1724	tgt = lmv_find_target(lmv, &op_data->op_fid1);
1725	if (IS_ERR(tgt))
1726		return PTR_ERR(tgt);
1727
1728	rc = md_done_writing(tgt->ltd_exp, op_data, mod);
1729	return rc;
1730}
1731
1732static int
1733lmv_enqueue_remote(struct obd_export *exp, struct ldlm_enqueue_info *einfo,
1734		   struct lookup_intent *it, struct md_op_data *op_data,
1735		   struct lustre_handle *lockh, void *lmm, int lmmsize,
1736		   __u64 extra_lock_flags)
1737{
1738	struct ptlrpc_request      *req = it->d.lustre.it_data;
1739	struct obd_device	  *obd = exp->exp_obd;
1740	struct lmv_obd	     *lmv = &obd->u.lmv;
1741	struct lustre_handle	plock;
1742	struct lmv_tgt_desc	*tgt;
1743	struct md_op_data	  *rdata;
1744	struct lu_fid	       fid1;
1745	struct mdt_body	    *body;
1746	int			 rc = 0;
1747	int			 pmode;
1748
1749	body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
1750	LASSERT(body != NULL);
1751
1752	if (!(body->valid & OBD_MD_MDS))
1753		return 0;
1754
1755	CDEBUG(D_INODE, "REMOTE_ENQUEUE '%s' on "DFID" -> "DFID"\n",
1756	       LL_IT2STR(it), PFID(&op_data->op_fid1), PFID(&body->fid1));
1757
1758	/*
1759	 * We got LOOKUP lock, but we really need attrs.
1760	 */
1761	pmode = it->d.lustre.it_lock_mode;
1762	LASSERT(pmode != 0);
1763	memcpy(&plock, lockh, sizeof(plock));
1764	it->d.lustre.it_lock_mode = 0;
1765	it->d.lustre.it_data = NULL;
1766	fid1 = body->fid1;
1767
1768	ptlrpc_req_finished(req);
1769
1770	tgt = lmv_find_target(lmv, &fid1);
1771	if (IS_ERR(tgt)) {
1772		rc = PTR_ERR(tgt);
1773		goto out;
1774	}
1775
1776	OBD_ALLOC_PTR(rdata);
1777	if (rdata == NULL) {
1778		rc = -ENOMEM;
1779		goto out;
1780	}
1781
1782	rdata->op_fid1 = fid1;
1783	rdata->op_bias = MDS_CROSS_REF;
1784
1785	rc = md_enqueue(tgt->ltd_exp, einfo, it, rdata, lockh,
1786			lmm, lmmsize, NULL, extra_lock_flags);
1787	OBD_FREE_PTR(rdata);
1788out:
1789	ldlm_lock_decref(&plock, pmode);
1790	return rc;
1791}
1792
1793static int
1794lmv_enqueue(struct obd_export *exp, struct ldlm_enqueue_info *einfo,
1795	    struct lookup_intent *it, struct md_op_data *op_data,
1796	    struct lustre_handle *lockh, void *lmm, int lmmsize,
1797	    struct ptlrpc_request **req, __u64 extra_lock_flags)
1798{
1799	struct obd_device	*obd = exp->exp_obd;
1800	struct lmv_obd	   *lmv = &obd->u.lmv;
1801	struct lmv_tgt_desc      *tgt;
1802	int		       rc;
1803
1804	rc = lmv_check_connect(obd);
1805	if (rc)
1806		return rc;
1807
1808	CDEBUG(D_INODE, "ENQUEUE '%s' on "DFID"\n",
1809	       LL_IT2STR(it), PFID(&op_data->op_fid1));
1810
1811	tgt = lmv_locate_mds(lmv, op_data, &op_data->op_fid1);
1812	if (IS_ERR(tgt))
1813		return PTR_ERR(tgt);
1814
1815	CDEBUG(D_INODE, "ENQUEUE '%s' on "DFID" -> mds #%d\n",
1816	       LL_IT2STR(it), PFID(&op_data->op_fid1), tgt->ltd_idx);
1817
1818	rc = md_enqueue(tgt->ltd_exp, einfo, it, op_data, lockh,
1819			lmm, lmmsize, req, extra_lock_flags);
1820
1821	if (rc == 0 && it && it->it_op == IT_OPEN) {
1822		rc = lmv_enqueue_remote(exp, einfo, it, op_data, lockh,
1823					lmm, lmmsize, extra_lock_flags);
1824	}
1825	return rc;
1826}
1827
1828static int
1829lmv_getattr_name(struct obd_export *exp, struct md_op_data *op_data,
1830		 struct ptlrpc_request **request)
1831{
1832	struct ptlrpc_request   *req = NULL;
1833	struct obd_device       *obd = exp->exp_obd;
1834	struct lmv_obd	  *lmv = &obd->u.lmv;
1835	struct lmv_tgt_desc     *tgt;
1836	struct mdt_body	 *body;
1837	int		      rc;
1838
1839	rc = lmv_check_connect(obd);
1840	if (rc)
1841		return rc;
1842
1843	tgt = lmv_locate_mds(lmv, op_data, &op_data->op_fid1);
1844	if (IS_ERR(tgt))
1845		return PTR_ERR(tgt);
1846
1847	CDEBUG(D_INODE, "GETATTR_NAME for %*s on "DFID" -> mds #%d\n",
1848	       op_data->op_namelen, op_data->op_name, PFID(&op_data->op_fid1),
1849	       tgt->ltd_idx);
1850
1851	rc = md_getattr_name(tgt->ltd_exp, op_data, request);
1852	if (rc != 0)
1853		return rc;
1854
1855	body = req_capsule_server_get(&(*request)->rq_pill,
1856				      &RMF_MDT_BODY);
1857	LASSERT(body != NULL);
1858
1859	if (body->valid & OBD_MD_MDS) {
1860		struct lu_fid rid = body->fid1;
1861		CDEBUG(D_INODE, "Request attrs for "DFID"\n",
1862		       PFID(&rid));
1863
1864		tgt = lmv_find_target(lmv, &rid);
1865		if (IS_ERR(tgt)) {
1866			ptlrpc_req_finished(*request);
1867			return PTR_ERR(tgt);
1868		}
1869
1870		op_data->op_fid1 = rid;
1871		op_data->op_valid |= OBD_MD_FLCROSSREF;
1872		op_data->op_namelen = 0;
1873		op_data->op_name = NULL;
1874		rc = md_getattr_name(tgt->ltd_exp, op_data, &req);
1875		ptlrpc_req_finished(*request);
1876		*request = req;
1877	}
1878
1879	return rc;
1880}
1881
1882#define md_op_data_fid(op_data, fl)		     \
1883	(fl == MF_MDC_CANCEL_FID1 ? &op_data->op_fid1 : \
1884	 fl == MF_MDC_CANCEL_FID2 ? &op_data->op_fid2 : \
1885	 fl == MF_MDC_CANCEL_FID3 ? &op_data->op_fid3 : \
1886	 fl == MF_MDC_CANCEL_FID4 ? &op_data->op_fid4 : \
1887	 NULL)
1888
1889static int lmv_early_cancel(struct obd_export *exp, struct md_op_data *op_data,
1890			    int op_tgt, ldlm_mode_t mode, int bits, int flag)
1891{
1892	struct lu_fid	  *fid = md_op_data_fid(op_data, flag);
1893	struct obd_device      *obd = exp->exp_obd;
1894	struct lmv_obd	 *lmv = &obd->u.lmv;
1895	struct lmv_tgt_desc    *tgt;
1896	ldlm_policy_data_t      policy = {{0}};
1897	int		     rc = 0;
1898
1899	if (!fid_is_sane(fid))
1900		return 0;
1901
1902	tgt = lmv_find_target(lmv, fid);
1903	if (IS_ERR(tgt))
1904		return PTR_ERR(tgt);
1905
1906	if (tgt->ltd_idx != op_tgt) {
1907		CDEBUG(D_INODE, "EARLY_CANCEL on "DFID"\n", PFID(fid));
1908		policy.l_inodebits.bits = bits;
1909		rc = md_cancel_unused(tgt->ltd_exp, fid, &policy,
1910				      mode, LCF_ASYNC, NULL);
1911	} else {
1912		CDEBUG(D_INODE,
1913		       "EARLY_CANCEL skip operation target %d on "DFID"\n",
1914		       op_tgt, PFID(fid));
1915		op_data->op_flags |= flag;
1916		rc = 0;
1917	}
1918
1919	return rc;
1920}
1921
1922/*
1923 * llite passes fid of an target inode in op_data->op_fid1 and id of directory in
1924 * op_data->op_fid2
1925 */
1926static int lmv_link(struct obd_export *exp, struct md_op_data *op_data,
1927		    struct ptlrpc_request **request)
1928{
1929	struct obd_device       *obd = exp->exp_obd;
1930	struct lmv_obd	  *lmv = &obd->u.lmv;
1931	struct lmv_tgt_desc     *tgt;
1932	int		      rc;
1933
1934	rc = lmv_check_connect(obd);
1935	if (rc)
1936		return rc;
1937
1938	LASSERT(op_data->op_namelen != 0);
1939
1940	CDEBUG(D_INODE, "LINK "DFID":%*s to "DFID"\n",
1941	       PFID(&op_data->op_fid2), op_data->op_namelen,
1942	       op_data->op_name, PFID(&op_data->op_fid1));
1943
1944	op_data->op_fsuid = from_kuid(&init_user_ns, current_fsuid());
1945	op_data->op_fsgid = from_kgid(&init_user_ns, current_fsgid());
1946	op_data->op_cap = cfs_curproc_cap_pack();
1947	tgt = lmv_locate_mds(lmv, op_data, &op_data->op_fid2);
1948	if (IS_ERR(tgt))
1949		return PTR_ERR(tgt);
1950
1951	/*
1952	 * Cancel UPDATE lock on child (fid1).
1953	 */
1954	op_data->op_flags |= MF_MDC_CANCEL_FID2;
1955	rc = lmv_early_cancel(exp, op_data, tgt->ltd_idx, LCK_EX,
1956			      MDS_INODELOCK_UPDATE, MF_MDC_CANCEL_FID1);
1957	if (rc != 0)
1958		return rc;
1959
1960	rc = md_link(tgt->ltd_exp, op_data, request);
1961
1962	return rc;
1963}
1964
1965static int lmv_rename(struct obd_export *exp, struct md_op_data *op_data,
1966		      const char *old, int oldlen, const char *new, int newlen,
1967		      struct ptlrpc_request **request)
1968{
1969	struct obd_device       *obd = exp->exp_obd;
1970	struct lmv_obd	  *lmv = &obd->u.lmv;
1971	struct lmv_tgt_desc     *src_tgt;
1972	struct lmv_tgt_desc     *tgt_tgt;
1973	int			rc;
1974
1975	LASSERT(oldlen != 0);
1976
1977	CDEBUG(D_INODE, "RENAME %*s in "DFID" to %*s in "DFID"\n",
1978	       oldlen, old, PFID(&op_data->op_fid1),
1979	       newlen, new, PFID(&op_data->op_fid2));
1980
1981	rc = lmv_check_connect(obd);
1982	if (rc)
1983		return rc;
1984
1985	op_data->op_fsuid = from_kuid(&init_user_ns, current_fsuid());
1986	op_data->op_fsgid = from_kgid(&init_user_ns, current_fsgid());
1987	op_data->op_cap = cfs_curproc_cap_pack();
1988	src_tgt = lmv_locate_mds(lmv, op_data, &op_data->op_fid1);
1989	if (IS_ERR(src_tgt))
1990		return PTR_ERR(src_tgt);
1991
1992	tgt_tgt = lmv_locate_mds(lmv, op_data, &op_data->op_fid2);
1993	if (IS_ERR(tgt_tgt))
1994		return PTR_ERR(tgt_tgt);
1995	/*
1996	 * LOOKUP lock on src child (fid3) should also be cancelled for
1997	 * src_tgt in mdc_rename.
1998	 */
1999	op_data->op_flags |= MF_MDC_CANCEL_FID1 | MF_MDC_CANCEL_FID3;
2000
2001	/*
2002	 * Cancel UPDATE locks on tgt parent (fid2), tgt_tgt is its
2003	 * own target.
2004	 */
2005	rc = lmv_early_cancel(exp, op_data, src_tgt->ltd_idx,
2006			      LCK_EX, MDS_INODELOCK_UPDATE,
2007			      MF_MDC_CANCEL_FID2);
2008
2009	/*
2010	 * Cancel LOOKUP locks on tgt child (fid4) for parent tgt_tgt.
2011	 */
2012	if (rc == 0) {
2013		rc = lmv_early_cancel(exp, op_data, src_tgt->ltd_idx,
2014				      LCK_EX, MDS_INODELOCK_LOOKUP,
2015				      MF_MDC_CANCEL_FID4);
2016	}
2017
2018	/*
2019	 * Cancel all the locks on tgt child (fid4).
2020	 */
2021	if (rc == 0)
2022		rc = lmv_early_cancel(exp, op_data, src_tgt->ltd_idx,
2023				      LCK_EX, MDS_INODELOCK_FULL,
2024				      MF_MDC_CANCEL_FID4);
2025
2026	if (rc == 0)
2027		rc = md_rename(src_tgt->ltd_exp, op_data, old, oldlen,
2028			       new, newlen, request);
2029	return rc;
2030}
2031
2032static int lmv_setattr(struct obd_export *exp, struct md_op_data *op_data,
2033		       void *ea, int ealen, void *ea2, int ea2len,
2034		       struct ptlrpc_request **request,
2035		       struct md_open_data **mod)
2036{
2037	struct obd_device       *obd = exp->exp_obd;
2038	struct lmv_obd	  *lmv = &obd->u.lmv;
2039	struct lmv_tgt_desc     *tgt;
2040	int		      rc = 0;
2041
2042	rc = lmv_check_connect(obd);
2043	if (rc)
2044		return rc;
2045
2046	CDEBUG(D_INODE, "SETATTR for "DFID", valid 0x%x\n",
2047	       PFID(&op_data->op_fid1), op_data->op_attr.ia_valid);
2048
2049	op_data->op_flags |= MF_MDC_CANCEL_FID1;
2050	tgt = lmv_find_target(lmv, &op_data->op_fid1);
2051	if (IS_ERR(tgt))
2052		return PTR_ERR(tgt);
2053
2054	rc = md_setattr(tgt->ltd_exp, op_data, ea, ealen, ea2,
2055			ea2len, request, mod);
2056
2057	return rc;
2058}
2059
2060static int lmv_sync(struct obd_export *exp, const struct lu_fid *fid,
2061		    struct obd_capa *oc, struct ptlrpc_request **request)
2062{
2063	struct obd_device	 *obd = exp->exp_obd;
2064	struct lmv_obd	    *lmv = &obd->u.lmv;
2065	struct lmv_tgt_desc       *tgt;
2066	int			rc;
2067
2068	rc = lmv_check_connect(obd);
2069	if (rc)
2070		return rc;
2071
2072	tgt = lmv_find_target(lmv, fid);
2073	if (IS_ERR(tgt))
2074		return PTR_ERR(tgt);
2075
2076	rc = md_sync(tgt->ltd_exp, fid, oc, request);
2077	return rc;
2078}
2079
2080/*
2081 * Adjust a set of pages, each page containing an array of lu_dirpages,
2082 * so that each page can be used as a single logical lu_dirpage.
2083 *
2084 * A lu_dirpage is laid out as follows, where s = ldp_hash_start,
2085 * e = ldp_hash_end, f = ldp_flags, p = padding, and each "ent" is a
2086 * struct lu_dirent.  It has size up to LU_PAGE_SIZE. The ldp_hash_end
2087 * value is used as a cookie to request the next lu_dirpage in a
2088 * directory listing that spans multiple pages (two in this example):
2089 *   ________
2090 *  |	|
2091 * .|--------v-------   -----.
2092 * |s|e|f|p|ent|ent| ... |ent|
2093 * '--|--------------   -----'   Each CFS_PAGE contains a single
2094 *    '------.		   lu_dirpage.
2095 * .---------v-------   -----.
2096 * |s|e|f|p|ent| 0 | ... | 0 |
2097 * '-----------------   -----'
2098 *
2099 * However, on hosts where the native VM page size (PAGE_CACHE_SIZE) is
2100 * larger than LU_PAGE_SIZE, a single host page may contain multiple
2101 * lu_dirpages. After reading the lu_dirpages from the MDS, the
2102 * ldp_hash_end of the first lu_dirpage refers to the one immediately
2103 * after it in the same CFS_PAGE (arrows simplified for brevity, but
2104 * in general e0==s1, e1==s2, etc.):
2105 *
2106 * .--------------------   -----.
2107 * |s0|e0|f0|p|ent|ent| ... |ent|
2108 * |---v----------------   -----|
2109 * |s1|e1|f1|p|ent|ent| ... |ent|
2110 * |---v----------------   -----|  Here, each CFS_PAGE contains
2111 *	     ...		 multiple lu_dirpages.
2112 * |---v----------------   -----|
2113 * |s'|e'|f'|p|ent|ent| ... |ent|
2114 * '---|----------------   -----'
2115 *     v
2116 * .----------------------------.
2117 * |	next CFS_PAGE       |
2118 *
2119 * This structure is transformed into a single logical lu_dirpage as follows:
2120 *
2121 * - Replace e0 with e' so the request for the next lu_dirpage gets the page
2122 *   labeled 'next CFS_PAGE'.
2123 *
2124 * - Copy the LDF_COLLIDE flag from f' to f0 to correctly reflect whether
2125 *   a hash collision with the next page exists.
2126 *
2127 * - Adjust the lde_reclen of the ending entry of each lu_dirpage to span
2128 *   to the first entry of the next lu_dirpage.
2129 */
2130#if PAGE_CACHE_SIZE > LU_PAGE_SIZE
2131static void lmv_adjust_dirpages(struct page **pages, int ncfspgs, int nlupgs)
2132{
2133	int i;
2134
2135	for (i = 0; i < ncfspgs; i++) {
2136		struct lu_dirpage	*dp = kmap(pages[i]);
2137		struct lu_dirpage	*first = dp;
2138		struct lu_dirent	*end_dirent = NULL;
2139		struct lu_dirent	*ent;
2140		__u64			hash_end = dp->ldp_hash_end;
2141		__u32			flags = dp->ldp_flags;
2142
2143		while (--nlupgs > 0) {
2144			ent = lu_dirent_start(dp);
2145			for (end_dirent = ent; ent != NULL;
2146			     end_dirent = ent, ent = lu_dirent_next(ent));
2147
2148			/* Advance dp to next lu_dirpage. */
2149			dp = (struct lu_dirpage *)((char *)dp + LU_PAGE_SIZE);
2150
2151			/* Check if we've reached the end of the CFS_PAGE. */
2152			if (!((unsigned long)dp & ~CFS_PAGE_MASK))
2153				break;
2154
2155			/* Save the hash and flags of this lu_dirpage. */
2156			hash_end = dp->ldp_hash_end;
2157			flags = dp->ldp_flags;
2158
2159			/* Check if lu_dirpage contains no entries. */
2160			if (!end_dirent)
2161				break;
2162
2163			/* Enlarge the end entry lde_reclen from 0 to
2164			 * first entry of next lu_dirpage. */
2165			LASSERT(le16_to_cpu(end_dirent->lde_reclen) == 0);
2166			end_dirent->lde_reclen =
2167				cpu_to_le16((char *)(dp->ldp_entries) -
2168					    (char *)end_dirent);
2169		}
2170
2171		first->ldp_hash_end = hash_end;
2172		first->ldp_flags &= ~cpu_to_le32(LDF_COLLIDE);
2173		first->ldp_flags |= flags & cpu_to_le32(LDF_COLLIDE);
2174
2175		kunmap(pages[i]);
2176	}
2177	LASSERTF(nlupgs == 0, "left = %d", nlupgs);
2178}
2179#else
2180#define lmv_adjust_dirpages(pages, ncfspgs, nlupgs) do {} while (0)
2181#endif	/* PAGE_CACHE_SIZE > LU_PAGE_SIZE */
2182
2183static int lmv_readpage(struct obd_export *exp, struct md_op_data *op_data,
2184			struct page **pages, struct ptlrpc_request **request)
2185{
2186	struct obd_device	*obd = exp->exp_obd;
2187	struct lmv_obd		*lmv = &obd->u.lmv;
2188	__u64			offset = op_data->op_offset;
2189	int			rc;
2190	int			ncfspgs; /* pages read in PAGE_CACHE_SIZE */
2191	int			nlupgs; /* pages read in LU_PAGE_SIZE */
2192	struct lmv_tgt_desc	*tgt;
2193
2194	rc = lmv_check_connect(obd);
2195	if (rc)
2196		return rc;
2197
2198	CDEBUG(D_INODE, "READPAGE at %#llx from "DFID"\n",
2199	       offset, PFID(&op_data->op_fid1));
2200
2201	tgt = lmv_find_target(lmv, &op_data->op_fid1);
2202	if (IS_ERR(tgt))
2203		return PTR_ERR(tgt);
2204
2205	rc = md_readpage(tgt->ltd_exp, op_data, pages, request);
2206	if (rc != 0)
2207		return rc;
2208
2209	ncfspgs = ((*request)->rq_bulk->bd_nob_transferred + PAGE_CACHE_SIZE - 1)
2210		 >> PAGE_CACHE_SHIFT;
2211	nlupgs = (*request)->rq_bulk->bd_nob_transferred >> LU_PAGE_SHIFT;
2212	LASSERT(!((*request)->rq_bulk->bd_nob_transferred & ~LU_PAGE_MASK));
2213	LASSERT(ncfspgs > 0 && ncfspgs <= op_data->op_npages);
2214
2215	CDEBUG(D_INODE, "read %d(%d)/%d pages\n", ncfspgs, nlupgs,
2216	       op_data->op_npages);
2217
2218	lmv_adjust_dirpages(pages, ncfspgs, nlupgs);
2219
2220	return rc;
2221}
2222
2223static int lmv_unlink(struct obd_export *exp, struct md_op_data *op_data,
2224		      struct ptlrpc_request **request)
2225{
2226	struct obd_device       *obd = exp->exp_obd;
2227	struct lmv_obd	  *lmv = &obd->u.lmv;
2228	struct lmv_tgt_desc     *tgt = NULL;
2229	struct mdt_body		*body;
2230	int		     rc;
2231
2232	rc = lmv_check_connect(obd);
2233	if (rc)
2234		return rc;
2235retry:
2236	/* Send unlink requests to the MDT where the child is located */
2237	if (likely(!fid_is_zero(&op_data->op_fid2)))
2238		tgt = lmv_locate_mds(lmv, op_data, &op_data->op_fid2);
2239	else
2240		tgt = lmv_locate_mds(lmv, op_data, &op_data->op_fid1);
2241	if (IS_ERR(tgt))
2242		return PTR_ERR(tgt);
2243
2244	op_data->op_fsuid = from_kuid(&init_user_ns, current_fsuid());
2245	op_data->op_fsgid = from_kgid(&init_user_ns, current_fsgid());
2246	op_data->op_cap = cfs_curproc_cap_pack();
2247
2248	/*
2249	 * If child's fid is given, cancel unused locks for it if it is from
2250	 * another export than parent.
2251	 *
2252	 * LOOKUP lock for child (fid3) should also be cancelled on parent
2253	 * tgt_tgt in mdc_unlink().
2254	 */
2255	op_data->op_flags |= MF_MDC_CANCEL_FID1 | MF_MDC_CANCEL_FID3;
2256
2257	/*
2258	 * Cancel FULL locks on child (fid3).
2259	 */
2260	rc = lmv_early_cancel(exp, op_data, tgt->ltd_idx, LCK_EX,
2261			      MDS_INODELOCK_FULL, MF_MDC_CANCEL_FID3);
2262
2263	if (rc != 0)
2264		return rc;
2265
2266	CDEBUG(D_INODE, "unlink with fid="DFID"/"DFID" -> mds #%d\n",
2267	       PFID(&op_data->op_fid1), PFID(&op_data->op_fid2), tgt->ltd_idx);
2268
2269	rc = md_unlink(tgt->ltd_exp, op_data, request);
2270	if (rc != 0 && rc != -EREMOTE)
2271		return rc;
2272
2273	body = req_capsule_server_get(&(*request)->rq_pill, &RMF_MDT_BODY);
2274	if (body == NULL)
2275		return -EPROTO;
2276
2277	/* Not cross-ref case, just get out of here. */
2278	if (likely(!(body->valid & OBD_MD_MDS)))
2279		return 0;
2280
2281	CDEBUG(D_INODE, "%s: try unlink to another MDT for "DFID"\n",
2282	       exp->exp_obd->obd_name, PFID(&body->fid1));
2283
2284	/* This is a remote object, try remote MDT, Note: it may
2285	 * try more than 1 time here, Considering following case
2286	 * /mnt/lustre is root on MDT0, remote1 is on MDT1
2287	 * 1. Initially A does not know where remote1 is, it send
2288	 *    unlink RPC to MDT0, MDT0 return -EREMOTE, it will
2289	 *    resend unlink RPC to MDT1 (retry 1st time).
2290	 *
2291	 * 2. During the unlink RPC in flight,
2292	 *    client B mv /mnt/lustre/remote1 /mnt/lustre/remote2
2293	 *    and create new remote1, but on MDT0
2294	 *
2295	 * 3. MDT1 get unlink RPC(from A), then do remote lock on
2296	 *    /mnt/lustre, then lookup get fid of remote1, and find
2297	 *    it is remote dir again, and replay -EREMOTE again.
2298	 *
2299	 * 4. Then A will resend unlink RPC to MDT0. (retry 2nd times).
2300	 *
2301	 * In theory, it might try unlimited time here, but it should
2302	 * be very rare case.  */
2303	op_data->op_fid2 = body->fid1;
2304	ptlrpc_req_finished(*request);
2305	*request = NULL;
2306
2307	goto retry;
2308}
2309
2310static int lmv_precleanup(struct obd_device *obd, enum obd_cleanup_stage stage)
2311{
2312	struct lmv_obd *lmv = &obd->u.lmv;
2313	int rc = 0;
2314
2315	switch (stage) {
2316	case OBD_CLEANUP_EARLY:
2317		/* XXX: here should be calling obd_precleanup() down to
2318		 * stack. */
2319		break;
2320	case OBD_CLEANUP_EXPORTS:
2321		fld_client_proc_fini(&lmv->lmv_fld);
2322		lprocfs_obd_cleanup(obd);
2323		break;
2324	default:
2325		break;
2326	}
2327	return rc;
2328}
2329
2330static int lmv_get_info(const struct lu_env *env, struct obd_export *exp,
2331			__u32 keylen, void *key, __u32 *vallen, void *val,
2332			struct lov_stripe_md *lsm)
2333{
2334	struct obd_device       *obd;
2335	struct lmv_obd	  *lmv;
2336	int		      rc = 0;
2337
2338	obd = class_exp2obd(exp);
2339	if (obd == NULL) {
2340		CDEBUG(D_IOCTL, "Invalid client cookie %#llx\n",
2341		       exp->exp_handle.h_cookie);
2342		return -EINVAL;
2343	}
2344
2345	lmv = &obd->u.lmv;
2346	if (keylen >= strlen("remote_flag") && !strcmp(key, "remote_flag")) {
2347		struct lmv_tgt_desc *tgt;
2348		int i;
2349
2350		rc = lmv_check_connect(obd);
2351		if (rc)
2352			return rc;
2353
2354		LASSERT(*vallen == sizeof(__u32));
2355		for (i = 0; i < lmv->desc.ld_tgt_count; i++) {
2356			tgt = lmv->tgts[i];
2357			/*
2358			 * All tgts should be connected when this gets called.
2359			 */
2360			if (tgt == NULL || tgt->ltd_exp == NULL)
2361				continue;
2362
2363			if (!obd_get_info(env, tgt->ltd_exp, keylen, key,
2364					  vallen, val, NULL))
2365				return 0;
2366		}
2367		return -EINVAL;
2368	} else if (KEY_IS(KEY_MAX_EASIZE) ||
2369		   KEY_IS(KEY_DEFAULT_EASIZE) ||
2370		   KEY_IS(KEY_MAX_COOKIESIZE) ||
2371		   KEY_IS(KEY_DEFAULT_COOKIESIZE) ||
2372		   KEY_IS(KEY_CONN_DATA)) {
2373		rc = lmv_check_connect(obd);
2374		if (rc)
2375			return rc;
2376
2377		/*
2378		 * Forwarding this request to first MDS, it should know LOV
2379		 * desc.
2380		 */
2381		rc = obd_get_info(env, lmv->tgts[0]->ltd_exp, keylen, key,
2382				  vallen, val, NULL);
2383		if (!rc && KEY_IS(KEY_CONN_DATA))
2384			exp->exp_connect_data = *(struct obd_connect_data *)val;
2385		return rc;
2386	} else if (KEY_IS(KEY_TGT_COUNT)) {
2387		*((int *)val) = lmv->desc.ld_tgt_count;
2388		return 0;
2389	}
2390
2391	CDEBUG(D_IOCTL, "Invalid key\n");
2392	return -EINVAL;
2393}
2394
2395int lmv_set_info_async(const struct lu_env *env, struct obd_export *exp,
2396		       u32 keylen, void *key, u32 vallen,
2397		       void *val, struct ptlrpc_request_set *set)
2398{
2399	struct lmv_tgt_desc    *tgt;
2400	struct obd_device      *obd;
2401	struct lmv_obd	 *lmv;
2402	int rc = 0;
2403
2404	obd = class_exp2obd(exp);
2405	if (obd == NULL) {
2406		CDEBUG(D_IOCTL, "Invalid client cookie %#llx\n",
2407		       exp->exp_handle.h_cookie);
2408		return -EINVAL;
2409	}
2410	lmv = &obd->u.lmv;
2411
2412	if (KEY_IS(KEY_READ_ONLY) || KEY_IS(KEY_FLUSH_CTX)) {
2413		int i, err = 0;
2414
2415		for (i = 0; i < lmv->desc.ld_tgt_count; i++) {
2416			tgt = lmv->tgts[i];
2417
2418			if (tgt == NULL || tgt->ltd_exp == NULL)
2419				continue;
2420
2421			err = obd_set_info_async(env, tgt->ltd_exp,
2422						 keylen, key, vallen, val, set);
2423			if (err && rc == 0)
2424				rc = err;
2425		}
2426
2427		return rc;
2428	}
2429
2430	return -EINVAL;
2431}
2432
2433int lmv_packmd(struct obd_export *exp, struct lov_mds_md **lmmp,
2434	       struct lov_stripe_md *lsm)
2435{
2436	struct obd_device	 *obd = class_exp2obd(exp);
2437	struct lmv_obd	    *lmv = &obd->u.lmv;
2438	struct lmv_stripe_md      *meap;
2439	struct lmv_stripe_md      *lsmp;
2440	int			mea_size;
2441	int			i;
2442
2443	mea_size = lmv_get_easize(lmv);
2444	if (!lmmp)
2445		return mea_size;
2446
2447	if (*lmmp && !lsm) {
2448		OBD_FREE_LARGE(*lmmp, mea_size);
2449		*lmmp = NULL;
2450		return 0;
2451	}
2452
2453	if (*lmmp == NULL) {
2454		OBD_ALLOC_LARGE(*lmmp, mea_size);
2455		if (*lmmp == NULL)
2456			return -ENOMEM;
2457	}
2458
2459	if (!lsm)
2460		return mea_size;
2461
2462	lsmp = (struct lmv_stripe_md *)lsm;
2463	meap = (struct lmv_stripe_md *)*lmmp;
2464
2465	if (lsmp->mea_magic != MEA_MAGIC_LAST_CHAR &&
2466	    lsmp->mea_magic != MEA_MAGIC_ALL_CHARS)
2467		return -EINVAL;
2468
2469	meap->mea_magic = cpu_to_le32(lsmp->mea_magic);
2470	meap->mea_count = cpu_to_le32(lsmp->mea_count);
2471	meap->mea_master = cpu_to_le32(lsmp->mea_master);
2472
2473	for (i = 0; i < lmv->desc.ld_tgt_count; i++) {
2474		meap->mea_ids[i] = lsmp->mea_ids[i];
2475		fid_cpu_to_le(&meap->mea_ids[i], &lsmp->mea_ids[i]);
2476	}
2477
2478	return mea_size;
2479}
2480
2481int lmv_unpackmd(struct obd_export *exp, struct lov_stripe_md **lsmp,
2482		 struct lov_mds_md *lmm, int lmm_size)
2483{
2484	struct obd_device	  *obd = class_exp2obd(exp);
2485	struct lmv_stripe_md      **tmea = (struct lmv_stripe_md **)lsmp;
2486	struct lmv_stripe_md       *mea = (struct lmv_stripe_md *)lmm;
2487	struct lmv_obd	     *lmv = &obd->u.lmv;
2488	int			 mea_size;
2489	int			 i;
2490	__u32		       magic;
2491
2492	mea_size = lmv_get_easize(lmv);
2493	if (lsmp == NULL)
2494		return mea_size;
2495
2496	if (*lsmp != NULL && lmm == NULL) {
2497		OBD_FREE_LARGE(*tmea, mea_size);
2498		*lsmp = NULL;
2499		return 0;
2500	}
2501
2502	LASSERT(mea_size == lmm_size);
2503
2504	OBD_ALLOC_LARGE(*tmea, mea_size);
2505	if (*tmea == NULL)
2506		return -ENOMEM;
2507
2508	if (!lmm)
2509		return mea_size;
2510
2511	if (mea->mea_magic == MEA_MAGIC_LAST_CHAR ||
2512	    mea->mea_magic == MEA_MAGIC_ALL_CHARS ||
2513	    mea->mea_magic == MEA_MAGIC_HASH_SEGMENT) {
2514		magic = le32_to_cpu(mea->mea_magic);
2515	} else {
2516		/*
2517		 * Old mea is not handled here.
2518		 */
2519		CERROR("Old not supportable EA is found\n");
2520		LBUG();
2521	}
2522
2523	(*tmea)->mea_magic = magic;
2524	(*tmea)->mea_count = le32_to_cpu(mea->mea_count);
2525	(*tmea)->mea_master = le32_to_cpu(mea->mea_master);
2526
2527	for (i = 0; i < (*tmea)->mea_count; i++) {
2528		(*tmea)->mea_ids[i] = mea->mea_ids[i];
2529		fid_le_to_cpu(&(*tmea)->mea_ids[i], &(*tmea)->mea_ids[i]);
2530	}
2531	return mea_size;
2532}
2533
2534static int lmv_cancel_unused(struct obd_export *exp, const struct lu_fid *fid,
2535			     ldlm_policy_data_t *policy, ldlm_mode_t mode,
2536			     ldlm_cancel_flags_t flags, void *opaque)
2537{
2538	struct obd_device       *obd = exp->exp_obd;
2539	struct lmv_obd	  *lmv = &obd->u.lmv;
2540	int		      rc = 0;
2541	int		      err;
2542	int		      i;
2543
2544	LASSERT(fid != NULL);
2545
2546	for (i = 0; i < lmv->desc.ld_tgt_count; i++) {
2547		if (lmv->tgts[i] == NULL || lmv->tgts[i]->ltd_exp == NULL ||
2548		    lmv->tgts[i]->ltd_active == 0)
2549			continue;
2550
2551		err = md_cancel_unused(lmv->tgts[i]->ltd_exp, fid,
2552				       policy, mode, flags, opaque);
2553		if (!rc)
2554			rc = err;
2555	}
2556	return rc;
2557}
2558
2559int lmv_set_lock_data(struct obd_export *exp, __u64 *lockh, void *data,
2560		      __u64 *bits)
2561{
2562	struct lmv_obd	  *lmv = &exp->exp_obd->u.lmv;
2563	int		      rc;
2564
2565	rc =  md_set_lock_data(lmv->tgts[0]->ltd_exp, lockh, data, bits);
2566	return rc;
2567}
2568
2569ldlm_mode_t lmv_lock_match(struct obd_export *exp, __u64 flags,
2570			   const struct lu_fid *fid, ldlm_type_t type,
2571			   ldlm_policy_data_t *policy, ldlm_mode_t mode,
2572			   struct lustre_handle *lockh)
2573{
2574	struct obd_device       *obd = exp->exp_obd;
2575	struct lmv_obd	  *lmv = &obd->u.lmv;
2576	ldlm_mode_t	      rc;
2577	int		      i;
2578
2579	CDEBUG(D_INODE, "Lock match for "DFID"\n", PFID(fid));
2580
2581	/*
2582	 * With CMD every object can have two locks in different namespaces:
2583	 * lookup lock in space of mds storing direntry and update/open lock in
2584	 * space of mds storing inode. Thus we check all targets, not only that
2585	 * one fid was created in.
2586	 */
2587	for (i = 0; i < lmv->desc.ld_tgt_count; i++) {
2588		if (lmv->tgts[i] == NULL ||
2589		    lmv->tgts[i]->ltd_exp == NULL ||
2590		    lmv->tgts[i]->ltd_active == 0)
2591			continue;
2592
2593		rc = md_lock_match(lmv->tgts[i]->ltd_exp, flags, fid,
2594				   type, policy, mode, lockh);
2595		if (rc)
2596			return rc;
2597	}
2598
2599	return 0;
2600}
2601
2602int lmv_get_lustre_md(struct obd_export *exp, struct ptlrpc_request *req,
2603		      struct obd_export *dt_exp, struct obd_export *md_exp,
2604		      struct lustre_md *md)
2605{
2606	struct lmv_obd	  *lmv = &exp->exp_obd->u.lmv;
2607
2608	return md_get_lustre_md(lmv->tgts[0]->ltd_exp, req, dt_exp, md_exp, md);
2609}
2610
2611int lmv_free_lustre_md(struct obd_export *exp, struct lustre_md *md)
2612{
2613	struct obd_device       *obd = exp->exp_obd;
2614	struct lmv_obd	  *lmv = &obd->u.lmv;
2615
2616	if (md->mea)
2617		obd_free_memmd(exp, (void *)&md->mea);
2618	return md_free_lustre_md(lmv->tgts[0]->ltd_exp, md);
2619}
2620
2621int lmv_set_open_replay_data(struct obd_export *exp,
2622			     struct obd_client_handle *och,
2623			     struct lookup_intent *it)
2624{
2625	struct obd_device       *obd = exp->exp_obd;
2626	struct lmv_obd	  *lmv = &obd->u.lmv;
2627	struct lmv_tgt_desc     *tgt;
2628
2629	tgt = lmv_find_target(lmv, &och->och_fid);
2630	if (IS_ERR(tgt))
2631		return PTR_ERR(tgt);
2632
2633	return md_set_open_replay_data(tgt->ltd_exp, och, it);
2634}
2635
2636int lmv_clear_open_replay_data(struct obd_export *exp,
2637			       struct obd_client_handle *och)
2638{
2639	struct obd_device       *obd = exp->exp_obd;
2640	struct lmv_obd	  *lmv = &obd->u.lmv;
2641	struct lmv_tgt_desc     *tgt;
2642
2643	tgt = lmv_find_target(lmv, &och->och_fid);
2644	if (IS_ERR(tgt))
2645		return PTR_ERR(tgt);
2646
2647	return md_clear_open_replay_data(tgt->ltd_exp, och);
2648}
2649
2650static int lmv_get_remote_perm(struct obd_export *exp,
2651			       const struct lu_fid *fid,
2652			       struct obd_capa *oc, __u32 suppgid,
2653			       struct ptlrpc_request **request)
2654{
2655	struct obd_device       *obd = exp->exp_obd;
2656	struct lmv_obd	  *lmv = &obd->u.lmv;
2657	struct lmv_tgt_desc     *tgt;
2658	int		      rc;
2659
2660	rc = lmv_check_connect(obd);
2661	if (rc)
2662		return rc;
2663
2664	tgt = lmv_find_target(lmv, fid);
2665	if (IS_ERR(tgt))
2666		return PTR_ERR(tgt);
2667
2668	rc = md_get_remote_perm(tgt->ltd_exp, fid, oc, suppgid, request);
2669	return rc;
2670}
2671
2672static int lmv_renew_capa(struct obd_export *exp, struct obd_capa *oc,
2673			  renew_capa_cb_t cb)
2674{
2675	struct obd_device       *obd = exp->exp_obd;
2676	struct lmv_obd	  *lmv = &obd->u.lmv;
2677	struct lmv_tgt_desc     *tgt;
2678	int		      rc;
2679
2680	rc = lmv_check_connect(obd);
2681	if (rc)
2682		return rc;
2683
2684	tgt = lmv_find_target(lmv, &oc->c_capa.lc_fid);
2685	if (IS_ERR(tgt))
2686		return PTR_ERR(tgt);
2687
2688	rc = md_renew_capa(tgt->ltd_exp, oc, cb);
2689	return rc;
2690}
2691
2692int lmv_unpack_capa(struct obd_export *exp, struct ptlrpc_request *req,
2693		    const struct req_msg_field *field, struct obd_capa **oc)
2694{
2695	struct lmv_obd *lmv = &exp->exp_obd->u.lmv;
2696
2697	return md_unpack_capa(lmv->tgts[0]->ltd_exp, req, field, oc);
2698}
2699
2700int lmv_intent_getattr_async(struct obd_export *exp,
2701			     struct md_enqueue_info *minfo,
2702			     struct ldlm_enqueue_info *einfo)
2703{
2704	struct md_op_data       *op_data = &minfo->mi_data;
2705	struct obd_device       *obd = exp->exp_obd;
2706	struct lmv_obd	  *lmv = &obd->u.lmv;
2707	struct lmv_tgt_desc     *tgt = NULL;
2708	int		      rc;
2709
2710	rc = lmv_check_connect(obd);
2711	if (rc)
2712		return rc;
2713
2714	tgt = lmv_find_target(lmv, &op_data->op_fid1);
2715	if (IS_ERR(tgt))
2716		return PTR_ERR(tgt);
2717
2718	rc = md_intent_getattr_async(tgt->ltd_exp, minfo, einfo);
2719	return rc;
2720}
2721
2722int lmv_revalidate_lock(struct obd_export *exp, struct lookup_intent *it,
2723			struct lu_fid *fid, __u64 *bits)
2724{
2725	struct obd_device       *obd = exp->exp_obd;
2726	struct lmv_obd	  *lmv = &obd->u.lmv;
2727	struct lmv_tgt_desc     *tgt;
2728	int		      rc;
2729
2730	rc = lmv_check_connect(obd);
2731	if (rc)
2732		return rc;
2733
2734	tgt = lmv_find_target(lmv, fid);
2735	if (IS_ERR(tgt))
2736		return PTR_ERR(tgt);
2737
2738	rc = md_revalidate_lock(tgt->ltd_exp, it, fid, bits);
2739	return rc;
2740}
2741
2742/**
2743 * For lmv, only need to send request to master MDT, and the master MDT will
2744 * process with other slave MDTs. The only exception is Q_GETOQUOTA for which
2745 * we directly fetch data from the slave MDTs.
2746 */
2747int lmv_quotactl(struct obd_device *unused, struct obd_export *exp,
2748		 struct obd_quotactl *oqctl)
2749{
2750	struct obd_device   *obd = class_exp2obd(exp);
2751	struct lmv_obd      *lmv = &obd->u.lmv;
2752	struct lmv_tgt_desc *tgt = lmv->tgts[0];
2753	int		  rc = 0, i;
2754	__u64		curspace, curinodes;
2755
2756	if (!lmv->desc.ld_tgt_count || !tgt->ltd_active) {
2757		CERROR("master lmv inactive\n");
2758		return -EIO;
2759	}
2760
2761	if (oqctl->qc_cmd != Q_GETOQUOTA) {
2762		rc = obd_quotactl(tgt->ltd_exp, oqctl);
2763		return rc;
2764	}
2765
2766	curspace = curinodes = 0;
2767	for (i = 0; i < lmv->desc.ld_tgt_count; i++) {
2768		int err;
2769		tgt = lmv->tgts[i];
2770
2771		if (tgt == NULL || tgt->ltd_exp == NULL || tgt->ltd_active == 0)
2772			continue;
2773		if (!tgt->ltd_active) {
2774			CDEBUG(D_HA, "mdt %d is inactive.\n", i);
2775			continue;
2776		}
2777
2778		err = obd_quotactl(tgt->ltd_exp, oqctl);
2779		if (err) {
2780			CERROR("getquota on mdt %d failed. %d\n", i, err);
2781			if (!rc)
2782				rc = err;
2783		} else {
2784			curspace += oqctl->qc_dqblk.dqb_curspace;
2785			curinodes += oqctl->qc_dqblk.dqb_curinodes;
2786		}
2787	}
2788	oqctl->qc_dqblk.dqb_curspace = curspace;
2789	oqctl->qc_dqblk.dqb_curinodes = curinodes;
2790
2791	return rc;
2792}
2793
2794int lmv_quotacheck(struct obd_device *unused, struct obd_export *exp,
2795		   struct obd_quotactl *oqctl)
2796{
2797	struct obd_device   *obd = class_exp2obd(exp);
2798	struct lmv_obd      *lmv = &obd->u.lmv;
2799	struct lmv_tgt_desc *tgt;
2800	int		  i, rc = 0;
2801
2802	for (i = 0; i < lmv->desc.ld_tgt_count; i++) {
2803		int err;
2804		tgt = lmv->tgts[i];
2805		if (tgt == NULL || tgt->ltd_exp == NULL || !tgt->ltd_active) {
2806			CERROR("lmv idx %d inactive\n", i);
2807			return -EIO;
2808		}
2809
2810		err = obd_quotacheck(tgt->ltd_exp, oqctl);
2811		if (err && !rc)
2812			rc = err;
2813	}
2814
2815	return rc;
2816}
2817
2818struct obd_ops lmv_obd_ops = {
2819	.o_owner		= THIS_MODULE,
2820	.o_setup		= lmv_setup,
2821	.o_cleanup	      = lmv_cleanup,
2822	.o_precleanup	   = lmv_precleanup,
2823	.o_process_config       = lmv_process_config,
2824	.o_connect	      = lmv_connect,
2825	.o_disconnect	   = lmv_disconnect,
2826	.o_statfs	       = lmv_statfs,
2827	.o_get_info	     = lmv_get_info,
2828	.o_set_info_async       = lmv_set_info_async,
2829	.o_packmd	       = lmv_packmd,
2830	.o_unpackmd	     = lmv_unpackmd,
2831	.o_notify	       = lmv_notify,
2832	.o_get_uuid	     = lmv_get_uuid,
2833	.o_iocontrol	    = lmv_iocontrol,
2834	.o_quotacheck	   = lmv_quotacheck,
2835	.o_quotactl	     = lmv_quotactl
2836};
2837
2838struct md_ops lmv_md_ops = {
2839	.m_getstatus	    = lmv_getstatus,
2840	.m_null_inode		= lmv_null_inode,
2841	.m_find_cbdata	  = lmv_find_cbdata,
2842	.m_close		= lmv_close,
2843	.m_create	       = lmv_create,
2844	.m_done_writing	 = lmv_done_writing,
2845	.m_enqueue	      = lmv_enqueue,
2846	.m_getattr	      = lmv_getattr,
2847	.m_getxattr	     = lmv_getxattr,
2848	.m_getattr_name	 = lmv_getattr_name,
2849	.m_intent_lock	  = lmv_intent_lock,
2850	.m_link		 = lmv_link,
2851	.m_rename	       = lmv_rename,
2852	.m_setattr	      = lmv_setattr,
2853	.m_setxattr	     = lmv_setxattr,
2854	.m_sync		 = lmv_sync,
2855	.m_readpage	     = lmv_readpage,
2856	.m_unlink	       = lmv_unlink,
2857	.m_init_ea_size	 = lmv_init_ea_size,
2858	.m_cancel_unused	= lmv_cancel_unused,
2859	.m_set_lock_data	= lmv_set_lock_data,
2860	.m_lock_match	   = lmv_lock_match,
2861	.m_get_lustre_md	= lmv_get_lustre_md,
2862	.m_free_lustre_md       = lmv_free_lustre_md,
2863	.m_set_open_replay_data = lmv_set_open_replay_data,
2864	.m_clear_open_replay_data = lmv_clear_open_replay_data,
2865	.m_renew_capa	   = lmv_renew_capa,
2866	.m_unpack_capa	  = lmv_unpack_capa,
2867	.m_get_remote_perm      = lmv_get_remote_perm,
2868	.m_intent_getattr_async = lmv_intent_getattr_async,
2869	.m_revalidate_lock      = lmv_revalidate_lock
2870};
2871
2872int __init lmv_init(void)
2873{
2874	struct lprocfs_static_vars lvars;
2875	int			rc;
2876
2877	lprocfs_lmv_init_vars(&lvars);
2878
2879	rc = class_register_type(&lmv_obd_ops, &lmv_md_ops,
2880				 lvars.module_vars, LUSTRE_LMV_NAME, NULL);
2881	return rc;
2882}
2883
2884static void lmv_exit(void)
2885{
2886	class_unregister_type(LUSTRE_LMV_NAME);
2887}
2888
2889MODULE_AUTHOR("Sun Microsystems, Inc. <http://www.lustre.org/>");
2890MODULE_DESCRIPTION("Lustre Logical Metadata Volume OBD driver");
2891MODULE_LICENSE("GPL");
2892
2893module_init(lmv_init);
2894module_exit(lmv_exit);
2895