[go: nahoru, domu]

file.c revision d0a0acc3ccf5a45c976fe94b15a1f9e9c4c78414
1/*
2 * GPL HEADER START
3 *
4 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
5 *
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License version 2 only,
8 * as published by the Free Software Foundation.
9 *
10 * This program is distributed in the hope that it will be useful, but
11 * WITHOUT ANY WARRANTY; without even the implied warranty of
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
13 * General Public License version 2 for more details (a copy is included
14 * in the LICENSE file that accompanied this code).
15 *
16 * You should have received a copy of the GNU General Public License
17 * version 2 along with this program; If not, see
18 * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
19 *
20 * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
21 * CA 95054 USA or visit www.sun.com if you need additional information or
22 * have any questions.
23 *
24 * GPL HEADER END
25 */
26/*
27 * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved.
28 * Use is subject to license terms.
29 *
30 * Copyright (c) 2011, 2012, Intel Corporation.
31 */
32/*
33 * This file is part of Lustre, http://www.lustre.org/
34 * Lustre is a trademark of Sun Microsystems, Inc.
35 *
36 * lustre/llite/file.c
37 *
38 * Author: Peter Braam <braam@clusterfs.com>
39 * Author: Phil Schwan <phil@clusterfs.com>
40 * Author: Andreas Dilger <adilger@clusterfs.com>
41 */
42
43#define DEBUG_SUBSYSTEM S_LLITE
44#include <lustre_dlm.h>
45#include <lustre_lite.h>
46#include <linux/pagemap.h>
47#include <linux/file.h>
48#include "llite_internal.h"
49#include <lustre/ll_fiemap.h>
50
51#include "cl_object.h"
52
53struct ll_file_data *ll_file_data_get(void)
54{
55	struct ll_file_data *fd;
56
57	OBD_SLAB_ALLOC_PTR_GFP(fd, ll_file_data_slab, __GFP_IO);
58	if (fd == NULL)
59		return NULL;
60	fd->fd_write_failed = false;
61	return fd;
62}
63
64static void ll_file_data_put(struct ll_file_data *fd)
65{
66	if (fd != NULL)
67		OBD_SLAB_FREE_PTR(fd, ll_file_data_slab);
68}
69
70void ll_pack_inode2opdata(struct inode *inode, struct md_op_data *op_data,
71			  struct lustre_handle *fh)
72{
73	op_data->op_fid1 = ll_i2info(inode)->lli_fid;
74	op_data->op_attr.ia_mode = inode->i_mode;
75	op_data->op_attr.ia_atime = inode->i_atime;
76	op_data->op_attr.ia_mtime = inode->i_mtime;
77	op_data->op_attr.ia_ctime = inode->i_ctime;
78	op_data->op_attr.ia_size = i_size_read(inode);
79	op_data->op_attr_blocks = inode->i_blocks;
80	((struct ll_iattr *)&op_data->op_attr)->ia_attr_flags =
81					ll_inode_to_ext_flags(inode->i_flags);
82	op_data->op_ioepoch = ll_i2info(inode)->lli_ioepoch;
83	if (fh)
84		op_data->op_handle = *fh;
85	op_data->op_capa1 = ll_mdscapa_get(inode);
86
87	if (LLIF_DATA_MODIFIED & ll_i2info(inode)->lli_flags)
88		op_data->op_bias |= MDS_DATA_MODIFIED;
89}
90
91/**
92 * Closes the IO epoch and packs all the attributes into @op_data for
93 * the CLOSE rpc.
94 */
95static void ll_prepare_close(struct inode *inode, struct md_op_data *op_data,
96			     struct obd_client_handle *och)
97{
98	op_data->op_attr.ia_valid = ATTR_MODE | ATTR_ATIME | ATTR_ATIME_SET |
99					ATTR_MTIME | ATTR_MTIME_SET |
100					ATTR_CTIME | ATTR_CTIME_SET;
101
102	if (!(och->och_flags & FMODE_WRITE))
103		goto out;
104
105	if (!exp_connect_som(ll_i2mdexp(inode)) || !S_ISREG(inode->i_mode))
106		op_data->op_attr.ia_valid |= ATTR_SIZE | ATTR_BLOCKS;
107	else
108		ll_ioepoch_close(inode, op_data, &och, 0);
109
110out:
111	ll_pack_inode2opdata(inode, op_data, &och->och_fh);
112	ll_prep_md_op_data(op_data, inode, NULL, NULL,
113			   0, 0, LUSTRE_OPC_ANY, NULL);
114}
115
116static int ll_close_inode_openhandle(struct obd_export *md_exp,
117				     struct inode *inode,
118				     struct obd_client_handle *och,
119				     const __u64 *data_version)
120{
121	struct obd_export *exp = ll_i2mdexp(inode);
122	struct md_op_data *op_data;
123	struct ptlrpc_request *req = NULL;
124	struct obd_device *obd = class_exp2obd(exp);
125	int epoch_close = 1;
126	int rc;
127
128	if (obd == NULL) {
129		/*
130		 * XXX: in case of LMV, is this correct to access
131		 * ->exp_handle?
132		 */
133		CERROR("Invalid MDC connection handle "LPX64"\n",
134		       ll_i2mdexp(inode)->exp_handle.h_cookie);
135		GOTO(out, rc = 0);
136	}
137
138	OBD_ALLOC_PTR(op_data);
139	if (op_data == NULL)
140		GOTO(out, rc = -ENOMEM); // XXX We leak openhandle and request here.
141
142	ll_prepare_close(inode, op_data, och);
143	if (data_version != NULL) {
144		/* Pass in data_version implies release. */
145		op_data->op_bias |= MDS_HSM_RELEASE;
146		op_data->op_data_version = *data_version;
147		op_data->op_lease_handle = och->och_lease_handle;
148		op_data->op_attr.ia_valid |= ATTR_SIZE | ATTR_BLOCKS;
149	}
150	epoch_close = (op_data->op_flags & MF_EPOCH_CLOSE);
151	rc = md_close(md_exp, op_data, och->och_mod, &req);
152	if (rc == -EAGAIN) {
153		/* This close must have the epoch closed. */
154		LASSERT(epoch_close);
155		/* MDS has instructed us to obtain Size-on-MDS attribute from
156		 * OSTs and send setattr to back to MDS. */
157		rc = ll_som_update(inode, op_data);
158		if (rc) {
159			CERROR("inode %lu mdc Size-on-MDS update failed: "
160			       "rc = %d\n", inode->i_ino, rc);
161			rc = 0;
162		}
163	} else if (rc) {
164		CERROR("inode %lu mdc close failed: rc = %d\n",
165		       inode->i_ino, rc);
166	}
167
168	/* DATA_MODIFIED flag was successfully sent on close, cancel data
169	 * modification flag. */
170	if (rc == 0 && (op_data->op_bias & MDS_DATA_MODIFIED)) {
171		struct ll_inode_info *lli = ll_i2info(inode);
172
173		spin_lock(&lli->lli_lock);
174		lli->lli_flags &= ~LLIF_DATA_MODIFIED;
175		spin_unlock(&lli->lli_lock);
176	}
177
178	if (rc == 0) {
179		rc = ll_objects_destroy(req, inode);
180		if (rc)
181			CERROR("inode %lu ll_objects destroy: rc = %d\n",
182			       inode->i_ino, rc);
183	}
184	if (rc == 0 && op_data->op_bias & MDS_HSM_RELEASE) {
185		struct mdt_body *body;
186		body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
187		if (!(body->valid & OBD_MD_FLRELEASED))
188			rc = -EBUSY;
189	}
190
191	ll_finish_md_op_data(op_data);
192
193out:
194	if (exp_connect_som(exp) && !epoch_close &&
195	    S_ISREG(inode->i_mode) && (och->och_flags & FMODE_WRITE)) {
196		ll_queue_done_writing(inode, LLIF_DONE_WRITING);
197	} else {
198		md_clear_open_replay_data(md_exp, och);
199		/* Free @och if it is not waiting for DONE_WRITING. */
200		och->och_fh.cookie = DEAD_HANDLE_MAGIC;
201		OBD_FREE_PTR(och);
202	}
203	if (req) /* This is close request */
204		ptlrpc_req_finished(req);
205	return rc;
206}
207
208int ll_md_real_close(struct inode *inode, fmode_t fmode)
209{
210	struct ll_inode_info *lli = ll_i2info(inode);
211	struct obd_client_handle **och_p;
212	struct obd_client_handle *och;
213	__u64 *och_usecount;
214	int rc = 0;
215
216	if (fmode & FMODE_WRITE) {
217		och_p = &lli->lli_mds_write_och;
218		och_usecount = &lli->lli_open_fd_write_count;
219	} else if (fmode & FMODE_EXEC) {
220		och_p = &lli->lli_mds_exec_och;
221		och_usecount = &lli->lli_open_fd_exec_count;
222	} else {
223		LASSERT(fmode & FMODE_READ);
224		och_p = &lli->lli_mds_read_och;
225		och_usecount = &lli->lli_open_fd_read_count;
226	}
227
228	mutex_lock(&lli->lli_och_mutex);
229	if (*och_usecount > 0) {
230		/* There are still users of this handle, so skip
231		 * freeing it. */
232		mutex_unlock(&lli->lli_och_mutex);
233		return 0;
234	}
235
236	och=*och_p;
237	*och_p = NULL;
238	mutex_unlock(&lli->lli_och_mutex);
239
240	if (och != NULL) {
241		/* There might be a race and this handle may already
242		   be closed. */
243		rc = ll_close_inode_openhandle(ll_i2sbi(inode)->ll_md_exp,
244					       inode, och, NULL);
245	}
246
247	return rc;
248}
249
250int ll_md_close(struct obd_export *md_exp, struct inode *inode,
251		struct file *file)
252{
253	struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
254	struct ll_inode_info *lli = ll_i2info(inode);
255	int rc = 0;
256
257	/* clear group lock, if present */
258	if (unlikely(fd->fd_flags & LL_FILE_GROUP_LOCKED))
259		ll_put_grouplock(inode, file, fd->fd_grouplock.cg_gid);
260
261	if (fd->fd_lease_och != NULL) {
262		bool lease_broken;
263
264		/* Usually the lease is not released when the
265		 * application crashed, we need to release here. */
266		rc = ll_lease_close(fd->fd_lease_och, inode, &lease_broken);
267		CDEBUG(rc ? D_ERROR : D_INODE, "Clean up lease "DFID" %d/%d\n",
268			PFID(&lli->lli_fid), rc, lease_broken);
269
270		fd->fd_lease_och = NULL;
271	}
272
273	if (fd->fd_och != NULL) {
274		rc = ll_close_inode_openhandle(md_exp, inode, fd->fd_och, NULL);
275		fd->fd_och = NULL;
276		GOTO(out, rc);
277	}
278
279	/* Let's see if we have good enough OPEN lock on the file and if
280	   we can skip talking to MDS */
281	if (file->f_dentry->d_inode) { /* Can this ever be false? */
282		int lockmode;
283		int flags = LDLM_FL_BLOCK_GRANTED | LDLM_FL_TEST_LOCK;
284		struct lustre_handle lockh;
285		struct inode *inode = file->f_dentry->d_inode;
286		ldlm_policy_data_t policy = {.l_inodebits={MDS_INODELOCK_OPEN}};
287
288		mutex_lock(&lli->lli_och_mutex);
289		if (fd->fd_omode & FMODE_WRITE) {
290			lockmode = LCK_CW;
291			LASSERT(lli->lli_open_fd_write_count);
292			lli->lli_open_fd_write_count--;
293		} else if (fd->fd_omode & FMODE_EXEC) {
294			lockmode = LCK_PR;
295			LASSERT(lli->lli_open_fd_exec_count);
296			lli->lli_open_fd_exec_count--;
297		} else {
298			lockmode = LCK_CR;
299			LASSERT(lli->lli_open_fd_read_count);
300			lli->lli_open_fd_read_count--;
301		}
302		mutex_unlock(&lli->lli_och_mutex);
303
304		if (!md_lock_match(md_exp, flags, ll_inode2fid(inode),
305				   LDLM_IBITS, &policy, lockmode,
306				   &lockh)) {
307			rc = ll_md_real_close(file->f_dentry->d_inode,
308					      fd->fd_omode);
309		}
310	} else {
311		CERROR("Releasing a file %p with negative dentry %p. Name %s",
312		       file, file->f_dentry, file->f_dentry->d_name.name);
313	}
314
315out:
316	LUSTRE_FPRIVATE(file) = NULL;
317	ll_file_data_put(fd);
318	ll_capa_close(inode);
319
320	return rc;
321}
322
323/* While this returns an error code, fput() the caller does not, so we need
324 * to make every effort to clean up all of our state here.  Also, applications
325 * rarely check close errors and even if an error is returned they will not
326 * re-try the close call.
327 */
328int ll_file_release(struct inode *inode, struct file *file)
329{
330	struct ll_file_data *fd;
331	struct ll_sb_info *sbi = ll_i2sbi(inode);
332	struct ll_inode_info *lli = ll_i2info(inode);
333	int rc;
334
335	CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p)\n", inode->i_ino,
336	       inode->i_generation, inode);
337
338#ifdef CONFIG_FS_POSIX_ACL
339	if (sbi->ll_flags & LL_SBI_RMT_CLIENT &&
340	    inode == inode->i_sb->s_root->d_inode) {
341		struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
342
343		LASSERT(fd != NULL);
344		if (unlikely(fd->fd_flags & LL_FILE_RMTACL)) {
345			fd->fd_flags &= ~LL_FILE_RMTACL;
346			rct_del(&sbi->ll_rct, current_pid());
347			et_search_free(&sbi->ll_et, current_pid());
348		}
349	}
350#endif
351
352	if (inode->i_sb->s_root != file->f_dentry)
353		ll_stats_ops_tally(sbi, LPROC_LL_RELEASE, 1);
354	fd = LUSTRE_FPRIVATE(file);
355	LASSERT(fd != NULL);
356
357	/* The last ref on @file, maybe not the the owner pid of statahead.
358	 * Different processes can open the same dir, "ll_opendir_key" means:
359	 * it is me that should stop the statahead thread. */
360	if (S_ISDIR(inode->i_mode) && lli->lli_opendir_key == fd &&
361	    lli->lli_opendir_pid != 0)
362		ll_stop_statahead(inode, lli->lli_opendir_key);
363
364	if (inode->i_sb->s_root == file->f_dentry) {
365		LUSTRE_FPRIVATE(file) = NULL;
366		ll_file_data_put(fd);
367		return 0;
368	}
369
370	if (!S_ISDIR(inode->i_mode)) {
371		lov_read_and_clear_async_rc(lli->lli_clob);
372		lli->lli_async_rc = 0;
373	}
374
375	rc = ll_md_close(sbi->ll_md_exp, inode, file);
376
377	if (CFS_FAIL_TIMEOUT_MS(OBD_FAIL_PTLRPC_DUMP_LOG, cfs_fail_val))
378		libcfs_debug_dumplog();
379
380	return rc;
381}
382
383static int ll_intent_file_open(struct file *file, void *lmm,
384			       int lmmsize, struct lookup_intent *itp)
385{
386	struct ll_sb_info *sbi = ll_i2sbi(file->f_dentry->d_inode);
387	struct dentry *parent = file->f_dentry->d_parent;
388	const char *name = file->f_dentry->d_name.name;
389	const int len = file->f_dentry->d_name.len;
390	struct md_op_data *op_data;
391	struct ptlrpc_request *req;
392	__u32 opc = LUSTRE_OPC_ANY;
393	int rc;
394
395	if (!parent)
396		return -ENOENT;
397
398	/* Usually we come here only for NFSD, and we want open lock.
399	   But we can also get here with pre 2.6.15 patchless kernels, and in
400	   that case that lock is also ok */
401	/* We can also get here if there was cached open handle in revalidate_it
402	 * but it disappeared while we were getting from there to ll_file_open.
403	 * But this means this file was closed and immediately opened which
404	 * makes a good candidate for using OPEN lock */
405	/* If lmmsize & lmm are not 0, we are just setting stripe info
406	 * parameters. No need for the open lock */
407	if (lmm == NULL && lmmsize == 0) {
408		itp->it_flags |= MDS_OPEN_LOCK;
409		if (itp->it_flags & FMODE_WRITE)
410			opc = LUSTRE_OPC_CREATE;
411	}
412
413	op_data  = ll_prep_md_op_data(NULL, parent->d_inode,
414				      file->f_dentry->d_inode, name, len,
415				      O_RDWR, opc, NULL);
416	if (IS_ERR(op_data))
417		return PTR_ERR(op_data);
418
419	itp->it_flags |= MDS_OPEN_BY_FID;
420	rc = md_intent_lock(sbi->ll_md_exp, op_data, lmm, lmmsize, itp,
421			    0 /*unused */, &req, ll_md_blocking_ast, 0);
422	ll_finish_md_op_data(op_data);
423	if (rc == -ESTALE) {
424		/* reason for keep own exit path - don`t flood log
425		* with messages with -ESTALE errors.
426		*/
427		if (!it_disposition(itp, DISP_OPEN_OPEN) ||
428		     it_open_error(DISP_OPEN_OPEN, itp))
429			GOTO(out, rc);
430		ll_release_openhandle(file->f_dentry, itp);
431		GOTO(out, rc);
432	}
433
434	if (it_disposition(itp, DISP_LOOKUP_NEG))
435		GOTO(out, rc = -ENOENT);
436
437	if (rc != 0 || it_open_error(DISP_OPEN_OPEN, itp)) {
438		rc = rc ? rc : it_open_error(DISP_OPEN_OPEN, itp);
439		CDEBUG(D_VFSTRACE, "lock enqueue: err: %d\n", rc);
440		GOTO(out, rc);
441	}
442
443	rc = ll_prep_inode(&file->f_dentry->d_inode, req, NULL, itp);
444	if (!rc && itp->d.lustre.it_lock_mode)
445		ll_set_lock_data(sbi->ll_md_exp, file->f_dentry->d_inode,
446				 itp, NULL);
447
448out:
449	ptlrpc_req_finished(req);
450	ll_intent_drop_lock(itp);
451
452	return rc;
453}
454
455/**
456 * Assign an obtained @ioepoch to client's inode. No lock is needed, MDS does
457 * not believe attributes if a few ioepoch holders exist. Attributes for
458 * previous ioepoch if new one is opened are also skipped by MDS.
459 */
460void ll_ioepoch_open(struct ll_inode_info *lli, __u64 ioepoch)
461{
462	if (ioepoch && lli->lli_ioepoch != ioepoch) {
463		lli->lli_ioepoch = ioepoch;
464		CDEBUG(D_INODE, "Epoch "LPU64" opened on "DFID"\n",
465		       ioepoch, PFID(&lli->lli_fid));
466	}
467}
468
469static int ll_och_fill(struct obd_export *md_exp, struct lookup_intent *it,
470		       struct obd_client_handle *och)
471{
472	struct ptlrpc_request *req = it->d.lustre.it_data;
473	struct mdt_body *body;
474
475	body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
476	och->och_fh = body->handle;
477	och->och_fid = body->fid1;
478	och->och_lease_handle.cookie = it->d.lustre.it_lock_handle;
479	och->och_magic = OBD_CLIENT_HANDLE_MAGIC;
480	och->och_flags = it->it_flags;
481
482	return md_set_open_replay_data(md_exp, och, it);
483}
484
485int ll_local_open(struct file *file, struct lookup_intent *it,
486		  struct ll_file_data *fd, struct obd_client_handle *och)
487{
488	struct inode *inode = file->f_dentry->d_inode;
489	struct ll_inode_info *lli = ll_i2info(inode);
490
491	LASSERT(!LUSTRE_FPRIVATE(file));
492
493	LASSERT(fd != NULL);
494
495	if (och) {
496		struct ptlrpc_request *req = it->d.lustre.it_data;
497		struct mdt_body *body;
498		int rc;
499
500		rc = ll_och_fill(ll_i2sbi(inode)->ll_md_exp, it, och);
501		if (rc != 0)
502			return rc;
503
504		body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
505		ll_ioepoch_open(lli, body->ioepoch);
506	}
507
508	LUSTRE_FPRIVATE(file) = fd;
509	ll_readahead_init(inode, &fd->fd_ras);
510	fd->fd_omode = it->it_flags & (FMODE_READ | FMODE_WRITE | FMODE_EXEC);
511	return 0;
512}
513
514/* Open a file, and (for the very first open) create objects on the OSTs at
515 * this time.  If opened with O_LOV_DELAY_CREATE, then we don't do the object
516 * creation or open until ll_lov_setstripe() ioctl is called.
517 *
518 * If we already have the stripe MD locally then we don't request it in
519 * md_open(), by passing a lmm_size = 0.
520 *
521 * It is up to the application to ensure no other processes open this file
522 * in the O_LOV_DELAY_CREATE case, or the default striping pattern will be
523 * used.  We might be able to avoid races of that sort by getting lli_open_sem
524 * before returning in the O_LOV_DELAY_CREATE case and dropping it here
525 * or in ll_file_release(), but I'm not sure that is desirable/necessary.
526 */
527int ll_file_open(struct inode *inode, struct file *file)
528{
529	struct ll_inode_info *lli = ll_i2info(inode);
530	struct lookup_intent *it, oit = { .it_op = IT_OPEN,
531					  .it_flags = file->f_flags };
532	struct obd_client_handle **och_p = NULL;
533	__u64 *och_usecount = NULL;
534	struct ll_file_data *fd;
535	int rc = 0, opendir_set = 0;
536
537	CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p), flags %o\n", inode->i_ino,
538	       inode->i_generation, inode, file->f_flags);
539
540	it = file->private_data; /* XXX: compat macro */
541	file->private_data = NULL; /* prevent ll_local_open assertion */
542
543	fd = ll_file_data_get();
544	if (fd == NULL)
545		GOTO(out_openerr, rc = -ENOMEM);
546
547	fd->fd_file = file;
548	if (S_ISDIR(inode->i_mode)) {
549		spin_lock(&lli->lli_sa_lock);
550		if (lli->lli_opendir_key == NULL && lli->lli_sai == NULL &&
551		    lli->lli_opendir_pid == 0) {
552			lli->lli_opendir_key = fd;
553			lli->lli_opendir_pid = current_pid();
554			opendir_set = 1;
555		}
556		spin_unlock(&lli->lli_sa_lock);
557	}
558
559	if (inode->i_sb->s_root == file->f_dentry) {
560		LUSTRE_FPRIVATE(file) = fd;
561		return 0;
562	}
563
564	if (!it || !it->d.lustre.it_disposition) {
565		/* Convert f_flags into access mode. We cannot use file->f_mode,
566		 * because everything but O_ACCMODE mask was stripped from
567		 * there */
568		if ((oit.it_flags + 1) & O_ACCMODE)
569			oit.it_flags++;
570		if (file->f_flags & O_TRUNC)
571			oit.it_flags |= FMODE_WRITE;
572
573		/* kernel only call f_op->open in dentry_open.  filp_open calls
574		 * dentry_open after call to open_namei that checks permissions.
575		 * Only nfsd_open call dentry_open directly without checking
576		 * permissions and because of that this code below is safe. */
577		if (oit.it_flags & (FMODE_WRITE | FMODE_READ))
578			oit.it_flags |= MDS_OPEN_OWNEROVERRIDE;
579
580		/* We do not want O_EXCL here, presumably we opened the file
581		 * already? XXX - NFS implications? */
582		oit.it_flags &= ~O_EXCL;
583
584		/* bug20584, if "it_flags" contains O_CREAT, the file will be
585		 * created if necessary, then "IT_CREAT" should be set to keep
586		 * consistent with it */
587		if (oit.it_flags & O_CREAT)
588			oit.it_op |= IT_CREAT;
589
590		it = &oit;
591	}
592
593restart:
594	/* Let's see if we have file open on MDS already. */
595	if (it->it_flags & FMODE_WRITE) {
596		och_p = &lli->lli_mds_write_och;
597		och_usecount = &lli->lli_open_fd_write_count;
598	} else if (it->it_flags & FMODE_EXEC) {
599		och_p = &lli->lli_mds_exec_och;
600		och_usecount = &lli->lli_open_fd_exec_count;
601	 } else {
602		och_p = &lli->lli_mds_read_och;
603		och_usecount = &lli->lli_open_fd_read_count;
604	}
605
606	mutex_lock(&lli->lli_och_mutex);
607	if (*och_p) { /* Open handle is present */
608		if (it_disposition(it, DISP_OPEN_OPEN)) {
609			/* Well, there's extra open request that we do not need,
610			   let's close it somehow. This will decref request. */
611			rc = it_open_error(DISP_OPEN_OPEN, it);
612			if (rc) {
613				mutex_unlock(&lli->lli_och_mutex);
614				GOTO(out_openerr, rc);
615			}
616
617			ll_release_openhandle(file->f_dentry, it);
618		}
619		(*och_usecount)++;
620
621		rc = ll_local_open(file, it, fd, NULL);
622		if (rc) {
623			(*och_usecount)--;
624			mutex_unlock(&lli->lli_och_mutex);
625			GOTO(out_openerr, rc);
626		}
627	} else {
628		LASSERT(*och_usecount == 0);
629		if (!it->d.lustre.it_disposition) {
630			/* We cannot just request lock handle now, new ELC code
631			   means that one of other OPEN locks for this file
632			   could be cancelled, and since blocking ast handler
633			   would attempt to grab och_mutex as well, that would
634			   result in a deadlock */
635			mutex_unlock(&lli->lli_och_mutex);
636			it->it_create_mode |= M_CHECK_STALE;
637			rc = ll_intent_file_open(file, NULL, 0, it);
638			it->it_create_mode &= ~M_CHECK_STALE;
639			if (rc)
640				GOTO(out_openerr, rc);
641
642			goto restart;
643		}
644		OBD_ALLOC(*och_p, sizeof (struct obd_client_handle));
645		if (!*och_p)
646			GOTO(out_och_free, rc = -ENOMEM);
647
648		(*och_usecount)++;
649
650		/* md_intent_lock() didn't get a request ref if there was an
651		 * open error, so don't do cleanup on the request here
652		 * (bug 3430) */
653		/* XXX (green): Should not we bail out on any error here, not
654		 * just open error? */
655		rc = it_open_error(DISP_OPEN_OPEN, it);
656		if (rc)
657			GOTO(out_och_free, rc);
658
659		LASSERT(it_disposition(it, DISP_ENQ_OPEN_REF));
660
661		rc = ll_local_open(file, it, fd, *och_p);
662		if (rc)
663			GOTO(out_och_free, rc);
664	}
665	mutex_unlock(&lli->lli_och_mutex);
666	fd = NULL;
667
668	/* Must do this outside lli_och_mutex lock to prevent deadlock where
669	   different kind of OPEN lock for this same inode gets cancelled
670	   by ldlm_cancel_lru */
671	if (!S_ISREG(inode->i_mode))
672		GOTO(out_och_free, rc);
673
674	ll_capa_open(inode);
675
676	if (!lli->lli_has_smd &&
677	    (cl_is_lov_delay_create(file->f_flags) ||
678	     (file->f_mode & FMODE_WRITE) == 0)) {
679		CDEBUG(D_INODE, "object creation was delayed\n");
680		GOTO(out_och_free, rc);
681	}
682	cl_lov_delay_create_clear(&file->f_flags);
683	GOTO(out_och_free, rc);
684
685out_och_free:
686	if (rc) {
687		if (och_p && *och_p) {
688			OBD_FREE(*och_p, sizeof (struct obd_client_handle));
689			*och_p = NULL; /* OBD_FREE writes some magic there */
690			(*och_usecount)--;
691		}
692		mutex_unlock(&lli->lli_och_mutex);
693
694out_openerr:
695		if (opendir_set != 0)
696			ll_stop_statahead(inode, lli->lli_opendir_key);
697		if (fd != NULL)
698			ll_file_data_put(fd);
699	} else {
700		ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_OPEN, 1);
701	}
702
703	if (it && it_disposition(it, DISP_ENQ_OPEN_REF)) {
704		ptlrpc_req_finished(it->d.lustre.it_data);
705		it_clear_disposition(it, DISP_ENQ_OPEN_REF);
706	}
707
708	return rc;
709}
710
711static int ll_md_blocking_lease_ast(struct ldlm_lock *lock,
712			struct ldlm_lock_desc *desc, void *data, int flag)
713{
714	int rc;
715	struct lustre_handle lockh;
716
717	switch (flag) {
718	case LDLM_CB_BLOCKING:
719		ldlm_lock2handle(lock, &lockh);
720		rc = ldlm_cli_cancel(&lockh, LCF_ASYNC);
721		if (rc < 0) {
722			CDEBUG(D_INODE, "ldlm_cli_cancel: %d\n", rc);
723			return rc;
724		}
725		break;
726	case LDLM_CB_CANCELING:
727		/* do nothing */
728		break;
729	}
730	return 0;
731}
732
733/**
734 * Acquire a lease and open the file.
735 */
736struct obd_client_handle *ll_lease_open(struct inode *inode, struct file *file,
737					fmode_t fmode, __u64 open_flags)
738{
739	struct lookup_intent it = { .it_op = IT_OPEN };
740	struct ll_sb_info *sbi = ll_i2sbi(inode);
741	struct md_op_data *op_data;
742	struct ptlrpc_request *req;
743	struct lustre_handle old_handle = { 0 };
744	struct obd_client_handle *och = NULL;
745	int rc;
746	int rc2;
747
748	if (fmode != FMODE_WRITE && fmode != FMODE_READ)
749		return ERR_PTR(-EINVAL);
750
751	if (file != NULL) {
752		struct ll_inode_info *lli = ll_i2info(inode);
753		struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
754		struct obd_client_handle **och_p;
755		__u64 *och_usecount;
756
757		if (!(fmode & file->f_mode) || (file->f_mode & FMODE_EXEC))
758			return ERR_PTR(-EPERM);
759
760		/* Get the openhandle of the file */
761		rc = -EBUSY;
762		mutex_lock(&lli->lli_och_mutex);
763		if (fd->fd_lease_och != NULL) {
764			mutex_unlock(&lli->lli_och_mutex);
765			return ERR_PTR(rc);
766		}
767
768		if (fd->fd_och == NULL) {
769			if (file->f_mode & FMODE_WRITE) {
770				LASSERT(lli->lli_mds_write_och != NULL);
771				och_p = &lli->lli_mds_write_och;
772				och_usecount = &lli->lli_open_fd_write_count;
773			} else {
774				LASSERT(lli->lli_mds_read_och != NULL);
775				och_p = &lli->lli_mds_read_och;
776				och_usecount = &lli->lli_open_fd_read_count;
777			}
778			if (*och_usecount == 1) {
779				fd->fd_och = *och_p;
780				*och_p = NULL;
781				*och_usecount = 0;
782				rc = 0;
783			}
784		}
785		mutex_unlock(&lli->lli_och_mutex);
786		if (rc < 0) /* more than 1 opener */
787			return ERR_PTR(rc);
788
789		LASSERT(fd->fd_och != NULL);
790		old_handle = fd->fd_och->och_fh;
791	}
792
793	OBD_ALLOC_PTR(och);
794	if (och == NULL)
795		return ERR_PTR(-ENOMEM);
796
797	op_data = ll_prep_md_op_data(NULL, inode, inode, NULL, 0, 0,
798					LUSTRE_OPC_ANY, NULL);
799	if (IS_ERR(op_data))
800		GOTO(out, rc = PTR_ERR(op_data));
801
802	/* To tell the MDT this openhandle is from the same owner */
803	op_data->op_handle = old_handle;
804
805	it.it_flags = fmode | open_flags;
806	it.it_flags |= MDS_OPEN_LOCK | MDS_OPEN_BY_FID | MDS_OPEN_LEASE;
807	rc = md_intent_lock(sbi->ll_md_exp, op_data, NULL, 0, &it, 0, &req,
808				ll_md_blocking_lease_ast,
809	/* LDLM_FL_NO_LRU: To not put the lease lock into LRU list, otherwise
810	 * it can be cancelled which may mislead applications that the lease is
811	 * broken;
812	 * LDLM_FL_EXCL: Set this flag so that it won't be matched by normal
813	 * open in ll_md_blocking_ast(). Otherwise as ll_md_blocking_lease_ast
814	 * doesn't deal with openhandle, so normal openhandle will be leaked. */
815				LDLM_FL_NO_LRU | LDLM_FL_EXCL);
816	ll_finish_md_op_data(op_data);
817	ptlrpc_req_finished(req);
818	if (rc < 0)
819		GOTO(out_release_it, rc);
820
821	if (it_disposition(&it, DISP_LOOKUP_NEG))
822		GOTO(out_release_it, rc = -ENOENT);
823
824	rc = it_open_error(DISP_OPEN_OPEN, &it);
825	if (rc)
826		GOTO(out_release_it, rc);
827
828	LASSERT(it_disposition(&it, DISP_ENQ_OPEN_REF));
829	ll_och_fill(sbi->ll_md_exp, &it, och);
830
831	if (!it_disposition(&it, DISP_OPEN_LEASE)) /* old server? */
832		GOTO(out_close, rc = -EOPNOTSUPP);
833
834	/* already get lease, handle lease lock */
835	ll_set_lock_data(sbi->ll_md_exp, inode, &it, NULL);
836	if (it.d.lustre.it_lock_mode == 0 ||
837	    it.d.lustre.it_lock_bits != MDS_INODELOCK_OPEN) {
838		/* open lock must return for lease */
839		CERROR(DFID "lease granted but no open lock, %d/%llu.\n",
840			PFID(ll_inode2fid(inode)), it.d.lustre.it_lock_mode,
841			it.d.lustre.it_lock_bits);
842		GOTO(out_close, rc = -EPROTO);
843	}
844
845	ll_intent_release(&it);
846	return och;
847
848out_close:
849	rc2 = ll_close_inode_openhandle(sbi->ll_md_exp, inode, och, NULL);
850	if (rc2)
851		CERROR("Close openhandle returned %d\n", rc2);
852
853	/* cancel open lock */
854	if (it.d.lustre.it_lock_mode != 0) {
855		ldlm_lock_decref_and_cancel(&och->och_lease_handle,
856						it.d.lustre.it_lock_mode);
857		it.d.lustre.it_lock_mode = 0;
858	}
859out_release_it:
860	ll_intent_release(&it);
861out:
862	OBD_FREE_PTR(och);
863	return ERR_PTR(rc);
864}
865EXPORT_SYMBOL(ll_lease_open);
866
867/**
868 * Release lease and close the file.
869 * It will check if the lease has ever broken.
870 */
871int ll_lease_close(struct obd_client_handle *och, struct inode *inode,
872			bool *lease_broken)
873{
874	struct ldlm_lock *lock;
875	bool cancelled = true;
876	int rc;
877
878	lock = ldlm_handle2lock(&och->och_lease_handle);
879	if (lock != NULL) {
880		lock_res_and_lock(lock);
881		cancelled = ldlm_is_cancel(lock);
882		unlock_res_and_lock(lock);
883		ldlm_lock_put(lock);
884	}
885
886	CDEBUG(D_INODE, "lease for "DFID" broken? %d\n",
887		PFID(&ll_i2info(inode)->lli_fid), cancelled);
888
889	if (!cancelled)
890		ldlm_cli_cancel(&och->och_lease_handle, 0);
891	if (lease_broken != NULL)
892		*lease_broken = cancelled;
893
894	rc = ll_close_inode_openhandle(ll_i2sbi(inode)->ll_md_exp, inode, och,
895				       NULL);
896	return rc;
897}
898EXPORT_SYMBOL(ll_lease_close);
899
900/* Fills the obdo with the attributes for the lsm */
901static int ll_lsm_getattr(struct lov_stripe_md *lsm, struct obd_export *exp,
902			  struct obd_capa *capa, struct obdo *obdo,
903			  __u64 ioepoch, int sync)
904{
905	struct ptlrpc_request_set *set;
906	struct obd_info	    oinfo = { { { 0 } } };
907	int			rc;
908
909	LASSERT(lsm != NULL);
910
911	oinfo.oi_md = lsm;
912	oinfo.oi_oa = obdo;
913	oinfo.oi_oa->o_oi = lsm->lsm_oi;
914	oinfo.oi_oa->o_mode = S_IFREG;
915	oinfo.oi_oa->o_ioepoch = ioepoch;
916	oinfo.oi_oa->o_valid = OBD_MD_FLID | OBD_MD_FLTYPE |
917			       OBD_MD_FLSIZE | OBD_MD_FLBLOCKS |
918			       OBD_MD_FLBLKSZ | OBD_MD_FLATIME |
919			       OBD_MD_FLMTIME | OBD_MD_FLCTIME |
920			       OBD_MD_FLGROUP | OBD_MD_FLEPOCH |
921			       OBD_MD_FLDATAVERSION;
922	oinfo.oi_capa = capa;
923	if (sync) {
924		oinfo.oi_oa->o_valid |= OBD_MD_FLFLAGS;
925		oinfo.oi_oa->o_flags |= OBD_FL_SRVLOCK;
926	}
927
928	set = ptlrpc_prep_set();
929	if (set == NULL) {
930		CERROR("can't allocate ptlrpc set\n");
931		rc = -ENOMEM;
932	} else {
933		rc = obd_getattr_async(exp, &oinfo, set);
934		if (rc == 0)
935			rc = ptlrpc_set_wait(set);
936		ptlrpc_set_destroy(set);
937	}
938	if (rc == 0)
939		oinfo.oi_oa->o_valid &= (OBD_MD_FLBLOCKS | OBD_MD_FLBLKSZ |
940					 OBD_MD_FLATIME | OBD_MD_FLMTIME |
941					 OBD_MD_FLCTIME | OBD_MD_FLSIZE |
942					 OBD_MD_FLDATAVERSION);
943	return rc;
944}
945
946/**
947  * Performs the getattr on the inode and updates its fields.
948  * If @sync != 0, perform the getattr under the server-side lock.
949  */
950int ll_inode_getattr(struct inode *inode, struct obdo *obdo,
951		     __u64 ioepoch, int sync)
952{
953	struct obd_capa      *capa = ll_mdscapa_get(inode);
954	struct lov_stripe_md *lsm;
955	int rc;
956
957	lsm = ccc_inode_lsm_get(inode);
958	rc = ll_lsm_getattr(lsm, ll_i2dtexp(inode),
959			    capa, obdo, ioepoch, sync);
960	capa_put(capa);
961	if (rc == 0) {
962		struct ost_id *oi = lsm ? &lsm->lsm_oi : &obdo->o_oi;
963
964		obdo_refresh_inode(inode, obdo, obdo->o_valid);
965		CDEBUG(D_INODE, "objid "DOSTID" size %llu, blocks %llu,"
966		       " blksize %lu\n", POSTID(oi), i_size_read(inode),
967		       (unsigned long long)inode->i_blocks,
968		       (unsigned long)ll_inode_blksize(inode));
969	}
970	ccc_inode_lsm_put(inode, lsm);
971	return rc;
972}
973
974int ll_merge_lvb(const struct lu_env *env, struct inode *inode)
975{
976	struct ll_inode_info *lli = ll_i2info(inode);
977	struct cl_object *obj = lli->lli_clob;
978	struct cl_attr *attr = ccc_env_thread_attr(env);
979	struct ost_lvb lvb;
980	int rc = 0;
981
982	ll_inode_size_lock(inode);
983	/* merge timestamps the most recently obtained from mds with
984	   timestamps obtained from osts */
985	LTIME_S(inode->i_atime) = lli->lli_lvb.lvb_atime;
986	LTIME_S(inode->i_mtime) = lli->lli_lvb.lvb_mtime;
987	LTIME_S(inode->i_ctime) = lli->lli_lvb.lvb_ctime;
988	inode_init_lvb(inode, &lvb);
989
990	cl_object_attr_lock(obj);
991	rc = cl_object_attr_get(env, obj, attr);
992	cl_object_attr_unlock(obj);
993
994	if (rc == 0) {
995		if (lvb.lvb_atime < attr->cat_atime)
996			lvb.lvb_atime = attr->cat_atime;
997		if (lvb.lvb_ctime < attr->cat_ctime)
998			lvb.lvb_ctime = attr->cat_ctime;
999		if (lvb.lvb_mtime < attr->cat_mtime)
1000			lvb.lvb_mtime = attr->cat_mtime;
1001
1002		CDEBUG(D_VFSTRACE, DFID" updating i_size "LPU64"\n",
1003				PFID(&lli->lli_fid), attr->cat_size);
1004		cl_isize_write_nolock(inode, attr->cat_size);
1005
1006		inode->i_blocks = attr->cat_blocks;
1007
1008		LTIME_S(inode->i_mtime) = lvb.lvb_mtime;
1009		LTIME_S(inode->i_atime) = lvb.lvb_atime;
1010		LTIME_S(inode->i_ctime) = lvb.lvb_ctime;
1011	}
1012	ll_inode_size_unlock(inode);
1013
1014	return rc;
1015}
1016
1017int ll_glimpse_ioctl(struct ll_sb_info *sbi, struct lov_stripe_md *lsm,
1018		     lstat_t *st)
1019{
1020	struct obdo obdo = { 0 };
1021	int rc;
1022
1023	rc = ll_lsm_getattr(lsm, sbi->ll_dt_exp, NULL, &obdo, 0, 0);
1024	if (rc == 0) {
1025		st->st_size   = obdo.o_size;
1026		st->st_blocks = obdo.o_blocks;
1027		st->st_mtime  = obdo.o_mtime;
1028		st->st_atime  = obdo.o_atime;
1029		st->st_ctime  = obdo.o_ctime;
1030	}
1031	return rc;
1032}
1033
1034static bool file_is_noatime(const struct file *file)
1035{
1036	const struct vfsmount *mnt = file->f_path.mnt;
1037	const struct inode *inode = file->f_path.dentry->d_inode;
1038
1039	/* Adapted from file_accessed() and touch_atime().*/
1040	if (file->f_flags & O_NOATIME)
1041		return true;
1042
1043	if (inode->i_flags & S_NOATIME)
1044		return true;
1045
1046	if (IS_NOATIME(inode))
1047		return true;
1048
1049	if (mnt->mnt_flags & (MNT_NOATIME | MNT_READONLY))
1050		return true;
1051
1052	if ((mnt->mnt_flags & MNT_NODIRATIME) && S_ISDIR(inode->i_mode))
1053		return true;
1054
1055	if ((inode->i_sb->s_flags & MS_NODIRATIME) && S_ISDIR(inode->i_mode))
1056		return true;
1057
1058	return false;
1059}
1060
1061void ll_io_init(struct cl_io *io, const struct file *file, int write)
1062{
1063	struct inode *inode = file->f_dentry->d_inode;
1064
1065	io->u.ci_rw.crw_nonblock = file->f_flags & O_NONBLOCK;
1066	if (write) {
1067		io->u.ci_wr.wr_append = !!(file->f_flags & O_APPEND);
1068		io->u.ci_wr.wr_sync = file->f_flags & O_SYNC ||
1069				      file->f_flags & O_DIRECT ||
1070				      IS_SYNC(inode);
1071	}
1072	io->ci_obj     = ll_i2info(inode)->lli_clob;
1073	io->ci_lockreq = CILR_MAYBE;
1074	if (ll_file_nolock(file)) {
1075		io->ci_lockreq = CILR_NEVER;
1076		io->ci_no_srvlock = 1;
1077	} else if (file->f_flags & O_APPEND) {
1078		io->ci_lockreq = CILR_MANDATORY;
1079	}
1080
1081	io->ci_noatime = file_is_noatime(file);
1082}
1083
1084static ssize_t
1085ll_file_io_generic(const struct lu_env *env, struct vvp_io_args *args,
1086		   struct file *file, enum cl_io_type iot,
1087		   loff_t *ppos, size_t count)
1088{
1089	struct ll_inode_info *lli = ll_i2info(file->f_dentry->d_inode);
1090	struct ll_file_data  *fd  = LUSTRE_FPRIVATE(file);
1091	struct cl_io	 *io;
1092	ssize_t	       result;
1093
1094restart:
1095	io = ccc_env_thread_io(env);
1096	ll_io_init(io, file, iot == CIT_WRITE);
1097
1098	if (cl_io_rw_init(env, io, iot, *ppos, count) == 0) {
1099		struct vvp_io *vio = vvp_env_io(env);
1100		struct ccc_io *cio = ccc_env_io(env);
1101		int write_mutex_locked = 0;
1102
1103		cio->cui_fd  = LUSTRE_FPRIVATE(file);
1104		vio->cui_io_subtype = args->via_io_subtype;
1105
1106		switch (vio->cui_io_subtype) {
1107		case IO_NORMAL:
1108			cio->cui_iov = args->u.normal.via_iov;
1109			cio->cui_nrsegs = args->u.normal.via_nrsegs;
1110			cio->cui_tot_nrsegs = cio->cui_nrsegs;
1111			cio->cui_iocb = args->u.normal.via_iocb;
1112			if ((iot == CIT_WRITE) &&
1113			    !(cio->cui_fd->fd_flags & LL_FILE_GROUP_LOCKED)) {
1114				if (mutex_lock_interruptible(&lli->
1115							       lli_write_mutex))
1116					GOTO(out, result = -ERESTARTSYS);
1117				write_mutex_locked = 1;
1118			} else if (iot == CIT_READ) {
1119				down_read(&lli->lli_trunc_sem);
1120			}
1121			break;
1122		case IO_SPLICE:
1123			vio->u.splice.cui_pipe = args->u.splice.via_pipe;
1124			vio->u.splice.cui_flags = args->u.splice.via_flags;
1125			break;
1126		default:
1127			CERROR("Unknown IO type - %u\n", vio->cui_io_subtype);
1128			LBUG();
1129		}
1130		result = cl_io_loop(env, io);
1131		if (write_mutex_locked)
1132			mutex_unlock(&lli->lli_write_mutex);
1133		else if (args->via_io_subtype == IO_NORMAL && iot == CIT_READ)
1134			up_read(&lli->lli_trunc_sem);
1135	} else {
1136		/* cl_io_rw_init() handled IO */
1137		result = io->ci_result;
1138	}
1139
1140	if (io->ci_nob > 0) {
1141		result = io->ci_nob;
1142		*ppos = io->u.ci_wr.wr.crw_pos;
1143	}
1144	GOTO(out, result);
1145out:
1146	cl_io_fini(env, io);
1147	/* If any bit been read/written (result != 0), we just return
1148	 * short read/write instead of restart io. */
1149	if ((result == 0 || result == -ENODATA) && io->ci_need_restart) {
1150		CDEBUG(D_VFSTRACE, "Restart %s on %s from %lld, count:%zd\n",
1151		       iot == CIT_READ ? "read" : "write",
1152		       file->f_dentry->d_name.name, *ppos, count);
1153		LASSERTF(io->ci_nob == 0, "%zd", io->ci_nob);
1154		goto restart;
1155	}
1156
1157	if (iot == CIT_READ) {
1158		if (result >= 0)
1159			ll_stats_ops_tally(ll_i2sbi(file->f_dentry->d_inode),
1160					   LPROC_LL_READ_BYTES, result);
1161	} else if (iot == CIT_WRITE) {
1162		if (result >= 0) {
1163			ll_stats_ops_tally(ll_i2sbi(file->f_dentry->d_inode),
1164					   LPROC_LL_WRITE_BYTES, result);
1165			fd->fd_write_failed = false;
1166		} else if (result != -ERESTARTSYS) {
1167			fd->fd_write_failed = true;
1168		}
1169	}
1170
1171	return result;
1172}
1173
1174static ssize_t ll_file_aio_read(struct kiocb *iocb, const struct iovec *iov,
1175				unsigned long nr_segs, loff_t pos)
1176{
1177	struct lu_env      *env;
1178	struct vvp_io_args *args;
1179	size_t	      count = 0;
1180	ssize_t	     result;
1181	int		 refcheck;
1182
1183	result = generic_segment_checks(iov, &nr_segs, &count, VERIFY_WRITE);
1184	if (result)
1185		return result;
1186
1187	env = cl_env_get(&refcheck);
1188	if (IS_ERR(env))
1189		return PTR_ERR(env);
1190
1191	args = vvp_env_args(env, IO_NORMAL);
1192	args->u.normal.via_iov = (struct iovec *)iov;
1193	args->u.normal.via_nrsegs = nr_segs;
1194	args->u.normal.via_iocb = iocb;
1195
1196	result = ll_file_io_generic(env, args, iocb->ki_filp, CIT_READ,
1197				    &iocb->ki_pos, count);
1198	cl_env_put(env, &refcheck);
1199	return result;
1200}
1201
1202static ssize_t ll_file_read(struct file *file, char *buf, size_t count,
1203			    loff_t *ppos)
1204{
1205	struct lu_env *env;
1206	struct iovec  *local_iov;
1207	struct kiocb  *kiocb;
1208	ssize_t	result;
1209	int	    refcheck;
1210
1211	env = cl_env_get(&refcheck);
1212	if (IS_ERR(env))
1213		return PTR_ERR(env);
1214
1215	local_iov = &vvp_env_info(env)->vti_local_iov;
1216	kiocb = &vvp_env_info(env)->vti_kiocb;
1217	local_iov->iov_base = (void __user *)buf;
1218	local_iov->iov_len = count;
1219	init_sync_kiocb(kiocb, file);
1220	kiocb->ki_pos = *ppos;
1221	kiocb->ki_nbytes = count;
1222
1223	result = ll_file_aio_read(kiocb, local_iov, 1, kiocb->ki_pos);
1224	*ppos = kiocb->ki_pos;
1225
1226	cl_env_put(env, &refcheck);
1227	return result;
1228}
1229
1230/*
1231 * Write to a file (through the page cache).
1232 */
1233static ssize_t ll_file_aio_write(struct kiocb *iocb, const struct iovec *iov,
1234				 unsigned long nr_segs, loff_t pos)
1235{
1236	struct lu_env      *env;
1237	struct vvp_io_args *args;
1238	size_t	      count = 0;
1239	ssize_t	     result;
1240	int		 refcheck;
1241
1242	result = generic_segment_checks(iov, &nr_segs, &count, VERIFY_READ);
1243	if (result)
1244		return result;
1245
1246	env = cl_env_get(&refcheck);
1247	if (IS_ERR(env))
1248		return PTR_ERR(env);
1249
1250	args = vvp_env_args(env, IO_NORMAL);
1251	args->u.normal.via_iov = (struct iovec *)iov;
1252	args->u.normal.via_nrsegs = nr_segs;
1253	args->u.normal.via_iocb = iocb;
1254
1255	result = ll_file_io_generic(env, args, iocb->ki_filp, CIT_WRITE,
1256				  &iocb->ki_pos, count);
1257	cl_env_put(env, &refcheck);
1258	return result;
1259}
1260
1261static ssize_t ll_file_write(struct file *file, const char *buf, size_t count,
1262			     loff_t *ppos)
1263{
1264	struct lu_env *env;
1265	struct iovec  *local_iov;
1266	struct kiocb  *kiocb;
1267	ssize_t	result;
1268	int	    refcheck;
1269
1270	env = cl_env_get(&refcheck);
1271	if (IS_ERR(env))
1272		return PTR_ERR(env);
1273
1274	local_iov = &vvp_env_info(env)->vti_local_iov;
1275	kiocb = &vvp_env_info(env)->vti_kiocb;
1276	local_iov->iov_base = (void __user *)buf;
1277	local_iov->iov_len = count;
1278	init_sync_kiocb(kiocb, file);
1279	kiocb->ki_pos = *ppos;
1280	kiocb->ki_nbytes = count;
1281
1282	result = ll_file_aio_write(kiocb, local_iov, 1, kiocb->ki_pos);
1283	*ppos = kiocb->ki_pos;
1284
1285	cl_env_put(env, &refcheck);
1286	return result;
1287}
1288
1289
1290
1291/*
1292 * Send file content (through pagecache) somewhere with helper
1293 */
1294static ssize_t ll_file_splice_read(struct file *in_file, loff_t *ppos,
1295				   struct pipe_inode_info *pipe, size_t count,
1296				   unsigned int flags)
1297{
1298	struct lu_env      *env;
1299	struct vvp_io_args *args;
1300	ssize_t	     result;
1301	int		 refcheck;
1302
1303	env = cl_env_get(&refcheck);
1304	if (IS_ERR(env))
1305		return PTR_ERR(env);
1306
1307	args = vvp_env_args(env, IO_SPLICE);
1308	args->u.splice.via_pipe = pipe;
1309	args->u.splice.via_flags = flags;
1310
1311	result = ll_file_io_generic(env, args, in_file, CIT_READ, ppos, count);
1312	cl_env_put(env, &refcheck);
1313	return result;
1314}
1315
1316static int ll_lov_recreate(struct inode *inode, struct ost_id *oi,
1317			   obd_count ost_idx)
1318{
1319	struct obd_export *exp = ll_i2dtexp(inode);
1320	struct obd_trans_info oti = { 0 };
1321	struct obdo *oa = NULL;
1322	int lsm_size;
1323	int rc = 0;
1324	struct lov_stripe_md *lsm = NULL, *lsm2;
1325
1326	OBDO_ALLOC(oa);
1327	if (oa == NULL)
1328		return -ENOMEM;
1329
1330	lsm = ccc_inode_lsm_get(inode);
1331	if (!lsm_has_objects(lsm))
1332		GOTO(out, rc = -ENOENT);
1333
1334	lsm_size = sizeof(*lsm) + (sizeof(struct lov_oinfo) *
1335		   (lsm->lsm_stripe_count));
1336
1337	OBD_ALLOC_LARGE(lsm2, lsm_size);
1338	if (lsm2 == NULL)
1339		GOTO(out, rc = -ENOMEM);
1340
1341	oa->o_oi = *oi;
1342	oa->o_nlink = ost_idx;
1343	oa->o_flags |= OBD_FL_RECREATE_OBJS;
1344	oa->o_valid = OBD_MD_FLID | OBD_MD_FLFLAGS | OBD_MD_FLGROUP;
1345	obdo_from_inode(oa, inode, OBD_MD_FLTYPE | OBD_MD_FLATIME |
1346				   OBD_MD_FLMTIME | OBD_MD_FLCTIME);
1347	obdo_set_parent_fid(oa, &ll_i2info(inode)->lli_fid);
1348	memcpy(lsm2, lsm, lsm_size);
1349	ll_inode_size_lock(inode);
1350	rc = obd_create(NULL, exp, oa, &lsm2, &oti);
1351	ll_inode_size_unlock(inode);
1352
1353	OBD_FREE_LARGE(lsm2, lsm_size);
1354	GOTO(out, rc);
1355out:
1356	ccc_inode_lsm_put(inode, lsm);
1357	OBDO_FREE(oa);
1358	return rc;
1359}
1360
1361static int ll_lov_recreate_obj(struct inode *inode, unsigned long arg)
1362{
1363	struct ll_recreate_obj ucreat;
1364	struct ost_id		oi;
1365
1366	if (!capable(CFS_CAP_SYS_ADMIN))
1367		return -EPERM;
1368
1369	if (copy_from_user(&ucreat, (struct ll_recreate_obj *)arg,
1370			   sizeof(ucreat)))
1371		return -EFAULT;
1372
1373	ostid_set_seq_mdt0(&oi);
1374	ostid_set_id(&oi, ucreat.lrc_id);
1375	return ll_lov_recreate(inode, &oi, ucreat.lrc_ost_idx);
1376}
1377
1378static int ll_lov_recreate_fid(struct inode *inode, unsigned long arg)
1379{
1380	struct lu_fid	fid;
1381	struct ost_id	oi;
1382	obd_count	ost_idx;
1383
1384	if (!capable(CFS_CAP_SYS_ADMIN))
1385		return -EPERM;
1386
1387	if (copy_from_user(&fid, (struct lu_fid *)arg, sizeof(fid)))
1388		return -EFAULT;
1389
1390	fid_to_ostid(&fid, &oi);
1391	ost_idx = (fid_seq(&fid) >> 16) & 0xffff;
1392	return ll_lov_recreate(inode, &oi, ost_idx);
1393}
1394
1395int ll_lov_setstripe_ea_info(struct inode *inode, struct file *file,
1396			     int flags, struct lov_user_md *lum, int lum_size)
1397{
1398	struct lov_stripe_md *lsm = NULL;
1399	struct lookup_intent oit = {.it_op = IT_OPEN, .it_flags = flags};
1400	int rc = 0;
1401
1402	lsm = ccc_inode_lsm_get(inode);
1403	if (lsm != NULL) {
1404		ccc_inode_lsm_put(inode, lsm);
1405		CDEBUG(D_IOCTL, "stripe already exists for ino %lu\n",
1406		       inode->i_ino);
1407		GOTO(out, rc = -EEXIST);
1408	}
1409
1410	ll_inode_size_lock(inode);
1411	rc = ll_intent_file_open(file, lum, lum_size, &oit);
1412	if (rc)
1413		GOTO(out_unlock, rc);
1414	rc = oit.d.lustre.it_status;
1415	if (rc < 0)
1416		GOTO(out_req_free, rc);
1417
1418	ll_release_openhandle(file->f_dentry, &oit);
1419
1420out_unlock:
1421	ll_inode_size_unlock(inode);
1422	ll_intent_release(&oit);
1423	ccc_inode_lsm_put(inode, lsm);
1424out:
1425	cl_lov_delay_create_clear(&file->f_flags);
1426	return rc;
1427out_req_free:
1428	ptlrpc_req_finished((struct ptlrpc_request *) oit.d.lustre.it_data);
1429	goto out;
1430}
1431
1432int ll_lov_getstripe_ea_info(struct inode *inode, const char *filename,
1433			     struct lov_mds_md **lmmp, int *lmm_size,
1434			     struct ptlrpc_request **request)
1435{
1436	struct ll_sb_info *sbi = ll_i2sbi(inode);
1437	struct mdt_body  *body;
1438	struct lov_mds_md *lmm = NULL;
1439	struct ptlrpc_request *req = NULL;
1440	struct md_op_data *op_data;
1441	int rc, lmmsize;
1442
1443	rc = ll_get_max_mdsize(sbi, &lmmsize);
1444	if (rc)
1445		return rc;
1446
1447	op_data = ll_prep_md_op_data(NULL, inode, NULL, filename,
1448				     strlen(filename), lmmsize,
1449				     LUSTRE_OPC_ANY, NULL);
1450	if (IS_ERR(op_data))
1451		return PTR_ERR(op_data);
1452
1453	op_data->op_valid = OBD_MD_FLEASIZE | OBD_MD_FLDIREA;
1454	rc = md_getattr_name(sbi->ll_md_exp, op_data, &req);
1455	ll_finish_md_op_data(op_data);
1456	if (rc < 0) {
1457		CDEBUG(D_INFO, "md_getattr_name failed "
1458		       "on %s: rc %d\n", filename, rc);
1459		GOTO(out, rc);
1460	}
1461
1462	body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
1463	LASSERT(body != NULL); /* checked by mdc_getattr_name */
1464
1465	lmmsize = body->eadatasize;
1466
1467	if (!(body->valid & (OBD_MD_FLEASIZE | OBD_MD_FLDIREA)) ||
1468			lmmsize == 0) {
1469		GOTO(out, rc = -ENODATA);
1470	}
1471
1472	lmm = req_capsule_server_sized_get(&req->rq_pill, &RMF_MDT_MD, lmmsize);
1473	LASSERT(lmm != NULL);
1474
1475	if ((lmm->lmm_magic != cpu_to_le32(LOV_MAGIC_V1)) &&
1476	    (lmm->lmm_magic != cpu_to_le32(LOV_MAGIC_V3))) {
1477		GOTO(out, rc = -EPROTO);
1478	}
1479
1480	/*
1481	 * This is coming from the MDS, so is probably in
1482	 * little endian.  We convert it to host endian before
1483	 * passing it to userspace.
1484	 */
1485	if (LOV_MAGIC != cpu_to_le32(LOV_MAGIC)) {
1486		int stripe_count;
1487
1488		stripe_count = le16_to_cpu(lmm->lmm_stripe_count);
1489		if (le32_to_cpu(lmm->lmm_pattern) & LOV_PATTERN_F_RELEASED)
1490			stripe_count = 0;
1491
1492		/* if function called for directory - we should
1493		 * avoid swab not existent lsm objects */
1494		if (lmm->lmm_magic == cpu_to_le32(LOV_MAGIC_V1)) {
1495			lustre_swab_lov_user_md_v1((struct lov_user_md_v1 *)lmm);
1496			if (S_ISREG(body->mode))
1497				lustre_swab_lov_user_md_objects(
1498				 ((struct lov_user_md_v1 *)lmm)->lmm_objects,
1499				 stripe_count);
1500		} else if (lmm->lmm_magic == cpu_to_le32(LOV_MAGIC_V3)) {
1501			lustre_swab_lov_user_md_v3((struct lov_user_md_v3 *)lmm);
1502			if (S_ISREG(body->mode))
1503				lustre_swab_lov_user_md_objects(
1504				 ((struct lov_user_md_v3 *)lmm)->lmm_objects,
1505				 stripe_count);
1506		}
1507	}
1508
1509out:
1510	*lmmp = lmm;
1511	*lmm_size = lmmsize;
1512	*request = req;
1513	return rc;
1514}
1515
1516static int ll_lov_setea(struct inode *inode, struct file *file,
1517			    unsigned long arg)
1518{
1519	int			 flags = MDS_OPEN_HAS_OBJS | FMODE_WRITE;
1520	struct lov_user_md	*lump;
1521	int			 lum_size = sizeof(struct lov_user_md) +
1522					    sizeof(struct lov_user_ost_data);
1523	int			 rc;
1524
1525	if (!capable(CFS_CAP_SYS_ADMIN))
1526		return -EPERM;
1527
1528	OBD_ALLOC_LARGE(lump, lum_size);
1529	if (lump == NULL)
1530		return -ENOMEM;
1531
1532	if (copy_from_user(lump, (struct lov_user_md  *)arg, lum_size)) {
1533		OBD_FREE_LARGE(lump, lum_size);
1534		return -EFAULT;
1535	}
1536
1537	rc = ll_lov_setstripe_ea_info(inode, file, flags, lump, lum_size);
1538
1539	OBD_FREE_LARGE(lump, lum_size);
1540	return rc;
1541}
1542
1543static int ll_lov_setstripe(struct inode *inode, struct file *file,
1544			    unsigned long arg)
1545{
1546	struct lov_user_md_v3	 lumv3;
1547	struct lov_user_md_v1	*lumv1 = (struct lov_user_md_v1 *)&lumv3;
1548	struct lov_user_md_v1	*lumv1p = (struct lov_user_md_v1 *)arg;
1549	struct lov_user_md_v3	*lumv3p = (struct lov_user_md_v3 *)arg;
1550	int			 lum_size, rc;
1551	int			 flags = FMODE_WRITE;
1552
1553	/* first try with v1 which is smaller than v3 */
1554	lum_size = sizeof(struct lov_user_md_v1);
1555	if (copy_from_user(lumv1, lumv1p, lum_size))
1556		return -EFAULT;
1557
1558	if (lumv1->lmm_magic == LOV_USER_MAGIC_V3) {
1559		lum_size = sizeof(struct lov_user_md_v3);
1560		if (copy_from_user(&lumv3, lumv3p, lum_size))
1561			return -EFAULT;
1562	}
1563
1564	rc = ll_lov_setstripe_ea_info(inode, file, flags, lumv1, lum_size);
1565	if (rc == 0) {
1566		struct lov_stripe_md *lsm;
1567		__u32 gen;
1568
1569		put_user(0, &lumv1p->lmm_stripe_count);
1570
1571		ll_layout_refresh(inode, &gen);
1572		lsm = ccc_inode_lsm_get(inode);
1573		rc = obd_iocontrol(LL_IOC_LOV_GETSTRIPE, ll_i2dtexp(inode),
1574				   0, lsm, (void *)arg);
1575		ccc_inode_lsm_put(inode, lsm);
1576	}
1577	return rc;
1578}
1579
1580static int ll_lov_getstripe(struct inode *inode, unsigned long arg)
1581{
1582	struct lov_stripe_md *lsm;
1583	int rc = -ENODATA;
1584
1585	lsm = ccc_inode_lsm_get(inode);
1586	if (lsm != NULL)
1587		rc = obd_iocontrol(LL_IOC_LOV_GETSTRIPE, ll_i2dtexp(inode), 0,
1588				   lsm, (void *)arg);
1589	ccc_inode_lsm_put(inode, lsm);
1590	return rc;
1591}
1592
1593int ll_get_grouplock(struct inode *inode, struct file *file, unsigned long arg)
1594{
1595	struct ll_inode_info   *lli = ll_i2info(inode);
1596	struct ll_file_data    *fd = LUSTRE_FPRIVATE(file);
1597	struct ccc_grouplock    grouplock;
1598	int		     rc;
1599
1600	if (ll_file_nolock(file))
1601		return -EOPNOTSUPP;
1602
1603	spin_lock(&lli->lli_lock);
1604	if (fd->fd_flags & LL_FILE_GROUP_LOCKED) {
1605		CWARN("group lock already existed with gid %lu\n",
1606		      fd->fd_grouplock.cg_gid);
1607		spin_unlock(&lli->lli_lock);
1608		return -EINVAL;
1609	}
1610	LASSERT(fd->fd_grouplock.cg_lock == NULL);
1611	spin_unlock(&lli->lli_lock);
1612
1613	rc = cl_get_grouplock(cl_i2info(inode)->lli_clob,
1614			      arg, (file->f_flags & O_NONBLOCK), &grouplock);
1615	if (rc)
1616		return rc;
1617
1618	spin_lock(&lli->lli_lock);
1619	if (fd->fd_flags & LL_FILE_GROUP_LOCKED) {
1620		spin_unlock(&lli->lli_lock);
1621		CERROR("another thread just won the race\n");
1622		cl_put_grouplock(&grouplock);
1623		return -EINVAL;
1624	}
1625
1626	fd->fd_flags |= LL_FILE_GROUP_LOCKED;
1627	fd->fd_grouplock = grouplock;
1628	spin_unlock(&lli->lli_lock);
1629
1630	CDEBUG(D_INFO, "group lock %lu obtained\n", arg);
1631	return 0;
1632}
1633
1634int ll_put_grouplock(struct inode *inode, struct file *file, unsigned long arg)
1635{
1636	struct ll_inode_info   *lli = ll_i2info(inode);
1637	struct ll_file_data    *fd = LUSTRE_FPRIVATE(file);
1638	struct ccc_grouplock    grouplock;
1639
1640	spin_lock(&lli->lli_lock);
1641	if (!(fd->fd_flags & LL_FILE_GROUP_LOCKED)) {
1642		spin_unlock(&lli->lli_lock);
1643		CWARN("no group lock held\n");
1644		return -EINVAL;
1645	}
1646	LASSERT(fd->fd_grouplock.cg_lock != NULL);
1647
1648	if (fd->fd_grouplock.cg_gid != arg) {
1649		CWARN("group lock %lu doesn't match current id %lu\n",
1650		       arg, fd->fd_grouplock.cg_gid);
1651		spin_unlock(&lli->lli_lock);
1652		return -EINVAL;
1653	}
1654
1655	grouplock = fd->fd_grouplock;
1656	memset(&fd->fd_grouplock, 0, sizeof(fd->fd_grouplock));
1657	fd->fd_flags &= ~LL_FILE_GROUP_LOCKED;
1658	spin_unlock(&lli->lli_lock);
1659
1660	cl_put_grouplock(&grouplock);
1661	CDEBUG(D_INFO, "group lock %lu released\n", arg);
1662	return 0;
1663}
1664
1665/**
1666 * Close inode open handle
1667 *
1668 * \param dentry [in]     dentry which contains the inode
1669 * \param it     [in,out] intent which contains open info and result
1670 *
1671 * \retval 0     success
1672 * \retval <0    failure
1673 */
1674int ll_release_openhandle(struct dentry *dentry, struct lookup_intent *it)
1675{
1676	struct inode *inode = dentry->d_inode;
1677	struct obd_client_handle *och;
1678	int rc;
1679
1680	LASSERT(inode);
1681
1682	/* Root ? Do nothing. */
1683	if (dentry->d_inode->i_sb->s_root == dentry)
1684		return 0;
1685
1686	/* No open handle to close? Move away */
1687	if (!it_disposition(it, DISP_OPEN_OPEN))
1688		return 0;
1689
1690	LASSERT(it_open_error(DISP_OPEN_OPEN, it) == 0);
1691
1692	OBD_ALLOC(och, sizeof(*och));
1693	if (!och)
1694		GOTO(out, rc = -ENOMEM);
1695
1696	ll_och_fill(ll_i2sbi(inode)->ll_md_exp, it, och);
1697
1698	rc = ll_close_inode_openhandle(ll_i2sbi(inode)->ll_md_exp,
1699				       inode, och, NULL);
1700out:
1701	/* this one is in place of ll_file_open */
1702	if (it_disposition(it, DISP_ENQ_OPEN_REF)) {
1703		ptlrpc_req_finished(it->d.lustre.it_data);
1704		it_clear_disposition(it, DISP_ENQ_OPEN_REF);
1705	}
1706	return rc;
1707}
1708
1709/**
1710 * Get size for inode for which FIEMAP mapping is requested.
1711 * Make the FIEMAP get_info call and returns the result.
1712 */
1713int ll_do_fiemap(struct inode *inode, struct ll_user_fiemap *fiemap,
1714	      int num_bytes)
1715{
1716	struct obd_export *exp = ll_i2dtexp(inode);
1717	struct lov_stripe_md *lsm = NULL;
1718	struct ll_fiemap_info_key fm_key = { .name = KEY_FIEMAP, };
1719	int vallen = num_bytes;
1720	int rc;
1721
1722	/* Checks for fiemap flags */
1723	if (fiemap->fm_flags & ~LUSTRE_FIEMAP_FLAGS_COMPAT) {
1724		fiemap->fm_flags &= ~LUSTRE_FIEMAP_FLAGS_COMPAT;
1725		return -EBADR;
1726	}
1727
1728	/* Check for FIEMAP_FLAG_SYNC */
1729	if (fiemap->fm_flags & FIEMAP_FLAG_SYNC) {
1730		rc = filemap_fdatawrite(inode->i_mapping);
1731		if (rc)
1732			return rc;
1733	}
1734
1735	lsm = ccc_inode_lsm_get(inode);
1736	if (lsm == NULL)
1737		return -ENOENT;
1738
1739	/* If the stripe_count > 1 and the application does not understand
1740	 * DEVICE_ORDER flag, then it cannot interpret the extents correctly.
1741	 */
1742	if (lsm->lsm_stripe_count > 1 &&
1743	    !(fiemap->fm_flags & FIEMAP_FLAG_DEVICE_ORDER))
1744		GOTO(out, rc = -EOPNOTSUPP);
1745
1746	fm_key.oa.o_oi = lsm->lsm_oi;
1747	fm_key.oa.o_valid = OBD_MD_FLID | OBD_MD_FLGROUP;
1748
1749	obdo_from_inode(&fm_key.oa, inode, OBD_MD_FLSIZE);
1750	obdo_set_parent_fid(&fm_key.oa, &ll_i2info(inode)->lli_fid);
1751	/* If filesize is 0, then there would be no objects for mapping */
1752	if (fm_key.oa.o_size == 0) {
1753		fiemap->fm_mapped_extents = 0;
1754		GOTO(out, rc = 0);
1755	}
1756
1757	memcpy(&fm_key.fiemap, fiemap, sizeof(*fiemap));
1758
1759	rc = obd_get_info(NULL, exp, sizeof(fm_key), &fm_key, &vallen,
1760			  fiemap, lsm);
1761	if (rc)
1762		CERROR("obd_get_info failed: rc = %d\n", rc);
1763
1764out:
1765	ccc_inode_lsm_put(inode, lsm);
1766	return rc;
1767}
1768
1769int ll_fid2path(struct inode *inode, void *arg)
1770{
1771	struct obd_export	*exp = ll_i2mdexp(inode);
1772	struct getinfo_fid2path	*gfout, *gfin;
1773	int			 outsize, rc;
1774
1775	if (!capable(CFS_CAP_DAC_READ_SEARCH) &&
1776	    !(ll_i2sbi(inode)->ll_flags & LL_SBI_USER_FID2PATH))
1777		return -EPERM;
1778
1779	/* Need to get the buflen */
1780	OBD_ALLOC_PTR(gfin);
1781	if (gfin == NULL)
1782		return -ENOMEM;
1783	if (copy_from_user(gfin, arg, sizeof(*gfin))) {
1784		OBD_FREE_PTR(gfin);
1785		return -EFAULT;
1786	}
1787
1788	outsize = sizeof(*gfout) + gfin->gf_pathlen;
1789	OBD_ALLOC(gfout, outsize);
1790	if (gfout == NULL) {
1791		OBD_FREE_PTR(gfin);
1792		return -ENOMEM;
1793	}
1794	memcpy(gfout, gfin, sizeof(*gfout));
1795	OBD_FREE_PTR(gfin);
1796
1797	/* Call mdc_iocontrol */
1798	rc = obd_iocontrol(OBD_IOC_FID2PATH, exp, outsize, gfout, NULL);
1799	if (rc)
1800		GOTO(gf_free, rc);
1801
1802	if (copy_to_user(arg, gfout, outsize))
1803		rc = -EFAULT;
1804
1805gf_free:
1806	OBD_FREE(gfout, outsize);
1807	return rc;
1808}
1809
1810static int ll_ioctl_fiemap(struct inode *inode, unsigned long arg)
1811{
1812	struct ll_user_fiemap *fiemap_s;
1813	size_t num_bytes, ret_bytes;
1814	unsigned int extent_count;
1815	int rc = 0;
1816
1817	/* Get the extent count so we can calculate the size of
1818	 * required fiemap buffer */
1819	if (get_user(extent_count,
1820	    &((struct ll_user_fiemap __user *)arg)->fm_extent_count))
1821		return -EFAULT;
1822	num_bytes = sizeof(*fiemap_s) + (extent_count *
1823					 sizeof(struct ll_fiemap_extent));
1824
1825	OBD_ALLOC_LARGE(fiemap_s, num_bytes);
1826	if (fiemap_s == NULL)
1827		return -ENOMEM;
1828
1829	/* get the fiemap value */
1830	if (copy_from_user(fiemap_s, (struct ll_user_fiemap __user *)arg,
1831			   sizeof(*fiemap_s)))
1832		GOTO(error, rc = -EFAULT);
1833
1834	/* If fm_extent_count is non-zero, read the first extent since
1835	 * it is used to calculate end_offset and device from previous
1836	 * fiemap call. */
1837	if (extent_count) {
1838		if (copy_from_user(&fiemap_s->fm_extents[0],
1839		    (char __user *)arg + sizeof(*fiemap_s),
1840		    sizeof(struct ll_fiemap_extent)))
1841			GOTO(error, rc = -EFAULT);
1842	}
1843
1844	rc = ll_do_fiemap(inode, fiemap_s, num_bytes);
1845	if (rc)
1846		GOTO(error, rc);
1847
1848	ret_bytes = sizeof(struct ll_user_fiemap);
1849
1850	if (extent_count != 0)
1851		ret_bytes += (fiemap_s->fm_mapped_extents *
1852				 sizeof(struct ll_fiemap_extent));
1853
1854	if (copy_to_user((void *)arg, fiemap_s, ret_bytes))
1855		rc = -EFAULT;
1856
1857error:
1858	OBD_FREE_LARGE(fiemap_s, num_bytes);
1859	return rc;
1860}
1861
1862/*
1863 * Read the data_version for inode.
1864 *
1865 * This value is computed using stripe object version on OST.
1866 * Version is computed using server side locking.
1867 *
1868 * @param extent_lock  Take extent lock. Not needed if a process is already
1869 *		       holding the OST object group locks.
1870 */
1871int ll_data_version(struct inode *inode, __u64 *data_version,
1872		    int extent_lock)
1873{
1874	struct lov_stripe_md	*lsm = NULL;
1875	struct ll_sb_info	*sbi = ll_i2sbi(inode);
1876	struct obdo		*obdo = NULL;
1877	int			 rc;
1878
1879	/* If no stripe, we consider version is 0. */
1880	lsm = ccc_inode_lsm_get(inode);
1881	if (!lsm_has_objects(lsm)) {
1882		*data_version = 0;
1883		CDEBUG(D_INODE, "No object for inode\n");
1884		GOTO(out, rc = 0);
1885	}
1886
1887	OBD_ALLOC_PTR(obdo);
1888	if (obdo == NULL)
1889		GOTO(out, rc = -ENOMEM);
1890
1891	rc = ll_lsm_getattr(lsm, sbi->ll_dt_exp, NULL, obdo, 0, extent_lock);
1892	if (rc == 0) {
1893		if (!(obdo->o_valid & OBD_MD_FLDATAVERSION))
1894			rc = -EOPNOTSUPP;
1895		else
1896			*data_version = obdo->o_data_version;
1897	}
1898
1899	OBD_FREE_PTR(obdo);
1900out:
1901	ccc_inode_lsm_put(inode, lsm);
1902	return rc;
1903}
1904
1905/*
1906 * Trigger a HSM release request for the provided inode.
1907 */
1908int ll_hsm_release(struct inode *inode)
1909{
1910	struct cl_env_nest nest;
1911	struct lu_env *env;
1912	struct obd_client_handle *och = NULL;
1913	__u64 data_version = 0;
1914	int rc;
1915
1916
1917	CDEBUG(D_INODE, "%s: Releasing file "DFID".\n",
1918	       ll_get_fsname(inode->i_sb, NULL, 0),
1919	       PFID(&ll_i2info(inode)->lli_fid));
1920
1921	och = ll_lease_open(inode, NULL, FMODE_WRITE, MDS_OPEN_RELEASE);
1922	if (IS_ERR(och))
1923		GOTO(out, rc = PTR_ERR(och));
1924
1925	/* Grab latest data_version and [am]time values */
1926	rc = ll_data_version(inode, &data_version, 1);
1927	if (rc != 0)
1928		GOTO(out, rc);
1929
1930	env = cl_env_nested_get(&nest);
1931	if (IS_ERR(env))
1932		GOTO(out, rc = PTR_ERR(env));
1933
1934	ll_merge_lvb(env, inode);
1935	cl_env_nested_put(&nest, env);
1936
1937	/* Release the file.
1938	 * NB: lease lock handle is released in mdc_hsm_release_pack() because
1939	 * we still need it to pack l_remote_handle to MDT. */
1940	rc = ll_close_inode_openhandle(ll_i2sbi(inode)->ll_md_exp, inode, och,
1941				       &data_version);
1942	och = NULL;
1943
1944
1945out:
1946	if (och != NULL && !IS_ERR(och)) /* close the file */
1947		ll_lease_close(och, inode, NULL);
1948
1949	return rc;
1950}
1951
1952struct ll_swap_stack {
1953	struct iattr		 ia1, ia2;
1954	__u64			 dv1, dv2;
1955	struct inode		*inode1, *inode2;
1956	bool			 check_dv1, check_dv2;
1957};
1958
1959static int ll_swap_layouts(struct file *file1, struct file *file2,
1960			   struct lustre_swap_layouts *lsl)
1961{
1962	struct mdc_swap_layouts	 msl;
1963	struct md_op_data	*op_data;
1964	__u32			 gid;
1965	__u64			 dv;
1966	struct ll_swap_stack	*llss = NULL;
1967	int			 rc;
1968
1969	OBD_ALLOC_PTR(llss);
1970	if (llss == NULL)
1971		return -ENOMEM;
1972
1973	llss->inode1 = file1->f_dentry->d_inode;
1974	llss->inode2 = file2->f_dentry->d_inode;
1975
1976	if (!S_ISREG(llss->inode2->i_mode))
1977		GOTO(free, rc = -EINVAL);
1978
1979	if (inode_permission(llss->inode1, MAY_WRITE) ||
1980	    inode_permission(llss->inode2, MAY_WRITE))
1981		GOTO(free, rc = -EPERM);
1982
1983	if (llss->inode2->i_sb != llss->inode1->i_sb)
1984		GOTO(free, rc = -EXDEV);
1985
1986	/* we use 2 bool because it is easier to swap than 2 bits */
1987	if (lsl->sl_flags & SWAP_LAYOUTS_CHECK_DV1)
1988		llss->check_dv1 = true;
1989
1990	if (lsl->sl_flags & SWAP_LAYOUTS_CHECK_DV2)
1991		llss->check_dv2 = true;
1992
1993	/* we cannot use lsl->sl_dvX directly because we may swap them */
1994	llss->dv1 = lsl->sl_dv1;
1995	llss->dv2 = lsl->sl_dv2;
1996
1997	rc = lu_fid_cmp(ll_inode2fid(llss->inode1), ll_inode2fid(llss->inode2));
1998	if (rc == 0) /* same file, done! */
1999		GOTO(free, rc = 0);
2000
2001	if (rc < 0) { /* sequentialize it */
2002		swap(llss->inode1, llss->inode2);
2003		swap(file1, file2);
2004		swap(llss->dv1, llss->dv2);
2005		swap(llss->check_dv1, llss->check_dv2);
2006	}
2007
2008	gid = lsl->sl_gid;
2009	if (gid != 0) { /* application asks to flush dirty cache */
2010		rc = ll_get_grouplock(llss->inode1, file1, gid);
2011		if (rc < 0)
2012			GOTO(free, rc);
2013
2014		rc = ll_get_grouplock(llss->inode2, file2, gid);
2015		if (rc < 0) {
2016			ll_put_grouplock(llss->inode1, file1, gid);
2017			GOTO(free, rc);
2018		}
2019	}
2020
2021	/* to be able to restore mtime and atime after swap
2022	 * we need to first save them */
2023	if (lsl->sl_flags &
2024	    (SWAP_LAYOUTS_KEEP_MTIME | SWAP_LAYOUTS_KEEP_ATIME)) {
2025		llss->ia1.ia_mtime = llss->inode1->i_mtime;
2026		llss->ia1.ia_atime = llss->inode1->i_atime;
2027		llss->ia1.ia_valid = ATTR_MTIME | ATTR_ATIME;
2028		llss->ia2.ia_mtime = llss->inode2->i_mtime;
2029		llss->ia2.ia_atime = llss->inode2->i_atime;
2030		llss->ia2.ia_valid = ATTR_MTIME | ATTR_ATIME;
2031	}
2032
2033	/* ultimate check, before swapping the layouts we check if
2034	 * dataversion has changed (if requested) */
2035	if (llss->check_dv1) {
2036		rc = ll_data_version(llss->inode1, &dv, 0);
2037		if (rc)
2038			GOTO(putgl, rc);
2039		if (dv != llss->dv1)
2040			GOTO(putgl, rc = -EAGAIN);
2041	}
2042
2043	if (llss->check_dv2) {
2044		rc = ll_data_version(llss->inode2, &dv, 0);
2045		if (rc)
2046			GOTO(putgl, rc);
2047		if (dv != llss->dv2)
2048			GOTO(putgl, rc = -EAGAIN);
2049	}
2050
2051	/* struct md_op_data is used to send the swap args to the mdt
2052	 * only flags is missing, so we use struct mdc_swap_layouts
2053	 * through the md_op_data->op_data */
2054	/* flags from user space have to be converted before they are send to
2055	 * server, no flag is sent today, they are only used on the client */
2056	msl.msl_flags = 0;
2057	rc = -ENOMEM;
2058	op_data = ll_prep_md_op_data(NULL, llss->inode1, llss->inode2, NULL, 0,
2059				     0, LUSTRE_OPC_ANY, &msl);
2060	if (IS_ERR(op_data))
2061		GOTO(free, rc = PTR_ERR(op_data));
2062
2063	rc = obd_iocontrol(LL_IOC_LOV_SWAP_LAYOUTS, ll_i2mdexp(llss->inode1),
2064			   sizeof(*op_data), op_data, NULL);
2065	ll_finish_md_op_data(op_data);
2066
2067putgl:
2068	if (gid != 0) {
2069		ll_put_grouplock(llss->inode2, file2, gid);
2070		ll_put_grouplock(llss->inode1, file1, gid);
2071	}
2072
2073	/* rc can be set from obd_iocontrol() or from a GOTO(putgl, ...) */
2074	if (rc != 0)
2075		GOTO(free, rc);
2076
2077	/* clear useless flags */
2078	if (!(lsl->sl_flags & SWAP_LAYOUTS_KEEP_MTIME)) {
2079		llss->ia1.ia_valid &= ~ATTR_MTIME;
2080		llss->ia2.ia_valid &= ~ATTR_MTIME;
2081	}
2082
2083	if (!(lsl->sl_flags & SWAP_LAYOUTS_KEEP_ATIME)) {
2084		llss->ia1.ia_valid &= ~ATTR_ATIME;
2085		llss->ia2.ia_valid &= ~ATTR_ATIME;
2086	}
2087
2088	/* update time if requested */
2089	rc = 0;
2090	if (llss->ia2.ia_valid != 0) {
2091		mutex_lock(&llss->inode1->i_mutex);
2092		rc = ll_setattr(file1->f_dentry, &llss->ia2);
2093		mutex_unlock(&llss->inode1->i_mutex);
2094	}
2095
2096	if (llss->ia1.ia_valid != 0) {
2097		int rc1;
2098
2099		mutex_lock(&llss->inode2->i_mutex);
2100		rc1 = ll_setattr(file2->f_dentry, &llss->ia1);
2101		mutex_unlock(&llss->inode2->i_mutex);
2102		if (rc == 0)
2103			rc = rc1;
2104	}
2105
2106free:
2107	if (llss != NULL)
2108		OBD_FREE_PTR(llss);
2109
2110	return rc;
2111}
2112
2113static int ll_hsm_state_set(struct inode *inode, struct hsm_state_set *hss)
2114{
2115	struct md_op_data	*op_data;
2116	int			 rc;
2117
2118	/* Non-root users are forbidden to set or clear flags which are
2119	 * NOT defined in HSM_USER_MASK. */
2120	if (((hss->hss_setmask | hss->hss_clearmask) & ~HSM_USER_MASK) &&
2121	    !capable(CFS_CAP_SYS_ADMIN))
2122		return -EPERM;
2123
2124	op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
2125				     LUSTRE_OPC_ANY, hss);
2126	if (IS_ERR(op_data))
2127		return PTR_ERR(op_data);
2128
2129	rc = obd_iocontrol(LL_IOC_HSM_STATE_SET, ll_i2mdexp(inode),
2130			   sizeof(*op_data), op_data, NULL);
2131
2132	ll_finish_md_op_data(op_data);
2133
2134	return rc;
2135}
2136
2137static int ll_hsm_import(struct inode *inode, struct file *file,
2138			 struct hsm_user_import *hui)
2139{
2140	struct hsm_state_set	*hss = NULL;
2141	struct iattr		*attr = NULL;
2142	int			 rc;
2143
2144
2145	if (!S_ISREG(inode->i_mode))
2146		return -EINVAL;
2147
2148	/* set HSM flags */
2149	OBD_ALLOC_PTR(hss);
2150	if (hss == NULL)
2151		GOTO(out, rc = -ENOMEM);
2152
2153	hss->hss_valid = HSS_SETMASK | HSS_ARCHIVE_ID;
2154	hss->hss_archive_id = hui->hui_archive_id;
2155	hss->hss_setmask = HS_ARCHIVED | HS_EXISTS | HS_RELEASED;
2156	rc = ll_hsm_state_set(inode, hss);
2157	if (rc != 0)
2158		GOTO(out, rc);
2159
2160	OBD_ALLOC_PTR(attr);
2161	if (attr == NULL)
2162		GOTO(out, rc = -ENOMEM);
2163
2164	attr->ia_mode = hui->hui_mode & (S_IRWXU | S_IRWXG | S_IRWXO);
2165	attr->ia_mode |= S_IFREG;
2166	attr->ia_uid = make_kuid(&init_user_ns, hui->hui_uid);
2167	attr->ia_gid = make_kgid(&init_user_ns, hui->hui_gid);
2168	attr->ia_size = hui->hui_size;
2169	attr->ia_mtime.tv_sec = hui->hui_mtime;
2170	attr->ia_mtime.tv_nsec = hui->hui_mtime_ns;
2171	attr->ia_atime.tv_sec = hui->hui_atime;
2172	attr->ia_atime.tv_nsec = hui->hui_atime_ns;
2173
2174	attr->ia_valid = ATTR_SIZE | ATTR_MODE | ATTR_FORCE |
2175			 ATTR_UID | ATTR_GID |
2176			 ATTR_MTIME | ATTR_MTIME_SET |
2177			 ATTR_ATIME | ATTR_ATIME_SET;
2178
2179	rc = ll_setattr_raw(file->f_dentry, attr, true);
2180	if (rc == -ENODATA)
2181		rc = 0;
2182
2183out:
2184	if (hss != NULL)
2185		OBD_FREE_PTR(hss);
2186
2187	if (attr != NULL)
2188		OBD_FREE_PTR(attr);
2189
2190	return rc;
2191}
2192
2193long ll_file_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
2194{
2195	struct inode		*inode = file->f_dentry->d_inode;
2196	struct ll_file_data	*fd = LUSTRE_FPRIVATE(file);
2197	int			 flags, rc;
2198
2199	CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p),cmd=%x\n", inode->i_ino,
2200	       inode->i_generation, inode, cmd);
2201	ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_IOCTL, 1);
2202
2203	/* asm-ppc{,64} declares TCGETS, et. al. as type 't' not 'T' */
2204	if (_IOC_TYPE(cmd) == 'T' || _IOC_TYPE(cmd) == 't') /* tty ioctls */
2205		return -ENOTTY;
2206
2207	switch(cmd) {
2208	case LL_IOC_GETFLAGS:
2209		/* Get the current value of the file flags */
2210		return put_user(fd->fd_flags, (int *)arg);
2211	case LL_IOC_SETFLAGS:
2212	case LL_IOC_CLRFLAGS:
2213		/* Set or clear specific file flags */
2214		/* XXX This probably needs checks to ensure the flags are
2215		 *     not abused, and to handle any flag side effects.
2216		 */
2217		if (get_user(flags, (int *) arg))
2218			return -EFAULT;
2219
2220		if (cmd == LL_IOC_SETFLAGS) {
2221			if ((flags & LL_FILE_IGNORE_LOCK) &&
2222			    !(file->f_flags & O_DIRECT)) {
2223				CERROR("%s: unable to disable locking on "
2224				       "non-O_DIRECT file\n", current->comm);
2225				return -EINVAL;
2226			}
2227
2228			fd->fd_flags |= flags;
2229		} else {
2230			fd->fd_flags &= ~flags;
2231		}
2232		return 0;
2233	case LL_IOC_LOV_SETSTRIPE:
2234		return ll_lov_setstripe(inode, file, arg);
2235	case LL_IOC_LOV_SETEA:
2236		return ll_lov_setea(inode, file, arg);
2237	case LL_IOC_LOV_SWAP_LAYOUTS: {
2238		struct file *file2;
2239		struct lustre_swap_layouts lsl;
2240
2241		if (copy_from_user(&lsl, (char *)arg,
2242				       sizeof(struct lustre_swap_layouts)))
2243			return -EFAULT;
2244
2245		if ((file->f_flags & O_ACCMODE) == 0) /* O_RDONLY */
2246			return -EPERM;
2247
2248		file2 = fget(lsl.sl_fd);
2249		if (file2 == NULL)
2250			return -EBADF;
2251
2252		rc = -EPERM;
2253		if ((file2->f_flags & O_ACCMODE) != 0) /* O_WRONLY or O_RDWR */
2254			rc = ll_swap_layouts(file, file2, &lsl);
2255		fput(file2);
2256		return rc;
2257	}
2258	case LL_IOC_LOV_GETSTRIPE:
2259		return ll_lov_getstripe(inode, arg);
2260	case LL_IOC_RECREATE_OBJ:
2261		return ll_lov_recreate_obj(inode, arg);
2262	case LL_IOC_RECREATE_FID:
2263		return ll_lov_recreate_fid(inode, arg);
2264	case FSFILT_IOC_FIEMAP:
2265		return ll_ioctl_fiemap(inode, arg);
2266	case FSFILT_IOC_GETFLAGS:
2267	case FSFILT_IOC_SETFLAGS:
2268		return ll_iocontrol(inode, file, cmd, arg);
2269	case FSFILT_IOC_GETVERSION_OLD:
2270	case FSFILT_IOC_GETVERSION:
2271		return put_user(inode->i_generation, (int *)arg);
2272	case LL_IOC_GROUP_LOCK:
2273		return ll_get_grouplock(inode, file, arg);
2274	case LL_IOC_GROUP_UNLOCK:
2275		return ll_put_grouplock(inode, file, arg);
2276	case IOC_OBD_STATFS:
2277		return ll_obd_statfs(inode, (void *)arg);
2278
2279	/* We need to special case any other ioctls we want to handle,
2280	 * to send them to the MDS/OST as appropriate and to properly
2281	 * network encode the arg field.
2282	case FSFILT_IOC_SETVERSION_OLD:
2283	case FSFILT_IOC_SETVERSION:
2284	*/
2285	case LL_IOC_FLUSHCTX:
2286		return ll_flush_ctx(inode);
2287	case LL_IOC_PATH2FID: {
2288		if (copy_to_user((void *)arg, ll_inode2fid(inode),
2289				 sizeof(struct lu_fid)))
2290			return -EFAULT;
2291
2292		return 0;
2293	}
2294	case OBD_IOC_FID2PATH:
2295		return ll_fid2path(inode, (void *)arg);
2296	case LL_IOC_DATA_VERSION: {
2297		struct ioc_data_version	idv;
2298		int			rc;
2299
2300		if (copy_from_user(&idv, (char *)arg, sizeof(idv)))
2301			return -EFAULT;
2302
2303		rc = ll_data_version(inode, &idv.idv_version,
2304				!(idv.idv_flags & LL_DV_NOFLUSH));
2305
2306		if (rc == 0 && copy_to_user((char *) arg, &idv, sizeof(idv)))
2307			return -EFAULT;
2308
2309		return rc;
2310	}
2311
2312	case LL_IOC_GET_MDTIDX: {
2313		int mdtidx;
2314
2315		mdtidx = ll_get_mdt_idx(inode);
2316		if (mdtidx < 0)
2317			return mdtidx;
2318
2319		if (put_user((int)mdtidx, (int*)arg))
2320			return -EFAULT;
2321
2322		return 0;
2323	}
2324	case OBD_IOC_GETDTNAME:
2325	case OBD_IOC_GETMDNAME:
2326		return ll_get_obd_name(inode, cmd, arg);
2327	case LL_IOC_HSM_STATE_GET: {
2328		struct md_op_data	*op_data;
2329		struct hsm_user_state	*hus;
2330		int			 rc;
2331
2332		OBD_ALLOC_PTR(hus);
2333		if (hus == NULL)
2334			return -ENOMEM;
2335
2336		op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
2337					     LUSTRE_OPC_ANY, hus);
2338		if (IS_ERR(op_data)) {
2339			OBD_FREE_PTR(hus);
2340			return PTR_ERR(op_data);
2341		}
2342
2343		rc = obd_iocontrol(cmd, ll_i2mdexp(inode), sizeof(*op_data),
2344				   op_data, NULL);
2345
2346		if (copy_to_user((void *)arg, hus, sizeof(*hus)))
2347			rc = -EFAULT;
2348
2349		ll_finish_md_op_data(op_data);
2350		OBD_FREE_PTR(hus);
2351		return rc;
2352	}
2353	case LL_IOC_HSM_STATE_SET: {
2354		struct hsm_state_set	*hss;
2355		int			 rc;
2356
2357		OBD_ALLOC_PTR(hss);
2358		if (hss == NULL)
2359			return -ENOMEM;
2360
2361		if (copy_from_user(hss, (char *)arg, sizeof(*hss))) {
2362			OBD_FREE_PTR(hss);
2363			return -EFAULT;
2364		}
2365
2366		rc = ll_hsm_state_set(inode, hss);
2367
2368		OBD_FREE_PTR(hss);
2369		return rc;
2370	}
2371	case LL_IOC_HSM_ACTION: {
2372		struct md_op_data		*op_data;
2373		struct hsm_current_action	*hca;
2374		int				 rc;
2375
2376		OBD_ALLOC_PTR(hca);
2377		if (hca == NULL)
2378			return -ENOMEM;
2379
2380		op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
2381					     LUSTRE_OPC_ANY, hca);
2382		if (IS_ERR(op_data)) {
2383			OBD_FREE_PTR(hca);
2384			return PTR_ERR(op_data);
2385		}
2386
2387		rc = obd_iocontrol(cmd, ll_i2mdexp(inode), sizeof(*op_data),
2388				   op_data, NULL);
2389
2390		if (copy_to_user((char *)arg, hca, sizeof(*hca)))
2391			rc = -EFAULT;
2392
2393		ll_finish_md_op_data(op_data);
2394		OBD_FREE_PTR(hca);
2395		return rc;
2396	}
2397	case LL_IOC_SET_LEASE: {
2398		struct ll_inode_info *lli = ll_i2info(inode);
2399		struct obd_client_handle *och = NULL;
2400		bool lease_broken;
2401		fmode_t mode = 0;
2402
2403		switch (arg) {
2404		case F_WRLCK:
2405			if (!(file->f_mode & FMODE_WRITE))
2406				return -EPERM;
2407			mode = FMODE_WRITE;
2408			break;
2409		case F_RDLCK:
2410			if (!(file->f_mode & FMODE_READ))
2411				return -EPERM;
2412			mode = FMODE_READ;
2413			break;
2414		case F_UNLCK:
2415			mutex_lock(&lli->lli_och_mutex);
2416			if (fd->fd_lease_och != NULL) {
2417				och = fd->fd_lease_och;
2418				fd->fd_lease_och = NULL;
2419			}
2420			mutex_unlock(&lli->lli_och_mutex);
2421
2422			if (och != NULL) {
2423				mode = och->och_flags &
2424				       (FMODE_READ|FMODE_WRITE);
2425				rc = ll_lease_close(och, inode, &lease_broken);
2426				if (rc == 0 && lease_broken)
2427					mode = 0;
2428			} else {
2429				rc = -ENOLCK;
2430			}
2431
2432			/* return the type of lease or error */
2433			return rc < 0 ? rc : (int)mode;
2434		default:
2435			return -EINVAL;
2436		}
2437
2438		CDEBUG(D_INODE, "Set lease with mode %d\n", mode);
2439
2440		/* apply for lease */
2441		och = ll_lease_open(inode, file, mode, 0);
2442		if (IS_ERR(och))
2443			return PTR_ERR(och);
2444
2445		rc = 0;
2446		mutex_lock(&lli->lli_och_mutex);
2447		if (fd->fd_lease_och == NULL) {
2448			fd->fd_lease_och = och;
2449			och = NULL;
2450		}
2451		mutex_unlock(&lli->lli_och_mutex);
2452		if (och != NULL) {
2453			/* impossible now that only excl is supported for now */
2454			ll_lease_close(och, inode, &lease_broken);
2455			rc = -EBUSY;
2456		}
2457		return rc;
2458	}
2459	case LL_IOC_GET_LEASE: {
2460		struct ll_inode_info *lli = ll_i2info(inode);
2461		struct ldlm_lock *lock = NULL;
2462
2463		rc = 0;
2464		mutex_lock(&lli->lli_och_mutex);
2465		if (fd->fd_lease_och != NULL) {
2466			struct obd_client_handle *och = fd->fd_lease_och;
2467
2468			lock = ldlm_handle2lock(&och->och_lease_handle);
2469			if (lock != NULL) {
2470				lock_res_and_lock(lock);
2471				if (!ldlm_is_cancel(lock))
2472					rc = och->och_flags &
2473						(FMODE_READ | FMODE_WRITE);
2474				unlock_res_and_lock(lock);
2475				ldlm_lock_put(lock);
2476			}
2477		}
2478		mutex_unlock(&lli->lli_och_mutex);
2479		return rc;
2480	}
2481	case LL_IOC_HSM_IMPORT: {
2482		struct hsm_user_import *hui;
2483
2484		OBD_ALLOC_PTR(hui);
2485		if (hui == NULL)
2486			return -ENOMEM;
2487
2488		if (copy_from_user(hui, (void *)arg, sizeof(*hui))) {
2489			OBD_FREE_PTR(hui);
2490			return -EFAULT;
2491		}
2492
2493		rc = ll_hsm_import(inode, file, hui);
2494
2495		OBD_FREE_PTR(hui);
2496		return rc;
2497	}
2498	default: {
2499		int err;
2500
2501		if (LLIOC_STOP ==
2502		     ll_iocontrol_call(inode, file, cmd, arg, &err))
2503			return err;
2504
2505		return obd_iocontrol(cmd, ll_i2dtexp(inode), 0, NULL,
2506				     (void *)arg);
2507	}
2508	}
2509}
2510
2511
2512loff_t ll_file_seek(struct file *file, loff_t offset, int origin)
2513{
2514	struct inode *inode = file->f_dentry->d_inode;
2515	loff_t retval, eof = 0;
2516
2517	retval = offset + ((origin == SEEK_END) ? i_size_read(inode) :
2518			   (origin == SEEK_CUR) ? file->f_pos : 0);
2519	CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p), to=%llu=%#llx(%d)\n",
2520	       inode->i_ino, inode->i_generation, inode, retval, retval,
2521	       origin);
2522	ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_LLSEEK, 1);
2523
2524	if (origin == SEEK_END || origin == SEEK_HOLE || origin == SEEK_DATA) {
2525		retval = ll_glimpse_size(inode);
2526		if (retval != 0)
2527			return retval;
2528		eof = i_size_read(inode);
2529	}
2530
2531	retval = generic_file_llseek_size(file, offset, origin,
2532					  ll_file_maxbytes(inode), eof);
2533	return retval;
2534}
2535
2536int ll_flush(struct file *file, fl_owner_t id)
2537{
2538	struct inode *inode = file->f_dentry->d_inode;
2539	struct ll_inode_info *lli = ll_i2info(inode);
2540	struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
2541	int rc, err;
2542
2543	LASSERT(!S_ISDIR(inode->i_mode));
2544
2545	/* catch async errors that were recorded back when async writeback
2546	 * failed for pages in this mapping. */
2547	rc = lli->lli_async_rc;
2548	lli->lli_async_rc = 0;
2549	err = lov_read_and_clear_async_rc(lli->lli_clob);
2550	if (rc == 0)
2551		rc = err;
2552
2553	/* The application has been told write failure already.
2554	 * Do not report failure again. */
2555	if (fd->fd_write_failed)
2556		return 0;
2557	return rc ? -EIO : 0;
2558}
2559
2560/**
2561 * Called to make sure a portion of file has been written out.
2562 * if @local_only is not true, it will send OST_SYNC RPCs to ost.
2563 *
2564 * Return how many pages have been written.
2565 */
2566int cl_sync_file_range(struct inode *inode, loff_t start, loff_t end,
2567		       enum cl_fsync_mode mode, int ignore_layout)
2568{
2569	struct cl_env_nest nest;
2570	struct lu_env *env;
2571	struct cl_io *io;
2572	struct obd_capa *capa = NULL;
2573	struct cl_fsync_io *fio;
2574	int result;
2575
2576	if (mode != CL_FSYNC_NONE && mode != CL_FSYNC_LOCAL &&
2577	    mode != CL_FSYNC_DISCARD && mode != CL_FSYNC_ALL)
2578		return -EINVAL;
2579
2580	env = cl_env_nested_get(&nest);
2581	if (IS_ERR(env))
2582		return PTR_ERR(env);
2583
2584	capa = ll_osscapa_get(inode, CAPA_OPC_OSS_WRITE);
2585
2586	io = ccc_env_thread_io(env);
2587	io->ci_obj = cl_i2info(inode)->lli_clob;
2588	io->ci_ignore_layout = ignore_layout;
2589
2590	/* initialize parameters for sync */
2591	fio = &io->u.ci_fsync;
2592	fio->fi_capa = capa;
2593	fio->fi_start = start;
2594	fio->fi_end = end;
2595	fio->fi_fid = ll_inode2fid(inode);
2596	fio->fi_mode = mode;
2597	fio->fi_nr_written = 0;
2598
2599	if (cl_io_init(env, io, CIT_FSYNC, io->ci_obj) == 0)
2600		result = cl_io_loop(env, io);
2601	else
2602		result = io->ci_result;
2603	if (result == 0)
2604		result = fio->fi_nr_written;
2605	cl_io_fini(env, io);
2606	cl_env_nested_put(&nest, env);
2607
2608	capa_put(capa);
2609
2610	return result;
2611}
2612
2613/*
2614 * When dentry is provided (the 'else' case), *file->f_dentry may be
2615 * null and dentry must be used directly rather than pulled from
2616 * *file->f_dentry as is done otherwise.
2617 */
2618
2619int ll_fsync(struct file *file, loff_t start, loff_t end, int datasync)
2620{
2621	struct dentry *dentry = file->f_dentry;
2622	struct inode *inode = dentry->d_inode;
2623	struct ll_inode_info *lli = ll_i2info(inode);
2624	struct ptlrpc_request *req;
2625	struct obd_capa *oc;
2626	int rc, err;
2627
2628	CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p)\n", inode->i_ino,
2629	       inode->i_generation, inode);
2630	ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_FSYNC, 1);
2631
2632	rc = filemap_write_and_wait_range(inode->i_mapping, start, end);
2633	mutex_lock(&inode->i_mutex);
2634
2635	/* catch async errors that were recorded back when async writeback
2636	 * failed for pages in this mapping. */
2637	if (!S_ISDIR(inode->i_mode)) {
2638		err = lli->lli_async_rc;
2639		lli->lli_async_rc = 0;
2640		if (rc == 0)
2641			rc = err;
2642		err = lov_read_and_clear_async_rc(lli->lli_clob);
2643		if (rc == 0)
2644			rc = err;
2645	}
2646
2647	oc = ll_mdscapa_get(inode);
2648	err = md_sync(ll_i2sbi(inode)->ll_md_exp, ll_inode2fid(inode), oc,
2649		      &req);
2650	capa_put(oc);
2651	if (!rc)
2652		rc = err;
2653	if (!err)
2654		ptlrpc_req_finished(req);
2655
2656	if (datasync && S_ISREG(inode->i_mode)) {
2657		struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
2658
2659		err = cl_sync_file_range(inode, 0, OBD_OBJECT_EOF,
2660				CL_FSYNC_ALL, 0);
2661		if (rc == 0 && err < 0)
2662			rc = err;
2663		if (rc < 0)
2664			fd->fd_write_failed = true;
2665		else
2666			fd->fd_write_failed = false;
2667	}
2668
2669	mutex_unlock(&inode->i_mutex);
2670	return rc;
2671}
2672
2673int ll_file_flock(struct file *file, int cmd, struct file_lock *file_lock)
2674{
2675	struct inode *inode = file->f_dentry->d_inode;
2676	struct ll_sb_info *sbi = ll_i2sbi(inode);
2677	struct ldlm_enqueue_info einfo = {
2678		.ei_type	= LDLM_FLOCK,
2679		.ei_cb_cp	= ldlm_flock_completion_ast,
2680		.ei_cbdata	= file_lock,
2681	};
2682	struct md_op_data *op_data;
2683	struct lustre_handle lockh = {0};
2684	ldlm_policy_data_t flock = {{0}};
2685	int flags = 0;
2686	int rc;
2687	int rc2 = 0;
2688
2689	CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu file_lock=%p\n",
2690	       inode->i_ino, file_lock);
2691
2692	ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_FLOCK, 1);
2693
2694	if (file_lock->fl_flags & FL_FLOCK) {
2695		LASSERT((cmd == F_SETLKW) || (cmd == F_SETLK));
2696		/* flocks are whole-file locks */
2697		flock.l_flock.end = OFFSET_MAX;
2698		/* For flocks owner is determined by the local file descriptor*/
2699		flock.l_flock.owner = (unsigned long)file_lock->fl_file;
2700	} else if (file_lock->fl_flags & FL_POSIX) {
2701		flock.l_flock.owner = (unsigned long)file_lock->fl_owner;
2702		flock.l_flock.start = file_lock->fl_start;
2703		flock.l_flock.end = file_lock->fl_end;
2704	} else {
2705		return -EINVAL;
2706	}
2707	flock.l_flock.pid = file_lock->fl_pid;
2708
2709	/* Somewhat ugly workaround for svc lockd.
2710	 * lockd installs custom fl_lmops->lm_compare_owner that checks
2711	 * for the fl_owner to be the same (which it always is on local node
2712	 * I guess between lockd processes) and then compares pid.
2713	 * As such we assign pid to the owner field to make it all work,
2714	 * conflict with normal locks is unlikely since pid space and
2715	 * pointer space for current->files are not intersecting */
2716	if (file_lock->fl_lmops && file_lock->fl_lmops->lm_compare_owner)
2717		flock.l_flock.owner = (unsigned long)file_lock->fl_pid;
2718
2719	switch (file_lock->fl_type) {
2720	case F_RDLCK:
2721		einfo.ei_mode = LCK_PR;
2722		break;
2723	case F_UNLCK:
2724		/* An unlock request may or may not have any relation to
2725		 * existing locks so we may not be able to pass a lock handle
2726		 * via a normal ldlm_lock_cancel() request. The request may even
2727		 * unlock a byte range in the middle of an existing lock. In
2728		 * order to process an unlock request we need all of the same
2729		 * information that is given with a normal read or write record
2730		 * lock request. To avoid creating another ldlm unlock (cancel)
2731		 * message we'll treat a LCK_NL flock request as an unlock. */
2732		einfo.ei_mode = LCK_NL;
2733		break;
2734	case F_WRLCK:
2735		einfo.ei_mode = LCK_PW;
2736		break;
2737	default:
2738		CDEBUG(D_INFO, "Unknown fcntl lock type: %d\n",
2739			file_lock->fl_type);
2740		return -ENOTSUPP;
2741	}
2742
2743	switch (cmd) {
2744	case F_SETLKW:
2745#ifdef F_SETLKW64
2746	case F_SETLKW64:
2747#endif
2748		flags = 0;
2749		break;
2750	case F_SETLK:
2751#ifdef F_SETLK64
2752	case F_SETLK64:
2753#endif
2754		flags = LDLM_FL_BLOCK_NOWAIT;
2755		break;
2756	case F_GETLK:
2757#ifdef F_GETLK64
2758	case F_GETLK64:
2759#endif
2760		flags = LDLM_FL_TEST_LOCK;
2761		/* Save the old mode so that if the mode in the lock changes we
2762		 * can decrement the appropriate reader or writer refcount. */
2763		file_lock->fl_type = einfo.ei_mode;
2764		break;
2765	default:
2766		CERROR("unknown fcntl lock command: %d\n", cmd);
2767		return -EINVAL;
2768	}
2769
2770	op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
2771				     LUSTRE_OPC_ANY, NULL);
2772	if (IS_ERR(op_data))
2773		return PTR_ERR(op_data);
2774
2775	CDEBUG(D_DLMTRACE, "inode=%lu, pid=%u, flags=%#x, mode=%u, "
2776	       "start="LPU64", end="LPU64"\n", inode->i_ino, flock.l_flock.pid,
2777	       flags, einfo.ei_mode, flock.l_flock.start, flock.l_flock.end);
2778
2779	rc = md_enqueue(sbi->ll_md_exp, &einfo, NULL,
2780			op_data, &lockh, &flock, 0, NULL /* req */, flags);
2781
2782	if ((file_lock->fl_flags & FL_FLOCK) &&
2783	    (rc == 0 || file_lock->fl_type == F_UNLCK))
2784		rc2  = flock_lock_file_wait(file, file_lock);
2785	if ((file_lock->fl_flags & FL_POSIX) &&
2786	    (rc == 0 || file_lock->fl_type == F_UNLCK) &&
2787	    !(flags & LDLM_FL_TEST_LOCK))
2788		rc2  = posix_lock_file_wait(file, file_lock);
2789
2790	if (rc2 && file_lock->fl_type != F_UNLCK) {
2791		einfo.ei_mode = LCK_NL;
2792		md_enqueue(sbi->ll_md_exp, &einfo, NULL,
2793			op_data, &lockh, &flock, 0, NULL /* req */, flags);
2794		rc = rc2;
2795	}
2796
2797	ll_finish_md_op_data(op_data);
2798
2799	return rc;
2800}
2801
2802int ll_file_noflock(struct file *file, int cmd, struct file_lock *file_lock)
2803{
2804	return -ENOSYS;
2805}
2806
2807/**
2808 * test if some locks matching bits and l_req_mode are acquired
2809 * - bits can be in different locks
2810 * - if found clear the common lock bits in *bits
2811 * - the bits not found, are kept in *bits
2812 * \param inode [IN]
2813 * \param bits [IN] searched lock bits [IN]
2814 * \param l_req_mode [IN] searched lock mode
2815 * \retval boolean, true iff all bits are found
2816 */
2817int ll_have_md_lock(struct inode *inode, __u64 *bits,  ldlm_mode_t l_req_mode)
2818{
2819	struct lustre_handle lockh;
2820	ldlm_policy_data_t policy;
2821	ldlm_mode_t mode = (l_req_mode == LCK_MINMODE) ?
2822				(LCK_CR|LCK_CW|LCK_PR|LCK_PW) : l_req_mode;
2823	struct lu_fid *fid;
2824	__u64 flags;
2825	int i;
2826
2827	if (!inode)
2828	       return 0;
2829
2830	fid = &ll_i2info(inode)->lli_fid;
2831	CDEBUG(D_INFO, "trying to match res "DFID" mode %s\n", PFID(fid),
2832	       ldlm_lockname[mode]);
2833
2834	flags = LDLM_FL_BLOCK_GRANTED | LDLM_FL_CBPENDING | LDLM_FL_TEST_LOCK;
2835	for (i = 0; i <= MDS_INODELOCK_MAXSHIFT && *bits != 0; i++) {
2836		policy.l_inodebits.bits = *bits & (1 << i);
2837		if (policy.l_inodebits.bits == 0)
2838			continue;
2839
2840		if (md_lock_match(ll_i2mdexp(inode), flags, fid, LDLM_IBITS,
2841				  &policy, mode, &lockh)) {
2842			struct ldlm_lock *lock;
2843
2844			lock = ldlm_handle2lock(&lockh);
2845			if (lock) {
2846				*bits &=
2847				      ~(lock->l_policy_data.l_inodebits.bits);
2848				LDLM_LOCK_PUT(lock);
2849			} else {
2850				*bits &= ~policy.l_inodebits.bits;
2851			}
2852		}
2853	}
2854	return *bits == 0;
2855}
2856
2857ldlm_mode_t ll_take_md_lock(struct inode *inode, __u64 bits,
2858			    struct lustre_handle *lockh, __u64 flags,
2859			    ldlm_mode_t mode)
2860{
2861	ldlm_policy_data_t policy = { .l_inodebits = {bits}};
2862	struct lu_fid *fid;
2863	ldlm_mode_t rc;
2864
2865	fid = &ll_i2info(inode)->lli_fid;
2866	CDEBUG(D_INFO, "trying to match res "DFID"\n", PFID(fid));
2867
2868	rc = md_lock_match(ll_i2mdexp(inode), LDLM_FL_BLOCK_GRANTED|flags,
2869			   fid, LDLM_IBITS, &policy, mode, lockh);
2870
2871	return rc;
2872}
2873
2874static int ll_inode_revalidate_fini(struct inode *inode, int rc)
2875{
2876	/* Already unlinked. Just update nlink and return success */
2877	if (rc == -ENOENT) {
2878		clear_nlink(inode);
2879		/* This path cannot be hit for regular files unless in
2880		 * case of obscure races, so no need to validate size.
2881		 */
2882		if (!S_ISREG(inode->i_mode) && !S_ISDIR(inode->i_mode))
2883			return 0;
2884	} else if (rc != 0) {
2885		CERROR("%s: revalidate FID "DFID" error: rc = %d\n",
2886		       ll_get_fsname(inode->i_sb, NULL, 0),
2887		       PFID(ll_inode2fid(inode)), rc);
2888	}
2889
2890	return rc;
2891}
2892
2893int __ll_inode_revalidate_it(struct dentry *dentry, struct lookup_intent *it,
2894			     __u64 ibits)
2895{
2896	struct inode *inode = dentry->d_inode;
2897	struct ptlrpc_request *req = NULL;
2898	struct obd_export *exp;
2899	int rc = 0;
2900
2901	LASSERT(inode != NULL);
2902
2903	CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p),name=%s\n",
2904	       inode->i_ino, inode->i_generation, inode, dentry->d_name.name);
2905
2906	exp = ll_i2mdexp(inode);
2907
2908	/* XXX: Enable OBD_CONNECT_ATTRFID to reduce unnecessary getattr RPC.
2909	 *      But under CMD case, it caused some lock issues, should be fixed
2910	 *      with new CMD ibits lock. See bug 12718 */
2911	if (exp_connect_flags(exp) & OBD_CONNECT_ATTRFID) {
2912		struct lookup_intent oit = { .it_op = IT_GETATTR };
2913		struct md_op_data *op_data;
2914
2915		if (ibits == MDS_INODELOCK_LOOKUP)
2916			oit.it_op = IT_LOOKUP;
2917
2918		/* Call getattr by fid, so do not provide name at all. */
2919		op_data = ll_prep_md_op_data(NULL, dentry->d_inode,
2920					     dentry->d_inode, NULL, 0, 0,
2921					     LUSTRE_OPC_ANY, NULL);
2922		if (IS_ERR(op_data))
2923			return PTR_ERR(op_data);
2924
2925		oit.it_create_mode |= M_CHECK_STALE;
2926		rc = md_intent_lock(exp, op_data, NULL, 0,
2927				    /* we are not interested in name
2928				       based lookup */
2929				    &oit, 0, &req,
2930				    ll_md_blocking_ast, 0);
2931		ll_finish_md_op_data(op_data);
2932		oit.it_create_mode &= ~M_CHECK_STALE;
2933		if (rc < 0) {
2934			rc = ll_inode_revalidate_fini(inode, rc);
2935			GOTO (out, rc);
2936		}
2937
2938		rc = ll_revalidate_it_finish(req, &oit, dentry);
2939		if (rc != 0) {
2940			ll_intent_release(&oit);
2941			GOTO(out, rc);
2942		}
2943
2944		/* Unlinked? Unhash dentry, so it is not picked up later by
2945		   do_lookup() -> ll_revalidate_it(). We cannot use d_drop
2946		   here to preserve get_cwd functionality on 2.6.
2947		   Bug 10503 */
2948		if (!dentry->d_inode->i_nlink)
2949			d_lustre_invalidate(dentry, 0);
2950
2951		ll_lookup_finish_locks(&oit, dentry);
2952	} else if (!ll_have_md_lock(dentry->d_inode, &ibits, LCK_MINMODE)) {
2953		struct ll_sb_info *sbi = ll_i2sbi(dentry->d_inode);
2954		obd_valid valid = OBD_MD_FLGETATTR;
2955		struct md_op_data *op_data;
2956		int ealen = 0;
2957
2958		if (S_ISREG(inode->i_mode)) {
2959			rc = ll_get_max_mdsize(sbi, &ealen);
2960			if (rc)
2961				return rc;
2962			valid |= OBD_MD_FLEASIZE | OBD_MD_FLMODEASIZE;
2963		}
2964
2965		op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL,
2966					     0, ealen, LUSTRE_OPC_ANY,
2967					     NULL);
2968		if (IS_ERR(op_data))
2969			return PTR_ERR(op_data);
2970
2971		op_data->op_valid = valid;
2972		/* Once OBD_CONNECT_ATTRFID is not supported, we can't find one
2973		 * capa for this inode. Because we only keep capas of dirs
2974		 * fresh. */
2975		rc = md_getattr(sbi->ll_md_exp, op_data, &req);
2976		ll_finish_md_op_data(op_data);
2977		if (rc) {
2978			rc = ll_inode_revalidate_fini(inode, rc);
2979			return rc;
2980		}
2981
2982		rc = ll_prep_inode(&inode, req, NULL, NULL);
2983	}
2984out:
2985	ptlrpc_req_finished(req);
2986	return rc;
2987}
2988
2989int ll_inode_revalidate_it(struct dentry *dentry, struct lookup_intent *it,
2990			   __u64 ibits)
2991{
2992	struct inode *inode = dentry->d_inode;
2993	int rc;
2994
2995	rc = __ll_inode_revalidate_it(dentry, it, ibits);
2996	if (rc != 0)
2997		return rc;
2998
2999	/* if object isn't regular file, don't validate size */
3000	if (!S_ISREG(inode->i_mode)) {
3001		LTIME_S(inode->i_atime) = ll_i2info(inode)->lli_lvb.lvb_atime;
3002		LTIME_S(inode->i_mtime) = ll_i2info(inode)->lli_lvb.lvb_mtime;
3003		LTIME_S(inode->i_ctime) = ll_i2info(inode)->lli_lvb.lvb_ctime;
3004	} else {
3005		/* In case of restore, the MDT has the right size and has
3006		 * already send it back without granting the layout lock,
3007		 * inode is up-to-date so glimpse is useless.
3008		 * Also to glimpse we need the layout, in case of a running
3009		 * restore the MDT holds the layout lock so the glimpse will
3010		 * block up to the end of restore (getattr will block)
3011		 */
3012		if (!(ll_i2info(inode)->lli_flags & LLIF_FILE_RESTORING))
3013			rc = ll_glimpse_size(inode);
3014	}
3015	return rc;
3016}
3017
3018int ll_getattr_it(struct vfsmount *mnt, struct dentry *de,
3019		  struct lookup_intent *it, struct kstat *stat)
3020{
3021	struct inode *inode = de->d_inode;
3022	struct ll_sb_info *sbi = ll_i2sbi(inode);
3023	struct ll_inode_info *lli = ll_i2info(inode);
3024	int res = 0;
3025
3026	res = ll_inode_revalidate_it(de, it, MDS_INODELOCK_UPDATE |
3027					     MDS_INODELOCK_LOOKUP);
3028	ll_stats_ops_tally(sbi, LPROC_LL_GETATTR, 1);
3029
3030	if (res)
3031		return res;
3032
3033	stat->dev = inode->i_sb->s_dev;
3034	if (ll_need_32bit_api(sbi))
3035		stat->ino = cl_fid_build_ino(&lli->lli_fid, 1);
3036	else
3037		stat->ino = inode->i_ino;
3038	stat->mode = inode->i_mode;
3039	stat->nlink = inode->i_nlink;
3040	stat->uid = inode->i_uid;
3041	stat->gid = inode->i_gid;
3042	stat->rdev = inode->i_rdev;
3043	stat->atime = inode->i_atime;
3044	stat->mtime = inode->i_mtime;
3045	stat->ctime = inode->i_ctime;
3046	stat->blksize = 1 << inode->i_blkbits;
3047
3048	stat->size = i_size_read(inode);
3049	stat->blocks = inode->i_blocks;
3050
3051	return 0;
3052}
3053int ll_getattr(struct vfsmount *mnt, struct dentry *de, struct kstat *stat)
3054{
3055	struct lookup_intent it = { .it_op = IT_GETATTR };
3056
3057	return ll_getattr_it(mnt, de, &it, stat);
3058}
3059
3060int ll_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
3061		__u64 start, __u64 len)
3062{
3063	int rc;
3064	size_t num_bytes;
3065	struct ll_user_fiemap *fiemap;
3066	unsigned int extent_count = fieinfo->fi_extents_max;
3067
3068	num_bytes = sizeof(*fiemap) + (extent_count *
3069				       sizeof(struct ll_fiemap_extent));
3070	OBD_ALLOC_LARGE(fiemap, num_bytes);
3071
3072	if (fiemap == NULL)
3073		return -ENOMEM;
3074
3075	fiemap->fm_flags = fieinfo->fi_flags;
3076	fiemap->fm_extent_count = fieinfo->fi_extents_max;
3077	fiemap->fm_start = start;
3078	fiemap->fm_length = len;
3079	memcpy(&fiemap->fm_extents[0], fieinfo->fi_extents_start,
3080	       sizeof(struct ll_fiemap_extent));
3081
3082	rc = ll_do_fiemap(inode, fiemap, num_bytes);
3083
3084	fieinfo->fi_flags = fiemap->fm_flags;
3085	fieinfo->fi_extents_mapped = fiemap->fm_mapped_extents;
3086	memcpy(fieinfo->fi_extents_start, &fiemap->fm_extents[0],
3087	       fiemap->fm_mapped_extents * sizeof(struct ll_fiemap_extent));
3088
3089	OBD_FREE_LARGE(fiemap, num_bytes);
3090	return rc;
3091}
3092
3093struct posix_acl * ll_get_acl(struct inode *inode, int type)
3094{
3095	struct ll_inode_info *lli = ll_i2info(inode);
3096	struct posix_acl *acl = NULL;
3097
3098	spin_lock(&lli->lli_lock);
3099	/* VFS' acl_permission_check->check_acl will release the refcount */
3100	acl = posix_acl_dup(lli->lli_posix_acl);
3101	spin_unlock(&lli->lli_lock);
3102
3103	return acl;
3104}
3105
3106
3107int ll_inode_permission(struct inode *inode, int mask)
3108{
3109	int rc = 0;
3110
3111#ifdef MAY_NOT_BLOCK
3112	if (mask & MAY_NOT_BLOCK)
3113		return -ECHILD;
3114#endif
3115
3116       /* as root inode are NOT getting validated in lookup operation,
3117	* need to do it before permission check. */
3118
3119	if (inode == inode->i_sb->s_root->d_inode) {
3120		struct lookup_intent it = { .it_op = IT_LOOKUP };
3121
3122		rc = __ll_inode_revalidate_it(inode->i_sb->s_root, &it,
3123					      MDS_INODELOCK_LOOKUP);
3124		if (rc)
3125			return rc;
3126	}
3127
3128	CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p), inode mode %x mask %o\n",
3129	       inode->i_ino, inode->i_generation, inode, inode->i_mode, mask);
3130
3131	if (ll_i2sbi(inode)->ll_flags & LL_SBI_RMT_CLIENT)
3132		return lustre_check_remote_perm(inode, mask);
3133
3134	ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_INODE_PERM, 1);
3135	rc = generic_permission(inode, mask);
3136
3137	return rc;
3138}
3139
3140/* -o localflock - only provides locally consistent flock locks */
3141struct file_operations ll_file_operations = {
3142	.read	   = ll_file_read,
3143	.aio_read = ll_file_aio_read,
3144	.write	  = ll_file_write,
3145	.aio_write = ll_file_aio_write,
3146	.unlocked_ioctl = ll_file_ioctl,
3147	.open	   = ll_file_open,
3148	.release	= ll_file_release,
3149	.mmap	   = ll_file_mmap,
3150	.llseek	 = ll_file_seek,
3151	.splice_read    = ll_file_splice_read,
3152	.fsync	  = ll_fsync,
3153	.flush	  = ll_flush
3154};
3155
3156struct file_operations ll_file_operations_flock = {
3157	.read	   = ll_file_read,
3158	.aio_read    = ll_file_aio_read,
3159	.write	  = ll_file_write,
3160	.aio_write   = ll_file_aio_write,
3161	.unlocked_ioctl = ll_file_ioctl,
3162	.open	   = ll_file_open,
3163	.release	= ll_file_release,
3164	.mmap	   = ll_file_mmap,
3165	.llseek	 = ll_file_seek,
3166	.splice_read    = ll_file_splice_read,
3167	.fsync	  = ll_fsync,
3168	.flush	  = ll_flush,
3169	.flock	  = ll_file_flock,
3170	.lock	   = ll_file_flock
3171};
3172
3173/* These are for -o noflock - to return ENOSYS on flock calls */
3174struct file_operations ll_file_operations_noflock = {
3175	.read	   = ll_file_read,
3176	.aio_read    = ll_file_aio_read,
3177	.write	  = ll_file_write,
3178	.aio_write   = ll_file_aio_write,
3179	.unlocked_ioctl = ll_file_ioctl,
3180	.open	   = ll_file_open,
3181	.release	= ll_file_release,
3182	.mmap	   = ll_file_mmap,
3183	.llseek	 = ll_file_seek,
3184	.splice_read    = ll_file_splice_read,
3185	.fsync	  = ll_fsync,
3186	.flush	  = ll_flush,
3187	.flock	  = ll_file_noflock,
3188	.lock	   = ll_file_noflock
3189};
3190
3191struct inode_operations ll_file_inode_operations = {
3192	.setattr	= ll_setattr,
3193	.getattr	= ll_getattr,
3194	.permission	= ll_inode_permission,
3195	.setxattr	= ll_setxattr,
3196	.getxattr	= ll_getxattr,
3197	.listxattr	= ll_listxattr,
3198	.removexattr	= ll_removexattr,
3199	.fiemap		= ll_fiemap,
3200	.get_acl	= ll_get_acl,
3201};
3202
3203/* dynamic ioctl number support routines */
3204static struct llioc_ctl_data {
3205	struct rw_semaphore	ioc_sem;
3206	struct list_head	      ioc_head;
3207} llioc = {
3208	__RWSEM_INITIALIZER(llioc.ioc_sem),
3209	LIST_HEAD_INIT(llioc.ioc_head)
3210};
3211
3212
3213struct llioc_data {
3214	struct list_head	      iocd_list;
3215	unsigned int	    iocd_size;
3216	llioc_callback_t	iocd_cb;
3217	unsigned int	    iocd_count;
3218	unsigned int	    iocd_cmd[0];
3219};
3220
3221void *ll_iocontrol_register(llioc_callback_t cb, int count, unsigned int *cmd)
3222{
3223	unsigned int size;
3224	struct llioc_data *in_data = NULL;
3225
3226	if (cb == NULL || cmd == NULL ||
3227	    count > LLIOC_MAX_CMD || count < 0)
3228		return NULL;
3229
3230	size = sizeof(*in_data) + count * sizeof(unsigned int);
3231	OBD_ALLOC(in_data, size);
3232	if (in_data == NULL)
3233		return NULL;
3234
3235	memset(in_data, 0, sizeof(*in_data));
3236	in_data->iocd_size = size;
3237	in_data->iocd_cb = cb;
3238	in_data->iocd_count = count;
3239	memcpy(in_data->iocd_cmd, cmd, sizeof(unsigned int) * count);
3240
3241	down_write(&llioc.ioc_sem);
3242	list_add_tail(&in_data->iocd_list, &llioc.ioc_head);
3243	up_write(&llioc.ioc_sem);
3244
3245	return in_data;
3246}
3247
3248void ll_iocontrol_unregister(void *magic)
3249{
3250	struct llioc_data *tmp;
3251
3252	if (magic == NULL)
3253		return;
3254
3255	down_write(&llioc.ioc_sem);
3256	list_for_each_entry(tmp, &llioc.ioc_head, iocd_list) {
3257		if (tmp == magic) {
3258			unsigned int size = tmp->iocd_size;
3259
3260			list_del(&tmp->iocd_list);
3261			up_write(&llioc.ioc_sem);
3262
3263			OBD_FREE(tmp, size);
3264			return;
3265		}
3266	}
3267	up_write(&llioc.ioc_sem);
3268
3269	CWARN("didn't find iocontrol register block with magic: %p\n", magic);
3270}
3271
3272EXPORT_SYMBOL(ll_iocontrol_register);
3273EXPORT_SYMBOL(ll_iocontrol_unregister);
3274
3275enum llioc_iter ll_iocontrol_call(struct inode *inode, struct file *file,
3276			unsigned int cmd, unsigned long arg, int *rcp)
3277{
3278	enum llioc_iter ret = LLIOC_CONT;
3279	struct llioc_data *data;
3280	int rc = -EINVAL, i;
3281
3282	down_read(&llioc.ioc_sem);
3283	list_for_each_entry(data, &llioc.ioc_head, iocd_list) {
3284		for (i = 0; i < data->iocd_count; i++) {
3285			if (cmd != data->iocd_cmd[i])
3286				continue;
3287
3288			ret = data->iocd_cb(inode, file, cmd, arg, data, &rc);
3289			break;
3290		}
3291
3292		if (ret == LLIOC_STOP)
3293			break;
3294	}
3295	up_read(&llioc.ioc_sem);
3296
3297	if (rcp)
3298		*rcp = rc;
3299	return ret;
3300}
3301
3302int ll_layout_conf(struct inode *inode, const struct cl_object_conf *conf)
3303{
3304	struct ll_inode_info *lli = ll_i2info(inode);
3305	struct cl_env_nest nest;
3306	struct lu_env *env;
3307	int result;
3308
3309	if (lli->lli_clob == NULL)
3310		return 0;
3311
3312	env = cl_env_nested_get(&nest);
3313	if (IS_ERR(env))
3314		return PTR_ERR(env);
3315
3316	result = cl_conf_set(env, lli->lli_clob, conf);
3317	cl_env_nested_put(&nest, env);
3318
3319	if (conf->coc_opc == OBJECT_CONF_SET) {
3320		struct ldlm_lock *lock = conf->coc_lock;
3321
3322		LASSERT(lock != NULL);
3323		LASSERT(ldlm_has_layout(lock));
3324		if (result == 0) {
3325			/* it can only be allowed to match after layout is
3326			 * applied to inode otherwise false layout would be
3327			 * seen. Applying layout should happen before dropping
3328			 * the intent lock. */
3329			ldlm_lock_allow_match(lock);
3330		}
3331	}
3332	return result;
3333}
3334
3335/* Fetch layout from MDT with getxattr request, if it's not ready yet */
3336static int ll_layout_fetch(struct inode *inode, struct ldlm_lock *lock)
3337
3338{
3339	struct ll_sb_info *sbi = ll_i2sbi(inode);
3340	struct obd_capa *oc;
3341	struct ptlrpc_request *req;
3342	struct mdt_body *body;
3343	void *lvbdata;
3344	void *lmm;
3345	int lmmsize;
3346	int rc;
3347
3348	CDEBUG(D_INODE, DFID" LVB_READY=%d l_lvb_data=%p l_lvb_len=%d\n",
3349	       PFID(ll_inode2fid(inode)), !!(lock->l_flags & LDLM_FL_LVB_READY),
3350	       lock->l_lvb_data, lock->l_lvb_len);
3351
3352	if ((lock->l_lvb_data != NULL) && (lock->l_flags & LDLM_FL_LVB_READY))
3353		return 0;
3354
3355	/* if layout lock was granted right away, the layout is returned
3356	 * within DLM_LVB of dlm reply; otherwise if the lock was ever
3357	 * blocked and then granted via completion ast, we have to fetch
3358	 * layout here. Please note that we can't use the LVB buffer in
3359	 * completion AST because it doesn't have a large enough buffer */
3360	oc = ll_mdscapa_get(inode);
3361	rc = ll_get_max_mdsize(sbi, &lmmsize);
3362	if (rc == 0)
3363		rc = md_getxattr(sbi->ll_md_exp, ll_inode2fid(inode), oc,
3364				OBD_MD_FLXATTR, XATTR_NAME_LOV, NULL, 0,
3365				lmmsize, 0, &req);
3366	capa_put(oc);
3367	if (rc < 0)
3368		return rc;
3369
3370	body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
3371	if (body == NULL || body->eadatasize > lmmsize)
3372		GOTO(out, rc = -EPROTO);
3373
3374	lmmsize = body->eadatasize;
3375	if (lmmsize == 0) /* empty layout */
3376		GOTO(out, rc = 0);
3377
3378	lmm = req_capsule_server_sized_get(&req->rq_pill, &RMF_EADATA, lmmsize);
3379	if (lmm == NULL)
3380		GOTO(out, rc = -EFAULT);
3381
3382	OBD_ALLOC_LARGE(lvbdata, lmmsize);
3383	if (lvbdata == NULL)
3384		GOTO(out, rc = -ENOMEM);
3385
3386	memcpy(lvbdata, lmm, lmmsize);
3387	lock_res_and_lock(lock);
3388	if (lock->l_lvb_data != NULL)
3389		OBD_FREE_LARGE(lock->l_lvb_data, lock->l_lvb_len);
3390
3391	lock->l_lvb_data = lvbdata;
3392	lock->l_lvb_len = lmmsize;
3393	unlock_res_and_lock(lock);
3394
3395out:
3396	ptlrpc_req_finished(req);
3397	return rc;
3398}
3399
3400/**
3401 * Apply the layout to the inode. Layout lock is held and will be released
3402 * in this function.
3403 */
3404static int ll_layout_lock_set(struct lustre_handle *lockh, ldlm_mode_t mode,
3405				struct inode *inode, __u32 *gen, bool reconf)
3406{
3407	struct ll_inode_info *lli = ll_i2info(inode);
3408	struct ll_sb_info    *sbi = ll_i2sbi(inode);
3409	struct ldlm_lock *lock;
3410	struct lustre_md md = { NULL };
3411	struct cl_object_conf conf;
3412	int rc = 0;
3413	bool lvb_ready;
3414	bool wait_layout = false;
3415
3416	LASSERT(lustre_handle_is_used(lockh));
3417
3418	lock = ldlm_handle2lock(lockh);
3419	LASSERT(lock != NULL);
3420	LASSERT(ldlm_has_layout(lock));
3421
3422	LDLM_DEBUG(lock, "File %p/"DFID" being reconfigured: %d.\n",
3423		   inode, PFID(&lli->lli_fid), reconf);
3424
3425	/* in case this is a caching lock and reinstate with new inode */
3426	md_set_lock_data(sbi->ll_md_exp, &lockh->cookie, inode, NULL);
3427
3428	lock_res_and_lock(lock);
3429	lvb_ready = !!(lock->l_flags & LDLM_FL_LVB_READY);
3430	unlock_res_and_lock(lock);
3431	/* checking lvb_ready is racy but this is okay. The worst case is
3432	 * that multi processes may configure the file on the same time. */
3433	if (lvb_ready || !reconf) {
3434		rc = -ENODATA;
3435		if (lvb_ready) {
3436			/* layout_gen must be valid if layout lock is not
3437			 * cancelled and stripe has already set */
3438			*gen = lli->lli_layout_gen;
3439			rc = 0;
3440		}
3441		GOTO(out, rc);
3442	}
3443
3444	rc = ll_layout_fetch(inode, lock);
3445	if (rc < 0)
3446		GOTO(out, rc);
3447
3448	/* for layout lock, lmm is returned in lock's lvb.
3449	 * lvb_data is immutable if the lock is held so it's safe to access it
3450	 * without res lock. See the description in ldlm_lock_decref_internal()
3451	 * for the condition to free lvb_data of layout lock */
3452	if (lock->l_lvb_data != NULL) {
3453		rc = obd_unpackmd(sbi->ll_dt_exp, &md.lsm,
3454				  lock->l_lvb_data, lock->l_lvb_len);
3455		if (rc >= 0) {
3456			*gen = LL_LAYOUT_GEN_EMPTY;
3457			if (md.lsm != NULL)
3458				*gen = md.lsm->lsm_layout_gen;
3459			rc = 0;
3460		} else {
3461			CERROR("%s: file "DFID" unpackmd error: %d\n",
3462				ll_get_fsname(inode->i_sb, NULL, 0),
3463				PFID(&lli->lli_fid), rc);
3464		}
3465	}
3466	if (rc < 0)
3467		GOTO(out, rc);
3468
3469	/* set layout to file. Unlikely this will fail as old layout was
3470	 * surely eliminated */
3471	memset(&conf, 0, sizeof(conf));
3472	conf.coc_opc = OBJECT_CONF_SET;
3473	conf.coc_inode = inode;
3474	conf.coc_lock = lock;
3475	conf.u.coc_md = &md;
3476	rc = ll_layout_conf(inode, &conf);
3477
3478	if (md.lsm != NULL)
3479		obd_free_memmd(sbi->ll_dt_exp, &md.lsm);
3480
3481	/* refresh layout failed, need to wait */
3482	wait_layout = rc == -EBUSY;
3483
3484out:
3485	LDLM_LOCK_PUT(lock);
3486	ldlm_lock_decref(lockh, mode);
3487
3488	/* wait for IO to complete if it's still being used. */
3489	if (wait_layout) {
3490		CDEBUG(D_INODE, "%s: %p/"DFID" wait for layout reconf.\n",
3491			ll_get_fsname(inode->i_sb, NULL, 0),
3492			inode, PFID(&lli->lli_fid));
3493
3494		memset(&conf, 0, sizeof(conf));
3495		conf.coc_opc = OBJECT_CONF_WAIT;
3496		conf.coc_inode = inode;
3497		rc = ll_layout_conf(inode, &conf);
3498		if (rc == 0)
3499			rc = -EAGAIN;
3500
3501		CDEBUG(D_INODE, "file: "DFID" waiting layout return: %d.\n",
3502			PFID(&lli->lli_fid), rc);
3503	}
3504	return rc;
3505}
3506
3507/**
3508 * This function checks if there exists a LAYOUT lock on the client side,
3509 * or enqueues it if it doesn't have one in cache.
3510 *
3511 * This function will not hold layout lock so it may be revoked any time after
3512 * this function returns. Any operations depend on layout should be redone
3513 * in that case.
3514 *
3515 * This function should be called before lov_io_init() to get an uptodate
3516 * layout version, the caller should save the version number and after IO
3517 * is finished, this function should be called again to verify that layout
3518 * is not changed during IO time.
3519 */
3520int ll_layout_refresh(struct inode *inode, __u32 *gen)
3521{
3522	struct ll_inode_info  *lli = ll_i2info(inode);
3523	struct ll_sb_info     *sbi = ll_i2sbi(inode);
3524	struct md_op_data     *op_data;
3525	struct lookup_intent   it;
3526	struct lustre_handle   lockh;
3527	ldlm_mode_t	       mode;
3528	struct ldlm_enqueue_info einfo = {
3529		.ei_type = LDLM_IBITS,
3530		.ei_mode = LCK_CR,
3531		.ei_cb_bl = ll_md_blocking_ast,
3532		.ei_cb_cp = ldlm_completion_ast,
3533	};
3534	int rc;
3535
3536	*gen = lli->lli_layout_gen;
3537	if (!(sbi->ll_flags & LL_SBI_LAYOUT_LOCK))
3538		return 0;
3539
3540	/* sanity checks */
3541	LASSERT(fid_is_sane(ll_inode2fid(inode)));
3542	LASSERT(S_ISREG(inode->i_mode));
3543
3544	/* mostly layout lock is caching on the local side, so try to match
3545	 * it before grabbing layout lock mutex. */
3546	mode = ll_take_md_lock(inode, MDS_INODELOCK_LAYOUT, &lockh, 0,
3547			       LCK_CR | LCK_CW | LCK_PR | LCK_PW);
3548	if (mode != 0) { /* hit cached lock */
3549		rc = ll_layout_lock_set(&lockh, mode, inode, gen, false);
3550		if (rc == 0)
3551			return 0;
3552
3553		/* better hold lli_layout_mutex to try again otherwise
3554		 * it will have starvation problem. */
3555	}
3556
3557	/* take layout lock mutex to enqueue layout lock exclusively. */
3558	mutex_lock(&lli->lli_layout_mutex);
3559
3560again:
3561	/* try again. Maybe somebody else has done this. */
3562	mode = ll_take_md_lock(inode, MDS_INODELOCK_LAYOUT, &lockh, 0,
3563			       LCK_CR | LCK_CW | LCK_PR | LCK_PW);
3564	if (mode != 0) { /* hit cached lock */
3565		rc = ll_layout_lock_set(&lockh, mode, inode, gen, true);
3566		if (rc == -EAGAIN)
3567			goto again;
3568
3569		mutex_unlock(&lli->lli_layout_mutex);
3570		return rc;
3571	}
3572
3573	op_data = ll_prep_md_op_data(NULL, inode, inode, NULL,
3574			0, 0, LUSTRE_OPC_ANY, NULL);
3575	if (IS_ERR(op_data)) {
3576		mutex_unlock(&lli->lli_layout_mutex);
3577		return PTR_ERR(op_data);
3578	}
3579
3580	/* have to enqueue one */
3581	memset(&it, 0, sizeof(it));
3582	it.it_op = IT_LAYOUT;
3583	lockh.cookie = 0ULL;
3584
3585	LDLM_DEBUG_NOLOCK("%s: requeue layout lock for file %p/"DFID".\n",
3586			ll_get_fsname(inode->i_sb, NULL, 0), inode,
3587			PFID(&lli->lli_fid));
3588
3589	rc = md_enqueue(sbi->ll_md_exp, &einfo, &it, op_data, &lockh,
3590			NULL, 0, NULL, 0);
3591	if (it.d.lustre.it_data != NULL)
3592		ptlrpc_req_finished(it.d.lustre.it_data);
3593	it.d.lustre.it_data = NULL;
3594
3595	ll_finish_md_op_data(op_data);
3596
3597	mode = it.d.lustre.it_lock_mode;
3598	it.d.lustre.it_lock_mode = 0;
3599	ll_intent_drop_lock(&it);
3600
3601	if (rc == 0) {
3602		/* set lock data in case this is a new lock */
3603		ll_set_lock_data(sbi->ll_md_exp, inode, &it, NULL);
3604		rc = ll_layout_lock_set(&lockh, mode, inode, gen, true);
3605		if (rc == -EAGAIN)
3606			goto again;
3607	}
3608	mutex_unlock(&lli->lli_layout_mutex);
3609
3610	return rc;
3611}
3612
3613/**
3614 *  This function send a restore request to the MDT
3615 */
3616int ll_layout_restore(struct inode *inode)
3617{
3618	struct hsm_user_request	*hur;
3619	int			 len, rc;
3620
3621	len = sizeof(struct hsm_user_request) +
3622	      sizeof(struct hsm_user_item);
3623	OBD_ALLOC(hur, len);
3624	if (hur == NULL)
3625		return -ENOMEM;
3626
3627	hur->hur_request.hr_action = HUA_RESTORE;
3628	hur->hur_request.hr_archive_id = 0;
3629	hur->hur_request.hr_flags = 0;
3630	memcpy(&hur->hur_user_item[0].hui_fid, &ll_i2info(inode)->lli_fid,
3631	       sizeof(hur->hur_user_item[0].hui_fid));
3632	hur->hur_user_item[0].hui_extent.length = -1;
3633	hur->hur_request.hr_itemcount = 1;
3634	rc = obd_iocontrol(LL_IOC_HSM_REQUEST, cl_i2sbi(inode)->ll_md_exp,
3635			   len, hur, NULL);
3636	OBD_FREE(hur, len);
3637	return rc;
3638}
3639