[go: nahoru, domu]

file.c revision 38585ccc4627a9da381af9e912b756cfceb615a5
1/*
2 * GPL HEADER START
3 *
4 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
5 *
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License version 2 only,
8 * as published by the Free Software Foundation.
9 *
10 * This program is distributed in the hope that it will be useful, but
11 * WITHOUT ANY WARRANTY; without even the implied warranty of
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
13 * General Public License version 2 for more details (a copy is included
14 * in the LICENSE file that accompanied this code).
15 *
16 * You should have received a copy of the GNU General Public License
17 * version 2 along with this program; If not, see
18 * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
19 *
20 * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
21 * CA 95054 USA or visit www.sun.com if you need additional information or
22 * have any questions.
23 *
24 * GPL HEADER END
25 */
26/*
27 * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved.
28 * Use is subject to license terms.
29 *
30 * Copyright (c) 2011, 2012, Intel Corporation.
31 */
32/*
33 * This file is part of Lustre, http://www.lustre.org/
34 * Lustre is a trademark of Sun Microsystems, Inc.
35 *
36 * lustre/llite/file.c
37 *
38 * Author: Peter Braam <braam@clusterfs.com>
39 * Author: Phil Schwan <phil@clusterfs.com>
40 * Author: Andreas Dilger <adilger@clusterfs.com>
41 */
42
43#define DEBUG_SUBSYSTEM S_LLITE
44#include <lustre_dlm.h>
45#include <lustre_lite.h>
46#include <linux/pagemap.h>
47#include <linux/file.h>
48#include "llite_internal.h"
49#include <lustre/ll_fiemap.h>
50
51#include "cl_object.h"
52
53struct ll_file_data *ll_file_data_get(void)
54{
55	struct ll_file_data *fd;
56
57	OBD_SLAB_ALLOC_PTR_GFP(fd, ll_file_data_slab, __GFP_IO);
58	if (fd == NULL)
59		return NULL;
60	fd->fd_write_failed = false;
61	return fd;
62}
63
64static void ll_file_data_put(struct ll_file_data *fd)
65{
66	if (fd != NULL)
67		OBD_SLAB_FREE_PTR(fd, ll_file_data_slab);
68}
69
70void ll_pack_inode2opdata(struct inode *inode, struct md_op_data *op_data,
71			  struct lustre_handle *fh)
72{
73	op_data->op_fid1 = ll_i2info(inode)->lli_fid;
74	op_data->op_attr.ia_mode = inode->i_mode;
75	op_data->op_attr.ia_atime = inode->i_atime;
76	op_data->op_attr.ia_mtime = inode->i_mtime;
77	op_data->op_attr.ia_ctime = inode->i_ctime;
78	op_data->op_attr.ia_size = i_size_read(inode);
79	op_data->op_attr_blocks = inode->i_blocks;
80	((struct ll_iattr *)&op_data->op_attr)->ia_attr_flags =
81					ll_inode_to_ext_flags(inode->i_flags);
82	op_data->op_ioepoch = ll_i2info(inode)->lli_ioepoch;
83	if (fh)
84		op_data->op_handle = *fh;
85	op_data->op_capa1 = ll_mdscapa_get(inode);
86
87	if (LLIF_DATA_MODIFIED & ll_i2info(inode)->lli_flags)
88		op_data->op_bias |= MDS_DATA_MODIFIED;
89}
90
91/**
92 * Closes the IO epoch and packs all the attributes into @op_data for
93 * the CLOSE rpc.
94 */
95static void ll_prepare_close(struct inode *inode, struct md_op_data *op_data,
96			     struct obd_client_handle *och)
97{
98	op_data->op_attr.ia_valid = ATTR_MODE | ATTR_ATIME | ATTR_ATIME_SET |
99					ATTR_MTIME | ATTR_MTIME_SET |
100					ATTR_CTIME | ATTR_CTIME_SET;
101
102	if (!(och->och_flags & FMODE_WRITE))
103		goto out;
104
105	if (!exp_connect_som(ll_i2mdexp(inode)) || !S_ISREG(inode->i_mode))
106		op_data->op_attr.ia_valid |= ATTR_SIZE | ATTR_BLOCKS;
107	else
108		ll_ioepoch_close(inode, op_data, &och, 0);
109
110out:
111	ll_pack_inode2opdata(inode, op_data, &och->och_fh);
112	ll_prep_md_op_data(op_data, inode, NULL, NULL,
113			   0, 0, LUSTRE_OPC_ANY, NULL);
114}
115
116static int ll_close_inode_openhandle(struct obd_export *md_exp,
117				     struct inode *inode,
118				     struct obd_client_handle *och,
119				     const __u64 *data_version)
120{
121	struct obd_export *exp = ll_i2mdexp(inode);
122	struct md_op_data *op_data;
123	struct ptlrpc_request *req = NULL;
124	struct obd_device *obd = class_exp2obd(exp);
125	int epoch_close = 1;
126	int rc;
127
128	if (obd == NULL) {
129		/*
130		 * XXX: in case of LMV, is this correct to access
131		 * ->exp_handle?
132		 */
133		CERROR("Invalid MDC connection handle "LPX64"\n",
134		       ll_i2mdexp(inode)->exp_handle.h_cookie);
135		GOTO(out, rc = 0);
136	}
137
138	OBD_ALLOC_PTR(op_data);
139	if (op_data == NULL)
140		GOTO(out, rc = -ENOMEM); // XXX We leak openhandle and request here.
141
142	ll_prepare_close(inode, op_data, och);
143	if (data_version != NULL) {
144		/* Pass in data_version implies release. */
145		op_data->op_bias |= MDS_HSM_RELEASE;
146		op_data->op_data_version = *data_version;
147		op_data->op_lease_handle = och->och_lease_handle;
148		op_data->op_attr.ia_valid |= ATTR_SIZE | ATTR_BLOCKS;
149	}
150	epoch_close = (op_data->op_flags & MF_EPOCH_CLOSE);
151	rc = md_close(md_exp, op_data, och->och_mod, &req);
152	if (rc == -EAGAIN) {
153		/* This close must have the epoch closed. */
154		LASSERT(epoch_close);
155		/* MDS has instructed us to obtain Size-on-MDS attribute from
156		 * OSTs and send setattr to back to MDS. */
157		rc = ll_som_update(inode, op_data);
158		if (rc) {
159			CERROR("inode %lu mdc Size-on-MDS update failed: "
160			       "rc = %d\n", inode->i_ino, rc);
161			rc = 0;
162		}
163	} else if (rc) {
164		CERROR("inode %lu mdc close failed: rc = %d\n",
165		       inode->i_ino, rc);
166	}
167
168	/* DATA_MODIFIED flag was successfully sent on close, cancel data
169	 * modification flag. */
170	if (rc == 0 && (op_data->op_bias & MDS_DATA_MODIFIED)) {
171		struct ll_inode_info *lli = ll_i2info(inode);
172
173		spin_lock(&lli->lli_lock);
174		lli->lli_flags &= ~LLIF_DATA_MODIFIED;
175		spin_unlock(&lli->lli_lock);
176	}
177
178	if (rc == 0) {
179		rc = ll_objects_destroy(req, inode);
180		if (rc)
181			CERROR("inode %lu ll_objects destroy: rc = %d\n",
182			       inode->i_ino, rc);
183	}
184	if (rc == 0 && op_data->op_bias & MDS_HSM_RELEASE) {
185		struct mdt_body *body;
186		body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
187		if (!(body->valid & OBD_MD_FLRELEASED))
188			rc = -EBUSY;
189	}
190
191	ll_finish_md_op_data(op_data);
192
193out:
194	if (exp_connect_som(exp) && !epoch_close &&
195	    S_ISREG(inode->i_mode) && (och->och_flags & FMODE_WRITE)) {
196		ll_queue_done_writing(inode, LLIF_DONE_WRITING);
197	} else {
198		md_clear_open_replay_data(md_exp, och);
199		/* Free @och if it is not waiting for DONE_WRITING. */
200		och->och_fh.cookie = DEAD_HANDLE_MAGIC;
201		OBD_FREE_PTR(och);
202	}
203	if (req) /* This is close request */
204		ptlrpc_req_finished(req);
205	return rc;
206}
207
208int ll_md_real_close(struct inode *inode, int flags)
209{
210	struct ll_inode_info *lli = ll_i2info(inode);
211	struct obd_client_handle **och_p;
212	struct obd_client_handle *och;
213	__u64 *och_usecount;
214	int rc = 0;
215
216	if (flags & FMODE_WRITE) {
217		och_p = &lli->lli_mds_write_och;
218		och_usecount = &lli->lli_open_fd_write_count;
219	} else if (flags & FMODE_EXEC) {
220		och_p = &lli->lli_mds_exec_och;
221		och_usecount = &lli->lli_open_fd_exec_count;
222	} else {
223		LASSERT(flags & FMODE_READ);
224		och_p = &lli->lli_mds_read_och;
225		och_usecount = &lli->lli_open_fd_read_count;
226	}
227
228	mutex_lock(&lli->lli_och_mutex);
229	if (*och_usecount) { /* There are still users of this handle, so
230				skip freeing it. */
231		mutex_unlock(&lli->lli_och_mutex);
232		return 0;
233	}
234	och=*och_p;
235	*och_p = NULL;
236	mutex_unlock(&lli->lli_och_mutex);
237
238	if (och) { /* There might be a race and somebody have freed this och
239		      already */
240		rc = ll_close_inode_openhandle(ll_i2sbi(inode)->ll_md_exp,
241					       inode, och, NULL);
242	}
243
244	return rc;
245}
246
247int ll_md_close(struct obd_export *md_exp, struct inode *inode,
248		struct file *file)
249{
250	struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
251	struct ll_inode_info *lli = ll_i2info(inode);
252	int rc = 0;
253
254	/* clear group lock, if present */
255	if (unlikely(fd->fd_flags & LL_FILE_GROUP_LOCKED))
256		ll_put_grouplock(inode, file, fd->fd_grouplock.cg_gid);
257
258	if (fd->fd_lease_och != NULL) {
259		bool lease_broken;
260
261		/* Usually the lease is not released when the
262		 * application crashed, we need to release here. */
263		rc = ll_lease_close(fd->fd_lease_och, inode, &lease_broken);
264		CDEBUG(rc ? D_ERROR : D_INODE, "Clean up lease "DFID" %d/%d\n",
265			PFID(&lli->lli_fid), rc, lease_broken);
266
267		fd->fd_lease_och = NULL;
268	}
269
270	if (fd->fd_och != NULL) {
271		rc = ll_close_inode_openhandle(md_exp, inode, fd->fd_och, NULL);
272		fd->fd_och = NULL;
273		GOTO(out, rc);
274	}
275
276	/* Let's see if we have good enough OPEN lock on the file and if
277	   we can skip talking to MDS */
278	if (file->f_dentry->d_inode) { /* Can this ever be false? */
279		int lockmode;
280		int flags = LDLM_FL_BLOCK_GRANTED | LDLM_FL_TEST_LOCK;
281		struct lustre_handle lockh;
282		struct inode *inode = file->f_dentry->d_inode;
283		ldlm_policy_data_t policy = {.l_inodebits={MDS_INODELOCK_OPEN}};
284
285		mutex_lock(&lli->lli_och_mutex);
286		if (fd->fd_omode & FMODE_WRITE) {
287			lockmode = LCK_CW;
288			LASSERT(lli->lli_open_fd_write_count);
289			lli->lli_open_fd_write_count--;
290		} else if (fd->fd_omode & FMODE_EXEC) {
291			lockmode = LCK_PR;
292			LASSERT(lli->lli_open_fd_exec_count);
293			lli->lli_open_fd_exec_count--;
294		} else {
295			lockmode = LCK_CR;
296			LASSERT(lli->lli_open_fd_read_count);
297			lli->lli_open_fd_read_count--;
298		}
299		mutex_unlock(&lli->lli_och_mutex);
300
301		if (!md_lock_match(md_exp, flags, ll_inode2fid(inode),
302				   LDLM_IBITS, &policy, lockmode,
303				   &lockh)) {
304			rc = ll_md_real_close(file->f_dentry->d_inode,
305					      fd->fd_omode);
306		}
307	} else {
308		CERROR("Releasing a file %p with negative dentry %p. Name %s",
309		       file, file->f_dentry, file->f_dentry->d_name.name);
310	}
311
312out:
313	LUSTRE_FPRIVATE(file) = NULL;
314	ll_file_data_put(fd);
315	ll_capa_close(inode);
316
317	return rc;
318}
319
320/* While this returns an error code, fput() the caller does not, so we need
321 * to make every effort to clean up all of our state here.  Also, applications
322 * rarely check close errors and even if an error is returned they will not
323 * re-try the close call.
324 */
325int ll_file_release(struct inode *inode, struct file *file)
326{
327	struct ll_file_data *fd;
328	struct ll_sb_info *sbi = ll_i2sbi(inode);
329	struct ll_inode_info *lli = ll_i2info(inode);
330	int rc;
331
332	CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p)\n", inode->i_ino,
333	       inode->i_generation, inode);
334
335#ifdef CONFIG_FS_POSIX_ACL
336	if (sbi->ll_flags & LL_SBI_RMT_CLIENT &&
337	    inode == inode->i_sb->s_root->d_inode) {
338		struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
339
340		LASSERT(fd != NULL);
341		if (unlikely(fd->fd_flags & LL_FILE_RMTACL)) {
342			fd->fd_flags &= ~LL_FILE_RMTACL;
343			rct_del(&sbi->ll_rct, current_pid());
344			et_search_free(&sbi->ll_et, current_pid());
345		}
346	}
347#endif
348
349	if (inode->i_sb->s_root != file->f_dentry)
350		ll_stats_ops_tally(sbi, LPROC_LL_RELEASE, 1);
351	fd = LUSTRE_FPRIVATE(file);
352	LASSERT(fd != NULL);
353
354	/* The last ref on @file, maybe not the the owner pid of statahead.
355	 * Different processes can open the same dir, "ll_opendir_key" means:
356	 * it is me that should stop the statahead thread. */
357	if (S_ISDIR(inode->i_mode) && lli->lli_opendir_key == fd &&
358	    lli->lli_opendir_pid != 0)
359		ll_stop_statahead(inode, lli->lli_opendir_key);
360
361	if (inode->i_sb->s_root == file->f_dentry) {
362		LUSTRE_FPRIVATE(file) = NULL;
363		ll_file_data_put(fd);
364		return 0;
365	}
366
367	if (!S_ISDIR(inode->i_mode)) {
368		lov_read_and_clear_async_rc(lli->lli_clob);
369		lli->lli_async_rc = 0;
370	}
371
372	rc = ll_md_close(sbi->ll_md_exp, inode, file);
373
374	if (CFS_FAIL_TIMEOUT_MS(OBD_FAIL_PTLRPC_DUMP_LOG, cfs_fail_val))
375		libcfs_debug_dumplog();
376
377	return rc;
378}
379
380static int ll_intent_file_open(struct file *file, void *lmm,
381			       int lmmsize, struct lookup_intent *itp)
382{
383	struct ll_sb_info *sbi = ll_i2sbi(file->f_dentry->d_inode);
384	struct dentry *parent = file->f_dentry->d_parent;
385	const char *name = file->f_dentry->d_name.name;
386	const int len = file->f_dentry->d_name.len;
387	struct md_op_data *op_data;
388	struct ptlrpc_request *req;
389	__u32 opc = LUSTRE_OPC_ANY;
390	int rc;
391
392	if (!parent)
393		return -ENOENT;
394
395	/* Usually we come here only for NFSD, and we want open lock.
396	   But we can also get here with pre 2.6.15 patchless kernels, and in
397	   that case that lock is also ok */
398	/* We can also get here if there was cached open handle in revalidate_it
399	 * but it disappeared while we were getting from there to ll_file_open.
400	 * But this means this file was closed and immediately opened which
401	 * makes a good candidate for using OPEN lock */
402	/* If lmmsize & lmm are not 0, we are just setting stripe info
403	 * parameters. No need for the open lock */
404	if (lmm == NULL && lmmsize == 0) {
405		itp->it_flags |= MDS_OPEN_LOCK;
406		if (itp->it_flags & FMODE_WRITE)
407			opc = LUSTRE_OPC_CREATE;
408	}
409
410	op_data  = ll_prep_md_op_data(NULL, parent->d_inode,
411				      file->f_dentry->d_inode, name, len,
412				      O_RDWR, opc, NULL);
413	if (IS_ERR(op_data))
414		return PTR_ERR(op_data);
415
416	itp->it_flags |= MDS_OPEN_BY_FID;
417	rc = md_intent_lock(sbi->ll_md_exp, op_data, lmm, lmmsize, itp,
418			    0 /*unused */, &req, ll_md_blocking_ast, 0);
419	ll_finish_md_op_data(op_data);
420	if (rc == -ESTALE) {
421		/* reason for keep own exit path - don`t flood log
422		* with messages with -ESTALE errors.
423		*/
424		if (!it_disposition(itp, DISP_OPEN_OPEN) ||
425		     it_open_error(DISP_OPEN_OPEN, itp))
426			GOTO(out, rc);
427		ll_release_openhandle(file->f_dentry, itp);
428		GOTO(out, rc);
429	}
430
431	if (it_disposition(itp, DISP_LOOKUP_NEG))
432		GOTO(out, rc = -ENOENT);
433
434	if (rc != 0 || it_open_error(DISP_OPEN_OPEN, itp)) {
435		rc = rc ? rc : it_open_error(DISP_OPEN_OPEN, itp);
436		CDEBUG(D_VFSTRACE, "lock enqueue: err: %d\n", rc);
437		GOTO(out, rc);
438	}
439
440	rc = ll_prep_inode(&file->f_dentry->d_inode, req, NULL, itp);
441	if (!rc && itp->d.lustre.it_lock_mode)
442		ll_set_lock_data(sbi->ll_md_exp, file->f_dentry->d_inode,
443				 itp, NULL);
444
445out:
446	ptlrpc_req_finished(itp->d.lustre.it_data);
447	it_clear_disposition(itp, DISP_ENQ_COMPLETE);
448	ll_intent_drop_lock(itp);
449
450	return rc;
451}
452
453/**
454 * Assign an obtained @ioepoch to client's inode. No lock is needed, MDS does
455 * not believe attributes if a few ioepoch holders exist. Attributes for
456 * previous ioepoch if new one is opened are also skipped by MDS.
457 */
458void ll_ioepoch_open(struct ll_inode_info *lli, __u64 ioepoch)
459{
460	if (ioepoch && lli->lli_ioepoch != ioepoch) {
461		lli->lli_ioepoch = ioepoch;
462		CDEBUG(D_INODE, "Epoch "LPU64" opened on "DFID"\n",
463		       ioepoch, PFID(&lli->lli_fid));
464	}
465}
466
467static int ll_och_fill(struct obd_export *md_exp, struct lookup_intent *it,
468		       struct obd_client_handle *och)
469{
470	struct ptlrpc_request *req = it->d.lustre.it_data;
471	struct mdt_body *body;
472
473	body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
474	och->och_fh = body->handle;
475	och->och_fid = body->fid1;
476	och->och_lease_handle.cookie = it->d.lustre.it_lock_handle;
477	och->och_magic = OBD_CLIENT_HANDLE_MAGIC;
478	och->och_flags = it->it_flags;
479
480	return md_set_open_replay_data(md_exp, och, req);
481}
482
483int ll_local_open(struct file *file, struct lookup_intent *it,
484		  struct ll_file_data *fd, struct obd_client_handle *och)
485{
486	struct inode *inode = file->f_dentry->d_inode;
487	struct ll_inode_info *lli = ll_i2info(inode);
488
489	LASSERT(!LUSTRE_FPRIVATE(file));
490
491	LASSERT(fd != NULL);
492
493	if (och) {
494		struct ptlrpc_request *req = it->d.lustre.it_data;
495		struct mdt_body *body;
496		int rc;
497
498		rc = ll_och_fill(ll_i2sbi(inode)->ll_md_exp, it, och);
499		if (rc != 0)
500			return rc;
501
502		body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
503		ll_ioepoch_open(lli, body->ioepoch);
504	}
505
506	LUSTRE_FPRIVATE(file) = fd;
507	ll_readahead_init(inode, &fd->fd_ras);
508	fd->fd_omode = it->it_flags & (FMODE_READ | FMODE_WRITE | FMODE_EXEC);
509	return 0;
510}
511
512/* Open a file, and (for the very first open) create objects on the OSTs at
513 * this time.  If opened with O_LOV_DELAY_CREATE, then we don't do the object
514 * creation or open until ll_lov_setstripe() ioctl is called.
515 *
516 * If we already have the stripe MD locally then we don't request it in
517 * md_open(), by passing a lmm_size = 0.
518 *
519 * It is up to the application to ensure no other processes open this file
520 * in the O_LOV_DELAY_CREATE case, or the default striping pattern will be
521 * used.  We might be able to avoid races of that sort by getting lli_open_sem
522 * before returning in the O_LOV_DELAY_CREATE case and dropping it here
523 * or in ll_file_release(), but I'm not sure that is desirable/necessary.
524 */
525int ll_file_open(struct inode *inode, struct file *file)
526{
527	struct ll_inode_info *lli = ll_i2info(inode);
528	struct lookup_intent *it, oit = { .it_op = IT_OPEN,
529					  .it_flags = file->f_flags };
530	struct obd_client_handle **och_p = NULL;
531	__u64 *och_usecount = NULL;
532	struct ll_file_data *fd;
533	int rc = 0, opendir_set = 0;
534
535	CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p), flags %o\n", inode->i_ino,
536	       inode->i_generation, inode, file->f_flags);
537
538	it = file->private_data; /* XXX: compat macro */
539	file->private_data = NULL; /* prevent ll_local_open assertion */
540
541	fd = ll_file_data_get();
542	if (fd == NULL)
543		GOTO(out_openerr, rc = -ENOMEM);
544
545	fd->fd_file = file;
546	if (S_ISDIR(inode->i_mode)) {
547		spin_lock(&lli->lli_sa_lock);
548		if (lli->lli_opendir_key == NULL && lli->lli_sai == NULL &&
549		    lli->lli_opendir_pid == 0) {
550			lli->lli_opendir_key = fd;
551			lli->lli_opendir_pid = current_pid();
552			opendir_set = 1;
553		}
554		spin_unlock(&lli->lli_sa_lock);
555	}
556
557	if (inode->i_sb->s_root == file->f_dentry) {
558		LUSTRE_FPRIVATE(file) = fd;
559		return 0;
560	}
561
562	if (!it || !it->d.lustre.it_disposition) {
563		/* Convert f_flags into access mode. We cannot use file->f_mode,
564		 * because everything but O_ACCMODE mask was stripped from
565		 * there */
566		if ((oit.it_flags + 1) & O_ACCMODE)
567			oit.it_flags++;
568		if (file->f_flags & O_TRUNC)
569			oit.it_flags |= FMODE_WRITE;
570
571		/* kernel only call f_op->open in dentry_open.  filp_open calls
572		 * dentry_open after call to open_namei that checks permissions.
573		 * Only nfsd_open call dentry_open directly without checking
574		 * permissions and because of that this code below is safe. */
575		if (oit.it_flags & (FMODE_WRITE | FMODE_READ))
576			oit.it_flags |= MDS_OPEN_OWNEROVERRIDE;
577
578		/* We do not want O_EXCL here, presumably we opened the file
579		 * already? XXX - NFS implications? */
580		oit.it_flags &= ~O_EXCL;
581
582		/* bug20584, if "it_flags" contains O_CREAT, the file will be
583		 * created if necessary, then "IT_CREAT" should be set to keep
584		 * consistent with it */
585		if (oit.it_flags & O_CREAT)
586			oit.it_op |= IT_CREAT;
587
588		it = &oit;
589	}
590
591restart:
592	/* Let's see if we have file open on MDS already. */
593	if (it->it_flags & FMODE_WRITE) {
594		och_p = &lli->lli_mds_write_och;
595		och_usecount = &lli->lli_open_fd_write_count;
596	} else if (it->it_flags & FMODE_EXEC) {
597		och_p = &lli->lli_mds_exec_och;
598		och_usecount = &lli->lli_open_fd_exec_count;
599	 } else {
600		och_p = &lli->lli_mds_read_och;
601		och_usecount = &lli->lli_open_fd_read_count;
602	}
603
604	mutex_lock(&lli->lli_och_mutex);
605	if (*och_p) { /* Open handle is present */
606		if (it_disposition(it, DISP_OPEN_OPEN)) {
607			/* Well, there's extra open request that we do not need,
608			   let's close it somehow. This will decref request. */
609			rc = it_open_error(DISP_OPEN_OPEN, it);
610			if (rc) {
611				mutex_unlock(&lli->lli_och_mutex);
612				GOTO(out_openerr, rc);
613			}
614
615			ll_release_openhandle(file->f_dentry, it);
616		}
617		(*och_usecount)++;
618
619		rc = ll_local_open(file, it, fd, NULL);
620		if (rc) {
621			(*och_usecount)--;
622			mutex_unlock(&lli->lli_och_mutex);
623			GOTO(out_openerr, rc);
624		}
625	} else {
626		LASSERT(*och_usecount == 0);
627		if (!it->d.lustre.it_disposition) {
628			/* We cannot just request lock handle now, new ELC code
629			   means that one of other OPEN locks for this file
630			   could be cancelled, and since blocking ast handler
631			   would attempt to grab och_mutex as well, that would
632			   result in a deadlock */
633			mutex_unlock(&lli->lli_och_mutex);
634			it->it_create_mode |= M_CHECK_STALE;
635			rc = ll_intent_file_open(file, NULL, 0, it);
636			it->it_create_mode &= ~M_CHECK_STALE;
637			if (rc)
638				GOTO(out_openerr, rc);
639
640			goto restart;
641		}
642		OBD_ALLOC(*och_p, sizeof (struct obd_client_handle));
643		if (!*och_p)
644			GOTO(out_och_free, rc = -ENOMEM);
645
646		(*och_usecount)++;
647
648		/* md_intent_lock() didn't get a request ref if there was an
649		 * open error, so don't do cleanup on the request here
650		 * (bug 3430) */
651		/* XXX (green): Should not we bail out on any error here, not
652		 * just open error? */
653		rc = it_open_error(DISP_OPEN_OPEN, it);
654		if (rc)
655			GOTO(out_och_free, rc);
656
657		LASSERT(it_disposition(it, DISP_ENQ_OPEN_REF));
658
659		rc = ll_local_open(file, it, fd, *och_p);
660		if (rc)
661			GOTO(out_och_free, rc);
662	}
663	mutex_unlock(&lli->lli_och_mutex);
664	fd = NULL;
665
666	/* Must do this outside lli_och_mutex lock to prevent deadlock where
667	   different kind of OPEN lock for this same inode gets cancelled
668	   by ldlm_cancel_lru */
669	if (!S_ISREG(inode->i_mode))
670		GOTO(out_och_free, rc);
671
672	ll_capa_open(inode);
673
674	if (!lli->lli_has_smd &&
675	    (cl_is_lov_delay_create(file->f_flags) ||
676	     (file->f_mode & FMODE_WRITE) == 0)) {
677		CDEBUG(D_INODE, "object creation was delayed\n");
678		GOTO(out_och_free, rc);
679	}
680	cl_lov_delay_create_clear(&file->f_flags);
681	GOTO(out_och_free, rc);
682
683out_och_free:
684	if (rc) {
685		if (och_p && *och_p) {
686			OBD_FREE(*och_p, sizeof (struct obd_client_handle));
687			*och_p = NULL; /* OBD_FREE writes some magic there */
688			(*och_usecount)--;
689		}
690		mutex_unlock(&lli->lli_och_mutex);
691
692out_openerr:
693		if (opendir_set != 0)
694			ll_stop_statahead(inode, lli->lli_opendir_key);
695		if (fd != NULL)
696			ll_file_data_put(fd);
697	} else {
698		ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_OPEN, 1);
699	}
700
701	if (it && it_disposition(it, DISP_ENQ_OPEN_REF)) {
702		ptlrpc_req_finished(it->d.lustre.it_data);
703		it_clear_disposition(it, DISP_ENQ_OPEN_REF);
704	}
705
706	return rc;
707}
708
709static int ll_md_blocking_lease_ast(struct ldlm_lock *lock,
710			struct ldlm_lock_desc *desc, void *data, int flag)
711{
712	int rc;
713	struct lustre_handle lockh;
714
715	switch (flag) {
716	case LDLM_CB_BLOCKING:
717		ldlm_lock2handle(lock, &lockh);
718		rc = ldlm_cli_cancel(&lockh, LCF_ASYNC);
719		if (rc < 0) {
720			CDEBUG(D_INODE, "ldlm_cli_cancel: %d\n", rc);
721			return rc;
722		}
723		break;
724	case LDLM_CB_CANCELING:
725		/* do nothing */
726		break;
727	}
728	return 0;
729}
730
731/**
732 * Acquire a lease and open the file.
733 */
734struct obd_client_handle *ll_lease_open(struct inode *inode, struct file *file,
735					fmode_t fmode, __u64 open_flags)
736{
737	struct lookup_intent it = { .it_op = IT_OPEN };
738	struct ll_sb_info *sbi = ll_i2sbi(inode);
739	struct md_op_data *op_data;
740	struct ptlrpc_request *req;
741	struct lustre_handle old_handle = { 0 };
742	struct obd_client_handle *och = NULL;
743	int rc;
744	int rc2;
745
746	if (fmode != FMODE_WRITE && fmode != FMODE_READ)
747		return ERR_PTR(-EINVAL);
748
749	if (file != NULL) {
750		struct ll_inode_info *lli = ll_i2info(inode);
751		struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
752		struct obd_client_handle **och_p;
753		__u64 *och_usecount;
754
755		if (!(fmode & file->f_mode) || (file->f_mode & FMODE_EXEC))
756			return ERR_PTR(-EPERM);
757
758		/* Get the openhandle of the file */
759		rc = -EBUSY;
760		mutex_lock(&lli->lli_och_mutex);
761		if (fd->fd_lease_och != NULL) {
762			mutex_unlock(&lli->lli_och_mutex);
763			return ERR_PTR(rc);
764		}
765
766		if (fd->fd_och == NULL) {
767			if (file->f_mode & FMODE_WRITE) {
768				LASSERT(lli->lli_mds_write_och != NULL);
769				och_p = &lli->lli_mds_write_och;
770				och_usecount = &lli->lli_open_fd_write_count;
771			} else {
772				LASSERT(lli->lli_mds_read_och != NULL);
773				och_p = &lli->lli_mds_read_och;
774				och_usecount = &lli->lli_open_fd_read_count;
775			}
776			if (*och_usecount == 1) {
777				fd->fd_och = *och_p;
778				*och_p = NULL;
779				*och_usecount = 0;
780				rc = 0;
781			}
782		}
783		mutex_unlock(&lli->lli_och_mutex);
784		if (rc < 0) /* more than 1 opener */
785			return ERR_PTR(rc);
786
787		LASSERT(fd->fd_och != NULL);
788		old_handle = fd->fd_och->och_fh;
789	}
790
791	OBD_ALLOC_PTR(och);
792	if (och == NULL)
793		return ERR_PTR(-ENOMEM);
794
795	op_data = ll_prep_md_op_data(NULL, inode, inode, NULL, 0, 0,
796					LUSTRE_OPC_ANY, NULL);
797	if (IS_ERR(op_data))
798		GOTO(out, rc = PTR_ERR(op_data));
799
800	/* To tell the MDT this openhandle is from the same owner */
801	op_data->op_handle = old_handle;
802
803	it.it_flags = fmode | open_flags;
804	it.it_flags |= MDS_OPEN_LOCK | MDS_OPEN_BY_FID | MDS_OPEN_LEASE;
805	rc = md_intent_lock(sbi->ll_md_exp, op_data, NULL, 0, &it, 0, &req,
806				ll_md_blocking_lease_ast,
807	/* LDLM_FL_NO_LRU: To not put the lease lock into LRU list, otherwise
808	 * it can be cancelled which may mislead applications that the lease is
809	 * broken;
810	 * LDLM_FL_EXCL: Set this flag so that it won't be matched by normal
811	 * open in ll_md_blocking_ast(). Otherwise as ll_md_blocking_lease_ast
812	 * doesn't deal with openhandle, so normal openhandle will be leaked. */
813				LDLM_FL_NO_LRU | LDLM_FL_EXCL);
814	ll_finish_md_op_data(op_data);
815	if (req != NULL) {
816		ptlrpc_req_finished(req);
817		it_clear_disposition(&it, DISP_ENQ_COMPLETE);
818	}
819	if (rc < 0)
820		GOTO(out_release_it, rc);
821
822	if (it_disposition(&it, DISP_LOOKUP_NEG))
823		GOTO(out_release_it, rc = -ENOENT);
824
825	rc = it_open_error(DISP_OPEN_OPEN, &it);
826	if (rc)
827		GOTO(out_release_it, rc);
828
829	LASSERT(it_disposition(&it, DISP_ENQ_OPEN_REF));
830	ll_och_fill(sbi->ll_md_exp, &it, och);
831
832	if (!it_disposition(&it, DISP_OPEN_LEASE)) /* old server? */
833		GOTO(out_close, rc = -EOPNOTSUPP);
834
835	/* already get lease, handle lease lock */
836	ll_set_lock_data(sbi->ll_md_exp, inode, &it, NULL);
837	if (it.d.lustre.it_lock_mode == 0 ||
838	    it.d.lustre.it_lock_bits != MDS_INODELOCK_OPEN) {
839		/* open lock must return for lease */
840		CERROR(DFID "lease granted but no open lock, %d/%llu.\n",
841			PFID(ll_inode2fid(inode)), it.d.lustre.it_lock_mode,
842			it.d.lustre.it_lock_bits);
843		GOTO(out_close, rc = -EPROTO);
844	}
845
846	ll_intent_release(&it);
847	return och;
848
849out_close:
850	rc2 = ll_close_inode_openhandle(sbi->ll_md_exp, inode, och, NULL);
851	if (rc2)
852		CERROR("Close openhandle returned %d\n", rc2);
853
854	/* cancel open lock */
855	if (it.d.lustre.it_lock_mode != 0) {
856		ldlm_lock_decref_and_cancel(&och->och_lease_handle,
857						it.d.lustre.it_lock_mode);
858		it.d.lustre.it_lock_mode = 0;
859	}
860out_release_it:
861	ll_intent_release(&it);
862out:
863	OBD_FREE_PTR(och);
864	return ERR_PTR(rc);
865}
866EXPORT_SYMBOL(ll_lease_open);
867
868/**
869 * Release lease and close the file.
870 * It will check if the lease has ever broken.
871 */
872int ll_lease_close(struct obd_client_handle *och, struct inode *inode,
873			bool *lease_broken)
874{
875	struct ldlm_lock *lock;
876	bool cancelled = true;
877	int rc;
878
879	lock = ldlm_handle2lock(&och->och_lease_handle);
880	if (lock != NULL) {
881		lock_res_and_lock(lock);
882		cancelled = ldlm_is_cancel(lock);
883		unlock_res_and_lock(lock);
884		ldlm_lock_put(lock);
885	}
886
887	CDEBUG(D_INODE, "lease for "DFID" broken? %d\n",
888		PFID(&ll_i2info(inode)->lli_fid), cancelled);
889
890	if (!cancelled)
891		ldlm_cli_cancel(&och->och_lease_handle, 0);
892	if (lease_broken != NULL)
893		*lease_broken = cancelled;
894
895	rc = ll_close_inode_openhandle(ll_i2sbi(inode)->ll_md_exp, inode, och,
896				       NULL);
897	return rc;
898}
899EXPORT_SYMBOL(ll_lease_close);
900
901/* Fills the obdo with the attributes for the lsm */
902static int ll_lsm_getattr(struct lov_stripe_md *lsm, struct obd_export *exp,
903			  struct obd_capa *capa, struct obdo *obdo,
904			  __u64 ioepoch, int sync)
905{
906	struct ptlrpc_request_set *set;
907	struct obd_info	    oinfo = { { { 0 } } };
908	int			rc;
909
910	LASSERT(lsm != NULL);
911
912	oinfo.oi_md = lsm;
913	oinfo.oi_oa = obdo;
914	oinfo.oi_oa->o_oi = lsm->lsm_oi;
915	oinfo.oi_oa->o_mode = S_IFREG;
916	oinfo.oi_oa->o_ioepoch = ioepoch;
917	oinfo.oi_oa->o_valid = OBD_MD_FLID | OBD_MD_FLTYPE |
918			       OBD_MD_FLSIZE | OBD_MD_FLBLOCKS |
919			       OBD_MD_FLBLKSZ | OBD_MD_FLATIME |
920			       OBD_MD_FLMTIME | OBD_MD_FLCTIME |
921			       OBD_MD_FLGROUP | OBD_MD_FLEPOCH |
922			       OBD_MD_FLDATAVERSION;
923	oinfo.oi_capa = capa;
924	if (sync) {
925		oinfo.oi_oa->o_valid |= OBD_MD_FLFLAGS;
926		oinfo.oi_oa->o_flags |= OBD_FL_SRVLOCK;
927	}
928
929	set = ptlrpc_prep_set();
930	if (set == NULL) {
931		CERROR("can't allocate ptlrpc set\n");
932		rc = -ENOMEM;
933	} else {
934		rc = obd_getattr_async(exp, &oinfo, set);
935		if (rc == 0)
936			rc = ptlrpc_set_wait(set);
937		ptlrpc_set_destroy(set);
938	}
939	if (rc == 0)
940		oinfo.oi_oa->o_valid &= (OBD_MD_FLBLOCKS | OBD_MD_FLBLKSZ |
941					 OBD_MD_FLATIME | OBD_MD_FLMTIME |
942					 OBD_MD_FLCTIME | OBD_MD_FLSIZE |
943					 OBD_MD_FLDATAVERSION);
944	return rc;
945}
946
947/**
948  * Performs the getattr on the inode and updates its fields.
949  * If @sync != 0, perform the getattr under the server-side lock.
950  */
951int ll_inode_getattr(struct inode *inode, struct obdo *obdo,
952		     __u64 ioepoch, int sync)
953{
954	struct obd_capa      *capa = ll_mdscapa_get(inode);
955	struct lov_stripe_md *lsm;
956	int rc;
957
958	lsm = ccc_inode_lsm_get(inode);
959	rc = ll_lsm_getattr(lsm, ll_i2dtexp(inode),
960			    capa, obdo, ioepoch, sync);
961	capa_put(capa);
962	if (rc == 0) {
963		struct ost_id *oi = lsm ? &lsm->lsm_oi : &obdo->o_oi;
964
965		obdo_refresh_inode(inode, obdo, obdo->o_valid);
966		CDEBUG(D_INODE, "objid "DOSTID" size %llu, blocks %llu,"
967		       " blksize %lu\n", POSTID(oi), i_size_read(inode),
968		       (unsigned long long)inode->i_blocks,
969		       (unsigned long)ll_inode_blksize(inode));
970	}
971	ccc_inode_lsm_put(inode, lsm);
972	return rc;
973}
974
975int ll_merge_lvb(const struct lu_env *env, struct inode *inode)
976{
977	struct ll_inode_info *lli = ll_i2info(inode);
978	struct cl_object *obj = lli->lli_clob;
979	struct cl_attr *attr = ccc_env_thread_attr(env);
980	struct ost_lvb lvb;
981	int rc = 0;
982
983	ll_inode_size_lock(inode);
984	/* merge timestamps the most recently obtained from mds with
985	   timestamps obtained from osts */
986	LTIME_S(inode->i_atime) = lli->lli_lvb.lvb_atime;
987	LTIME_S(inode->i_mtime) = lli->lli_lvb.lvb_mtime;
988	LTIME_S(inode->i_ctime) = lli->lli_lvb.lvb_ctime;
989	inode_init_lvb(inode, &lvb);
990
991	cl_object_attr_lock(obj);
992	rc = cl_object_attr_get(env, obj, attr);
993	cl_object_attr_unlock(obj);
994
995	if (rc == 0) {
996		if (lvb.lvb_atime < attr->cat_atime)
997			lvb.lvb_atime = attr->cat_atime;
998		if (lvb.lvb_ctime < attr->cat_ctime)
999			lvb.lvb_ctime = attr->cat_ctime;
1000		if (lvb.lvb_mtime < attr->cat_mtime)
1001			lvb.lvb_mtime = attr->cat_mtime;
1002
1003		CDEBUG(D_VFSTRACE, DFID" updating i_size "LPU64"\n",
1004				PFID(&lli->lli_fid), attr->cat_size);
1005		cl_isize_write_nolock(inode, attr->cat_size);
1006
1007		inode->i_blocks = attr->cat_blocks;
1008
1009		LTIME_S(inode->i_mtime) = lvb.lvb_mtime;
1010		LTIME_S(inode->i_atime) = lvb.lvb_atime;
1011		LTIME_S(inode->i_ctime) = lvb.lvb_ctime;
1012	}
1013	ll_inode_size_unlock(inode);
1014
1015	return rc;
1016}
1017
1018int ll_glimpse_ioctl(struct ll_sb_info *sbi, struct lov_stripe_md *lsm,
1019		     lstat_t *st)
1020{
1021	struct obdo obdo = { 0 };
1022	int rc;
1023
1024	rc = ll_lsm_getattr(lsm, sbi->ll_dt_exp, NULL, &obdo, 0, 0);
1025	if (rc == 0) {
1026		st->st_size   = obdo.o_size;
1027		st->st_blocks = obdo.o_blocks;
1028		st->st_mtime  = obdo.o_mtime;
1029		st->st_atime  = obdo.o_atime;
1030		st->st_ctime  = obdo.o_ctime;
1031	}
1032	return rc;
1033}
1034
1035void ll_io_init(struct cl_io *io, const struct file *file, int write)
1036{
1037	struct inode *inode = file->f_dentry->d_inode;
1038
1039	io->u.ci_rw.crw_nonblock = file->f_flags & O_NONBLOCK;
1040	if (write) {
1041		io->u.ci_wr.wr_append = !!(file->f_flags & O_APPEND);
1042		io->u.ci_wr.wr_sync = file->f_flags & O_SYNC ||
1043				      file->f_flags & O_DIRECT ||
1044				      IS_SYNC(inode);
1045	}
1046	io->ci_obj     = ll_i2info(inode)->lli_clob;
1047	io->ci_lockreq = CILR_MAYBE;
1048	if (ll_file_nolock(file)) {
1049		io->ci_lockreq = CILR_NEVER;
1050		io->ci_no_srvlock = 1;
1051	} else if (file->f_flags & O_APPEND) {
1052		io->ci_lockreq = CILR_MANDATORY;
1053	}
1054}
1055
1056static ssize_t
1057ll_file_io_generic(const struct lu_env *env, struct vvp_io_args *args,
1058		   struct file *file, enum cl_io_type iot,
1059		   loff_t *ppos, size_t count)
1060{
1061	struct ll_inode_info *lli = ll_i2info(file->f_dentry->d_inode);
1062	struct ll_file_data  *fd  = LUSTRE_FPRIVATE(file);
1063	struct cl_io	 *io;
1064	ssize_t	       result;
1065
1066restart:
1067	io = ccc_env_thread_io(env);
1068	ll_io_init(io, file, iot == CIT_WRITE);
1069
1070	if (cl_io_rw_init(env, io, iot, *ppos, count) == 0) {
1071		struct vvp_io *vio = vvp_env_io(env);
1072		struct ccc_io *cio = ccc_env_io(env);
1073		int write_mutex_locked = 0;
1074
1075		cio->cui_fd  = LUSTRE_FPRIVATE(file);
1076		vio->cui_io_subtype = args->via_io_subtype;
1077
1078		switch (vio->cui_io_subtype) {
1079		case IO_NORMAL:
1080			cio->cui_iov = args->u.normal.via_iov;
1081			cio->cui_nrsegs = args->u.normal.via_nrsegs;
1082			cio->cui_tot_nrsegs = cio->cui_nrsegs;
1083			cio->cui_iocb = args->u.normal.via_iocb;
1084			if ((iot == CIT_WRITE) &&
1085			    !(cio->cui_fd->fd_flags & LL_FILE_GROUP_LOCKED)) {
1086				if (mutex_lock_interruptible(&lli->
1087							       lli_write_mutex))
1088					GOTO(out, result = -ERESTARTSYS);
1089				write_mutex_locked = 1;
1090			} else if (iot == CIT_READ) {
1091				down_read(&lli->lli_trunc_sem);
1092			}
1093			break;
1094		case IO_SENDFILE:
1095			vio->u.sendfile.cui_actor = args->u.sendfile.via_actor;
1096			vio->u.sendfile.cui_target = args->u.sendfile.via_target;
1097			break;
1098		case IO_SPLICE:
1099			vio->u.splice.cui_pipe = args->u.splice.via_pipe;
1100			vio->u.splice.cui_flags = args->u.splice.via_flags;
1101			break;
1102		default:
1103			CERROR("Unknow IO type - %u\n", vio->cui_io_subtype);
1104			LBUG();
1105		}
1106		result = cl_io_loop(env, io);
1107		if (write_mutex_locked)
1108			mutex_unlock(&lli->lli_write_mutex);
1109		else if (args->via_io_subtype == IO_NORMAL && iot == CIT_READ)
1110			up_read(&lli->lli_trunc_sem);
1111	} else {
1112		/* cl_io_rw_init() handled IO */
1113		result = io->ci_result;
1114	}
1115
1116	if (io->ci_nob > 0) {
1117		result = io->ci_nob;
1118		*ppos = io->u.ci_wr.wr.crw_pos;
1119	}
1120	GOTO(out, result);
1121out:
1122	cl_io_fini(env, io);
1123	/* If any bit been read/written (result != 0), we just return
1124	 * short read/write instead of restart io. */
1125	if ((result == 0 || result == -ENODATA) && io->ci_need_restart) {
1126		CDEBUG(D_VFSTRACE, "Restart %s on %s from %lld, count:%zd\n",
1127		       iot == CIT_READ ? "read" : "write",
1128		       file->f_dentry->d_name.name, *ppos, count);
1129		LASSERTF(io->ci_nob == 0, "%zd", io->ci_nob);
1130		goto restart;
1131	}
1132
1133	if (iot == CIT_READ) {
1134		if (result >= 0)
1135			ll_stats_ops_tally(ll_i2sbi(file->f_dentry->d_inode),
1136					   LPROC_LL_READ_BYTES, result);
1137	} else if (iot == CIT_WRITE) {
1138		if (result >= 0) {
1139			ll_stats_ops_tally(ll_i2sbi(file->f_dentry->d_inode),
1140					   LPROC_LL_WRITE_BYTES, result);
1141			fd->fd_write_failed = false;
1142		} else if (result != -ERESTARTSYS) {
1143			fd->fd_write_failed = true;
1144		}
1145	}
1146
1147	return result;
1148}
1149
1150static ssize_t ll_file_aio_read(struct kiocb *iocb, const struct iovec *iov,
1151				unsigned long nr_segs, loff_t pos)
1152{
1153	struct lu_env      *env;
1154	struct vvp_io_args *args;
1155	size_t	      count = 0;
1156	ssize_t	     result;
1157	int		 refcheck;
1158
1159	result = generic_segment_checks(iov, &nr_segs, &count, VERIFY_WRITE);
1160	if (result)
1161		return result;
1162
1163	env = cl_env_get(&refcheck);
1164	if (IS_ERR(env))
1165		return PTR_ERR(env);
1166
1167	args = vvp_env_args(env, IO_NORMAL);
1168	args->u.normal.via_iov = (struct iovec *)iov;
1169	args->u.normal.via_nrsegs = nr_segs;
1170	args->u.normal.via_iocb = iocb;
1171
1172	result = ll_file_io_generic(env, args, iocb->ki_filp, CIT_READ,
1173				    &iocb->ki_pos, count);
1174	cl_env_put(env, &refcheck);
1175	return result;
1176}
1177
1178static ssize_t ll_file_read(struct file *file, char *buf, size_t count,
1179			    loff_t *ppos)
1180{
1181	struct lu_env *env;
1182	struct iovec  *local_iov;
1183	struct kiocb  *kiocb;
1184	ssize_t	result;
1185	int	    refcheck;
1186
1187	env = cl_env_get(&refcheck);
1188	if (IS_ERR(env))
1189		return PTR_ERR(env);
1190
1191	local_iov = &vvp_env_info(env)->vti_local_iov;
1192	kiocb = &vvp_env_info(env)->vti_kiocb;
1193	local_iov->iov_base = (void __user *)buf;
1194	local_iov->iov_len = count;
1195	init_sync_kiocb(kiocb, file);
1196	kiocb->ki_pos = *ppos;
1197	kiocb->ki_nbytes = count;
1198
1199	result = ll_file_aio_read(kiocb, local_iov, 1, kiocb->ki_pos);
1200	*ppos = kiocb->ki_pos;
1201
1202	cl_env_put(env, &refcheck);
1203	return result;
1204}
1205
1206/*
1207 * Write to a file (through the page cache).
1208 */
1209static ssize_t ll_file_aio_write(struct kiocb *iocb, const struct iovec *iov,
1210				 unsigned long nr_segs, loff_t pos)
1211{
1212	struct lu_env      *env;
1213	struct vvp_io_args *args;
1214	size_t	      count = 0;
1215	ssize_t	     result;
1216	int		 refcheck;
1217
1218	result = generic_segment_checks(iov, &nr_segs, &count, VERIFY_READ);
1219	if (result)
1220		return result;
1221
1222	env = cl_env_get(&refcheck);
1223	if (IS_ERR(env))
1224		return PTR_ERR(env);
1225
1226	args = vvp_env_args(env, IO_NORMAL);
1227	args->u.normal.via_iov = (struct iovec *)iov;
1228	args->u.normal.via_nrsegs = nr_segs;
1229	args->u.normal.via_iocb = iocb;
1230
1231	result = ll_file_io_generic(env, args, iocb->ki_filp, CIT_WRITE,
1232				  &iocb->ki_pos, count);
1233	cl_env_put(env, &refcheck);
1234	return result;
1235}
1236
1237static ssize_t ll_file_write(struct file *file, const char *buf, size_t count,
1238			     loff_t *ppos)
1239{
1240	struct lu_env *env;
1241	struct iovec  *local_iov;
1242	struct kiocb  *kiocb;
1243	ssize_t	result;
1244	int	    refcheck;
1245
1246	env = cl_env_get(&refcheck);
1247	if (IS_ERR(env))
1248		return PTR_ERR(env);
1249
1250	local_iov = &vvp_env_info(env)->vti_local_iov;
1251	kiocb = &vvp_env_info(env)->vti_kiocb;
1252	local_iov->iov_base = (void __user *)buf;
1253	local_iov->iov_len = count;
1254	init_sync_kiocb(kiocb, file);
1255	kiocb->ki_pos = *ppos;
1256	kiocb->ki_nbytes = count;
1257
1258	result = ll_file_aio_write(kiocb, local_iov, 1, kiocb->ki_pos);
1259	*ppos = kiocb->ki_pos;
1260
1261	cl_env_put(env, &refcheck);
1262	return result;
1263}
1264
1265
1266
1267/*
1268 * Send file content (through pagecache) somewhere with helper
1269 */
1270static ssize_t ll_file_splice_read(struct file *in_file, loff_t *ppos,
1271				   struct pipe_inode_info *pipe, size_t count,
1272				   unsigned int flags)
1273{
1274	struct lu_env      *env;
1275	struct vvp_io_args *args;
1276	ssize_t	     result;
1277	int		 refcheck;
1278
1279	env = cl_env_get(&refcheck);
1280	if (IS_ERR(env))
1281		return PTR_ERR(env);
1282
1283	args = vvp_env_args(env, IO_SPLICE);
1284	args->u.splice.via_pipe = pipe;
1285	args->u.splice.via_flags = flags;
1286
1287	result = ll_file_io_generic(env, args, in_file, CIT_READ, ppos, count);
1288	cl_env_put(env, &refcheck);
1289	return result;
1290}
1291
1292static int ll_lov_recreate(struct inode *inode, struct ost_id *oi,
1293			   obd_count ost_idx)
1294{
1295	struct obd_export *exp = ll_i2dtexp(inode);
1296	struct obd_trans_info oti = { 0 };
1297	struct obdo *oa = NULL;
1298	int lsm_size;
1299	int rc = 0;
1300	struct lov_stripe_md *lsm = NULL, *lsm2;
1301
1302	OBDO_ALLOC(oa);
1303	if (oa == NULL)
1304		return -ENOMEM;
1305
1306	lsm = ccc_inode_lsm_get(inode);
1307	if (!lsm_has_objects(lsm))
1308		GOTO(out, rc = -ENOENT);
1309
1310	lsm_size = sizeof(*lsm) + (sizeof(struct lov_oinfo) *
1311		   (lsm->lsm_stripe_count));
1312
1313	OBD_ALLOC_LARGE(lsm2, lsm_size);
1314	if (lsm2 == NULL)
1315		GOTO(out, rc = -ENOMEM);
1316
1317	oa->o_oi = *oi;
1318	oa->o_nlink = ost_idx;
1319	oa->o_flags |= OBD_FL_RECREATE_OBJS;
1320	oa->o_valid = OBD_MD_FLID | OBD_MD_FLFLAGS | OBD_MD_FLGROUP;
1321	obdo_from_inode(oa, inode, OBD_MD_FLTYPE | OBD_MD_FLATIME |
1322				   OBD_MD_FLMTIME | OBD_MD_FLCTIME);
1323	obdo_set_parent_fid(oa, &ll_i2info(inode)->lli_fid);
1324	memcpy(lsm2, lsm, lsm_size);
1325	ll_inode_size_lock(inode);
1326	rc = obd_create(NULL, exp, oa, &lsm2, &oti);
1327	ll_inode_size_unlock(inode);
1328
1329	OBD_FREE_LARGE(lsm2, lsm_size);
1330	GOTO(out, rc);
1331out:
1332	ccc_inode_lsm_put(inode, lsm);
1333	OBDO_FREE(oa);
1334	return rc;
1335}
1336
1337static int ll_lov_recreate_obj(struct inode *inode, unsigned long arg)
1338{
1339	struct ll_recreate_obj ucreat;
1340	struct ost_id		oi;
1341
1342	if (!capable(CFS_CAP_SYS_ADMIN))
1343		return -EPERM;
1344
1345	if (copy_from_user(&ucreat, (struct ll_recreate_obj *)arg,
1346			   sizeof(ucreat)))
1347		return -EFAULT;
1348
1349	ostid_set_seq_mdt0(&oi);
1350	ostid_set_id(&oi, ucreat.lrc_id);
1351	return ll_lov_recreate(inode, &oi, ucreat.lrc_ost_idx);
1352}
1353
1354static int ll_lov_recreate_fid(struct inode *inode, unsigned long arg)
1355{
1356	struct lu_fid	fid;
1357	struct ost_id	oi;
1358	obd_count	ost_idx;
1359
1360	if (!capable(CFS_CAP_SYS_ADMIN))
1361		return -EPERM;
1362
1363	if (copy_from_user(&fid, (struct lu_fid *)arg, sizeof(fid)))
1364		return -EFAULT;
1365
1366	fid_to_ostid(&fid, &oi);
1367	ost_idx = (fid_seq(&fid) >> 16) & 0xffff;
1368	return ll_lov_recreate(inode, &oi, ost_idx);
1369}
1370
1371int ll_lov_setstripe_ea_info(struct inode *inode, struct file *file,
1372			     int flags, struct lov_user_md *lum, int lum_size)
1373{
1374	struct lov_stripe_md *lsm = NULL;
1375	struct lookup_intent oit = {.it_op = IT_OPEN, .it_flags = flags};
1376	int rc = 0;
1377
1378	lsm = ccc_inode_lsm_get(inode);
1379	if (lsm != NULL) {
1380		ccc_inode_lsm_put(inode, lsm);
1381		CDEBUG(D_IOCTL, "stripe already exists for ino %lu\n",
1382		       inode->i_ino);
1383		GOTO(out, rc = -EEXIST);
1384	}
1385
1386	ll_inode_size_lock(inode);
1387	rc = ll_intent_file_open(file, lum, lum_size, &oit);
1388	if (rc)
1389		GOTO(out_unlock, rc);
1390	rc = oit.d.lustre.it_status;
1391	if (rc < 0)
1392		GOTO(out_req_free, rc);
1393
1394	ll_release_openhandle(file->f_dentry, &oit);
1395
1396out_unlock:
1397	ll_inode_size_unlock(inode);
1398	ll_intent_release(&oit);
1399	ccc_inode_lsm_put(inode, lsm);
1400out:
1401	cl_lov_delay_create_clear(&file->f_flags);
1402	return rc;
1403out_req_free:
1404	ptlrpc_req_finished((struct ptlrpc_request *) oit.d.lustre.it_data);
1405	goto out;
1406}
1407
1408int ll_lov_getstripe_ea_info(struct inode *inode, const char *filename,
1409			     struct lov_mds_md **lmmp, int *lmm_size,
1410			     struct ptlrpc_request **request)
1411{
1412	struct ll_sb_info *sbi = ll_i2sbi(inode);
1413	struct mdt_body  *body;
1414	struct lov_mds_md *lmm = NULL;
1415	struct ptlrpc_request *req = NULL;
1416	struct md_op_data *op_data;
1417	int rc, lmmsize;
1418
1419	rc = ll_get_max_mdsize(sbi, &lmmsize);
1420	if (rc)
1421		return rc;
1422
1423	op_data = ll_prep_md_op_data(NULL, inode, NULL, filename,
1424				     strlen(filename), lmmsize,
1425				     LUSTRE_OPC_ANY, NULL);
1426	if (IS_ERR(op_data))
1427		return PTR_ERR(op_data);
1428
1429	op_data->op_valid = OBD_MD_FLEASIZE | OBD_MD_FLDIREA;
1430	rc = md_getattr_name(sbi->ll_md_exp, op_data, &req);
1431	ll_finish_md_op_data(op_data);
1432	if (rc < 0) {
1433		CDEBUG(D_INFO, "md_getattr_name failed "
1434		       "on %s: rc %d\n", filename, rc);
1435		GOTO(out, rc);
1436	}
1437
1438	body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
1439	LASSERT(body != NULL); /* checked by mdc_getattr_name */
1440
1441	lmmsize = body->eadatasize;
1442
1443	if (!(body->valid & (OBD_MD_FLEASIZE | OBD_MD_FLDIREA)) ||
1444			lmmsize == 0) {
1445		GOTO(out, rc = -ENODATA);
1446	}
1447
1448	lmm = req_capsule_server_sized_get(&req->rq_pill, &RMF_MDT_MD, lmmsize);
1449	LASSERT(lmm != NULL);
1450
1451	if ((lmm->lmm_magic != cpu_to_le32(LOV_MAGIC_V1)) &&
1452	    (lmm->lmm_magic != cpu_to_le32(LOV_MAGIC_V3))) {
1453		GOTO(out, rc = -EPROTO);
1454	}
1455
1456	/*
1457	 * This is coming from the MDS, so is probably in
1458	 * little endian.  We convert it to host endian before
1459	 * passing it to userspace.
1460	 */
1461	if (LOV_MAGIC != cpu_to_le32(LOV_MAGIC)) {
1462		int stripe_count;
1463
1464		stripe_count = le16_to_cpu(lmm->lmm_stripe_count);
1465		if (le32_to_cpu(lmm->lmm_pattern) & LOV_PATTERN_F_RELEASED)
1466			stripe_count = 0;
1467
1468		/* if function called for directory - we should
1469		 * avoid swab not existent lsm objects */
1470		if (lmm->lmm_magic == cpu_to_le32(LOV_MAGIC_V1)) {
1471			lustre_swab_lov_user_md_v1((struct lov_user_md_v1 *)lmm);
1472			if (S_ISREG(body->mode))
1473				lustre_swab_lov_user_md_objects(
1474				 ((struct lov_user_md_v1 *)lmm)->lmm_objects,
1475				 stripe_count);
1476		} else if (lmm->lmm_magic == cpu_to_le32(LOV_MAGIC_V3)) {
1477			lustre_swab_lov_user_md_v3((struct lov_user_md_v3 *)lmm);
1478			if (S_ISREG(body->mode))
1479				lustre_swab_lov_user_md_objects(
1480				 ((struct lov_user_md_v3 *)lmm)->lmm_objects,
1481				 stripe_count);
1482		}
1483	}
1484
1485out:
1486	*lmmp = lmm;
1487	*lmm_size = lmmsize;
1488	*request = req;
1489	return rc;
1490}
1491
1492static int ll_lov_setea(struct inode *inode, struct file *file,
1493			    unsigned long arg)
1494{
1495	int			 flags = MDS_OPEN_HAS_OBJS | FMODE_WRITE;
1496	struct lov_user_md	*lump;
1497	int			 lum_size = sizeof(struct lov_user_md) +
1498					    sizeof(struct lov_user_ost_data);
1499	int			 rc;
1500
1501	if (!capable(CFS_CAP_SYS_ADMIN))
1502		return -EPERM;
1503
1504	OBD_ALLOC_LARGE(lump, lum_size);
1505	if (lump == NULL)
1506		return -ENOMEM;
1507
1508	if (copy_from_user(lump, (struct lov_user_md  *)arg, lum_size)) {
1509		OBD_FREE_LARGE(lump, lum_size);
1510		return -EFAULT;
1511	}
1512
1513	rc = ll_lov_setstripe_ea_info(inode, file, flags, lump, lum_size);
1514
1515	OBD_FREE_LARGE(lump, lum_size);
1516	return rc;
1517}
1518
1519static int ll_lov_setstripe(struct inode *inode, struct file *file,
1520			    unsigned long arg)
1521{
1522	struct lov_user_md_v3	 lumv3;
1523	struct lov_user_md_v1	*lumv1 = (struct lov_user_md_v1 *)&lumv3;
1524	struct lov_user_md_v1	*lumv1p = (struct lov_user_md_v1 *)arg;
1525	struct lov_user_md_v3	*lumv3p = (struct lov_user_md_v3 *)arg;
1526	int			 lum_size, rc;
1527	int			 flags = FMODE_WRITE;
1528
1529	/* first try with v1 which is smaller than v3 */
1530	lum_size = sizeof(struct lov_user_md_v1);
1531	if (copy_from_user(lumv1, lumv1p, lum_size))
1532		return -EFAULT;
1533
1534	if (lumv1->lmm_magic == LOV_USER_MAGIC_V3) {
1535		lum_size = sizeof(struct lov_user_md_v3);
1536		if (copy_from_user(&lumv3, lumv3p, lum_size))
1537			return -EFAULT;
1538	}
1539
1540	rc = ll_lov_setstripe_ea_info(inode, file, flags, lumv1, lum_size);
1541	if (rc == 0) {
1542		struct lov_stripe_md *lsm;
1543		__u32 gen;
1544
1545		put_user(0, &lumv1p->lmm_stripe_count);
1546
1547		ll_layout_refresh(inode, &gen);
1548		lsm = ccc_inode_lsm_get(inode);
1549		rc = obd_iocontrol(LL_IOC_LOV_GETSTRIPE, ll_i2dtexp(inode),
1550				   0, lsm, (void *)arg);
1551		ccc_inode_lsm_put(inode, lsm);
1552	}
1553	return rc;
1554}
1555
1556static int ll_lov_getstripe(struct inode *inode, unsigned long arg)
1557{
1558	struct lov_stripe_md *lsm;
1559	int rc = -ENODATA;
1560
1561	lsm = ccc_inode_lsm_get(inode);
1562	if (lsm != NULL)
1563		rc = obd_iocontrol(LL_IOC_LOV_GETSTRIPE, ll_i2dtexp(inode), 0,
1564				   lsm, (void *)arg);
1565	ccc_inode_lsm_put(inode, lsm);
1566	return rc;
1567}
1568
1569int ll_get_grouplock(struct inode *inode, struct file *file, unsigned long arg)
1570{
1571	struct ll_inode_info   *lli = ll_i2info(inode);
1572	struct ll_file_data    *fd = LUSTRE_FPRIVATE(file);
1573	struct ccc_grouplock    grouplock;
1574	int		     rc;
1575
1576	if (ll_file_nolock(file))
1577		return -EOPNOTSUPP;
1578
1579	spin_lock(&lli->lli_lock);
1580	if (fd->fd_flags & LL_FILE_GROUP_LOCKED) {
1581		CWARN("group lock already existed with gid %lu\n",
1582		      fd->fd_grouplock.cg_gid);
1583		spin_unlock(&lli->lli_lock);
1584		return -EINVAL;
1585	}
1586	LASSERT(fd->fd_grouplock.cg_lock == NULL);
1587	spin_unlock(&lli->lli_lock);
1588
1589	rc = cl_get_grouplock(cl_i2info(inode)->lli_clob,
1590			      arg, (file->f_flags & O_NONBLOCK), &grouplock);
1591	if (rc)
1592		return rc;
1593
1594	spin_lock(&lli->lli_lock);
1595	if (fd->fd_flags & LL_FILE_GROUP_LOCKED) {
1596		spin_unlock(&lli->lli_lock);
1597		CERROR("another thread just won the race\n");
1598		cl_put_grouplock(&grouplock);
1599		return -EINVAL;
1600	}
1601
1602	fd->fd_flags |= LL_FILE_GROUP_LOCKED;
1603	fd->fd_grouplock = grouplock;
1604	spin_unlock(&lli->lli_lock);
1605
1606	CDEBUG(D_INFO, "group lock %lu obtained\n", arg);
1607	return 0;
1608}
1609
1610int ll_put_grouplock(struct inode *inode, struct file *file, unsigned long arg)
1611{
1612	struct ll_inode_info   *lli = ll_i2info(inode);
1613	struct ll_file_data    *fd = LUSTRE_FPRIVATE(file);
1614	struct ccc_grouplock    grouplock;
1615
1616	spin_lock(&lli->lli_lock);
1617	if (!(fd->fd_flags & LL_FILE_GROUP_LOCKED)) {
1618		spin_unlock(&lli->lli_lock);
1619		CWARN("no group lock held\n");
1620		return -EINVAL;
1621	}
1622	LASSERT(fd->fd_grouplock.cg_lock != NULL);
1623
1624	if (fd->fd_grouplock.cg_gid != arg) {
1625		CWARN("group lock %lu doesn't match current id %lu\n",
1626		       arg, fd->fd_grouplock.cg_gid);
1627		spin_unlock(&lli->lli_lock);
1628		return -EINVAL;
1629	}
1630
1631	grouplock = fd->fd_grouplock;
1632	memset(&fd->fd_grouplock, 0, sizeof(fd->fd_grouplock));
1633	fd->fd_flags &= ~LL_FILE_GROUP_LOCKED;
1634	spin_unlock(&lli->lli_lock);
1635
1636	cl_put_grouplock(&grouplock);
1637	CDEBUG(D_INFO, "group lock %lu released\n", arg);
1638	return 0;
1639}
1640
1641/**
1642 * Close inode open handle
1643 *
1644 * \param dentry [in]     dentry which contains the inode
1645 * \param it     [in,out] intent which contains open info and result
1646 *
1647 * \retval 0     success
1648 * \retval <0    failure
1649 */
1650int ll_release_openhandle(struct dentry *dentry, struct lookup_intent *it)
1651{
1652	struct inode *inode = dentry->d_inode;
1653	struct obd_client_handle *och;
1654	int rc;
1655
1656	LASSERT(inode);
1657
1658	/* Root ? Do nothing. */
1659	if (dentry->d_inode->i_sb->s_root == dentry)
1660		return 0;
1661
1662	/* No open handle to close? Move away */
1663	if (!it_disposition(it, DISP_OPEN_OPEN))
1664		return 0;
1665
1666	LASSERT(it_open_error(DISP_OPEN_OPEN, it) == 0);
1667
1668	OBD_ALLOC(och, sizeof(*och));
1669	if (!och)
1670		GOTO(out, rc = -ENOMEM);
1671
1672	ll_och_fill(ll_i2sbi(inode)->ll_md_exp, it, och);
1673
1674	rc = ll_close_inode_openhandle(ll_i2sbi(inode)->ll_md_exp,
1675				       inode, och, NULL);
1676out:
1677	/* this one is in place of ll_file_open */
1678	if (it_disposition(it, DISP_ENQ_OPEN_REF)) {
1679		ptlrpc_req_finished(it->d.lustre.it_data);
1680		it_clear_disposition(it, DISP_ENQ_OPEN_REF);
1681	}
1682	return rc;
1683}
1684
1685/**
1686 * Get size for inode for which FIEMAP mapping is requested.
1687 * Make the FIEMAP get_info call and returns the result.
1688 */
1689int ll_do_fiemap(struct inode *inode, struct ll_user_fiemap *fiemap,
1690	      int num_bytes)
1691{
1692	struct obd_export *exp = ll_i2dtexp(inode);
1693	struct lov_stripe_md *lsm = NULL;
1694	struct ll_fiemap_info_key fm_key = { .name = KEY_FIEMAP, };
1695	int vallen = num_bytes;
1696	int rc;
1697
1698	/* Checks for fiemap flags */
1699	if (fiemap->fm_flags & ~LUSTRE_FIEMAP_FLAGS_COMPAT) {
1700		fiemap->fm_flags &= ~LUSTRE_FIEMAP_FLAGS_COMPAT;
1701		return -EBADR;
1702	}
1703
1704	/* Check for FIEMAP_FLAG_SYNC */
1705	if (fiemap->fm_flags & FIEMAP_FLAG_SYNC) {
1706		rc = filemap_fdatawrite(inode->i_mapping);
1707		if (rc)
1708			return rc;
1709	}
1710
1711	lsm = ccc_inode_lsm_get(inode);
1712	if (lsm == NULL)
1713		return -ENOENT;
1714
1715	/* If the stripe_count > 1 and the application does not understand
1716	 * DEVICE_ORDER flag, then it cannot interpret the extents correctly.
1717	 */
1718	if (lsm->lsm_stripe_count > 1 &&
1719	    !(fiemap->fm_flags & FIEMAP_FLAG_DEVICE_ORDER))
1720		GOTO(out, rc = -EOPNOTSUPP);
1721
1722	fm_key.oa.o_oi = lsm->lsm_oi;
1723	fm_key.oa.o_valid = OBD_MD_FLID | OBD_MD_FLGROUP;
1724
1725	obdo_from_inode(&fm_key.oa, inode, OBD_MD_FLSIZE);
1726	obdo_set_parent_fid(&fm_key.oa, &ll_i2info(inode)->lli_fid);
1727	/* If filesize is 0, then there would be no objects for mapping */
1728	if (fm_key.oa.o_size == 0) {
1729		fiemap->fm_mapped_extents = 0;
1730		GOTO(out, rc = 0);
1731	}
1732
1733	memcpy(&fm_key.fiemap, fiemap, sizeof(*fiemap));
1734
1735	rc = obd_get_info(NULL, exp, sizeof(fm_key), &fm_key, &vallen,
1736			  fiemap, lsm);
1737	if (rc)
1738		CERROR("obd_get_info failed: rc = %d\n", rc);
1739
1740out:
1741	ccc_inode_lsm_put(inode, lsm);
1742	return rc;
1743}
1744
1745int ll_fid2path(struct inode *inode, void *arg)
1746{
1747	struct obd_export	*exp = ll_i2mdexp(inode);
1748	struct getinfo_fid2path	*gfout, *gfin;
1749	int			 outsize, rc;
1750
1751	if (!capable(CFS_CAP_DAC_READ_SEARCH) &&
1752	    !(ll_i2sbi(inode)->ll_flags & LL_SBI_USER_FID2PATH))
1753		return -EPERM;
1754
1755	/* Need to get the buflen */
1756	OBD_ALLOC_PTR(gfin);
1757	if (gfin == NULL)
1758		return -ENOMEM;
1759	if (copy_from_user(gfin, arg, sizeof(*gfin))) {
1760		OBD_FREE_PTR(gfin);
1761		return -EFAULT;
1762	}
1763
1764	outsize = sizeof(*gfout) + gfin->gf_pathlen;
1765	OBD_ALLOC(gfout, outsize);
1766	if (gfout == NULL) {
1767		OBD_FREE_PTR(gfin);
1768		return -ENOMEM;
1769	}
1770	memcpy(gfout, gfin, sizeof(*gfout));
1771	OBD_FREE_PTR(gfin);
1772
1773	/* Call mdc_iocontrol */
1774	rc = obd_iocontrol(OBD_IOC_FID2PATH, exp, outsize, gfout, NULL);
1775	if (rc)
1776		GOTO(gf_free, rc);
1777
1778	if (copy_to_user(arg, gfout, outsize))
1779		rc = -EFAULT;
1780
1781gf_free:
1782	OBD_FREE(gfout, outsize);
1783	return rc;
1784}
1785
1786static int ll_ioctl_fiemap(struct inode *inode, unsigned long arg)
1787{
1788	struct ll_user_fiemap *fiemap_s;
1789	size_t num_bytes, ret_bytes;
1790	unsigned int extent_count;
1791	int rc = 0;
1792
1793	/* Get the extent count so we can calculate the size of
1794	 * required fiemap buffer */
1795	if (get_user(extent_count,
1796	    &((struct ll_user_fiemap __user *)arg)->fm_extent_count))
1797		return -EFAULT;
1798	num_bytes = sizeof(*fiemap_s) + (extent_count *
1799					 sizeof(struct ll_fiemap_extent));
1800
1801	OBD_ALLOC_LARGE(fiemap_s, num_bytes);
1802	if (fiemap_s == NULL)
1803		return -ENOMEM;
1804
1805	/* get the fiemap value */
1806	if (copy_from_user(fiemap_s, (struct ll_user_fiemap __user *)arg,
1807			   sizeof(*fiemap_s)))
1808		GOTO(error, rc = -EFAULT);
1809
1810	/* If fm_extent_count is non-zero, read the first extent since
1811	 * it is used to calculate end_offset and device from previous
1812	 * fiemap call. */
1813	if (extent_count) {
1814		if (copy_from_user(&fiemap_s->fm_extents[0],
1815		    (char __user *)arg + sizeof(*fiemap_s),
1816		    sizeof(struct ll_fiemap_extent)))
1817			GOTO(error, rc = -EFAULT);
1818	}
1819
1820	rc = ll_do_fiemap(inode, fiemap_s, num_bytes);
1821	if (rc)
1822		GOTO(error, rc);
1823
1824	ret_bytes = sizeof(struct ll_user_fiemap);
1825
1826	if (extent_count != 0)
1827		ret_bytes += (fiemap_s->fm_mapped_extents *
1828				 sizeof(struct ll_fiemap_extent));
1829
1830	if (copy_to_user((void *)arg, fiemap_s, ret_bytes))
1831		rc = -EFAULT;
1832
1833error:
1834	OBD_FREE_LARGE(fiemap_s, num_bytes);
1835	return rc;
1836}
1837
1838/*
1839 * Read the data_version for inode.
1840 *
1841 * This value is computed using stripe object version on OST.
1842 * Version is computed using server side locking.
1843 *
1844 * @param extent_lock  Take extent lock. Not needed if a process is already
1845 *		       holding the OST object group locks.
1846 */
1847int ll_data_version(struct inode *inode, __u64 *data_version,
1848		    int extent_lock)
1849{
1850	struct lov_stripe_md	*lsm = NULL;
1851	struct ll_sb_info	*sbi = ll_i2sbi(inode);
1852	struct obdo		*obdo = NULL;
1853	int			 rc;
1854
1855	/* If no stripe, we consider version is 0. */
1856	lsm = ccc_inode_lsm_get(inode);
1857	if (!lsm_has_objects(lsm)) {
1858		*data_version = 0;
1859		CDEBUG(D_INODE, "No object for inode\n");
1860		GOTO(out, rc = 0);
1861	}
1862
1863	OBD_ALLOC_PTR(obdo);
1864	if (obdo == NULL)
1865		GOTO(out, rc = -ENOMEM);
1866
1867	rc = ll_lsm_getattr(lsm, sbi->ll_dt_exp, NULL, obdo, 0, extent_lock);
1868	if (rc == 0) {
1869		if (!(obdo->o_valid & OBD_MD_FLDATAVERSION))
1870			rc = -EOPNOTSUPP;
1871		else
1872			*data_version = obdo->o_data_version;
1873	}
1874
1875	OBD_FREE_PTR(obdo);
1876out:
1877	ccc_inode_lsm_put(inode, lsm);
1878	return rc;
1879}
1880
1881/*
1882 * Trigger a HSM release request for the provided inode.
1883 */
1884int ll_hsm_release(struct inode *inode)
1885{
1886	struct cl_env_nest nest;
1887	struct lu_env *env;
1888	struct obd_client_handle *och = NULL;
1889	__u64 data_version = 0;
1890	int rc;
1891
1892
1893	CDEBUG(D_INODE, "%s: Releasing file "DFID".\n",
1894	       ll_get_fsname(inode->i_sb, NULL, 0),
1895	       PFID(&ll_i2info(inode)->lli_fid));
1896
1897	och = ll_lease_open(inode, NULL, FMODE_WRITE, MDS_OPEN_RELEASE);
1898	if (IS_ERR(och))
1899		GOTO(out, rc = PTR_ERR(och));
1900
1901	/* Grab latest data_version and [am]time values */
1902	rc = ll_data_version(inode, &data_version, 1);
1903	if (rc != 0)
1904		GOTO(out, rc);
1905
1906	env = cl_env_nested_get(&nest);
1907	if (IS_ERR(env))
1908		GOTO(out, rc = PTR_ERR(env));
1909
1910	ll_merge_lvb(env, inode);
1911	cl_env_nested_put(&nest, env);
1912
1913	/* Release the file.
1914	 * NB: lease lock handle is released in mdc_hsm_release_pack() because
1915	 * we still need it to pack l_remote_handle to MDT. */
1916	rc = ll_close_inode_openhandle(ll_i2sbi(inode)->ll_md_exp, inode, och,
1917				       &data_version);
1918	och = NULL;
1919
1920
1921out:
1922	if (och != NULL && !IS_ERR(och)) /* close the file */
1923		ll_lease_close(och, inode, NULL);
1924
1925	return rc;
1926}
1927
1928struct ll_swap_stack {
1929	struct iattr		 ia1, ia2;
1930	__u64			 dv1, dv2;
1931	struct inode		*inode1, *inode2;
1932	bool			 check_dv1, check_dv2;
1933};
1934
1935static int ll_swap_layouts(struct file *file1, struct file *file2,
1936			   struct lustre_swap_layouts *lsl)
1937{
1938	struct mdc_swap_layouts	 msl;
1939	struct md_op_data	*op_data;
1940	__u32			 gid;
1941	__u64			 dv;
1942	struct ll_swap_stack	*llss = NULL;
1943	int			 rc;
1944
1945	OBD_ALLOC_PTR(llss);
1946	if (llss == NULL)
1947		return -ENOMEM;
1948
1949	llss->inode1 = file1->f_dentry->d_inode;
1950	llss->inode2 = file2->f_dentry->d_inode;
1951
1952	if (!S_ISREG(llss->inode2->i_mode))
1953		GOTO(free, rc = -EINVAL);
1954
1955	if (inode_permission(llss->inode1, MAY_WRITE) ||
1956	    inode_permission(llss->inode2, MAY_WRITE))
1957		GOTO(free, rc = -EPERM);
1958
1959	if (llss->inode2->i_sb != llss->inode1->i_sb)
1960		GOTO(free, rc = -EXDEV);
1961
1962	/* we use 2 bool because it is easier to swap than 2 bits */
1963	if (lsl->sl_flags & SWAP_LAYOUTS_CHECK_DV1)
1964		llss->check_dv1 = true;
1965
1966	if (lsl->sl_flags & SWAP_LAYOUTS_CHECK_DV2)
1967		llss->check_dv2 = true;
1968
1969	/* we cannot use lsl->sl_dvX directly because we may swap them */
1970	llss->dv1 = lsl->sl_dv1;
1971	llss->dv2 = lsl->sl_dv2;
1972
1973	rc = lu_fid_cmp(ll_inode2fid(llss->inode1), ll_inode2fid(llss->inode2));
1974	if (rc == 0) /* same file, done! */
1975		GOTO(free, rc = 0);
1976
1977	if (rc < 0) { /* sequentialize it */
1978		swap(llss->inode1, llss->inode2);
1979		swap(file1, file2);
1980		swap(llss->dv1, llss->dv2);
1981		swap(llss->check_dv1, llss->check_dv2);
1982	}
1983
1984	gid = lsl->sl_gid;
1985	if (gid != 0) { /* application asks to flush dirty cache */
1986		rc = ll_get_grouplock(llss->inode1, file1, gid);
1987		if (rc < 0)
1988			GOTO(free, rc);
1989
1990		rc = ll_get_grouplock(llss->inode2, file2, gid);
1991		if (rc < 0) {
1992			ll_put_grouplock(llss->inode1, file1, gid);
1993			GOTO(free, rc);
1994		}
1995	}
1996
1997	/* to be able to restore mtime and atime after swap
1998	 * we need to first save them */
1999	if (lsl->sl_flags &
2000	    (SWAP_LAYOUTS_KEEP_MTIME | SWAP_LAYOUTS_KEEP_ATIME)) {
2001		llss->ia1.ia_mtime = llss->inode1->i_mtime;
2002		llss->ia1.ia_atime = llss->inode1->i_atime;
2003		llss->ia1.ia_valid = ATTR_MTIME | ATTR_ATIME;
2004		llss->ia2.ia_mtime = llss->inode2->i_mtime;
2005		llss->ia2.ia_atime = llss->inode2->i_atime;
2006		llss->ia2.ia_valid = ATTR_MTIME | ATTR_ATIME;
2007	}
2008
2009	/* ultimate check, before swaping the layouts we check if
2010	 * dataversion has changed (if requested) */
2011	if (llss->check_dv1) {
2012		rc = ll_data_version(llss->inode1, &dv, 0);
2013		if (rc)
2014			GOTO(putgl, rc);
2015		if (dv != llss->dv1)
2016			GOTO(putgl, rc = -EAGAIN);
2017	}
2018
2019	if (llss->check_dv2) {
2020		rc = ll_data_version(llss->inode2, &dv, 0);
2021		if (rc)
2022			GOTO(putgl, rc);
2023		if (dv != llss->dv2)
2024			GOTO(putgl, rc = -EAGAIN);
2025	}
2026
2027	/* struct md_op_data is used to send the swap args to the mdt
2028	 * only flags is missing, so we use struct mdc_swap_layouts
2029	 * through the md_op_data->op_data */
2030	/* flags from user space have to be converted before they are send to
2031	 * server, no flag is sent today, they are only used on the client */
2032	msl.msl_flags = 0;
2033	rc = -ENOMEM;
2034	op_data = ll_prep_md_op_data(NULL, llss->inode1, llss->inode2, NULL, 0,
2035				     0, LUSTRE_OPC_ANY, &msl);
2036	if (IS_ERR(op_data))
2037		GOTO(free, rc = PTR_ERR(op_data));
2038
2039	rc = obd_iocontrol(LL_IOC_LOV_SWAP_LAYOUTS, ll_i2mdexp(llss->inode1),
2040			   sizeof(*op_data), op_data, NULL);
2041	ll_finish_md_op_data(op_data);
2042
2043putgl:
2044	if (gid != 0) {
2045		ll_put_grouplock(llss->inode2, file2, gid);
2046		ll_put_grouplock(llss->inode1, file1, gid);
2047	}
2048
2049	/* rc can be set from obd_iocontrol() or from a GOTO(putgl, ...) */
2050	if (rc != 0)
2051		GOTO(free, rc);
2052
2053	/* clear useless flags */
2054	if (!(lsl->sl_flags & SWAP_LAYOUTS_KEEP_MTIME)) {
2055		llss->ia1.ia_valid &= ~ATTR_MTIME;
2056		llss->ia2.ia_valid &= ~ATTR_MTIME;
2057	}
2058
2059	if (!(lsl->sl_flags & SWAP_LAYOUTS_KEEP_ATIME)) {
2060		llss->ia1.ia_valid &= ~ATTR_ATIME;
2061		llss->ia2.ia_valid &= ~ATTR_ATIME;
2062	}
2063
2064	/* update time if requested */
2065	rc = 0;
2066	if (llss->ia2.ia_valid != 0) {
2067		mutex_lock(&llss->inode1->i_mutex);
2068		rc = ll_setattr(file1->f_dentry, &llss->ia2);
2069		mutex_unlock(&llss->inode1->i_mutex);
2070	}
2071
2072	if (llss->ia1.ia_valid != 0) {
2073		int rc1;
2074
2075		mutex_lock(&llss->inode2->i_mutex);
2076		rc1 = ll_setattr(file2->f_dentry, &llss->ia1);
2077		mutex_unlock(&llss->inode2->i_mutex);
2078		if (rc == 0)
2079			rc = rc1;
2080	}
2081
2082free:
2083	if (llss != NULL)
2084		OBD_FREE_PTR(llss);
2085
2086	return rc;
2087}
2088
2089static int ll_hsm_state_set(struct inode *inode, struct hsm_state_set *hss)
2090{
2091	struct md_op_data	*op_data;
2092	int			 rc;
2093
2094	/* Non-root users are forbidden to set or clear flags which are
2095	 * NOT defined in HSM_USER_MASK. */
2096	if (((hss->hss_setmask | hss->hss_clearmask) & ~HSM_USER_MASK) &&
2097	    !capable(CFS_CAP_SYS_ADMIN))
2098		return -EPERM;
2099
2100	op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
2101				     LUSTRE_OPC_ANY, hss);
2102	if (IS_ERR(op_data))
2103		return PTR_ERR(op_data);
2104
2105	rc = obd_iocontrol(LL_IOC_HSM_STATE_SET, ll_i2mdexp(inode),
2106			   sizeof(*op_data), op_data, NULL);
2107
2108	ll_finish_md_op_data(op_data);
2109
2110	return rc;
2111}
2112
2113static int ll_hsm_import(struct inode *inode, struct file *file,
2114			 struct hsm_user_import *hui)
2115{
2116	struct hsm_state_set	*hss = NULL;
2117	struct iattr		*attr = NULL;
2118	int			 rc;
2119
2120
2121	if (!S_ISREG(inode->i_mode))
2122		return -EINVAL;
2123
2124	/* set HSM flags */
2125	OBD_ALLOC_PTR(hss);
2126	if (hss == NULL)
2127		GOTO(out, rc = -ENOMEM);
2128
2129	hss->hss_valid = HSS_SETMASK | HSS_ARCHIVE_ID;
2130	hss->hss_archive_id = hui->hui_archive_id;
2131	hss->hss_setmask = HS_ARCHIVED | HS_EXISTS | HS_RELEASED;
2132	rc = ll_hsm_state_set(inode, hss);
2133	if (rc != 0)
2134		GOTO(out, rc);
2135
2136	OBD_ALLOC_PTR(attr);
2137	if (attr == NULL)
2138		GOTO(out, rc = -ENOMEM);
2139
2140	attr->ia_mode = hui->hui_mode & (S_IRWXU | S_IRWXG | S_IRWXO);
2141	attr->ia_mode |= S_IFREG;
2142	attr->ia_uid = make_kuid(&init_user_ns, hui->hui_uid);
2143	attr->ia_gid = make_kgid(&init_user_ns, hui->hui_gid);
2144	attr->ia_size = hui->hui_size;
2145	attr->ia_mtime.tv_sec = hui->hui_mtime;
2146	attr->ia_mtime.tv_nsec = hui->hui_mtime_ns;
2147	attr->ia_atime.tv_sec = hui->hui_atime;
2148	attr->ia_atime.tv_nsec = hui->hui_atime_ns;
2149
2150	attr->ia_valid = ATTR_SIZE | ATTR_MODE | ATTR_FORCE |
2151			 ATTR_UID | ATTR_GID |
2152			 ATTR_MTIME | ATTR_MTIME_SET |
2153			 ATTR_ATIME | ATTR_ATIME_SET;
2154
2155	rc = ll_setattr_raw(file->f_dentry, attr, true);
2156	if (rc == -ENODATA)
2157		rc = 0;
2158
2159out:
2160	if (hss != NULL)
2161		OBD_FREE_PTR(hss);
2162
2163	if (attr != NULL)
2164		OBD_FREE_PTR(attr);
2165
2166	return rc;
2167}
2168
2169long ll_file_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
2170{
2171	struct inode		*inode = file->f_dentry->d_inode;
2172	struct ll_file_data	*fd = LUSTRE_FPRIVATE(file);
2173	int			 flags, rc;
2174
2175	CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p),cmd=%x\n", inode->i_ino,
2176	       inode->i_generation, inode, cmd);
2177	ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_IOCTL, 1);
2178
2179	/* asm-ppc{,64} declares TCGETS, et. al. as type 't' not 'T' */
2180	if (_IOC_TYPE(cmd) == 'T' || _IOC_TYPE(cmd) == 't') /* tty ioctls */
2181		return -ENOTTY;
2182
2183	switch(cmd) {
2184	case LL_IOC_GETFLAGS:
2185		/* Get the current value of the file flags */
2186		return put_user(fd->fd_flags, (int *)arg);
2187	case LL_IOC_SETFLAGS:
2188	case LL_IOC_CLRFLAGS:
2189		/* Set or clear specific file flags */
2190		/* XXX This probably needs checks to ensure the flags are
2191		 *     not abused, and to handle any flag side effects.
2192		 */
2193		if (get_user(flags, (int *) arg))
2194			return -EFAULT;
2195
2196		if (cmd == LL_IOC_SETFLAGS) {
2197			if ((flags & LL_FILE_IGNORE_LOCK) &&
2198			    !(file->f_flags & O_DIRECT)) {
2199				CERROR("%s: unable to disable locking on "
2200				       "non-O_DIRECT file\n", current->comm);
2201				return -EINVAL;
2202			}
2203
2204			fd->fd_flags |= flags;
2205		} else {
2206			fd->fd_flags &= ~flags;
2207		}
2208		return 0;
2209	case LL_IOC_LOV_SETSTRIPE:
2210		return ll_lov_setstripe(inode, file, arg);
2211	case LL_IOC_LOV_SETEA:
2212		return ll_lov_setea(inode, file, arg);
2213	case LL_IOC_LOV_SWAP_LAYOUTS: {
2214		struct file *file2;
2215		struct lustre_swap_layouts lsl;
2216
2217		if (copy_from_user(&lsl, (char *)arg,
2218				       sizeof(struct lustre_swap_layouts)))
2219			return -EFAULT;
2220
2221		if ((file->f_flags & O_ACCMODE) == 0) /* O_RDONLY */
2222			return -EPERM;
2223
2224		file2 = fget(lsl.sl_fd);
2225		if (file2 == NULL)
2226			return -EBADF;
2227
2228		rc = -EPERM;
2229		if ((file2->f_flags & O_ACCMODE) != 0) /* O_WRONLY or O_RDWR */
2230			rc = ll_swap_layouts(file, file2, &lsl);
2231		fput(file2);
2232		return rc;
2233	}
2234	case LL_IOC_LOV_GETSTRIPE:
2235		return ll_lov_getstripe(inode, arg);
2236	case LL_IOC_RECREATE_OBJ:
2237		return ll_lov_recreate_obj(inode, arg);
2238	case LL_IOC_RECREATE_FID:
2239		return ll_lov_recreate_fid(inode, arg);
2240	case FSFILT_IOC_FIEMAP:
2241		return ll_ioctl_fiemap(inode, arg);
2242	case FSFILT_IOC_GETFLAGS:
2243	case FSFILT_IOC_SETFLAGS:
2244		return ll_iocontrol(inode, file, cmd, arg);
2245	case FSFILT_IOC_GETVERSION_OLD:
2246	case FSFILT_IOC_GETVERSION:
2247		return put_user(inode->i_generation, (int *)arg);
2248	case LL_IOC_GROUP_LOCK:
2249		return ll_get_grouplock(inode, file, arg);
2250	case LL_IOC_GROUP_UNLOCK:
2251		return ll_put_grouplock(inode, file, arg);
2252	case IOC_OBD_STATFS:
2253		return ll_obd_statfs(inode, (void *)arg);
2254
2255	/* We need to special case any other ioctls we want to handle,
2256	 * to send them to the MDS/OST as appropriate and to properly
2257	 * network encode the arg field.
2258	case FSFILT_IOC_SETVERSION_OLD:
2259	case FSFILT_IOC_SETVERSION:
2260	*/
2261	case LL_IOC_FLUSHCTX:
2262		return ll_flush_ctx(inode);
2263	case LL_IOC_PATH2FID: {
2264		if (copy_to_user((void *)arg, ll_inode2fid(inode),
2265				 sizeof(struct lu_fid)))
2266			return -EFAULT;
2267
2268		return 0;
2269	}
2270	case OBD_IOC_FID2PATH:
2271		return ll_fid2path(inode, (void *)arg);
2272	case LL_IOC_DATA_VERSION: {
2273		struct ioc_data_version	idv;
2274		int			rc;
2275
2276		if (copy_from_user(&idv, (char *)arg, sizeof(idv)))
2277			return -EFAULT;
2278
2279		rc = ll_data_version(inode, &idv.idv_version,
2280				!(idv.idv_flags & LL_DV_NOFLUSH));
2281
2282		if (rc == 0 && copy_to_user((char *) arg, &idv, sizeof(idv)))
2283			return -EFAULT;
2284
2285		return rc;
2286	}
2287
2288	case LL_IOC_GET_MDTIDX: {
2289		int mdtidx;
2290
2291		mdtidx = ll_get_mdt_idx(inode);
2292		if (mdtidx < 0)
2293			return mdtidx;
2294
2295		if (put_user((int)mdtidx, (int*)arg))
2296			return -EFAULT;
2297
2298		return 0;
2299	}
2300	case OBD_IOC_GETDTNAME:
2301	case OBD_IOC_GETMDNAME:
2302		return ll_get_obd_name(inode, cmd, arg);
2303	case LL_IOC_HSM_STATE_GET: {
2304		struct md_op_data	*op_data;
2305		struct hsm_user_state	*hus;
2306		int			 rc;
2307
2308		OBD_ALLOC_PTR(hus);
2309		if (hus == NULL)
2310			return -ENOMEM;
2311
2312		op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
2313					     LUSTRE_OPC_ANY, hus);
2314		if (IS_ERR(op_data)) {
2315			OBD_FREE_PTR(hus);
2316			return PTR_ERR(op_data);
2317		}
2318
2319		rc = obd_iocontrol(cmd, ll_i2mdexp(inode), sizeof(*op_data),
2320				   op_data, NULL);
2321
2322		if (copy_to_user((void *)arg, hus, sizeof(*hus)))
2323			rc = -EFAULT;
2324
2325		ll_finish_md_op_data(op_data);
2326		OBD_FREE_PTR(hus);
2327		return rc;
2328	}
2329	case LL_IOC_HSM_STATE_SET: {
2330		struct hsm_state_set	*hss;
2331		int			 rc;
2332
2333		OBD_ALLOC_PTR(hss);
2334		if (hss == NULL)
2335			return -ENOMEM;
2336
2337		if (copy_from_user(hss, (char *)arg, sizeof(*hss))) {
2338			OBD_FREE_PTR(hss);
2339			return -EFAULT;
2340		}
2341
2342		rc = ll_hsm_state_set(inode, hss);
2343
2344		OBD_FREE_PTR(hss);
2345		return rc;
2346	}
2347	case LL_IOC_HSM_ACTION: {
2348		struct md_op_data		*op_data;
2349		struct hsm_current_action	*hca;
2350		int				 rc;
2351
2352		OBD_ALLOC_PTR(hca);
2353		if (hca == NULL)
2354			return -ENOMEM;
2355
2356		op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
2357					     LUSTRE_OPC_ANY, hca);
2358		if (IS_ERR(op_data)) {
2359			OBD_FREE_PTR(hca);
2360			return PTR_ERR(op_data);
2361		}
2362
2363		rc = obd_iocontrol(cmd, ll_i2mdexp(inode), sizeof(*op_data),
2364				   op_data, NULL);
2365
2366		if (copy_to_user((char *)arg, hca, sizeof(*hca)))
2367			rc = -EFAULT;
2368
2369		ll_finish_md_op_data(op_data);
2370		OBD_FREE_PTR(hca);
2371		return rc;
2372	}
2373	case LL_IOC_SET_LEASE: {
2374		struct ll_inode_info *lli = ll_i2info(inode);
2375		struct obd_client_handle *och = NULL;
2376		bool lease_broken;
2377		fmode_t mode = 0;
2378
2379		switch (arg) {
2380		case F_WRLCK:
2381			if (!(file->f_mode & FMODE_WRITE))
2382				return -EPERM;
2383			mode = FMODE_WRITE;
2384			break;
2385		case F_RDLCK:
2386			if (!(file->f_mode & FMODE_READ))
2387				return -EPERM;
2388			mode = FMODE_READ;
2389			break;
2390		case F_UNLCK:
2391			mutex_lock(&lli->lli_och_mutex);
2392			if (fd->fd_lease_och != NULL) {
2393				och = fd->fd_lease_och;
2394				fd->fd_lease_och = NULL;
2395			}
2396			mutex_unlock(&lli->lli_och_mutex);
2397
2398			if (och != NULL) {
2399				mode = och->och_flags &
2400				       (FMODE_READ|FMODE_WRITE);
2401				rc = ll_lease_close(och, inode, &lease_broken);
2402				if (rc == 0 && lease_broken)
2403					mode = 0;
2404			} else {
2405				rc = -ENOLCK;
2406			}
2407
2408			/* return the type of lease or error */
2409			return rc < 0 ? rc : (int)mode;
2410		default:
2411			return -EINVAL;
2412		}
2413
2414		CDEBUG(D_INODE, "Set lease with mode %d\n", mode);
2415
2416		/* apply for lease */
2417		och = ll_lease_open(inode, file, mode, 0);
2418		if (IS_ERR(och))
2419			return PTR_ERR(och);
2420
2421		rc = 0;
2422		mutex_lock(&lli->lli_och_mutex);
2423		if (fd->fd_lease_och == NULL) {
2424			fd->fd_lease_och = och;
2425			och = NULL;
2426		}
2427		mutex_unlock(&lli->lli_och_mutex);
2428		if (och != NULL) {
2429			/* impossible now that only excl is supported for now */
2430			ll_lease_close(och, inode, &lease_broken);
2431			rc = -EBUSY;
2432		}
2433		return rc;
2434	}
2435	case LL_IOC_GET_LEASE: {
2436		struct ll_inode_info *lli = ll_i2info(inode);
2437		struct ldlm_lock *lock = NULL;
2438
2439		rc = 0;
2440		mutex_lock(&lli->lli_och_mutex);
2441		if (fd->fd_lease_och != NULL) {
2442			struct obd_client_handle *och = fd->fd_lease_och;
2443
2444			lock = ldlm_handle2lock(&och->och_lease_handle);
2445			if (lock != NULL) {
2446				lock_res_and_lock(lock);
2447				if (!ldlm_is_cancel(lock))
2448					rc = och->och_flags &
2449						(FMODE_READ | FMODE_WRITE);
2450				unlock_res_and_lock(lock);
2451				ldlm_lock_put(lock);
2452			}
2453		}
2454		mutex_unlock(&lli->lli_och_mutex);
2455		return rc;
2456	}
2457	case LL_IOC_HSM_IMPORT: {
2458		struct hsm_user_import *hui;
2459
2460		OBD_ALLOC_PTR(hui);
2461		if (hui == NULL)
2462			return -ENOMEM;
2463
2464		if (copy_from_user(hui, (void *)arg, sizeof(*hui))) {
2465			OBD_FREE_PTR(hui);
2466			return -EFAULT;
2467		}
2468
2469		rc = ll_hsm_import(inode, file, hui);
2470
2471		OBD_FREE_PTR(hui);
2472		return rc;
2473	}
2474	default: {
2475		int err;
2476
2477		if (LLIOC_STOP ==
2478		     ll_iocontrol_call(inode, file, cmd, arg, &err))
2479			return err;
2480
2481		return obd_iocontrol(cmd, ll_i2dtexp(inode), 0, NULL,
2482				     (void *)arg);
2483	}
2484	}
2485}
2486
2487
2488loff_t ll_file_seek(struct file *file, loff_t offset, int origin)
2489{
2490	struct inode *inode = file->f_dentry->d_inode;
2491	loff_t retval, eof = 0;
2492
2493	retval = offset + ((origin == SEEK_END) ? i_size_read(inode) :
2494			   (origin == SEEK_CUR) ? file->f_pos : 0);
2495	CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p), to=%llu=%#llx(%d)\n",
2496	       inode->i_ino, inode->i_generation, inode, retval, retval,
2497	       origin);
2498	ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_LLSEEK, 1);
2499
2500	if (origin == SEEK_END || origin == SEEK_HOLE || origin == SEEK_DATA) {
2501		retval = ll_glimpse_size(inode);
2502		if (retval != 0)
2503			return retval;
2504		eof = i_size_read(inode);
2505	}
2506
2507	retval = generic_file_llseek_size(file, offset, origin,
2508					  ll_file_maxbytes(inode), eof);
2509	return retval;
2510}
2511
2512int ll_flush(struct file *file, fl_owner_t id)
2513{
2514	struct inode *inode = file->f_dentry->d_inode;
2515	struct ll_inode_info *lli = ll_i2info(inode);
2516	struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
2517	int rc, err;
2518
2519	LASSERT(!S_ISDIR(inode->i_mode));
2520
2521	/* catch async errors that were recorded back when async writeback
2522	 * failed for pages in this mapping. */
2523	rc = lli->lli_async_rc;
2524	lli->lli_async_rc = 0;
2525	err = lov_read_and_clear_async_rc(lli->lli_clob);
2526	if (rc == 0)
2527		rc = err;
2528
2529	/* The application has been told write failure already.
2530	 * Do not report failure again. */
2531	if (fd->fd_write_failed)
2532		return 0;
2533	return rc ? -EIO : 0;
2534}
2535
2536/**
2537 * Called to make sure a portion of file has been written out.
2538 * if @local_only is not true, it will send OST_SYNC RPCs to ost.
2539 *
2540 * Return how many pages have been written.
2541 */
2542int cl_sync_file_range(struct inode *inode, loff_t start, loff_t end,
2543		       enum cl_fsync_mode mode, int ignore_layout)
2544{
2545	struct cl_env_nest nest;
2546	struct lu_env *env;
2547	struct cl_io *io;
2548	struct obd_capa *capa = NULL;
2549	struct cl_fsync_io *fio;
2550	int result;
2551
2552	if (mode != CL_FSYNC_NONE && mode != CL_FSYNC_LOCAL &&
2553	    mode != CL_FSYNC_DISCARD && mode != CL_FSYNC_ALL)
2554		return -EINVAL;
2555
2556	env = cl_env_nested_get(&nest);
2557	if (IS_ERR(env))
2558		return PTR_ERR(env);
2559
2560	capa = ll_osscapa_get(inode, CAPA_OPC_OSS_WRITE);
2561
2562	io = ccc_env_thread_io(env);
2563	io->ci_obj = cl_i2info(inode)->lli_clob;
2564	io->ci_ignore_layout = ignore_layout;
2565
2566	/* initialize parameters for sync */
2567	fio = &io->u.ci_fsync;
2568	fio->fi_capa = capa;
2569	fio->fi_start = start;
2570	fio->fi_end = end;
2571	fio->fi_fid = ll_inode2fid(inode);
2572	fio->fi_mode = mode;
2573	fio->fi_nr_written = 0;
2574
2575	if (cl_io_init(env, io, CIT_FSYNC, io->ci_obj) == 0)
2576		result = cl_io_loop(env, io);
2577	else
2578		result = io->ci_result;
2579	if (result == 0)
2580		result = fio->fi_nr_written;
2581	cl_io_fini(env, io);
2582	cl_env_nested_put(&nest, env);
2583
2584	capa_put(capa);
2585
2586	return result;
2587}
2588
2589/*
2590 * When dentry is provided (the 'else' case), *file->f_dentry may be
2591 * null and dentry must be used directly rather than pulled from
2592 * *file->f_dentry as is done otherwise.
2593 */
2594
2595int ll_fsync(struct file *file, loff_t start, loff_t end, int datasync)
2596{
2597	struct dentry *dentry = file->f_dentry;
2598	struct inode *inode = dentry->d_inode;
2599	struct ll_inode_info *lli = ll_i2info(inode);
2600	struct ptlrpc_request *req;
2601	struct obd_capa *oc;
2602	int rc, err;
2603
2604	CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p)\n", inode->i_ino,
2605	       inode->i_generation, inode);
2606	ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_FSYNC, 1);
2607
2608	rc = filemap_write_and_wait_range(inode->i_mapping, start, end);
2609	mutex_lock(&inode->i_mutex);
2610
2611	/* catch async errors that were recorded back when async writeback
2612	 * failed for pages in this mapping. */
2613	if (!S_ISDIR(inode->i_mode)) {
2614		err = lli->lli_async_rc;
2615		lli->lli_async_rc = 0;
2616		if (rc == 0)
2617			rc = err;
2618		err = lov_read_and_clear_async_rc(lli->lli_clob);
2619		if (rc == 0)
2620			rc = err;
2621	}
2622
2623	oc = ll_mdscapa_get(inode);
2624	err = md_sync(ll_i2sbi(inode)->ll_md_exp, ll_inode2fid(inode), oc,
2625		      &req);
2626	capa_put(oc);
2627	if (!rc)
2628		rc = err;
2629	if (!err)
2630		ptlrpc_req_finished(req);
2631
2632	if (datasync && S_ISREG(inode->i_mode)) {
2633		struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
2634
2635		err = cl_sync_file_range(inode, 0, OBD_OBJECT_EOF,
2636				CL_FSYNC_ALL, 0);
2637		if (rc == 0 && err < 0)
2638			rc = err;
2639		if (rc < 0)
2640			fd->fd_write_failed = true;
2641		else
2642			fd->fd_write_failed = false;
2643	}
2644
2645	mutex_unlock(&inode->i_mutex);
2646	return rc;
2647}
2648
2649int ll_file_flock(struct file *file, int cmd, struct file_lock *file_lock)
2650{
2651	struct inode *inode = file->f_dentry->d_inode;
2652	struct ll_sb_info *sbi = ll_i2sbi(inode);
2653	struct ldlm_enqueue_info einfo = {
2654		.ei_type	= LDLM_FLOCK,
2655		.ei_cb_cp	= ldlm_flock_completion_ast,
2656		.ei_cbdata	= file_lock,
2657	};
2658	struct md_op_data *op_data;
2659	struct lustre_handle lockh = {0};
2660	ldlm_policy_data_t flock = {{0}};
2661	int flags = 0;
2662	int rc;
2663	int rc2 = 0;
2664
2665	CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu file_lock=%p\n",
2666	       inode->i_ino, file_lock);
2667
2668	ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_FLOCK, 1);
2669
2670	if (file_lock->fl_flags & FL_FLOCK) {
2671		LASSERT((cmd == F_SETLKW) || (cmd == F_SETLK));
2672		/* flocks are whole-file locks */
2673		flock.l_flock.end = OFFSET_MAX;
2674		/* For flocks owner is determined by the local file desctiptor*/
2675		flock.l_flock.owner = (unsigned long)file_lock->fl_file;
2676	} else if (file_lock->fl_flags & FL_POSIX) {
2677		flock.l_flock.owner = (unsigned long)file_lock->fl_owner;
2678		flock.l_flock.start = file_lock->fl_start;
2679		flock.l_flock.end = file_lock->fl_end;
2680	} else {
2681		return -EINVAL;
2682	}
2683	flock.l_flock.pid = file_lock->fl_pid;
2684
2685	/* Somewhat ugly workaround for svc lockd.
2686	 * lockd installs custom fl_lmops->lm_compare_owner that checks
2687	 * for the fl_owner to be the same (which it always is on local node
2688	 * I guess between lockd processes) and then compares pid.
2689	 * As such we assign pid to the owner field to make it all work,
2690	 * conflict with normal locks is unlikely since pid space and
2691	 * pointer space for current->files are not intersecting */
2692	if (file_lock->fl_lmops && file_lock->fl_lmops->lm_compare_owner)
2693		flock.l_flock.owner = (unsigned long)file_lock->fl_pid;
2694
2695	switch (file_lock->fl_type) {
2696	case F_RDLCK:
2697		einfo.ei_mode = LCK_PR;
2698		break;
2699	case F_UNLCK:
2700		/* An unlock request may or may not have any relation to
2701		 * existing locks so we may not be able to pass a lock handle
2702		 * via a normal ldlm_lock_cancel() request. The request may even
2703		 * unlock a byte range in the middle of an existing lock. In
2704		 * order to process an unlock request we need all of the same
2705		 * information that is given with a normal read or write record
2706		 * lock request. To avoid creating another ldlm unlock (cancel)
2707		 * message we'll treat a LCK_NL flock request as an unlock. */
2708		einfo.ei_mode = LCK_NL;
2709		break;
2710	case F_WRLCK:
2711		einfo.ei_mode = LCK_PW;
2712		break;
2713	default:
2714		CDEBUG(D_INFO, "Unknown fcntl lock type: %d\n",
2715			file_lock->fl_type);
2716		return -ENOTSUPP;
2717	}
2718
2719	switch (cmd) {
2720	case F_SETLKW:
2721#ifdef F_SETLKW64
2722	case F_SETLKW64:
2723#endif
2724		flags = 0;
2725		break;
2726	case F_SETLK:
2727#ifdef F_SETLK64
2728	case F_SETLK64:
2729#endif
2730		flags = LDLM_FL_BLOCK_NOWAIT;
2731		break;
2732	case F_GETLK:
2733#ifdef F_GETLK64
2734	case F_GETLK64:
2735#endif
2736		flags = LDLM_FL_TEST_LOCK;
2737		/* Save the old mode so that if the mode in the lock changes we
2738		 * can decrement the appropriate reader or writer refcount. */
2739		file_lock->fl_type = einfo.ei_mode;
2740		break;
2741	default:
2742		CERROR("unknown fcntl lock command: %d\n", cmd);
2743		return -EINVAL;
2744	}
2745
2746	op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
2747				     LUSTRE_OPC_ANY, NULL);
2748	if (IS_ERR(op_data))
2749		return PTR_ERR(op_data);
2750
2751	CDEBUG(D_DLMTRACE, "inode=%lu, pid=%u, flags=%#x, mode=%u, "
2752	       "start="LPU64", end="LPU64"\n", inode->i_ino, flock.l_flock.pid,
2753	       flags, einfo.ei_mode, flock.l_flock.start, flock.l_flock.end);
2754
2755	rc = md_enqueue(sbi->ll_md_exp, &einfo, NULL,
2756			op_data, &lockh, &flock, 0, NULL /* req */, flags);
2757
2758	if ((file_lock->fl_flags & FL_FLOCK) &&
2759	    (rc == 0 || file_lock->fl_type == F_UNLCK))
2760		rc2  = flock_lock_file_wait(file, file_lock);
2761	if ((file_lock->fl_flags & FL_POSIX) &&
2762	    (rc == 0 || file_lock->fl_type == F_UNLCK) &&
2763	    !(flags & LDLM_FL_TEST_LOCK))
2764		rc2  = posix_lock_file_wait(file, file_lock);
2765
2766	if (rc2 && file_lock->fl_type != F_UNLCK) {
2767		einfo.ei_mode = LCK_NL;
2768		md_enqueue(sbi->ll_md_exp, &einfo, NULL,
2769			op_data, &lockh, &flock, 0, NULL /* req */, flags);
2770		rc = rc2;
2771	}
2772
2773	ll_finish_md_op_data(op_data);
2774
2775	return rc;
2776}
2777
2778int ll_file_noflock(struct file *file, int cmd, struct file_lock *file_lock)
2779{
2780	return -ENOSYS;
2781}
2782
2783/**
2784 * test if some locks matching bits and l_req_mode are acquired
2785 * - bits can be in different locks
2786 * - if found clear the common lock bits in *bits
2787 * - the bits not found, are kept in *bits
2788 * \param inode [IN]
2789 * \param bits [IN] searched lock bits [IN]
2790 * \param l_req_mode [IN] searched lock mode
2791 * \retval boolean, true iff all bits are found
2792 */
2793int ll_have_md_lock(struct inode *inode, __u64 *bits,  ldlm_mode_t l_req_mode)
2794{
2795	struct lustre_handle lockh;
2796	ldlm_policy_data_t policy;
2797	ldlm_mode_t mode = (l_req_mode == LCK_MINMODE) ?
2798				(LCK_CR|LCK_CW|LCK_PR|LCK_PW) : l_req_mode;
2799	struct lu_fid *fid;
2800	__u64 flags;
2801	int i;
2802
2803	if (!inode)
2804	       return 0;
2805
2806	fid = &ll_i2info(inode)->lli_fid;
2807	CDEBUG(D_INFO, "trying to match res "DFID" mode %s\n", PFID(fid),
2808	       ldlm_lockname[mode]);
2809
2810	flags = LDLM_FL_BLOCK_GRANTED | LDLM_FL_CBPENDING | LDLM_FL_TEST_LOCK;
2811	for (i = 0; i <= MDS_INODELOCK_MAXSHIFT && *bits != 0; i++) {
2812		policy.l_inodebits.bits = *bits & (1 << i);
2813		if (policy.l_inodebits.bits == 0)
2814			continue;
2815
2816		if (md_lock_match(ll_i2mdexp(inode), flags, fid, LDLM_IBITS,
2817				  &policy, mode, &lockh)) {
2818			struct ldlm_lock *lock;
2819
2820			lock = ldlm_handle2lock(&lockh);
2821			if (lock) {
2822				*bits &=
2823				      ~(lock->l_policy_data.l_inodebits.bits);
2824				LDLM_LOCK_PUT(lock);
2825			} else {
2826				*bits &= ~policy.l_inodebits.bits;
2827			}
2828		}
2829	}
2830	return *bits == 0;
2831}
2832
2833ldlm_mode_t ll_take_md_lock(struct inode *inode, __u64 bits,
2834			    struct lustre_handle *lockh, __u64 flags,
2835			    ldlm_mode_t mode)
2836{
2837	ldlm_policy_data_t policy = { .l_inodebits = {bits}};
2838	struct lu_fid *fid;
2839	ldlm_mode_t rc;
2840
2841	fid = &ll_i2info(inode)->lli_fid;
2842	CDEBUG(D_INFO, "trying to match res "DFID"\n", PFID(fid));
2843
2844	rc = md_lock_match(ll_i2mdexp(inode), LDLM_FL_BLOCK_GRANTED|flags,
2845			   fid, LDLM_IBITS, &policy, mode, lockh);
2846
2847	return rc;
2848}
2849
2850static int ll_inode_revalidate_fini(struct inode *inode, int rc)
2851{
2852	/* Already unlinked. Just update nlink and return success */
2853	if (rc == -ENOENT) {
2854		clear_nlink(inode);
2855		/* This path cannot be hit for regular files unless in
2856		 * case of obscure races, so no need to validate size.
2857		 */
2858		if (!S_ISREG(inode->i_mode) && !S_ISDIR(inode->i_mode))
2859			return 0;
2860	} else if (rc != 0) {
2861		CERROR("%s: revalidate FID "DFID" error: rc = %d\n",
2862		       ll_get_fsname(inode->i_sb, NULL, 0),
2863		       PFID(ll_inode2fid(inode)), rc);
2864	}
2865
2866	return rc;
2867}
2868
2869int __ll_inode_revalidate_it(struct dentry *dentry, struct lookup_intent *it,
2870			     __u64 ibits)
2871{
2872	struct inode *inode = dentry->d_inode;
2873	struct ptlrpc_request *req = NULL;
2874	struct obd_export *exp;
2875	int rc = 0;
2876
2877	LASSERT(inode != NULL);
2878
2879	CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p),name=%s\n",
2880	       inode->i_ino, inode->i_generation, inode, dentry->d_name.name);
2881
2882	exp = ll_i2mdexp(inode);
2883
2884	/* XXX: Enable OBD_CONNECT_ATTRFID to reduce unnecessary getattr RPC.
2885	 *      But under CMD case, it caused some lock issues, should be fixed
2886	 *      with new CMD ibits lock. See bug 12718 */
2887	if (exp_connect_flags(exp) & OBD_CONNECT_ATTRFID) {
2888		struct lookup_intent oit = { .it_op = IT_GETATTR };
2889		struct md_op_data *op_data;
2890
2891		if (ibits == MDS_INODELOCK_LOOKUP)
2892			oit.it_op = IT_LOOKUP;
2893
2894		/* Call getattr by fid, so do not provide name at all. */
2895		op_data = ll_prep_md_op_data(NULL, dentry->d_parent->d_inode,
2896					     dentry->d_inode, NULL, 0, 0,
2897					     LUSTRE_OPC_ANY, NULL);
2898		if (IS_ERR(op_data))
2899			return PTR_ERR(op_data);
2900
2901		oit.it_create_mode |= M_CHECK_STALE;
2902		rc = md_intent_lock(exp, op_data, NULL, 0,
2903				    /* we are not interested in name
2904				       based lookup */
2905				    &oit, 0, &req,
2906				    ll_md_blocking_ast, 0);
2907		ll_finish_md_op_data(op_data);
2908		oit.it_create_mode &= ~M_CHECK_STALE;
2909		if (rc < 0) {
2910			rc = ll_inode_revalidate_fini(inode, rc);
2911			GOTO (out, rc);
2912		}
2913
2914		rc = ll_revalidate_it_finish(req, &oit, dentry);
2915		if (rc != 0) {
2916			ll_intent_release(&oit);
2917			GOTO(out, rc);
2918		}
2919
2920		/* Unlinked? Unhash dentry, so it is not picked up later by
2921		   do_lookup() -> ll_revalidate_it(). We cannot use d_drop
2922		   here to preserve get_cwd functionality on 2.6.
2923		   Bug 10503 */
2924		if (!dentry->d_inode->i_nlink)
2925			d_lustre_invalidate(dentry, 0);
2926
2927		ll_lookup_finish_locks(&oit, dentry);
2928	} else if (!ll_have_md_lock(dentry->d_inode, &ibits, LCK_MINMODE)) {
2929		struct ll_sb_info *sbi = ll_i2sbi(dentry->d_inode);
2930		obd_valid valid = OBD_MD_FLGETATTR;
2931		struct md_op_data *op_data;
2932		int ealen = 0;
2933
2934		if (S_ISREG(inode->i_mode)) {
2935			rc = ll_get_max_mdsize(sbi, &ealen);
2936			if (rc)
2937				return rc;
2938			valid |= OBD_MD_FLEASIZE | OBD_MD_FLMODEASIZE;
2939		}
2940
2941		op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL,
2942					     0, ealen, LUSTRE_OPC_ANY,
2943					     NULL);
2944		if (IS_ERR(op_data))
2945			return PTR_ERR(op_data);
2946
2947		op_data->op_valid = valid;
2948		/* Once OBD_CONNECT_ATTRFID is not supported, we can't find one
2949		 * capa for this inode. Because we only keep capas of dirs
2950		 * fresh. */
2951		rc = md_getattr(sbi->ll_md_exp, op_data, &req);
2952		ll_finish_md_op_data(op_data);
2953		if (rc) {
2954			rc = ll_inode_revalidate_fini(inode, rc);
2955			return rc;
2956		}
2957
2958		rc = ll_prep_inode(&inode, req, NULL, NULL);
2959	}
2960out:
2961	ptlrpc_req_finished(req);
2962	return rc;
2963}
2964
2965int ll_inode_revalidate_it(struct dentry *dentry, struct lookup_intent *it,
2966			   __u64 ibits)
2967{
2968	struct inode *inode = dentry->d_inode;
2969	int rc;
2970
2971	rc = __ll_inode_revalidate_it(dentry, it, ibits);
2972	if (rc != 0)
2973		return rc;
2974
2975	/* if object isn't regular file, don't validate size */
2976	if (!S_ISREG(inode->i_mode)) {
2977		LTIME_S(inode->i_atime) = ll_i2info(inode)->lli_lvb.lvb_atime;
2978		LTIME_S(inode->i_mtime) = ll_i2info(inode)->lli_lvb.lvb_mtime;
2979		LTIME_S(inode->i_ctime) = ll_i2info(inode)->lli_lvb.lvb_ctime;
2980	} else {
2981		/* In case of restore, the MDT has the right size and has
2982		 * already send it back without granting the layout lock,
2983		 * inode is up-to-date so glimpse is useless.
2984		 * Also to glimpse we need the layout, in case of a running
2985		 * restore the MDT holds the layout lock so the glimpse will
2986		 * block up to the end of restore (getattr will block)
2987		 */
2988		if (!(ll_i2info(inode)->lli_flags & LLIF_FILE_RESTORING))
2989			rc = ll_glimpse_size(inode);
2990	}
2991	return rc;
2992}
2993
2994int ll_getattr_it(struct vfsmount *mnt, struct dentry *de,
2995		  struct lookup_intent *it, struct kstat *stat)
2996{
2997	struct inode *inode = de->d_inode;
2998	struct ll_sb_info *sbi = ll_i2sbi(inode);
2999	struct ll_inode_info *lli = ll_i2info(inode);
3000	int res = 0;
3001
3002	res = ll_inode_revalidate_it(de, it, MDS_INODELOCK_UPDATE |
3003					     MDS_INODELOCK_LOOKUP);
3004	ll_stats_ops_tally(sbi, LPROC_LL_GETATTR, 1);
3005
3006	if (res)
3007		return res;
3008
3009	stat->dev = inode->i_sb->s_dev;
3010	if (ll_need_32bit_api(sbi))
3011		stat->ino = cl_fid_build_ino(&lli->lli_fid, 1);
3012	else
3013		stat->ino = inode->i_ino;
3014	stat->mode = inode->i_mode;
3015	stat->nlink = inode->i_nlink;
3016	stat->uid = inode->i_uid;
3017	stat->gid = inode->i_gid;
3018	stat->rdev = inode->i_rdev;
3019	stat->atime = inode->i_atime;
3020	stat->mtime = inode->i_mtime;
3021	stat->ctime = inode->i_ctime;
3022	stat->blksize = 1 << inode->i_blkbits;
3023
3024	stat->size = i_size_read(inode);
3025	stat->blocks = inode->i_blocks;
3026
3027	return 0;
3028}
3029int ll_getattr(struct vfsmount *mnt, struct dentry *de, struct kstat *stat)
3030{
3031	struct lookup_intent it = { .it_op = IT_GETATTR };
3032
3033	return ll_getattr_it(mnt, de, &it, stat);
3034}
3035
3036int ll_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
3037		__u64 start, __u64 len)
3038{
3039	int rc;
3040	size_t num_bytes;
3041	struct ll_user_fiemap *fiemap;
3042	unsigned int extent_count = fieinfo->fi_extents_max;
3043
3044	num_bytes = sizeof(*fiemap) + (extent_count *
3045				       sizeof(struct ll_fiemap_extent));
3046	OBD_ALLOC_LARGE(fiemap, num_bytes);
3047
3048	if (fiemap == NULL)
3049		return -ENOMEM;
3050
3051	fiemap->fm_flags = fieinfo->fi_flags;
3052	fiemap->fm_extent_count = fieinfo->fi_extents_max;
3053	fiemap->fm_start = start;
3054	fiemap->fm_length = len;
3055	memcpy(&fiemap->fm_extents[0], fieinfo->fi_extents_start,
3056	       sizeof(struct ll_fiemap_extent));
3057
3058	rc = ll_do_fiemap(inode, fiemap, num_bytes);
3059
3060	fieinfo->fi_flags = fiemap->fm_flags;
3061	fieinfo->fi_extents_mapped = fiemap->fm_mapped_extents;
3062	memcpy(fieinfo->fi_extents_start, &fiemap->fm_extents[0],
3063	       fiemap->fm_mapped_extents * sizeof(struct ll_fiemap_extent));
3064
3065	OBD_FREE_LARGE(fiemap, num_bytes);
3066	return rc;
3067}
3068
3069struct posix_acl * ll_get_acl(struct inode *inode, int type)
3070{
3071	struct ll_inode_info *lli = ll_i2info(inode);
3072	struct posix_acl *acl = NULL;
3073
3074	spin_lock(&lli->lli_lock);
3075	/* VFS' acl_permission_check->check_acl will release the refcount */
3076	acl = posix_acl_dup(lli->lli_posix_acl);
3077	spin_unlock(&lli->lli_lock);
3078
3079	return acl;
3080}
3081
3082
3083int ll_inode_permission(struct inode *inode, int mask)
3084{
3085	int rc = 0;
3086
3087#ifdef MAY_NOT_BLOCK
3088	if (mask & MAY_NOT_BLOCK)
3089		return -ECHILD;
3090#endif
3091
3092       /* as root inode are NOT getting validated in lookup operation,
3093	* need to do it before permission check. */
3094
3095	if (inode == inode->i_sb->s_root->d_inode) {
3096		struct lookup_intent it = { .it_op = IT_LOOKUP };
3097
3098		rc = __ll_inode_revalidate_it(inode->i_sb->s_root, &it,
3099					      MDS_INODELOCK_LOOKUP);
3100		if (rc)
3101			return rc;
3102	}
3103
3104	CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p), inode mode %x mask %o\n",
3105	       inode->i_ino, inode->i_generation, inode, inode->i_mode, mask);
3106
3107	if (ll_i2sbi(inode)->ll_flags & LL_SBI_RMT_CLIENT)
3108		return lustre_check_remote_perm(inode, mask);
3109
3110	ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_INODE_PERM, 1);
3111	rc = generic_permission(inode, mask);
3112
3113	return rc;
3114}
3115
3116/* -o localflock - only provides locally consistent flock locks */
3117struct file_operations ll_file_operations = {
3118	.read	   = ll_file_read,
3119	.aio_read = ll_file_aio_read,
3120	.write	  = ll_file_write,
3121	.aio_write = ll_file_aio_write,
3122	.unlocked_ioctl = ll_file_ioctl,
3123	.open	   = ll_file_open,
3124	.release	= ll_file_release,
3125	.mmap	   = ll_file_mmap,
3126	.llseek	 = ll_file_seek,
3127	.splice_read    = ll_file_splice_read,
3128	.fsync	  = ll_fsync,
3129	.flush	  = ll_flush
3130};
3131
3132struct file_operations ll_file_operations_flock = {
3133	.read	   = ll_file_read,
3134	.aio_read    = ll_file_aio_read,
3135	.write	  = ll_file_write,
3136	.aio_write   = ll_file_aio_write,
3137	.unlocked_ioctl = ll_file_ioctl,
3138	.open	   = ll_file_open,
3139	.release	= ll_file_release,
3140	.mmap	   = ll_file_mmap,
3141	.llseek	 = ll_file_seek,
3142	.splice_read    = ll_file_splice_read,
3143	.fsync	  = ll_fsync,
3144	.flush	  = ll_flush,
3145	.flock	  = ll_file_flock,
3146	.lock	   = ll_file_flock
3147};
3148
3149/* These are for -o noflock - to return ENOSYS on flock calls */
3150struct file_operations ll_file_operations_noflock = {
3151	.read	   = ll_file_read,
3152	.aio_read    = ll_file_aio_read,
3153	.write	  = ll_file_write,
3154	.aio_write   = ll_file_aio_write,
3155	.unlocked_ioctl = ll_file_ioctl,
3156	.open	   = ll_file_open,
3157	.release	= ll_file_release,
3158	.mmap	   = ll_file_mmap,
3159	.llseek	 = ll_file_seek,
3160	.splice_read    = ll_file_splice_read,
3161	.fsync	  = ll_fsync,
3162	.flush	  = ll_flush,
3163	.flock	  = ll_file_noflock,
3164	.lock	   = ll_file_noflock
3165};
3166
3167struct inode_operations ll_file_inode_operations = {
3168	.setattr	= ll_setattr,
3169	.getattr	= ll_getattr,
3170	.permission	= ll_inode_permission,
3171	.setxattr	= ll_setxattr,
3172	.getxattr	= ll_getxattr,
3173	.listxattr	= ll_listxattr,
3174	.removexattr	= ll_removexattr,
3175	.fiemap		= ll_fiemap,
3176	.get_acl	= ll_get_acl,
3177};
3178
3179/* dynamic ioctl number support routins */
3180static struct llioc_ctl_data {
3181	struct rw_semaphore	ioc_sem;
3182	struct list_head	      ioc_head;
3183} llioc = {
3184	__RWSEM_INITIALIZER(llioc.ioc_sem),
3185	LIST_HEAD_INIT(llioc.ioc_head)
3186};
3187
3188
3189struct llioc_data {
3190	struct list_head	      iocd_list;
3191	unsigned int	    iocd_size;
3192	llioc_callback_t	iocd_cb;
3193	unsigned int	    iocd_count;
3194	unsigned int	    iocd_cmd[0];
3195};
3196
3197void *ll_iocontrol_register(llioc_callback_t cb, int count, unsigned int *cmd)
3198{
3199	unsigned int size;
3200	struct llioc_data *in_data = NULL;
3201
3202	if (cb == NULL || cmd == NULL ||
3203	    count > LLIOC_MAX_CMD || count < 0)
3204		return NULL;
3205
3206	size = sizeof(*in_data) + count * sizeof(unsigned int);
3207	OBD_ALLOC(in_data, size);
3208	if (in_data == NULL)
3209		return NULL;
3210
3211	memset(in_data, 0, sizeof(*in_data));
3212	in_data->iocd_size = size;
3213	in_data->iocd_cb = cb;
3214	in_data->iocd_count = count;
3215	memcpy(in_data->iocd_cmd, cmd, sizeof(unsigned int) * count);
3216
3217	down_write(&llioc.ioc_sem);
3218	list_add_tail(&in_data->iocd_list, &llioc.ioc_head);
3219	up_write(&llioc.ioc_sem);
3220
3221	return in_data;
3222}
3223
3224void ll_iocontrol_unregister(void *magic)
3225{
3226	struct llioc_data *tmp;
3227
3228	if (magic == NULL)
3229		return;
3230
3231	down_write(&llioc.ioc_sem);
3232	list_for_each_entry(tmp, &llioc.ioc_head, iocd_list) {
3233		if (tmp == magic) {
3234			unsigned int size = tmp->iocd_size;
3235
3236			list_del(&tmp->iocd_list);
3237			up_write(&llioc.ioc_sem);
3238
3239			OBD_FREE(tmp, size);
3240			return;
3241		}
3242	}
3243	up_write(&llioc.ioc_sem);
3244
3245	CWARN("didn't find iocontrol register block with magic: %p\n", magic);
3246}
3247
3248EXPORT_SYMBOL(ll_iocontrol_register);
3249EXPORT_SYMBOL(ll_iocontrol_unregister);
3250
3251enum llioc_iter ll_iocontrol_call(struct inode *inode, struct file *file,
3252			unsigned int cmd, unsigned long arg, int *rcp)
3253{
3254	enum llioc_iter ret = LLIOC_CONT;
3255	struct llioc_data *data;
3256	int rc = -EINVAL, i;
3257
3258	down_read(&llioc.ioc_sem);
3259	list_for_each_entry(data, &llioc.ioc_head, iocd_list) {
3260		for (i = 0; i < data->iocd_count; i++) {
3261			if (cmd != data->iocd_cmd[i])
3262				continue;
3263
3264			ret = data->iocd_cb(inode, file, cmd, arg, data, &rc);
3265			break;
3266		}
3267
3268		if (ret == LLIOC_STOP)
3269			break;
3270	}
3271	up_read(&llioc.ioc_sem);
3272
3273	if (rcp)
3274		*rcp = rc;
3275	return ret;
3276}
3277
3278int ll_layout_conf(struct inode *inode, const struct cl_object_conf *conf)
3279{
3280	struct ll_inode_info *lli = ll_i2info(inode);
3281	struct cl_env_nest nest;
3282	struct lu_env *env;
3283	int result;
3284
3285	if (lli->lli_clob == NULL)
3286		return 0;
3287
3288	env = cl_env_nested_get(&nest);
3289	if (IS_ERR(env))
3290		return PTR_ERR(env);
3291
3292	result = cl_conf_set(env, lli->lli_clob, conf);
3293	cl_env_nested_put(&nest, env);
3294
3295	if (conf->coc_opc == OBJECT_CONF_SET) {
3296		struct ldlm_lock *lock = conf->coc_lock;
3297
3298		LASSERT(lock != NULL);
3299		LASSERT(ldlm_has_layout(lock));
3300		if (result == 0) {
3301			/* it can only be allowed to match after layout is
3302			 * applied to inode otherwise false layout would be
3303			 * seen. Applying layout shoud happen before dropping
3304			 * the intent lock. */
3305			ldlm_lock_allow_match(lock);
3306		}
3307	}
3308	return result;
3309}
3310
3311/* Fetch layout from MDT with getxattr request, if it's not ready yet */
3312static int ll_layout_fetch(struct inode *inode, struct ldlm_lock *lock)
3313
3314{
3315	struct ll_sb_info *sbi = ll_i2sbi(inode);
3316	struct obd_capa *oc;
3317	struct ptlrpc_request *req;
3318	struct mdt_body *body;
3319	void *lvbdata;
3320	void *lmm;
3321	int lmmsize;
3322	int rc;
3323
3324	CDEBUG(D_INODE, DFID" LVB_READY=%d l_lvb_data=%p l_lvb_len=%d\n",
3325	       PFID(ll_inode2fid(inode)), !!(lock->l_flags & LDLM_FL_LVB_READY),
3326	       lock->l_lvb_data, lock->l_lvb_len);
3327
3328	if ((lock->l_lvb_data != NULL) && (lock->l_flags & LDLM_FL_LVB_READY))
3329		return 0;
3330
3331	/* if layout lock was granted right away, the layout is returned
3332	 * within DLM_LVB of dlm reply; otherwise if the lock was ever
3333	 * blocked and then granted via completion ast, we have to fetch
3334	 * layout here. Please note that we can't use the LVB buffer in
3335	 * completion AST because it doesn't have a large enough buffer */
3336	oc = ll_mdscapa_get(inode);
3337	rc = ll_get_max_mdsize(sbi, &lmmsize);
3338	if (rc == 0)
3339		rc = md_getxattr(sbi->ll_md_exp, ll_inode2fid(inode), oc,
3340				OBD_MD_FLXATTR, XATTR_NAME_LOV, NULL, 0,
3341				lmmsize, 0, &req);
3342	capa_put(oc);
3343	if (rc < 0)
3344		return rc;
3345
3346	body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
3347	if (body == NULL || body->eadatasize > lmmsize)
3348		GOTO(out, rc = -EPROTO);
3349
3350	lmmsize = body->eadatasize;
3351	if (lmmsize == 0) /* empty layout */
3352		GOTO(out, rc = 0);
3353
3354	lmm = req_capsule_server_sized_get(&req->rq_pill, &RMF_EADATA, lmmsize);
3355	if (lmm == NULL)
3356		GOTO(out, rc = -EFAULT);
3357
3358	OBD_ALLOC_LARGE(lvbdata, lmmsize);
3359	if (lvbdata == NULL)
3360		GOTO(out, rc = -ENOMEM);
3361
3362	memcpy(lvbdata, lmm, lmmsize);
3363	lock_res_and_lock(lock);
3364	if (lock->l_lvb_data != NULL)
3365		OBD_FREE_LARGE(lock->l_lvb_data, lock->l_lvb_len);
3366
3367	lock->l_lvb_data = lvbdata;
3368	lock->l_lvb_len = lmmsize;
3369	unlock_res_and_lock(lock);
3370
3371out:
3372	ptlrpc_req_finished(req);
3373	return rc;
3374}
3375
3376/**
3377 * Apply the layout to the inode. Layout lock is held and will be released
3378 * in this function.
3379 */
3380static int ll_layout_lock_set(struct lustre_handle *lockh, ldlm_mode_t mode,
3381				struct inode *inode, __u32 *gen, bool reconf)
3382{
3383	struct ll_inode_info *lli = ll_i2info(inode);
3384	struct ll_sb_info    *sbi = ll_i2sbi(inode);
3385	struct ldlm_lock *lock;
3386	struct lustre_md md = { NULL };
3387	struct cl_object_conf conf;
3388	int rc = 0;
3389	bool lvb_ready;
3390	bool wait_layout = false;
3391
3392	LASSERT(lustre_handle_is_used(lockh));
3393
3394	lock = ldlm_handle2lock(lockh);
3395	LASSERT(lock != NULL);
3396	LASSERT(ldlm_has_layout(lock));
3397
3398	LDLM_DEBUG(lock, "File %p/"DFID" being reconfigured: %d.\n",
3399		   inode, PFID(&lli->lli_fid), reconf);
3400
3401	/* in case this is a caching lock and reinstate with new inode */
3402	md_set_lock_data(sbi->ll_md_exp, &lockh->cookie, inode, NULL);
3403
3404	lock_res_and_lock(lock);
3405	lvb_ready = !!(lock->l_flags & LDLM_FL_LVB_READY);
3406	unlock_res_and_lock(lock);
3407	/* checking lvb_ready is racy but this is okay. The worst case is
3408	 * that multi processes may configure the file on the same time. */
3409	if (lvb_ready || !reconf) {
3410		rc = -ENODATA;
3411		if (lvb_ready) {
3412			/* layout_gen must be valid if layout lock is not
3413			 * cancelled and stripe has already set */
3414			*gen = lli->lli_layout_gen;
3415			rc = 0;
3416		}
3417		GOTO(out, rc);
3418	}
3419
3420	rc = ll_layout_fetch(inode, lock);
3421	if (rc < 0)
3422		GOTO(out, rc);
3423
3424	/* for layout lock, lmm is returned in lock's lvb.
3425	 * lvb_data is immutable if the lock is held so it's safe to access it
3426	 * without res lock. See the description in ldlm_lock_decref_internal()
3427	 * for the condition to free lvb_data of layout lock */
3428	if (lock->l_lvb_data != NULL) {
3429		rc = obd_unpackmd(sbi->ll_dt_exp, &md.lsm,
3430				  lock->l_lvb_data, lock->l_lvb_len);
3431		if (rc >= 0) {
3432			*gen = LL_LAYOUT_GEN_EMPTY;
3433			if (md.lsm != NULL)
3434				*gen = md.lsm->lsm_layout_gen;
3435			rc = 0;
3436		} else {
3437			CERROR("%s: file "DFID" unpackmd error: %d\n",
3438				ll_get_fsname(inode->i_sb, NULL, 0),
3439				PFID(&lli->lli_fid), rc);
3440		}
3441	}
3442	if (rc < 0)
3443		GOTO(out, rc);
3444
3445	/* set layout to file. Unlikely this will fail as old layout was
3446	 * surely eliminated */
3447	memset(&conf, 0, sizeof(conf));
3448	conf.coc_opc = OBJECT_CONF_SET;
3449	conf.coc_inode = inode;
3450	conf.coc_lock = lock;
3451	conf.u.coc_md = &md;
3452	rc = ll_layout_conf(inode, &conf);
3453
3454	if (md.lsm != NULL)
3455		obd_free_memmd(sbi->ll_dt_exp, &md.lsm);
3456
3457	/* refresh layout failed, need to wait */
3458	wait_layout = rc == -EBUSY;
3459
3460out:
3461	LDLM_LOCK_PUT(lock);
3462	ldlm_lock_decref(lockh, mode);
3463
3464	/* wait for IO to complete if it's still being used. */
3465	if (wait_layout) {
3466		CDEBUG(D_INODE, "%s: %p/"DFID" wait for layout reconf.\n",
3467			ll_get_fsname(inode->i_sb, NULL, 0),
3468			inode, PFID(&lli->lli_fid));
3469
3470		memset(&conf, 0, sizeof(conf));
3471		conf.coc_opc = OBJECT_CONF_WAIT;
3472		conf.coc_inode = inode;
3473		rc = ll_layout_conf(inode, &conf);
3474		if (rc == 0)
3475			rc = -EAGAIN;
3476
3477		CDEBUG(D_INODE, "file: "DFID" waiting layout return: %d.\n",
3478			PFID(&lli->lli_fid), rc);
3479	}
3480	return rc;
3481}
3482
3483/**
3484 * This function checks if there exists a LAYOUT lock on the client side,
3485 * or enqueues it if it doesn't have one in cache.
3486 *
3487 * This function will not hold layout lock so it may be revoked any time after
3488 * this function returns. Any operations depend on layout should be redone
3489 * in that case.
3490 *
3491 * This function should be called before lov_io_init() to get an uptodate
3492 * layout version, the caller should save the version number and after IO
3493 * is finished, this function should be called again to verify that layout
3494 * is not changed during IO time.
3495 */
3496int ll_layout_refresh(struct inode *inode, __u32 *gen)
3497{
3498	struct ll_inode_info  *lli = ll_i2info(inode);
3499	struct ll_sb_info     *sbi = ll_i2sbi(inode);
3500	struct md_op_data     *op_data;
3501	struct lookup_intent   it;
3502	struct lustre_handle   lockh;
3503	ldlm_mode_t	       mode;
3504	struct ldlm_enqueue_info einfo = {
3505		.ei_type = LDLM_IBITS,
3506		.ei_mode = LCK_CR,
3507		.ei_cb_bl = ll_md_blocking_ast,
3508		.ei_cb_cp = ldlm_completion_ast,
3509	};
3510	int rc;
3511
3512	*gen = lli->lli_layout_gen;
3513	if (!(sbi->ll_flags & LL_SBI_LAYOUT_LOCK))
3514		return 0;
3515
3516	/* sanity checks */
3517	LASSERT(fid_is_sane(ll_inode2fid(inode)));
3518	LASSERT(S_ISREG(inode->i_mode));
3519
3520	/* mostly layout lock is caching on the local side, so try to match
3521	 * it before grabbing layout lock mutex. */
3522	mode = ll_take_md_lock(inode, MDS_INODELOCK_LAYOUT, &lockh, 0,
3523			       LCK_CR | LCK_CW | LCK_PR | LCK_PW);
3524	if (mode != 0) { /* hit cached lock */
3525		rc = ll_layout_lock_set(&lockh, mode, inode, gen, false);
3526		if (rc == 0)
3527			return 0;
3528
3529		/* better hold lli_layout_mutex to try again otherwise
3530		 * it will have starvation problem. */
3531	}
3532
3533	/* take layout lock mutex to enqueue layout lock exclusively. */
3534	mutex_lock(&lli->lli_layout_mutex);
3535
3536again:
3537	/* try again. Maybe somebody else has done this. */
3538	mode = ll_take_md_lock(inode, MDS_INODELOCK_LAYOUT, &lockh, 0,
3539			       LCK_CR | LCK_CW | LCK_PR | LCK_PW);
3540	if (mode != 0) { /* hit cached lock */
3541		rc = ll_layout_lock_set(&lockh, mode, inode, gen, true);
3542		if (rc == -EAGAIN)
3543			goto again;
3544
3545		mutex_unlock(&lli->lli_layout_mutex);
3546		return rc;
3547	}
3548
3549	op_data = ll_prep_md_op_data(NULL, inode, inode, NULL,
3550			0, 0, LUSTRE_OPC_ANY, NULL);
3551	if (IS_ERR(op_data)) {
3552		mutex_unlock(&lli->lli_layout_mutex);
3553		return PTR_ERR(op_data);
3554	}
3555
3556	/* have to enqueue one */
3557	memset(&it, 0, sizeof(it));
3558	it.it_op = IT_LAYOUT;
3559	lockh.cookie = 0ULL;
3560
3561	LDLM_DEBUG_NOLOCK("%s: requeue layout lock for file %p/"DFID".\n",
3562			ll_get_fsname(inode->i_sb, NULL, 0), inode,
3563			PFID(&lli->lli_fid));
3564
3565	rc = md_enqueue(sbi->ll_md_exp, &einfo, &it, op_data, &lockh,
3566			NULL, 0, NULL, 0);
3567	if (it.d.lustre.it_data != NULL)
3568		ptlrpc_req_finished(it.d.lustre.it_data);
3569	it.d.lustre.it_data = NULL;
3570
3571	ll_finish_md_op_data(op_data);
3572
3573	mode = it.d.lustre.it_lock_mode;
3574	it.d.lustre.it_lock_mode = 0;
3575	ll_intent_drop_lock(&it);
3576
3577	if (rc == 0) {
3578		/* set lock data in case this is a new lock */
3579		ll_set_lock_data(sbi->ll_md_exp, inode, &it, NULL);
3580		rc = ll_layout_lock_set(&lockh, mode, inode, gen, true);
3581		if (rc == -EAGAIN)
3582			goto again;
3583	}
3584	mutex_unlock(&lli->lli_layout_mutex);
3585
3586	return rc;
3587}
3588
3589/**
3590 *  This function send a restore request to the MDT
3591 */
3592int ll_layout_restore(struct inode *inode)
3593{
3594	struct hsm_user_request	*hur;
3595	int			 len, rc;
3596
3597	len = sizeof(struct hsm_user_request) +
3598	      sizeof(struct hsm_user_item);
3599	OBD_ALLOC(hur, len);
3600	if (hur == NULL)
3601		return -ENOMEM;
3602
3603	hur->hur_request.hr_action = HUA_RESTORE;
3604	hur->hur_request.hr_archive_id = 0;
3605	hur->hur_request.hr_flags = 0;
3606	memcpy(&hur->hur_user_item[0].hui_fid, &ll_i2info(inode)->lli_fid,
3607	       sizeof(hur->hur_user_item[0].hui_fid));
3608	hur->hur_user_item[0].hui_extent.length = -1;
3609	hur->hur_request.hr_itemcount = 1;
3610	rc = obd_iocontrol(LL_IOC_HSM_REQUEST, cl_i2sbi(inode)->ll_md_exp,
3611			   len, hur, NULL);
3612	OBD_FREE(hur, len);
3613	return rc;
3614}
3615