[go: nahoru, domu]

file.c revision a720b790627c2e840f7eb58cf53fefc0428cc758
1/*
2 * GPL HEADER START
3 *
4 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
5 *
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License version 2 only,
8 * as published by the Free Software Foundation.
9 *
10 * This program is distributed in the hope that it will be useful, but
11 * WITHOUT ANY WARRANTY; without even the implied warranty of
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
13 * General Public License version 2 for more details (a copy is included
14 * in the LICENSE file that accompanied this code).
15 *
16 * You should have received a copy of the GNU General Public License
17 * version 2 along with this program; If not, see
18 * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
19 *
20 * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
21 * CA 95054 USA or visit www.sun.com if you need additional information or
22 * have any questions.
23 *
24 * GPL HEADER END
25 */
26/*
27 * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved.
28 * Use is subject to license terms.
29 *
30 * Copyright (c) 2011, 2012, Intel Corporation.
31 */
32/*
33 * This file is part of Lustre, http://www.lustre.org/
34 * Lustre is a trademark of Sun Microsystems, Inc.
35 *
36 * lustre/llite/file.c
37 *
38 * Author: Peter Braam <braam@clusterfs.com>
39 * Author: Phil Schwan <phil@clusterfs.com>
40 * Author: Andreas Dilger <adilger@clusterfs.com>
41 */
42
43#define DEBUG_SUBSYSTEM S_LLITE
44#include <lustre_dlm.h>
45#include <lustre_lite.h>
46#include <linux/pagemap.h>
47#include <linux/file.h>
48#include "llite_internal.h"
49#include <lustre/ll_fiemap.h>
50
51#include "cl_object.h"
52
53struct ll_file_data *ll_file_data_get(void)
54{
55	struct ll_file_data *fd;
56
57	OBD_SLAB_ALLOC_PTR_GFP(fd, ll_file_data_slab, __GFP_IO);
58	if (fd == NULL)
59		return NULL;
60	fd->fd_write_failed = false;
61	return fd;
62}
63
64static void ll_file_data_put(struct ll_file_data *fd)
65{
66	if (fd != NULL)
67		OBD_SLAB_FREE_PTR(fd, ll_file_data_slab);
68}
69
70void ll_pack_inode2opdata(struct inode *inode, struct md_op_data *op_data,
71			  struct lustre_handle *fh)
72{
73	op_data->op_fid1 = ll_i2info(inode)->lli_fid;
74	op_data->op_attr.ia_mode = inode->i_mode;
75	op_data->op_attr.ia_atime = inode->i_atime;
76	op_data->op_attr.ia_mtime = inode->i_mtime;
77	op_data->op_attr.ia_ctime = inode->i_ctime;
78	op_data->op_attr.ia_size = i_size_read(inode);
79	op_data->op_attr_blocks = inode->i_blocks;
80	((struct ll_iattr *)&op_data->op_attr)->ia_attr_flags =
81					ll_inode_to_ext_flags(inode->i_flags);
82	op_data->op_ioepoch = ll_i2info(inode)->lli_ioepoch;
83	if (fh)
84		op_data->op_handle = *fh;
85	op_data->op_capa1 = ll_mdscapa_get(inode);
86
87	if (LLIF_DATA_MODIFIED & ll_i2info(inode)->lli_flags)
88		op_data->op_bias |= MDS_DATA_MODIFIED;
89}
90
91/**
92 * Closes the IO epoch and packs all the attributes into @op_data for
93 * the CLOSE rpc.
94 */
95static void ll_prepare_close(struct inode *inode, struct md_op_data *op_data,
96			     struct obd_client_handle *och)
97{
98	op_data->op_attr.ia_valid = ATTR_MODE | ATTR_ATIME | ATTR_ATIME_SET |
99					ATTR_MTIME | ATTR_MTIME_SET |
100					ATTR_CTIME | ATTR_CTIME_SET;
101
102	if (!(och->och_flags & FMODE_WRITE))
103		goto out;
104
105	if (!exp_connect_som(ll_i2mdexp(inode)) || !S_ISREG(inode->i_mode))
106		op_data->op_attr.ia_valid |= ATTR_SIZE | ATTR_BLOCKS;
107	else
108		ll_ioepoch_close(inode, op_data, &och, 0);
109
110out:
111	ll_pack_inode2opdata(inode, op_data, &och->och_fh);
112	ll_prep_md_op_data(op_data, inode, NULL, NULL,
113			   0, 0, LUSTRE_OPC_ANY, NULL);
114}
115
116static int ll_close_inode_openhandle(struct obd_export *md_exp,
117				     struct inode *inode,
118				     struct obd_client_handle *och,
119				     const __u64 *data_version)
120{
121	struct obd_export *exp = ll_i2mdexp(inode);
122	struct md_op_data *op_data;
123	struct ptlrpc_request *req = NULL;
124	struct obd_device *obd = class_exp2obd(exp);
125	int epoch_close = 1;
126	int rc;
127
128	if (obd == NULL) {
129		/*
130		 * XXX: in case of LMV, is this correct to access
131		 * ->exp_handle?
132		 */
133		CERROR("Invalid MDC connection handle "LPX64"\n",
134		       ll_i2mdexp(inode)->exp_handle.h_cookie);
135		GOTO(out, rc = 0);
136	}
137
138	OBD_ALLOC_PTR(op_data);
139	if (op_data == NULL)
140		GOTO(out, rc = -ENOMEM); // XXX We leak openhandle and request here.
141
142	ll_prepare_close(inode, op_data, och);
143	if (data_version != NULL) {
144		/* Pass in data_version implies release. */
145		op_data->op_bias |= MDS_HSM_RELEASE;
146		op_data->op_data_version = *data_version;
147		op_data->op_lease_handle = och->och_lease_handle;
148		op_data->op_attr.ia_valid |= ATTR_SIZE | ATTR_BLOCKS;
149	}
150	epoch_close = (op_data->op_flags & MF_EPOCH_CLOSE);
151	rc = md_close(md_exp, op_data, och->och_mod, &req);
152	if (rc == -EAGAIN) {
153		/* This close must have the epoch closed. */
154		LASSERT(epoch_close);
155		/* MDS has instructed us to obtain Size-on-MDS attribute from
156		 * OSTs and send setattr to back to MDS. */
157		rc = ll_som_update(inode, op_data);
158		if (rc) {
159			CERROR("inode %lu mdc Size-on-MDS update failed: "
160			       "rc = %d\n", inode->i_ino, rc);
161			rc = 0;
162		}
163	} else if (rc) {
164		CERROR("inode %lu mdc close failed: rc = %d\n",
165		       inode->i_ino, rc);
166	}
167
168	/* DATA_MODIFIED flag was successfully sent on close, cancel data
169	 * modification flag. */
170	if (rc == 0 && (op_data->op_bias & MDS_DATA_MODIFIED)) {
171		struct ll_inode_info *lli = ll_i2info(inode);
172
173		spin_lock(&lli->lli_lock);
174		lli->lli_flags &= ~LLIF_DATA_MODIFIED;
175		spin_unlock(&lli->lli_lock);
176	}
177
178	if (rc == 0) {
179		rc = ll_objects_destroy(req, inode);
180		if (rc)
181			CERROR("inode %lu ll_objects destroy: rc = %d\n",
182			       inode->i_ino, rc);
183	}
184	if (rc == 0 && op_data->op_bias & MDS_HSM_RELEASE) {
185		struct mdt_body *body;
186		body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
187		if (!(body->valid & OBD_MD_FLRELEASED))
188			rc = -EBUSY;
189	}
190
191	ll_finish_md_op_data(op_data);
192
193out:
194	if (exp_connect_som(exp) && !epoch_close &&
195	    S_ISREG(inode->i_mode) && (och->och_flags & FMODE_WRITE)) {
196		ll_queue_done_writing(inode, LLIF_DONE_WRITING);
197	} else {
198		md_clear_open_replay_data(md_exp, och);
199		/* Free @och if it is not waiting for DONE_WRITING. */
200		och->och_fh.cookie = DEAD_HANDLE_MAGIC;
201		OBD_FREE_PTR(och);
202	}
203	if (req) /* This is close request */
204		ptlrpc_req_finished(req);
205	return rc;
206}
207
208int ll_md_real_close(struct inode *inode, int flags)
209{
210	struct ll_inode_info *lli = ll_i2info(inode);
211	struct obd_client_handle **och_p;
212	struct obd_client_handle *och;
213	__u64 *och_usecount;
214	int rc = 0;
215
216	if (flags & FMODE_WRITE) {
217		och_p = &lli->lli_mds_write_och;
218		och_usecount = &lli->lli_open_fd_write_count;
219	} else if (flags & FMODE_EXEC) {
220		och_p = &lli->lli_mds_exec_och;
221		och_usecount = &lli->lli_open_fd_exec_count;
222	} else {
223		LASSERT(flags & FMODE_READ);
224		och_p = &lli->lli_mds_read_och;
225		och_usecount = &lli->lli_open_fd_read_count;
226	}
227
228	mutex_lock(&lli->lli_och_mutex);
229	if (*och_usecount) { /* There are still users of this handle, so
230				skip freeing it. */
231		mutex_unlock(&lli->lli_och_mutex);
232		return 0;
233	}
234	och=*och_p;
235	*och_p = NULL;
236	mutex_unlock(&lli->lli_och_mutex);
237
238	if (och) { /* There might be a race and somebody have freed this och
239		      already */
240		rc = ll_close_inode_openhandle(ll_i2sbi(inode)->ll_md_exp,
241					       inode, och, NULL);
242	}
243
244	return rc;
245}
246
247int ll_md_close(struct obd_export *md_exp, struct inode *inode,
248		struct file *file)
249{
250	struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
251	struct ll_inode_info *lli = ll_i2info(inode);
252	int rc = 0;
253
254	/* clear group lock, if present */
255	if (unlikely(fd->fd_flags & LL_FILE_GROUP_LOCKED))
256		ll_put_grouplock(inode, file, fd->fd_grouplock.cg_gid);
257
258	if (fd->fd_lease_och != NULL) {
259		bool lease_broken;
260
261		/* Usually the lease is not released when the
262		 * application crashed, we need to release here. */
263		rc = ll_lease_close(fd->fd_lease_och, inode, &lease_broken);
264		CDEBUG(rc ? D_ERROR : D_INODE, "Clean up lease "DFID" %d/%d\n",
265			PFID(&lli->lli_fid), rc, lease_broken);
266
267		fd->fd_lease_och = NULL;
268	}
269
270	if (fd->fd_och != NULL) {
271		rc = ll_close_inode_openhandle(md_exp, inode, fd->fd_och, NULL);
272		fd->fd_och = NULL;
273		GOTO(out, rc);
274	}
275
276	/* Let's see if we have good enough OPEN lock on the file and if
277	   we can skip talking to MDS */
278	if (file->f_dentry->d_inode) { /* Can this ever be false? */
279		int lockmode;
280		int flags = LDLM_FL_BLOCK_GRANTED | LDLM_FL_TEST_LOCK;
281		struct lustre_handle lockh;
282		struct inode *inode = file->f_dentry->d_inode;
283		ldlm_policy_data_t policy = {.l_inodebits={MDS_INODELOCK_OPEN}};
284
285		mutex_lock(&lli->lli_och_mutex);
286		if (fd->fd_omode & FMODE_WRITE) {
287			lockmode = LCK_CW;
288			LASSERT(lli->lli_open_fd_write_count);
289			lli->lli_open_fd_write_count--;
290		} else if (fd->fd_omode & FMODE_EXEC) {
291			lockmode = LCK_PR;
292			LASSERT(lli->lli_open_fd_exec_count);
293			lli->lli_open_fd_exec_count--;
294		} else {
295			lockmode = LCK_CR;
296			LASSERT(lli->lli_open_fd_read_count);
297			lli->lli_open_fd_read_count--;
298		}
299		mutex_unlock(&lli->lli_och_mutex);
300
301		if (!md_lock_match(md_exp, flags, ll_inode2fid(inode),
302				   LDLM_IBITS, &policy, lockmode,
303				   &lockh)) {
304			rc = ll_md_real_close(file->f_dentry->d_inode,
305					      fd->fd_omode);
306		}
307	} else {
308		CERROR("Releasing a file %p with negative dentry %p. Name %s",
309		       file, file->f_dentry, file->f_dentry->d_name.name);
310	}
311
312out:
313	LUSTRE_FPRIVATE(file) = NULL;
314	ll_file_data_put(fd);
315	ll_capa_close(inode);
316
317	return rc;
318}
319
320/* While this returns an error code, fput() the caller does not, so we need
321 * to make every effort to clean up all of our state here.  Also, applications
322 * rarely check close errors and even if an error is returned they will not
323 * re-try the close call.
324 */
325int ll_file_release(struct inode *inode, struct file *file)
326{
327	struct ll_file_data *fd;
328	struct ll_sb_info *sbi = ll_i2sbi(inode);
329	struct ll_inode_info *lli = ll_i2info(inode);
330	int rc;
331
332	CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p)\n", inode->i_ino,
333	       inode->i_generation, inode);
334
335#ifdef CONFIG_FS_POSIX_ACL
336	if (sbi->ll_flags & LL_SBI_RMT_CLIENT &&
337	    inode == inode->i_sb->s_root->d_inode) {
338		struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
339
340		LASSERT(fd != NULL);
341		if (unlikely(fd->fd_flags & LL_FILE_RMTACL)) {
342			fd->fd_flags &= ~LL_FILE_RMTACL;
343			rct_del(&sbi->ll_rct, current_pid());
344			et_search_free(&sbi->ll_et, current_pid());
345		}
346	}
347#endif
348
349	if (inode->i_sb->s_root != file->f_dentry)
350		ll_stats_ops_tally(sbi, LPROC_LL_RELEASE, 1);
351	fd = LUSTRE_FPRIVATE(file);
352	LASSERT(fd != NULL);
353
354	/* The last ref on @file, maybe not the the owner pid of statahead.
355	 * Different processes can open the same dir, "ll_opendir_key" means:
356	 * it is me that should stop the statahead thread. */
357	if (S_ISDIR(inode->i_mode) && lli->lli_opendir_key == fd &&
358	    lli->lli_opendir_pid != 0)
359		ll_stop_statahead(inode, lli->lli_opendir_key);
360
361	if (inode->i_sb->s_root == file->f_dentry) {
362		LUSTRE_FPRIVATE(file) = NULL;
363		ll_file_data_put(fd);
364		return 0;
365	}
366
367	if (!S_ISDIR(inode->i_mode)) {
368		lov_read_and_clear_async_rc(lli->lli_clob);
369		lli->lli_async_rc = 0;
370	}
371
372	rc = ll_md_close(sbi->ll_md_exp, inode, file);
373
374	if (CFS_FAIL_TIMEOUT_MS(OBD_FAIL_PTLRPC_DUMP_LOG, cfs_fail_val))
375		libcfs_debug_dumplog();
376
377	return rc;
378}
379
380static int ll_intent_file_open(struct file *file, void *lmm,
381			       int lmmsize, struct lookup_intent *itp)
382{
383	struct ll_sb_info *sbi = ll_i2sbi(file->f_dentry->d_inode);
384	struct dentry *parent = file->f_dentry->d_parent;
385	const char *name = file->f_dentry->d_name.name;
386	const int len = file->f_dentry->d_name.len;
387	struct md_op_data *op_data;
388	struct ptlrpc_request *req;
389	__u32 opc = LUSTRE_OPC_ANY;
390	int rc;
391
392	if (!parent)
393		return -ENOENT;
394
395	/* Usually we come here only for NFSD, and we want open lock.
396	   But we can also get here with pre 2.6.15 patchless kernels, and in
397	   that case that lock is also ok */
398	/* We can also get here if there was cached open handle in revalidate_it
399	 * but it disappeared while we were getting from there to ll_file_open.
400	 * But this means this file was closed and immediately opened which
401	 * makes a good candidate for using OPEN lock */
402	/* If lmmsize & lmm are not 0, we are just setting stripe info
403	 * parameters. No need for the open lock */
404	if (lmm == NULL && lmmsize == 0) {
405		itp->it_flags |= MDS_OPEN_LOCK;
406		if (itp->it_flags & FMODE_WRITE)
407			opc = LUSTRE_OPC_CREATE;
408	}
409
410	op_data  = ll_prep_md_op_data(NULL, parent->d_inode,
411				      file->f_dentry->d_inode, name, len,
412				      O_RDWR, opc, NULL);
413	if (IS_ERR(op_data))
414		return PTR_ERR(op_data);
415
416	itp->it_flags |= MDS_OPEN_BY_FID;
417	rc = md_intent_lock(sbi->ll_md_exp, op_data, lmm, lmmsize, itp,
418			    0 /*unused */, &req, ll_md_blocking_ast, 0);
419	ll_finish_md_op_data(op_data);
420	if (rc == -ESTALE) {
421		/* reason for keep own exit path - don`t flood log
422		* with messages with -ESTALE errors.
423		*/
424		if (!it_disposition(itp, DISP_OPEN_OPEN) ||
425		     it_open_error(DISP_OPEN_OPEN, itp))
426			GOTO(out, rc);
427		ll_release_openhandle(file->f_dentry, itp);
428		GOTO(out, rc);
429	}
430
431	if (it_disposition(itp, DISP_LOOKUP_NEG))
432		GOTO(out, rc = -ENOENT);
433
434	if (rc != 0 || it_open_error(DISP_OPEN_OPEN, itp)) {
435		rc = rc ? rc : it_open_error(DISP_OPEN_OPEN, itp);
436		CDEBUG(D_VFSTRACE, "lock enqueue: err: %d\n", rc);
437		GOTO(out, rc);
438	}
439
440	rc = ll_prep_inode(&file->f_dentry->d_inode, req, NULL, itp);
441	if (!rc && itp->d.lustre.it_lock_mode)
442		ll_set_lock_data(sbi->ll_md_exp, file->f_dentry->d_inode,
443				 itp, NULL);
444
445out:
446	ptlrpc_req_finished(itp->d.lustre.it_data);
447	it_clear_disposition(itp, DISP_ENQ_COMPLETE);
448	ll_intent_drop_lock(itp);
449
450	return rc;
451}
452
453/**
454 * Assign an obtained @ioepoch to client's inode. No lock is needed, MDS does
455 * not believe attributes if a few ioepoch holders exist. Attributes for
456 * previous ioepoch if new one is opened are also skipped by MDS.
457 */
458void ll_ioepoch_open(struct ll_inode_info *lli, __u64 ioepoch)
459{
460	if (ioepoch && lli->lli_ioepoch != ioepoch) {
461		lli->lli_ioepoch = ioepoch;
462		CDEBUG(D_INODE, "Epoch "LPU64" opened on "DFID"\n",
463		       ioepoch, PFID(&lli->lli_fid));
464	}
465}
466
467static int ll_och_fill(struct obd_export *md_exp, struct lookup_intent *it,
468		       struct obd_client_handle *och)
469{
470	struct ptlrpc_request *req = it->d.lustre.it_data;
471	struct mdt_body *body;
472
473	body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
474	och->och_fh = body->handle;
475	och->och_fid = body->fid1;
476	och->och_lease_handle.cookie = it->d.lustre.it_lock_handle;
477	och->och_magic = OBD_CLIENT_HANDLE_MAGIC;
478	och->och_flags = it->it_flags;
479
480	return md_set_open_replay_data(md_exp, och, req);
481}
482
483int ll_local_open(struct file *file, struct lookup_intent *it,
484		  struct ll_file_data *fd, struct obd_client_handle *och)
485{
486	struct inode *inode = file->f_dentry->d_inode;
487	struct ll_inode_info *lli = ll_i2info(inode);
488
489	LASSERT(!LUSTRE_FPRIVATE(file));
490
491	LASSERT(fd != NULL);
492
493	if (och) {
494		struct ptlrpc_request *req = it->d.lustre.it_data;
495		struct mdt_body *body;
496		int rc;
497
498		rc = ll_och_fill(ll_i2sbi(inode)->ll_md_exp, it, och);
499		if (rc != 0)
500			return rc;
501
502		body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
503		ll_ioepoch_open(lli, body->ioepoch);
504	}
505
506	LUSTRE_FPRIVATE(file) = fd;
507	ll_readahead_init(inode, &fd->fd_ras);
508	fd->fd_omode = it->it_flags & (FMODE_READ | FMODE_WRITE | FMODE_EXEC);
509	return 0;
510}
511
512/* Open a file, and (for the very first open) create objects on the OSTs at
513 * this time.  If opened with O_LOV_DELAY_CREATE, then we don't do the object
514 * creation or open until ll_lov_setstripe() ioctl is called.
515 *
516 * If we already have the stripe MD locally then we don't request it in
517 * md_open(), by passing a lmm_size = 0.
518 *
519 * It is up to the application to ensure no other processes open this file
520 * in the O_LOV_DELAY_CREATE case, or the default striping pattern will be
521 * used.  We might be able to avoid races of that sort by getting lli_open_sem
522 * before returning in the O_LOV_DELAY_CREATE case and dropping it here
523 * or in ll_file_release(), but I'm not sure that is desirable/necessary.
524 */
525int ll_file_open(struct inode *inode, struct file *file)
526{
527	struct ll_inode_info *lli = ll_i2info(inode);
528	struct lookup_intent *it, oit = { .it_op = IT_OPEN,
529					  .it_flags = file->f_flags };
530	struct obd_client_handle **och_p = NULL;
531	__u64 *och_usecount = NULL;
532	struct ll_file_data *fd;
533	int rc = 0, opendir_set = 0;
534
535	CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p), flags %o\n", inode->i_ino,
536	       inode->i_generation, inode, file->f_flags);
537
538	it = file->private_data; /* XXX: compat macro */
539	file->private_data = NULL; /* prevent ll_local_open assertion */
540
541	fd = ll_file_data_get();
542	if (fd == NULL)
543		GOTO(out_openerr, rc = -ENOMEM);
544
545	fd->fd_file = file;
546	if (S_ISDIR(inode->i_mode)) {
547		spin_lock(&lli->lli_sa_lock);
548		if (lli->lli_opendir_key == NULL && lli->lli_sai == NULL &&
549		    lli->lli_opendir_pid == 0) {
550			lli->lli_opendir_key = fd;
551			lli->lli_opendir_pid = current_pid();
552			opendir_set = 1;
553		}
554		spin_unlock(&lli->lli_sa_lock);
555	}
556
557	if (inode->i_sb->s_root == file->f_dentry) {
558		LUSTRE_FPRIVATE(file) = fd;
559		return 0;
560	}
561
562	if (!it || !it->d.lustre.it_disposition) {
563		/* Convert f_flags into access mode. We cannot use file->f_mode,
564		 * because everything but O_ACCMODE mask was stripped from
565		 * there */
566		if ((oit.it_flags + 1) & O_ACCMODE)
567			oit.it_flags++;
568		if (file->f_flags & O_TRUNC)
569			oit.it_flags |= FMODE_WRITE;
570
571		/* kernel only call f_op->open in dentry_open.  filp_open calls
572		 * dentry_open after call to open_namei that checks permissions.
573		 * Only nfsd_open call dentry_open directly without checking
574		 * permissions and because of that this code below is safe. */
575		if (oit.it_flags & (FMODE_WRITE | FMODE_READ))
576			oit.it_flags |= MDS_OPEN_OWNEROVERRIDE;
577
578		/* We do not want O_EXCL here, presumably we opened the file
579		 * already? XXX - NFS implications? */
580		oit.it_flags &= ~O_EXCL;
581
582		/* bug20584, if "it_flags" contains O_CREAT, the file will be
583		 * created if necessary, then "IT_CREAT" should be set to keep
584		 * consistent with it */
585		if (oit.it_flags & O_CREAT)
586			oit.it_op |= IT_CREAT;
587
588		it = &oit;
589	}
590
591restart:
592	/* Let's see if we have file open on MDS already. */
593	if (it->it_flags & FMODE_WRITE) {
594		och_p = &lli->lli_mds_write_och;
595		och_usecount = &lli->lli_open_fd_write_count;
596	} else if (it->it_flags & FMODE_EXEC) {
597		och_p = &lli->lli_mds_exec_och;
598		och_usecount = &lli->lli_open_fd_exec_count;
599	 } else {
600		och_p = &lli->lli_mds_read_och;
601		och_usecount = &lli->lli_open_fd_read_count;
602	}
603
604	mutex_lock(&lli->lli_och_mutex);
605	if (*och_p) { /* Open handle is present */
606		if (it_disposition(it, DISP_OPEN_OPEN)) {
607			/* Well, there's extra open request that we do not need,
608			   let's close it somehow. This will decref request. */
609			rc = it_open_error(DISP_OPEN_OPEN, it);
610			if (rc) {
611				mutex_unlock(&lli->lli_och_mutex);
612				GOTO(out_openerr, rc);
613			}
614
615			ll_release_openhandle(file->f_dentry, it);
616		}
617		(*och_usecount)++;
618
619		rc = ll_local_open(file, it, fd, NULL);
620		if (rc) {
621			(*och_usecount)--;
622			mutex_unlock(&lli->lli_och_mutex);
623			GOTO(out_openerr, rc);
624		}
625	} else {
626		LASSERT(*och_usecount == 0);
627		if (!it->d.lustre.it_disposition) {
628			/* We cannot just request lock handle now, new ELC code
629			   means that one of other OPEN locks for this file
630			   could be cancelled, and since blocking ast handler
631			   would attempt to grab och_mutex as well, that would
632			   result in a deadlock */
633			mutex_unlock(&lli->lli_och_mutex);
634			it->it_create_mode |= M_CHECK_STALE;
635			rc = ll_intent_file_open(file, NULL, 0, it);
636			it->it_create_mode &= ~M_CHECK_STALE;
637			if (rc)
638				GOTO(out_openerr, rc);
639
640			goto restart;
641		}
642		OBD_ALLOC(*och_p, sizeof (struct obd_client_handle));
643		if (!*och_p)
644			GOTO(out_och_free, rc = -ENOMEM);
645
646		(*och_usecount)++;
647
648		/* md_intent_lock() didn't get a request ref if there was an
649		 * open error, so don't do cleanup on the request here
650		 * (bug 3430) */
651		/* XXX (green): Should not we bail out on any error here, not
652		 * just open error? */
653		rc = it_open_error(DISP_OPEN_OPEN, it);
654		if (rc)
655			GOTO(out_och_free, rc);
656
657		LASSERT(it_disposition(it, DISP_ENQ_OPEN_REF));
658
659		rc = ll_local_open(file, it, fd, *och_p);
660		if (rc)
661			GOTO(out_och_free, rc);
662	}
663	mutex_unlock(&lli->lli_och_mutex);
664	fd = NULL;
665
666	/* Must do this outside lli_och_mutex lock to prevent deadlock where
667	   different kind of OPEN lock for this same inode gets cancelled
668	   by ldlm_cancel_lru */
669	if (!S_ISREG(inode->i_mode))
670		GOTO(out_och_free, rc);
671
672	ll_capa_open(inode);
673
674	if (!lli->lli_has_smd) {
675		if (file->f_flags & O_LOV_DELAY_CREATE ||
676		    !(file->f_mode & FMODE_WRITE)) {
677			CDEBUG(D_INODE, "object creation was delayed\n");
678			GOTO(out_och_free, rc);
679		}
680	}
681	file->f_flags &= ~O_LOV_DELAY_CREATE;
682	GOTO(out_och_free, rc);
683
684out_och_free:
685	if (rc) {
686		if (och_p && *och_p) {
687			OBD_FREE(*och_p, sizeof (struct obd_client_handle));
688			*och_p = NULL; /* OBD_FREE writes some magic there */
689			(*och_usecount)--;
690		}
691		mutex_unlock(&lli->lli_och_mutex);
692
693out_openerr:
694		if (opendir_set != 0)
695			ll_stop_statahead(inode, lli->lli_opendir_key);
696		if (fd != NULL)
697			ll_file_data_put(fd);
698	} else {
699		ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_OPEN, 1);
700	}
701
702	if (it && it_disposition(it, DISP_ENQ_OPEN_REF)) {
703		ptlrpc_req_finished(it->d.lustre.it_data);
704		it_clear_disposition(it, DISP_ENQ_OPEN_REF);
705	}
706
707	return rc;
708}
709
710static int ll_md_blocking_lease_ast(struct ldlm_lock *lock,
711			struct ldlm_lock_desc *desc, void *data, int flag)
712{
713	int rc;
714	struct lustre_handle lockh;
715
716	switch (flag) {
717	case LDLM_CB_BLOCKING:
718		ldlm_lock2handle(lock, &lockh);
719		rc = ldlm_cli_cancel(&lockh, LCF_ASYNC);
720		if (rc < 0) {
721			CDEBUG(D_INODE, "ldlm_cli_cancel: %d\n", rc);
722			return rc;
723		}
724		break;
725	case LDLM_CB_CANCELING:
726		/* do nothing */
727		break;
728	}
729	return 0;
730}
731
732/**
733 * Acquire a lease and open the file.
734 */
735struct obd_client_handle *ll_lease_open(struct inode *inode, struct file *file,
736					fmode_t fmode, __u64 open_flags)
737{
738	struct lookup_intent it = { .it_op = IT_OPEN };
739	struct ll_sb_info *sbi = ll_i2sbi(inode);
740	struct md_op_data *op_data;
741	struct ptlrpc_request *req;
742	struct lustre_handle old_handle = { 0 };
743	struct obd_client_handle *och = NULL;
744	int rc;
745	int rc2;
746
747	if (fmode != FMODE_WRITE && fmode != FMODE_READ)
748		return ERR_PTR(-EINVAL);
749
750	if (file != NULL) {
751		struct ll_inode_info *lli = ll_i2info(inode);
752		struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
753		struct obd_client_handle **och_p;
754		__u64 *och_usecount;
755
756		if (!(fmode & file->f_mode) || (file->f_mode & FMODE_EXEC))
757			return ERR_PTR(-EPERM);
758
759		/* Get the openhandle of the file */
760		rc = -EBUSY;
761		mutex_lock(&lli->lli_och_mutex);
762		if (fd->fd_lease_och != NULL) {
763			mutex_unlock(&lli->lli_och_mutex);
764			return ERR_PTR(rc);
765		}
766
767		if (fd->fd_och == NULL) {
768			if (file->f_mode & FMODE_WRITE) {
769				LASSERT(lli->lli_mds_write_och != NULL);
770				och_p = &lli->lli_mds_write_och;
771				och_usecount = &lli->lli_open_fd_write_count;
772			} else {
773				LASSERT(lli->lli_mds_read_och != NULL);
774				och_p = &lli->lli_mds_read_och;
775				och_usecount = &lli->lli_open_fd_read_count;
776			}
777			if (*och_usecount == 1) {
778				fd->fd_och = *och_p;
779				*och_p = NULL;
780				*och_usecount = 0;
781				rc = 0;
782			}
783		}
784		mutex_unlock(&lli->lli_och_mutex);
785		if (rc < 0) /* more than 1 opener */
786			return ERR_PTR(rc);
787
788		LASSERT(fd->fd_och != NULL);
789		old_handle = fd->fd_och->och_fh;
790	}
791
792	OBD_ALLOC_PTR(och);
793	if (och == NULL)
794		return ERR_PTR(-ENOMEM);
795
796	op_data = ll_prep_md_op_data(NULL, inode, inode, NULL, 0, 0,
797					LUSTRE_OPC_ANY, NULL);
798	if (IS_ERR(op_data))
799		GOTO(out, rc = PTR_ERR(op_data));
800
801	/* To tell the MDT this openhandle is from the same owner */
802	op_data->op_handle = old_handle;
803
804	it.it_flags = fmode | open_flags;
805	it.it_flags |= MDS_OPEN_LOCK | MDS_OPEN_BY_FID | MDS_OPEN_LEASE;
806	rc = md_intent_lock(sbi->ll_md_exp, op_data, NULL, 0, &it, 0, &req,
807				ll_md_blocking_lease_ast,
808	/* LDLM_FL_NO_LRU: To not put the lease lock into LRU list, otherwise
809	 * it can be cancelled which may mislead applications that the lease is
810	 * broken;
811	 * LDLM_FL_EXCL: Set this flag so that it won't be matched by normal
812	 * open in ll_md_blocking_ast(). Otherwise as ll_md_blocking_lease_ast
813	 * doesn't deal with openhandle, so normal openhandle will be leaked. */
814				LDLM_FL_NO_LRU | LDLM_FL_EXCL);
815	ll_finish_md_op_data(op_data);
816	if (req != NULL) {
817		ptlrpc_req_finished(req);
818		it_clear_disposition(&it, DISP_ENQ_COMPLETE);
819	}
820	if (rc < 0)
821		GOTO(out_release_it, rc);
822
823	if (it_disposition(&it, DISP_LOOKUP_NEG))
824		GOTO(out_release_it, rc = -ENOENT);
825
826	rc = it_open_error(DISP_OPEN_OPEN, &it);
827	if (rc)
828		GOTO(out_release_it, rc);
829
830	LASSERT(it_disposition(&it, DISP_ENQ_OPEN_REF));
831	ll_och_fill(sbi->ll_md_exp, &it, och);
832
833	if (!it_disposition(&it, DISP_OPEN_LEASE)) /* old server? */
834		GOTO(out_close, rc = -EOPNOTSUPP);
835
836	/* already get lease, handle lease lock */
837	ll_set_lock_data(sbi->ll_md_exp, inode, &it, NULL);
838	if (it.d.lustre.it_lock_mode == 0 ||
839	    it.d.lustre.it_lock_bits != MDS_INODELOCK_OPEN) {
840		/* open lock must return for lease */
841		CERROR(DFID "lease granted but no open lock, %d/%llu.\n",
842			PFID(ll_inode2fid(inode)), it.d.lustre.it_lock_mode,
843			it.d.lustre.it_lock_bits);
844		GOTO(out_close, rc = -EPROTO);
845	}
846
847	ll_intent_release(&it);
848	return och;
849
850out_close:
851	rc2 = ll_close_inode_openhandle(sbi->ll_md_exp, inode, och, NULL);
852	if (rc2)
853		CERROR("Close openhandle returned %d\n", rc2);
854
855	/* cancel open lock */
856	if (it.d.lustre.it_lock_mode != 0) {
857		ldlm_lock_decref_and_cancel(&och->och_lease_handle,
858						it.d.lustre.it_lock_mode);
859		it.d.lustre.it_lock_mode = 0;
860	}
861out_release_it:
862	ll_intent_release(&it);
863out:
864	OBD_FREE_PTR(och);
865	return ERR_PTR(rc);
866}
867EXPORT_SYMBOL(ll_lease_open);
868
869/**
870 * Release lease and close the file.
871 * It will check if the lease has ever broken.
872 */
873int ll_lease_close(struct obd_client_handle *och, struct inode *inode,
874			bool *lease_broken)
875{
876	struct ldlm_lock *lock;
877	bool cancelled = true;
878	int rc;
879
880	lock = ldlm_handle2lock(&och->och_lease_handle);
881	if (lock != NULL) {
882		lock_res_and_lock(lock);
883		cancelled = ldlm_is_cancel(lock);
884		unlock_res_and_lock(lock);
885		ldlm_lock_put(lock);
886	}
887
888	CDEBUG(D_INODE, "lease for "DFID" broken? %d\n",
889		PFID(&ll_i2info(inode)->lli_fid), cancelled);
890
891	if (!cancelled)
892		ldlm_cli_cancel(&och->och_lease_handle, 0);
893	if (lease_broken != NULL)
894		*lease_broken = cancelled;
895
896	rc = ll_close_inode_openhandle(ll_i2sbi(inode)->ll_md_exp, inode, och,
897				       NULL);
898	return rc;
899}
900EXPORT_SYMBOL(ll_lease_close);
901
902/* Fills the obdo with the attributes for the lsm */
903static int ll_lsm_getattr(struct lov_stripe_md *lsm, struct obd_export *exp,
904			  struct obd_capa *capa, struct obdo *obdo,
905			  __u64 ioepoch, int sync)
906{
907	struct ptlrpc_request_set *set;
908	struct obd_info	    oinfo = { { { 0 } } };
909	int			rc;
910
911	LASSERT(lsm != NULL);
912
913	oinfo.oi_md = lsm;
914	oinfo.oi_oa = obdo;
915	oinfo.oi_oa->o_oi = lsm->lsm_oi;
916	oinfo.oi_oa->o_mode = S_IFREG;
917	oinfo.oi_oa->o_ioepoch = ioepoch;
918	oinfo.oi_oa->o_valid = OBD_MD_FLID | OBD_MD_FLTYPE |
919			       OBD_MD_FLSIZE | OBD_MD_FLBLOCKS |
920			       OBD_MD_FLBLKSZ | OBD_MD_FLATIME |
921			       OBD_MD_FLMTIME | OBD_MD_FLCTIME |
922			       OBD_MD_FLGROUP | OBD_MD_FLEPOCH |
923			       OBD_MD_FLDATAVERSION;
924	oinfo.oi_capa = capa;
925	if (sync) {
926		oinfo.oi_oa->o_valid |= OBD_MD_FLFLAGS;
927		oinfo.oi_oa->o_flags |= OBD_FL_SRVLOCK;
928	}
929
930	set = ptlrpc_prep_set();
931	if (set == NULL) {
932		CERROR("can't allocate ptlrpc set\n");
933		rc = -ENOMEM;
934	} else {
935		rc = obd_getattr_async(exp, &oinfo, set);
936		if (rc == 0)
937			rc = ptlrpc_set_wait(set);
938		ptlrpc_set_destroy(set);
939	}
940	if (rc == 0)
941		oinfo.oi_oa->o_valid &= (OBD_MD_FLBLOCKS | OBD_MD_FLBLKSZ |
942					 OBD_MD_FLATIME | OBD_MD_FLMTIME |
943					 OBD_MD_FLCTIME | OBD_MD_FLSIZE |
944					 OBD_MD_FLDATAVERSION);
945	return rc;
946}
947
948/**
949  * Performs the getattr on the inode and updates its fields.
950  * If @sync != 0, perform the getattr under the server-side lock.
951  */
952int ll_inode_getattr(struct inode *inode, struct obdo *obdo,
953		     __u64 ioepoch, int sync)
954{
955	struct obd_capa      *capa = ll_mdscapa_get(inode);
956	struct lov_stripe_md *lsm;
957	int rc;
958
959	lsm = ccc_inode_lsm_get(inode);
960	rc = ll_lsm_getattr(lsm, ll_i2dtexp(inode),
961			    capa, obdo, ioepoch, sync);
962	capa_put(capa);
963	if (rc == 0) {
964		struct ost_id *oi = lsm ? &lsm->lsm_oi : &obdo->o_oi;
965
966		obdo_refresh_inode(inode, obdo, obdo->o_valid);
967		CDEBUG(D_INODE, "objid "DOSTID" size %llu, blocks %llu,"
968		       " blksize %lu\n", POSTID(oi), i_size_read(inode),
969		       (unsigned long long)inode->i_blocks,
970		       (unsigned long)ll_inode_blksize(inode));
971	}
972	ccc_inode_lsm_put(inode, lsm);
973	return rc;
974}
975
976int ll_merge_lvb(const struct lu_env *env, struct inode *inode)
977{
978	struct ll_inode_info *lli = ll_i2info(inode);
979	struct cl_object *obj = lli->lli_clob;
980	struct cl_attr *attr = ccc_env_thread_attr(env);
981	struct ost_lvb lvb;
982	int rc = 0;
983
984	ll_inode_size_lock(inode);
985	/* merge timestamps the most recently obtained from mds with
986	   timestamps obtained from osts */
987	LTIME_S(inode->i_atime) = lli->lli_lvb.lvb_atime;
988	LTIME_S(inode->i_mtime) = lli->lli_lvb.lvb_mtime;
989	LTIME_S(inode->i_ctime) = lli->lli_lvb.lvb_ctime;
990	inode_init_lvb(inode, &lvb);
991
992	cl_object_attr_lock(obj);
993	rc = cl_object_attr_get(env, obj, attr);
994	cl_object_attr_unlock(obj);
995
996	if (rc == 0) {
997		if (lvb.lvb_atime < attr->cat_atime)
998			lvb.lvb_atime = attr->cat_atime;
999		if (lvb.lvb_ctime < attr->cat_ctime)
1000			lvb.lvb_ctime = attr->cat_ctime;
1001		if (lvb.lvb_mtime < attr->cat_mtime)
1002			lvb.lvb_mtime = attr->cat_mtime;
1003
1004		CDEBUG(D_VFSTRACE, DFID" updating i_size "LPU64"\n",
1005				PFID(&lli->lli_fid), attr->cat_size);
1006		cl_isize_write_nolock(inode, attr->cat_size);
1007
1008		inode->i_blocks = attr->cat_blocks;
1009
1010		LTIME_S(inode->i_mtime) = lvb.lvb_mtime;
1011		LTIME_S(inode->i_atime) = lvb.lvb_atime;
1012		LTIME_S(inode->i_ctime) = lvb.lvb_ctime;
1013	}
1014	ll_inode_size_unlock(inode);
1015
1016	return rc;
1017}
1018
1019int ll_glimpse_ioctl(struct ll_sb_info *sbi, struct lov_stripe_md *lsm,
1020		     lstat_t *st)
1021{
1022	struct obdo obdo = { 0 };
1023	int rc;
1024
1025	rc = ll_lsm_getattr(lsm, sbi->ll_dt_exp, NULL, &obdo, 0, 0);
1026	if (rc == 0) {
1027		st->st_size   = obdo.o_size;
1028		st->st_blocks = obdo.o_blocks;
1029		st->st_mtime  = obdo.o_mtime;
1030		st->st_atime  = obdo.o_atime;
1031		st->st_ctime  = obdo.o_ctime;
1032	}
1033	return rc;
1034}
1035
1036void ll_io_init(struct cl_io *io, const struct file *file, int write)
1037{
1038	struct inode *inode = file->f_dentry->d_inode;
1039
1040	io->u.ci_rw.crw_nonblock = file->f_flags & O_NONBLOCK;
1041	if (write) {
1042		io->u.ci_wr.wr_append = !!(file->f_flags & O_APPEND);
1043		io->u.ci_wr.wr_sync = file->f_flags & O_SYNC ||
1044				      file->f_flags & O_DIRECT ||
1045				      IS_SYNC(inode);
1046	}
1047	io->ci_obj     = ll_i2info(inode)->lli_clob;
1048	io->ci_lockreq = CILR_MAYBE;
1049	if (ll_file_nolock(file)) {
1050		io->ci_lockreq = CILR_NEVER;
1051		io->ci_no_srvlock = 1;
1052	} else if (file->f_flags & O_APPEND) {
1053		io->ci_lockreq = CILR_MANDATORY;
1054	}
1055}
1056
1057static ssize_t
1058ll_file_io_generic(const struct lu_env *env, struct vvp_io_args *args,
1059		   struct file *file, enum cl_io_type iot,
1060		   loff_t *ppos, size_t count)
1061{
1062	struct ll_inode_info *lli = ll_i2info(file->f_dentry->d_inode);
1063	struct ll_file_data  *fd  = LUSTRE_FPRIVATE(file);
1064	struct cl_io	 *io;
1065	ssize_t	       result;
1066
1067restart:
1068	io = ccc_env_thread_io(env);
1069	ll_io_init(io, file, iot == CIT_WRITE);
1070
1071	if (cl_io_rw_init(env, io, iot, *ppos, count) == 0) {
1072		struct vvp_io *vio = vvp_env_io(env);
1073		struct ccc_io *cio = ccc_env_io(env);
1074		int write_mutex_locked = 0;
1075
1076		cio->cui_fd  = LUSTRE_FPRIVATE(file);
1077		vio->cui_io_subtype = args->via_io_subtype;
1078
1079		switch (vio->cui_io_subtype) {
1080		case IO_NORMAL:
1081			cio->cui_iov = args->u.normal.via_iov;
1082			cio->cui_nrsegs = args->u.normal.via_nrsegs;
1083			cio->cui_tot_nrsegs = cio->cui_nrsegs;
1084			cio->cui_iocb = args->u.normal.via_iocb;
1085			if ((iot == CIT_WRITE) &&
1086			    !(cio->cui_fd->fd_flags & LL_FILE_GROUP_LOCKED)) {
1087				if (mutex_lock_interruptible(&lli->
1088							       lli_write_mutex))
1089					GOTO(out, result = -ERESTARTSYS);
1090				write_mutex_locked = 1;
1091			} else if (iot == CIT_READ) {
1092				down_read(&lli->lli_trunc_sem);
1093			}
1094			break;
1095		case IO_SENDFILE:
1096			vio->u.sendfile.cui_actor = args->u.sendfile.via_actor;
1097			vio->u.sendfile.cui_target = args->u.sendfile.via_target;
1098			break;
1099		case IO_SPLICE:
1100			vio->u.splice.cui_pipe = args->u.splice.via_pipe;
1101			vio->u.splice.cui_flags = args->u.splice.via_flags;
1102			break;
1103		default:
1104			CERROR("Unknow IO type - %u\n", vio->cui_io_subtype);
1105			LBUG();
1106		}
1107		result = cl_io_loop(env, io);
1108		if (write_mutex_locked)
1109			mutex_unlock(&lli->lli_write_mutex);
1110		else if (args->via_io_subtype == IO_NORMAL && iot == CIT_READ)
1111			up_read(&lli->lli_trunc_sem);
1112	} else {
1113		/* cl_io_rw_init() handled IO */
1114		result = io->ci_result;
1115	}
1116
1117	if (io->ci_nob > 0) {
1118		result = io->ci_nob;
1119		*ppos = io->u.ci_wr.wr.crw_pos;
1120	}
1121	GOTO(out, result);
1122out:
1123	cl_io_fini(env, io);
1124	/* If any bit been read/written (result != 0), we just return
1125	 * short read/write instead of restart io. */
1126	if ((result == 0 || result == -ENODATA) && io->ci_need_restart) {
1127		CDEBUG(D_VFSTRACE, "Restart %s on %s from %lld, count:%zd\n",
1128		       iot == CIT_READ ? "read" : "write",
1129		       file->f_dentry->d_name.name, *ppos, count);
1130		LASSERTF(io->ci_nob == 0, "%zd", io->ci_nob);
1131		goto restart;
1132	}
1133
1134	if (iot == CIT_READ) {
1135		if (result >= 0)
1136			ll_stats_ops_tally(ll_i2sbi(file->f_dentry->d_inode),
1137					   LPROC_LL_READ_BYTES, result);
1138	} else if (iot == CIT_WRITE) {
1139		if (result >= 0) {
1140			ll_stats_ops_tally(ll_i2sbi(file->f_dentry->d_inode),
1141					   LPROC_LL_WRITE_BYTES, result);
1142			fd->fd_write_failed = false;
1143		} else if (result != -ERESTARTSYS) {
1144			fd->fd_write_failed = true;
1145		}
1146	}
1147
1148	return result;
1149}
1150
1151static ssize_t ll_file_aio_read(struct kiocb *iocb, const struct iovec *iov,
1152				unsigned long nr_segs, loff_t pos)
1153{
1154	struct lu_env      *env;
1155	struct vvp_io_args *args;
1156	size_t	      count = 0;
1157	ssize_t	     result;
1158	int		 refcheck;
1159
1160	result = generic_segment_checks(iov, &nr_segs, &count, VERIFY_WRITE);
1161	if (result)
1162		return result;
1163
1164	env = cl_env_get(&refcheck);
1165	if (IS_ERR(env))
1166		return PTR_ERR(env);
1167
1168	args = vvp_env_args(env, IO_NORMAL);
1169	args->u.normal.via_iov = (struct iovec *)iov;
1170	args->u.normal.via_nrsegs = nr_segs;
1171	args->u.normal.via_iocb = iocb;
1172
1173	result = ll_file_io_generic(env, args, iocb->ki_filp, CIT_READ,
1174				    &iocb->ki_pos, count);
1175	cl_env_put(env, &refcheck);
1176	return result;
1177}
1178
1179static ssize_t ll_file_read(struct file *file, char *buf, size_t count,
1180			    loff_t *ppos)
1181{
1182	struct lu_env *env;
1183	struct iovec  *local_iov;
1184	struct kiocb  *kiocb;
1185	ssize_t	result;
1186	int	    refcheck;
1187
1188	env = cl_env_get(&refcheck);
1189	if (IS_ERR(env))
1190		return PTR_ERR(env);
1191
1192	local_iov = &vvp_env_info(env)->vti_local_iov;
1193	kiocb = &vvp_env_info(env)->vti_kiocb;
1194	local_iov->iov_base = (void __user *)buf;
1195	local_iov->iov_len = count;
1196	init_sync_kiocb(kiocb, file);
1197	kiocb->ki_pos = *ppos;
1198	kiocb->ki_nbytes = count;
1199
1200	result = ll_file_aio_read(kiocb, local_iov, 1, kiocb->ki_pos);
1201	*ppos = kiocb->ki_pos;
1202
1203	cl_env_put(env, &refcheck);
1204	return result;
1205}
1206
1207/*
1208 * Write to a file (through the page cache).
1209 */
1210static ssize_t ll_file_aio_write(struct kiocb *iocb, const struct iovec *iov,
1211				 unsigned long nr_segs, loff_t pos)
1212{
1213	struct lu_env      *env;
1214	struct vvp_io_args *args;
1215	size_t	      count = 0;
1216	ssize_t	     result;
1217	int		 refcheck;
1218
1219	result = generic_segment_checks(iov, &nr_segs, &count, VERIFY_READ);
1220	if (result)
1221		return result;
1222
1223	env = cl_env_get(&refcheck);
1224	if (IS_ERR(env))
1225		return PTR_ERR(env);
1226
1227	args = vvp_env_args(env, IO_NORMAL);
1228	args->u.normal.via_iov = (struct iovec *)iov;
1229	args->u.normal.via_nrsegs = nr_segs;
1230	args->u.normal.via_iocb = iocb;
1231
1232	result = ll_file_io_generic(env, args, iocb->ki_filp, CIT_WRITE,
1233				  &iocb->ki_pos, count);
1234	cl_env_put(env, &refcheck);
1235	return result;
1236}
1237
1238static ssize_t ll_file_write(struct file *file, const char *buf, size_t count,
1239			     loff_t *ppos)
1240{
1241	struct lu_env *env;
1242	struct iovec  *local_iov;
1243	struct kiocb  *kiocb;
1244	ssize_t	result;
1245	int	    refcheck;
1246
1247	env = cl_env_get(&refcheck);
1248	if (IS_ERR(env))
1249		return PTR_ERR(env);
1250
1251	local_iov = &vvp_env_info(env)->vti_local_iov;
1252	kiocb = &vvp_env_info(env)->vti_kiocb;
1253	local_iov->iov_base = (void __user *)buf;
1254	local_iov->iov_len = count;
1255	init_sync_kiocb(kiocb, file);
1256	kiocb->ki_pos = *ppos;
1257	kiocb->ki_nbytes = count;
1258
1259	result = ll_file_aio_write(kiocb, local_iov, 1, kiocb->ki_pos);
1260	*ppos = kiocb->ki_pos;
1261
1262	cl_env_put(env, &refcheck);
1263	return result;
1264}
1265
1266
1267
1268/*
1269 * Send file content (through pagecache) somewhere with helper
1270 */
1271static ssize_t ll_file_splice_read(struct file *in_file, loff_t *ppos,
1272				   struct pipe_inode_info *pipe, size_t count,
1273				   unsigned int flags)
1274{
1275	struct lu_env      *env;
1276	struct vvp_io_args *args;
1277	ssize_t	     result;
1278	int		 refcheck;
1279
1280	env = cl_env_get(&refcheck);
1281	if (IS_ERR(env))
1282		return PTR_ERR(env);
1283
1284	args = vvp_env_args(env, IO_SPLICE);
1285	args->u.splice.via_pipe = pipe;
1286	args->u.splice.via_flags = flags;
1287
1288	result = ll_file_io_generic(env, args, in_file, CIT_READ, ppos, count);
1289	cl_env_put(env, &refcheck);
1290	return result;
1291}
1292
1293static int ll_lov_recreate(struct inode *inode, struct ost_id *oi,
1294			   obd_count ost_idx)
1295{
1296	struct obd_export *exp = ll_i2dtexp(inode);
1297	struct obd_trans_info oti = { 0 };
1298	struct obdo *oa = NULL;
1299	int lsm_size;
1300	int rc = 0;
1301	struct lov_stripe_md *lsm = NULL, *lsm2;
1302
1303	OBDO_ALLOC(oa);
1304	if (oa == NULL)
1305		return -ENOMEM;
1306
1307	lsm = ccc_inode_lsm_get(inode);
1308	if (!lsm_has_objects(lsm))
1309		GOTO(out, rc = -ENOENT);
1310
1311	lsm_size = sizeof(*lsm) + (sizeof(struct lov_oinfo) *
1312		   (lsm->lsm_stripe_count));
1313
1314	OBD_ALLOC_LARGE(lsm2, lsm_size);
1315	if (lsm2 == NULL)
1316		GOTO(out, rc = -ENOMEM);
1317
1318	oa->o_oi = *oi;
1319	oa->o_nlink = ost_idx;
1320	oa->o_flags |= OBD_FL_RECREATE_OBJS;
1321	oa->o_valid = OBD_MD_FLID | OBD_MD_FLFLAGS | OBD_MD_FLGROUP;
1322	obdo_from_inode(oa, inode, OBD_MD_FLTYPE | OBD_MD_FLATIME |
1323				   OBD_MD_FLMTIME | OBD_MD_FLCTIME);
1324	obdo_set_parent_fid(oa, &ll_i2info(inode)->lli_fid);
1325	memcpy(lsm2, lsm, lsm_size);
1326	ll_inode_size_lock(inode);
1327	rc = obd_create(NULL, exp, oa, &lsm2, &oti);
1328	ll_inode_size_unlock(inode);
1329
1330	OBD_FREE_LARGE(lsm2, lsm_size);
1331	GOTO(out, rc);
1332out:
1333	ccc_inode_lsm_put(inode, lsm);
1334	OBDO_FREE(oa);
1335	return rc;
1336}
1337
1338static int ll_lov_recreate_obj(struct inode *inode, unsigned long arg)
1339{
1340	struct ll_recreate_obj ucreat;
1341	struct ost_id		oi;
1342
1343	if (!cfs_capable(CFS_CAP_SYS_ADMIN))
1344		return -EPERM;
1345
1346	if (copy_from_user(&ucreat, (struct ll_recreate_obj *)arg,
1347			   sizeof(ucreat)))
1348		return -EFAULT;
1349
1350	ostid_set_seq_mdt0(&oi);
1351	ostid_set_id(&oi, ucreat.lrc_id);
1352	return ll_lov_recreate(inode, &oi, ucreat.lrc_ost_idx);
1353}
1354
1355static int ll_lov_recreate_fid(struct inode *inode, unsigned long arg)
1356{
1357	struct lu_fid	fid;
1358	struct ost_id	oi;
1359	obd_count	ost_idx;
1360
1361	if (!cfs_capable(CFS_CAP_SYS_ADMIN))
1362		return -EPERM;
1363
1364	if (copy_from_user(&fid, (struct lu_fid *)arg, sizeof(fid)))
1365		return -EFAULT;
1366
1367	fid_to_ostid(&fid, &oi);
1368	ost_idx = (fid_seq(&fid) >> 16) & 0xffff;
1369	return ll_lov_recreate(inode, &oi, ost_idx);
1370}
1371
1372int ll_lov_setstripe_ea_info(struct inode *inode, struct file *file,
1373			     int flags, struct lov_user_md *lum, int lum_size)
1374{
1375	struct lov_stripe_md *lsm = NULL;
1376	struct lookup_intent oit = {.it_op = IT_OPEN, .it_flags = flags};
1377	int rc = 0;
1378
1379	lsm = ccc_inode_lsm_get(inode);
1380	if (lsm != NULL) {
1381		ccc_inode_lsm_put(inode, lsm);
1382		CDEBUG(D_IOCTL, "stripe already exists for ino %lu\n",
1383		       inode->i_ino);
1384		return -EEXIST;
1385	}
1386
1387	ll_inode_size_lock(inode);
1388	rc = ll_intent_file_open(file, lum, lum_size, &oit);
1389	if (rc)
1390		GOTO(out, rc);
1391	rc = oit.d.lustre.it_status;
1392	if (rc < 0)
1393		GOTO(out_req_free, rc);
1394
1395	ll_release_openhandle(file->f_dentry, &oit);
1396
1397 out:
1398	ll_inode_size_unlock(inode);
1399	ll_intent_release(&oit);
1400	ccc_inode_lsm_put(inode, lsm);
1401	return rc;
1402out_req_free:
1403	ptlrpc_req_finished((struct ptlrpc_request *) oit.d.lustre.it_data);
1404	goto out;
1405}
1406
1407int ll_lov_getstripe_ea_info(struct inode *inode, const char *filename,
1408			     struct lov_mds_md **lmmp, int *lmm_size,
1409			     struct ptlrpc_request **request)
1410{
1411	struct ll_sb_info *sbi = ll_i2sbi(inode);
1412	struct mdt_body  *body;
1413	struct lov_mds_md *lmm = NULL;
1414	struct ptlrpc_request *req = NULL;
1415	struct md_op_data *op_data;
1416	int rc, lmmsize;
1417
1418	rc = ll_get_max_mdsize(sbi, &lmmsize);
1419	if (rc)
1420		return rc;
1421
1422	op_data = ll_prep_md_op_data(NULL, inode, NULL, filename,
1423				     strlen(filename), lmmsize,
1424				     LUSTRE_OPC_ANY, NULL);
1425	if (IS_ERR(op_data))
1426		return PTR_ERR(op_data);
1427
1428	op_data->op_valid = OBD_MD_FLEASIZE | OBD_MD_FLDIREA;
1429	rc = md_getattr_name(sbi->ll_md_exp, op_data, &req);
1430	ll_finish_md_op_data(op_data);
1431	if (rc < 0) {
1432		CDEBUG(D_INFO, "md_getattr_name failed "
1433		       "on %s: rc %d\n", filename, rc);
1434		GOTO(out, rc);
1435	}
1436
1437	body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
1438	LASSERT(body != NULL); /* checked by mdc_getattr_name */
1439
1440	lmmsize = body->eadatasize;
1441
1442	if (!(body->valid & (OBD_MD_FLEASIZE | OBD_MD_FLDIREA)) ||
1443			lmmsize == 0) {
1444		GOTO(out, rc = -ENODATA);
1445	}
1446
1447	lmm = req_capsule_server_sized_get(&req->rq_pill, &RMF_MDT_MD, lmmsize);
1448	LASSERT(lmm != NULL);
1449
1450	if ((lmm->lmm_magic != cpu_to_le32(LOV_MAGIC_V1)) &&
1451	    (lmm->lmm_magic != cpu_to_le32(LOV_MAGIC_V3))) {
1452		GOTO(out, rc = -EPROTO);
1453	}
1454
1455	/*
1456	 * This is coming from the MDS, so is probably in
1457	 * little endian.  We convert it to host endian before
1458	 * passing it to userspace.
1459	 */
1460	if (LOV_MAGIC != cpu_to_le32(LOV_MAGIC)) {
1461		int stripe_count;
1462
1463		stripe_count = le16_to_cpu(lmm->lmm_stripe_count);
1464		if (le32_to_cpu(lmm->lmm_pattern) & LOV_PATTERN_F_RELEASED)
1465			stripe_count = 0;
1466
1467		/* if function called for directory - we should
1468		 * avoid swab not existent lsm objects */
1469		if (lmm->lmm_magic == cpu_to_le32(LOV_MAGIC_V1)) {
1470			lustre_swab_lov_user_md_v1((struct lov_user_md_v1 *)lmm);
1471			if (S_ISREG(body->mode))
1472				lustre_swab_lov_user_md_objects(
1473				 ((struct lov_user_md_v1 *)lmm)->lmm_objects,
1474				 stripe_count);
1475		} else if (lmm->lmm_magic == cpu_to_le32(LOV_MAGIC_V3)) {
1476			lustre_swab_lov_user_md_v3((struct lov_user_md_v3 *)lmm);
1477			if (S_ISREG(body->mode))
1478				lustre_swab_lov_user_md_objects(
1479				 ((struct lov_user_md_v3 *)lmm)->lmm_objects,
1480				 stripe_count);
1481		}
1482	}
1483
1484out:
1485	*lmmp = lmm;
1486	*lmm_size = lmmsize;
1487	*request = req;
1488	return rc;
1489}
1490
1491static int ll_lov_setea(struct inode *inode, struct file *file,
1492			    unsigned long arg)
1493{
1494	int			 flags = MDS_OPEN_HAS_OBJS | FMODE_WRITE;
1495	struct lov_user_md	*lump;
1496	int			 lum_size = sizeof(struct lov_user_md) +
1497					    sizeof(struct lov_user_ost_data);
1498	int			 rc;
1499
1500	if (!cfs_capable(CFS_CAP_SYS_ADMIN))
1501		return -EPERM;
1502
1503	OBD_ALLOC_LARGE(lump, lum_size);
1504	if (lump == NULL)
1505		return -ENOMEM;
1506
1507	if (copy_from_user(lump, (struct lov_user_md  *)arg, lum_size)) {
1508		OBD_FREE_LARGE(lump, lum_size);
1509		return -EFAULT;
1510	}
1511
1512	rc = ll_lov_setstripe_ea_info(inode, file, flags, lump, lum_size);
1513
1514	OBD_FREE_LARGE(lump, lum_size);
1515	return rc;
1516}
1517
1518static int ll_lov_setstripe(struct inode *inode, struct file *file,
1519			    unsigned long arg)
1520{
1521	struct lov_user_md_v3	 lumv3;
1522	struct lov_user_md_v1	*lumv1 = (struct lov_user_md_v1 *)&lumv3;
1523	struct lov_user_md_v1	*lumv1p = (struct lov_user_md_v1 *)arg;
1524	struct lov_user_md_v3	*lumv3p = (struct lov_user_md_v3 *)arg;
1525	int			 lum_size, rc;
1526	int			 flags = FMODE_WRITE;
1527
1528	/* first try with v1 which is smaller than v3 */
1529	lum_size = sizeof(struct lov_user_md_v1);
1530	if (copy_from_user(lumv1, lumv1p, lum_size))
1531		return -EFAULT;
1532
1533	if (lumv1->lmm_magic == LOV_USER_MAGIC_V3) {
1534		lum_size = sizeof(struct lov_user_md_v3);
1535		if (copy_from_user(&lumv3, lumv3p, lum_size))
1536			return -EFAULT;
1537	}
1538
1539	rc = ll_lov_setstripe_ea_info(inode, file, flags, lumv1, lum_size);
1540	if (rc == 0) {
1541		struct lov_stripe_md *lsm;
1542		__u32 gen;
1543
1544		put_user(0, &lumv1p->lmm_stripe_count);
1545
1546		ll_layout_refresh(inode, &gen);
1547		lsm = ccc_inode_lsm_get(inode);
1548		rc = obd_iocontrol(LL_IOC_LOV_GETSTRIPE, ll_i2dtexp(inode),
1549				   0, lsm, (void *)arg);
1550		ccc_inode_lsm_put(inode, lsm);
1551	}
1552	return rc;
1553}
1554
1555static int ll_lov_getstripe(struct inode *inode, unsigned long arg)
1556{
1557	struct lov_stripe_md *lsm;
1558	int rc = -ENODATA;
1559
1560	lsm = ccc_inode_lsm_get(inode);
1561	if (lsm != NULL)
1562		rc = obd_iocontrol(LL_IOC_LOV_GETSTRIPE, ll_i2dtexp(inode), 0,
1563				   lsm, (void *)arg);
1564	ccc_inode_lsm_put(inode, lsm);
1565	return rc;
1566}
1567
1568int ll_get_grouplock(struct inode *inode, struct file *file, unsigned long arg)
1569{
1570	struct ll_inode_info   *lli = ll_i2info(inode);
1571	struct ll_file_data    *fd = LUSTRE_FPRIVATE(file);
1572	struct ccc_grouplock    grouplock;
1573	int		     rc;
1574
1575	if (ll_file_nolock(file))
1576		return -EOPNOTSUPP;
1577
1578	spin_lock(&lli->lli_lock);
1579	if (fd->fd_flags & LL_FILE_GROUP_LOCKED) {
1580		CWARN("group lock already existed with gid %lu\n",
1581		      fd->fd_grouplock.cg_gid);
1582		spin_unlock(&lli->lli_lock);
1583		return -EINVAL;
1584	}
1585	LASSERT(fd->fd_grouplock.cg_lock == NULL);
1586	spin_unlock(&lli->lli_lock);
1587
1588	rc = cl_get_grouplock(cl_i2info(inode)->lli_clob,
1589			      arg, (file->f_flags & O_NONBLOCK), &grouplock);
1590	if (rc)
1591		return rc;
1592
1593	spin_lock(&lli->lli_lock);
1594	if (fd->fd_flags & LL_FILE_GROUP_LOCKED) {
1595		spin_unlock(&lli->lli_lock);
1596		CERROR("another thread just won the race\n");
1597		cl_put_grouplock(&grouplock);
1598		return -EINVAL;
1599	}
1600
1601	fd->fd_flags |= LL_FILE_GROUP_LOCKED;
1602	fd->fd_grouplock = grouplock;
1603	spin_unlock(&lli->lli_lock);
1604
1605	CDEBUG(D_INFO, "group lock %lu obtained\n", arg);
1606	return 0;
1607}
1608
1609int ll_put_grouplock(struct inode *inode, struct file *file, unsigned long arg)
1610{
1611	struct ll_inode_info   *lli = ll_i2info(inode);
1612	struct ll_file_data    *fd = LUSTRE_FPRIVATE(file);
1613	struct ccc_grouplock    grouplock;
1614
1615	spin_lock(&lli->lli_lock);
1616	if (!(fd->fd_flags & LL_FILE_GROUP_LOCKED)) {
1617		spin_unlock(&lli->lli_lock);
1618		CWARN("no group lock held\n");
1619		return -EINVAL;
1620	}
1621	LASSERT(fd->fd_grouplock.cg_lock != NULL);
1622
1623	if (fd->fd_grouplock.cg_gid != arg) {
1624		CWARN("group lock %lu doesn't match current id %lu\n",
1625		       arg, fd->fd_grouplock.cg_gid);
1626		spin_unlock(&lli->lli_lock);
1627		return -EINVAL;
1628	}
1629
1630	grouplock = fd->fd_grouplock;
1631	memset(&fd->fd_grouplock, 0, sizeof(fd->fd_grouplock));
1632	fd->fd_flags &= ~LL_FILE_GROUP_LOCKED;
1633	spin_unlock(&lli->lli_lock);
1634
1635	cl_put_grouplock(&grouplock);
1636	CDEBUG(D_INFO, "group lock %lu released\n", arg);
1637	return 0;
1638}
1639
1640/**
1641 * Close inode open handle
1642 *
1643 * \param dentry [in]     dentry which contains the inode
1644 * \param it     [in,out] intent which contains open info and result
1645 *
1646 * \retval 0     success
1647 * \retval <0    failure
1648 */
1649int ll_release_openhandle(struct dentry *dentry, struct lookup_intent *it)
1650{
1651	struct inode *inode = dentry->d_inode;
1652	struct obd_client_handle *och;
1653	int rc;
1654
1655	LASSERT(inode);
1656
1657	/* Root ? Do nothing. */
1658	if (dentry->d_inode->i_sb->s_root == dentry)
1659		return 0;
1660
1661	/* No open handle to close? Move away */
1662	if (!it_disposition(it, DISP_OPEN_OPEN))
1663		return 0;
1664
1665	LASSERT(it_open_error(DISP_OPEN_OPEN, it) == 0);
1666
1667	OBD_ALLOC(och, sizeof(*och));
1668	if (!och)
1669		GOTO(out, rc = -ENOMEM);
1670
1671	ll_och_fill(ll_i2sbi(inode)->ll_md_exp, it, och);
1672
1673	rc = ll_close_inode_openhandle(ll_i2sbi(inode)->ll_md_exp,
1674				       inode, och, NULL);
1675out:
1676	/* this one is in place of ll_file_open */
1677	if (it_disposition(it, DISP_ENQ_OPEN_REF)) {
1678		ptlrpc_req_finished(it->d.lustre.it_data);
1679		it_clear_disposition(it, DISP_ENQ_OPEN_REF);
1680	}
1681	return rc;
1682}
1683
1684/**
1685 * Get size for inode for which FIEMAP mapping is requested.
1686 * Make the FIEMAP get_info call and returns the result.
1687 */
1688int ll_do_fiemap(struct inode *inode, struct ll_user_fiemap *fiemap,
1689	      int num_bytes)
1690{
1691	struct obd_export *exp = ll_i2dtexp(inode);
1692	struct lov_stripe_md *lsm = NULL;
1693	struct ll_fiemap_info_key fm_key = { .name = KEY_FIEMAP, };
1694	int vallen = num_bytes;
1695	int rc;
1696
1697	/* Checks for fiemap flags */
1698	if (fiemap->fm_flags & ~LUSTRE_FIEMAP_FLAGS_COMPAT) {
1699		fiemap->fm_flags &= ~LUSTRE_FIEMAP_FLAGS_COMPAT;
1700		return -EBADR;
1701	}
1702
1703	/* Check for FIEMAP_FLAG_SYNC */
1704	if (fiemap->fm_flags & FIEMAP_FLAG_SYNC) {
1705		rc = filemap_fdatawrite(inode->i_mapping);
1706		if (rc)
1707			return rc;
1708	}
1709
1710	lsm = ccc_inode_lsm_get(inode);
1711	if (lsm == NULL)
1712		return -ENOENT;
1713
1714	/* If the stripe_count > 1 and the application does not understand
1715	 * DEVICE_ORDER flag, then it cannot interpret the extents correctly.
1716	 */
1717	if (lsm->lsm_stripe_count > 1 &&
1718	    !(fiemap->fm_flags & FIEMAP_FLAG_DEVICE_ORDER))
1719		GOTO(out, rc = -EOPNOTSUPP);
1720
1721	fm_key.oa.o_oi = lsm->lsm_oi;
1722	fm_key.oa.o_valid = OBD_MD_FLID | OBD_MD_FLGROUP;
1723
1724	obdo_from_inode(&fm_key.oa, inode, OBD_MD_FLSIZE);
1725	obdo_set_parent_fid(&fm_key.oa, &ll_i2info(inode)->lli_fid);
1726	/* If filesize is 0, then there would be no objects for mapping */
1727	if (fm_key.oa.o_size == 0) {
1728		fiemap->fm_mapped_extents = 0;
1729		GOTO(out, rc = 0);
1730	}
1731
1732	memcpy(&fm_key.fiemap, fiemap, sizeof(*fiemap));
1733
1734	rc = obd_get_info(NULL, exp, sizeof(fm_key), &fm_key, &vallen,
1735			  fiemap, lsm);
1736	if (rc)
1737		CERROR("obd_get_info failed: rc = %d\n", rc);
1738
1739out:
1740	ccc_inode_lsm_put(inode, lsm);
1741	return rc;
1742}
1743
1744int ll_fid2path(struct inode *inode, void *arg)
1745{
1746	struct obd_export	*exp = ll_i2mdexp(inode);
1747	struct getinfo_fid2path	*gfout, *gfin;
1748	int			 outsize, rc;
1749
1750	if (!cfs_capable(CFS_CAP_DAC_READ_SEARCH) &&
1751	    !(ll_i2sbi(inode)->ll_flags & LL_SBI_USER_FID2PATH))
1752		return -EPERM;
1753
1754	/* Need to get the buflen */
1755	OBD_ALLOC_PTR(gfin);
1756	if (gfin == NULL)
1757		return -ENOMEM;
1758	if (copy_from_user(gfin, arg, sizeof(*gfin))) {
1759		OBD_FREE_PTR(gfin);
1760		return -EFAULT;
1761	}
1762
1763	outsize = sizeof(*gfout) + gfin->gf_pathlen;
1764	OBD_ALLOC(gfout, outsize);
1765	if (gfout == NULL) {
1766		OBD_FREE_PTR(gfin);
1767		return -ENOMEM;
1768	}
1769	memcpy(gfout, gfin, sizeof(*gfout));
1770	OBD_FREE_PTR(gfin);
1771
1772	/* Call mdc_iocontrol */
1773	rc = obd_iocontrol(OBD_IOC_FID2PATH, exp, outsize, gfout, NULL);
1774	if (rc)
1775		GOTO(gf_free, rc);
1776
1777	if (copy_to_user(arg, gfout, outsize))
1778		rc = -EFAULT;
1779
1780gf_free:
1781	OBD_FREE(gfout, outsize);
1782	return rc;
1783}
1784
1785static int ll_ioctl_fiemap(struct inode *inode, unsigned long arg)
1786{
1787	struct ll_user_fiemap *fiemap_s;
1788	size_t num_bytes, ret_bytes;
1789	unsigned int extent_count;
1790	int rc = 0;
1791
1792	/* Get the extent count so we can calculate the size of
1793	 * required fiemap buffer */
1794	if (get_user(extent_count,
1795	    &((struct ll_user_fiemap __user *)arg)->fm_extent_count))
1796		return -EFAULT;
1797	num_bytes = sizeof(*fiemap_s) + (extent_count *
1798					 sizeof(struct ll_fiemap_extent));
1799
1800	OBD_ALLOC_LARGE(fiemap_s, num_bytes);
1801	if (fiemap_s == NULL)
1802		return -ENOMEM;
1803
1804	/* get the fiemap value */
1805	if (copy_from_user(fiemap_s, (struct ll_user_fiemap __user *)arg,
1806			   sizeof(*fiemap_s)))
1807		GOTO(error, rc = -EFAULT);
1808
1809	/* If fm_extent_count is non-zero, read the first extent since
1810	 * it is used to calculate end_offset and device from previous
1811	 * fiemap call. */
1812	if (extent_count) {
1813		if (copy_from_user(&fiemap_s->fm_extents[0],
1814		    (char __user *)arg + sizeof(*fiemap_s),
1815		    sizeof(struct ll_fiemap_extent)))
1816			GOTO(error, rc = -EFAULT);
1817	}
1818
1819	rc = ll_do_fiemap(inode, fiemap_s, num_bytes);
1820	if (rc)
1821		GOTO(error, rc);
1822
1823	ret_bytes = sizeof(struct ll_user_fiemap);
1824
1825	if (extent_count != 0)
1826		ret_bytes += (fiemap_s->fm_mapped_extents *
1827				 sizeof(struct ll_fiemap_extent));
1828
1829	if (copy_to_user((void *)arg, fiemap_s, ret_bytes))
1830		rc = -EFAULT;
1831
1832error:
1833	OBD_FREE_LARGE(fiemap_s, num_bytes);
1834	return rc;
1835}
1836
1837/*
1838 * Read the data_version for inode.
1839 *
1840 * This value is computed using stripe object version on OST.
1841 * Version is computed using server side locking.
1842 *
1843 * @param extent_lock  Take extent lock. Not needed if a process is already
1844 *		       holding the OST object group locks.
1845 */
1846int ll_data_version(struct inode *inode, __u64 *data_version,
1847		    int extent_lock)
1848{
1849	struct lov_stripe_md	*lsm = NULL;
1850	struct ll_sb_info	*sbi = ll_i2sbi(inode);
1851	struct obdo		*obdo = NULL;
1852	int			 rc;
1853
1854	/* If no stripe, we consider version is 0. */
1855	lsm = ccc_inode_lsm_get(inode);
1856	if (!lsm_has_objects(lsm)) {
1857		*data_version = 0;
1858		CDEBUG(D_INODE, "No object for inode\n");
1859		GOTO(out, rc = 0);
1860	}
1861
1862	OBD_ALLOC_PTR(obdo);
1863	if (obdo == NULL)
1864		GOTO(out, rc = -ENOMEM);
1865
1866	rc = ll_lsm_getattr(lsm, sbi->ll_dt_exp, NULL, obdo, 0, extent_lock);
1867	if (rc == 0) {
1868		if (!(obdo->o_valid & OBD_MD_FLDATAVERSION))
1869			rc = -EOPNOTSUPP;
1870		else
1871			*data_version = obdo->o_data_version;
1872	}
1873
1874	OBD_FREE_PTR(obdo);
1875out:
1876	ccc_inode_lsm_put(inode, lsm);
1877	return rc;
1878}
1879
1880/*
1881 * Trigger a HSM release request for the provided inode.
1882 */
1883int ll_hsm_release(struct inode *inode)
1884{
1885	struct cl_env_nest nest;
1886	struct lu_env *env;
1887	struct obd_client_handle *och = NULL;
1888	__u64 data_version = 0;
1889	int rc;
1890
1891
1892	CDEBUG(D_INODE, "%s: Releasing file "DFID".\n",
1893	       ll_get_fsname(inode->i_sb, NULL, 0),
1894	       PFID(&ll_i2info(inode)->lli_fid));
1895
1896	och = ll_lease_open(inode, NULL, FMODE_WRITE, MDS_OPEN_RELEASE);
1897	if (IS_ERR(och))
1898		GOTO(out, rc = PTR_ERR(och));
1899
1900	/* Grab latest data_version and [am]time values */
1901	rc = ll_data_version(inode, &data_version, 1);
1902	if (rc != 0)
1903		GOTO(out, rc);
1904
1905	env = cl_env_nested_get(&nest);
1906	if (IS_ERR(env))
1907		GOTO(out, rc = PTR_ERR(env));
1908
1909	ll_merge_lvb(env, inode);
1910	cl_env_nested_put(&nest, env);
1911
1912	/* Release the file.
1913	 * NB: lease lock handle is released in mdc_hsm_release_pack() because
1914	 * we still need it to pack l_remote_handle to MDT. */
1915	rc = ll_close_inode_openhandle(ll_i2sbi(inode)->ll_md_exp, inode, och,
1916				       &data_version);
1917	och = NULL;
1918
1919
1920out:
1921	if (och != NULL && !IS_ERR(och)) /* close the file */
1922		ll_lease_close(och, inode, NULL);
1923
1924	return rc;
1925}
1926
1927struct ll_swap_stack {
1928	struct iattr		 ia1, ia2;
1929	__u64			 dv1, dv2;
1930	struct inode		*inode1, *inode2;
1931	bool			 check_dv1, check_dv2;
1932};
1933
1934static int ll_swap_layouts(struct file *file1, struct file *file2,
1935			   struct lustre_swap_layouts *lsl)
1936{
1937	struct mdc_swap_layouts	 msl;
1938	struct md_op_data	*op_data;
1939	__u32			 gid;
1940	__u64			 dv;
1941	struct ll_swap_stack	*llss = NULL;
1942	int			 rc;
1943
1944	OBD_ALLOC_PTR(llss);
1945	if (llss == NULL)
1946		return -ENOMEM;
1947
1948	llss->inode1 = file1->f_dentry->d_inode;
1949	llss->inode2 = file2->f_dentry->d_inode;
1950
1951	if (!S_ISREG(llss->inode2->i_mode))
1952		GOTO(free, rc = -EINVAL);
1953
1954	if (inode_permission(llss->inode1, MAY_WRITE) ||
1955	    inode_permission(llss->inode2, MAY_WRITE))
1956		GOTO(free, rc = -EPERM);
1957
1958	if (llss->inode2->i_sb != llss->inode1->i_sb)
1959		GOTO(free, rc = -EXDEV);
1960
1961	/* we use 2 bool because it is easier to swap than 2 bits */
1962	if (lsl->sl_flags & SWAP_LAYOUTS_CHECK_DV1)
1963		llss->check_dv1 = true;
1964
1965	if (lsl->sl_flags & SWAP_LAYOUTS_CHECK_DV2)
1966		llss->check_dv2 = true;
1967
1968	/* we cannot use lsl->sl_dvX directly because we may swap them */
1969	llss->dv1 = lsl->sl_dv1;
1970	llss->dv2 = lsl->sl_dv2;
1971
1972	rc = lu_fid_cmp(ll_inode2fid(llss->inode1), ll_inode2fid(llss->inode2));
1973	if (rc == 0) /* same file, done! */
1974		GOTO(free, rc = 0);
1975
1976	if (rc < 0) { /* sequentialize it */
1977		swap(llss->inode1, llss->inode2);
1978		swap(file1, file2);
1979		swap(llss->dv1, llss->dv2);
1980		swap(llss->check_dv1, llss->check_dv2);
1981	}
1982
1983	gid = lsl->sl_gid;
1984	if (gid != 0) { /* application asks to flush dirty cache */
1985		rc = ll_get_grouplock(llss->inode1, file1, gid);
1986		if (rc < 0)
1987			GOTO(free, rc);
1988
1989		rc = ll_get_grouplock(llss->inode2, file2, gid);
1990		if (rc < 0) {
1991			ll_put_grouplock(llss->inode1, file1, gid);
1992			GOTO(free, rc);
1993		}
1994	}
1995
1996	/* to be able to restore mtime and atime after swap
1997	 * we need to first save them */
1998	if (lsl->sl_flags &
1999	    (SWAP_LAYOUTS_KEEP_MTIME | SWAP_LAYOUTS_KEEP_ATIME)) {
2000		llss->ia1.ia_mtime = llss->inode1->i_mtime;
2001		llss->ia1.ia_atime = llss->inode1->i_atime;
2002		llss->ia1.ia_valid = ATTR_MTIME | ATTR_ATIME;
2003		llss->ia2.ia_mtime = llss->inode2->i_mtime;
2004		llss->ia2.ia_atime = llss->inode2->i_atime;
2005		llss->ia2.ia_valid = ATTR_MTIME | ATTR_ATIME;
2006	}
2007
2008	/* ultimate check, before swaping the layouts we check if
2009	 * dataversion has changed (if requested) */
2010	if (llss->check_dv1) {
2011		rc = ll_data_version(llss->inode1, &dv, 0);
2012		if (rc)
2013			GOTO(putgl, rc);
2014		if (dv != llss->dv1)
2015			GOTO(putgl, rc = -EAGAIN);
2016	}
2017
2018	if (llss->check_dv2) {
2019		rc = ll_data_version(llss->inode2, &dv, 0);
2020		if (rc)
2021			GOTO(putgl, rc);
2022		if (dv != llss->dv2)
2023			GOTO(putgl, rc = -EAGAIN);
2024	}
2025
2026	/* struct md_op_data is used to send the swap args to the mdt
2027	 * only flags is missing, so we use struct mdc_swap_layouts
2028	 * through the md_op_data->op_data */
2029	/* flags from user space have to be converted before they are send to
2030	 * server, no flag is sent today, they are only used on the client */
2031	msl.msl_flags = 0;
2032	rc = -ENOMEM;
2033	op_data = ll_prep_md_op_data(NULL, llss->inode1, llss->inode2, NULL, 0,
2034				     0, LUSTRE_OPC_ANY, &msl);
2035	if (IS_ERR(op_data))
2036		GOTO(free, rc = PTR_ERR(op_data));
2037
2038	rc = obd_iocontrol(LL_IOC_LOV_SWAP_LAYOUTS, ll_i2mdexp(llss->inode1),
2039			   sizeof(*op_data), op_data, NULL);
2040	ll_finish_md_op_data(op_data);
2041
2042putgl:
2043	if (gid != 0) {
2044		ll_put_grouplock(llss->inode2, file2, gid);
2045		ll_put_grouplock(llss->inode1, file1, gid);
2046	}
2047
2048	/* rc can be set from obd_iocontrol() or from a GOTO(putgl, ...) */
2049	if (rc != 0)
2050		GOTO(free, rc);
2051
2052	/* clear useless flags */
2053	if (!(lsl->sl_flags & SWAP_LAYOUTS_KEEP_MTIME)) {
2054		llss->ia1.ia_valid &= ~ATTR_MTIME;
2055		llss->ia2.ia_valid &= ~ATTR_MTIME;
2056	}
2057
2058	if (!(lsl->sl_flags & SWAP_LAYOUTS_KEEP_ATIME)) {
2059		llss->ia1.ia_valid &= ~ATTR_ATIME;
2060		llss->ia2.ia_valid &= ~ATTR_ATIME;
2061	}
2062
2063	/* update time if requested */
2064	rc = 0;
2065	if (llss->ia2.ia_valid != 0) {
2066		mutex_lock(&llss->inode1->i_mutex);
2067		rc = ll_setattr(file1->f_dentry, &llss->ia2);
2068		mutex_unlock(&llss->inode1->i_mutex);
2069	}
2070
2071	if (llss->ia1.ia_valid != 0) {
2072		int rc1;
2073
2074		mutex_lock(&llss->inode2->i_mutex);
2075		rc1 = ll_setattr(file2->f_dentry, &llss->ia1);
2076		mutex_unlock(&llss->inode2->i_mutex);
2077		if (rc == 0)
2078			rc = rc1;
2079	}
2080
2081free:
2082	if (llss != NULL)
2083		OBD_FREE_PTR(llss);
2084
2085	return rc;
2086}
2087
2088static int ll_hsm_state_set(struct inode *inode, struct hsm_state_set *hss)
2089{
2090	struct md_op_data	*op_data;
2091	int			 rc;
2092
2093	/* Non-root users are forbidden to set or clear flags which are
2094	 * NOT defined in HSM_USER_MASK. */
2095	if (((hss->hss_setmask | hss->hss_clearmask) & ~HSM_USER_MASK) &&
2096	    !cfs_capable(CFS_CAP_SYS_ADMIN))
2097		return -EPERM;
2098
2099	op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
2100				     LUSTRE_OPC_ANY, hss);
2101	if (IS_ERR(op_data))
2102		return PTR_ERR(op_data);
2103
2104	rc = obd_iocontrol(LL_IOC_HSM_STATE_SET, ll_i2mdexp(inode),
2105			   sizeof(*op_data), op_data, NULL);
2106
2107	ll_finish_md_op_data(op_data);
2108
2109	return rc;
2110}
2111
2112static int ll_hsm_import(struct inode *inode, struct file *file,
2113			 struct hsm_user_import *hui)
2114{
2115	struct hsm_state_set	*hss = NULL;
2116	struct iattr		*attr = NULL;
2117	int			 rc;
2118
2119
2120	if (!S_ISREG(inode->i_mode))
2121		return -EINVAL;
2122
2123	/* set HSM flags */
2124	OBD_ALLOC_PTR(hss);
2125	if (hss == NULL)
2126		GOTO(out, rc = -ENOMEM);
2127
2128	hss->hss_valid = HSS_SETMASK | HSS_ARCHIVE_ID;
2129	hss->hss_archive_id = hui->hui_archive_id;
2130	hss->hss_setmask = HS_ARCHIVED | HS_EXISTS | HS_RELEASED;
2131	rc = ll_hsm_state_set(inode, hss);
2132	if (rc != 0)
2133		GOTO(out, rc);
2134
2135	OBD_ALLOC_PTR(attr);
2136	if (attr == NULL)
2137		GOTO(out, rc = -ENOMEM);
2138
2139	attr->ia_mode = hui->hui_mode & (S_IRWXU | S_IRWXG | S_IRWXO);
2140	attr->ia_mode |= S_IFREG;
2141	attr->ia_uid = make_kuid(&init_user_ns, hui->hui_uid);
2142	attr->ia_gid = make_kgid(&init_user_ns, hui->hui_gid);
2143	attr->ia_size = hui->hui_size;
2144	attr->ia_mtime.tv_sec = hui->hui_mtime;
2145	attr->ia_mtime.tv_nsec = hui->hui_mtime_ns;
2146	attr->ia_atime.tv_sec = hui->hui_atime;
2147	attr->ia_atime.tv_nsec = hui->hui_atime_ns;
2148
2149	attr->ia_valid = ATTR_SIZE | ATTR_MODE | ATTR_FORCE |
2150			 ATTR_UID | ATTR_GID |
2151			 ATTR_MTIME | ATTR_MTIME_SET |
2152			 ATTR_ATIME | ATTR_ATIME_SET;
2153
2154	rc = ll_setattr_raw(file->f_dentry, attr, true);
2155	if (rc == -ENODATA)
2156		rc = 0;
2157
2158out:
2159	if (hss != NULL)
2160		OBD_FREE_PTR(hss);
2161
2162	if (attr != NULL)
2163		OBD_FREE_PTR(attr);
2164
2165	return rc;
2166}
2167
2168long ll_file_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
2169{
2170	struct inode		*inode = file->f_dentry->d_inode;
2171	struct ll_file_data	*fd = LUSTRE_FPRIVATE(file);
2172	int			 flags, rc;
2173
2174	CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p),cmd=%x\n", inode->i_ino,
2175	       inode->i_generation, inode, cmd);
2176	ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_IOCTL, 1);
2177
2178	/* asm-ppc{,64} declares TCGETS, et. al. as type 't' not 'T' */
2179	if (_IOC_TYPE(cmd) == 'T' || _IOC_TYPE(cmd) == 't') /* tty ioctls */
2180		return -ENOTTY;
2181
2182	switch(cmd) {
2183	case LL_IOC_GETFLAGS:
2184		/* Get the current value of the file flags */
2185		return put_user(fd->fd_flags, (int *)arg);
2186	case LL_IOC_SETFLAGS:
2187	case LL_IOC_CLRFLAGS:
2188		/* Set or clear specific file flags */
2189		/* XXX This probably needs checks to ensure the flags are
2190		 *     not abused, and to handle any flag side effects.
2191		 */
2192		if (get_user(flags, (int *) arg))
2193			return -EFAULT;
2194
2195		if (cmd == LL_IOC_SETFLAGS) {
2196			if ((flags & LL_FILE_IGNORE_LOCK) &&
2197			    !(file->f_flags & O_DIRECT)) {
2198				CERROR("%s: unable to disable locking on "
2199				       "non-O_DIRECT file\n", current->comm);
2200				return -EINVAL;
2201			}
2202
2203			fd->fd_flags |= flags;
2204		} else {
2205			fd->fd_flags &= ~flags;
2206		}
2207		return 0;
2208	case LL_IOC_LOV_SETSTRIPE:
2209		return ll_lov_setstripe(inode, file, arg);
2210	case LL_IOC_LOV_SETEA:
2211		return ll_lov_setea(inode, file, arg);
2212	case LL_IOC_LOV_SWAP_LAYOUTS: {
2213		struct file *file2;
2214		struct lustre_swap_layouts lsl;
2215
2216		if (copy_from_user(&lsl, (char *)arg,
2217				       sizeof(struct lustre_swap_layouts)))
2218			return -EFAULT;
2219
2220		if ((file->f_flags & O_ACCMODE) == 0) /* O_RDONLY */
2221			return -EPERM;
2222
2223		file2 = fget(lsl.sl_fd);
2224		if (file2 == NULL)
2225			return -EBADF;
2226
2227		rc = -EPERM;
2228		if ((file2->f_flags & O_ACCMODE) != 0) /* O_WRONLY or O_RDWR */
2229			rc = ll_swap_layouts(file, file2, &lsl);
2230		fput(file2);
2231		return rc;
2232	}
2233	case LL_IOC_LOV_GETSTRIPE:
2234		return ll_lov_getstripe(inode, arg);
2235	case LL_IOC_RECREATE_OBJ:
2236		return ll_lov_recreate_obj(inode, arg);
2237	case LL_IOC_RECREATE_FID:
2238		return ll_lov_recreate_fid(inode, arg);
2239	case FSFILT_IOC_FIEMAP:
2240		return ll_ioctl_fiemap(inode, arg);
2241	case FSFILT_IOC_GETFLAGS:
2242	case FSFILT_IOC_SETFLAGS:
2243		return ll_iocontrol(inode, file, cmd, arg);
2244	case FSFILT_IOC_GETVERSION_OLD:
2245	case FSFILT_IOC_GETVERSION:
2246		return put_user(inode->i_generation, (int *)arg);
2247	case LL_IOC_GROUP_LOCK:
2248		return ll_get_grouplock(inode, file, arg);
2249	case LL_IOC_GROUP_UNLOCK:
2250		return ll_put_grouplock(inode, file, arg);
2251	case IOC_OBD_STATFS:
2252		return ll_obd_statfs(inode, (void *)arg);
2253
2254	/* We need to special case any other ioctls we want to handle,
2255	 * to send them to the MDS/OST as appropriate and to properly
2256	 * network encode the arg field.
2257	case FSFILT_IOC_SETVERSION_OLD:
2258	case FSFILT_IOC_SETVERSION:
2259	*/
2260	case LL_IOC_FLUSHCTX:
2261		return ll_flush_ctx(inode);
2262	case LL_IOC_PATH2FID: {
2263		if (copy_to_user((void *)arg, ll_inode2fid(inode),
2264				 sizeof(struct lu_fid)))
2265			return -EFAULT;
2266
2267		return 0;
2268	}
2269	case OBD_IOC_FID2PATH:
2270		return ll_fid2path(inode, (void *)arg);
2271	case LL_IOC_DATA_VERSION: {
2272		struct ioc_data_version	idv;
2273		int			rc;
2274
2275		if (copy_from_user(&idv, (char *)arg, sizeof(idv)))
2276			return -EFAULT;
2277
2278		rc = ll_data_version(inode, &idv.idv_version,
2279				!(idv.idv_flags & LL_DV_NOFLUSH));
2280
2281		if (rc == 0 && copy_to_user((char *) arg, &idv, sizeof(idv)))
2282			return -EFAULT;
2283
2284		return rc;
2285	}
2286
2287	case LL_IOC_GET_MDTIDX: {
2288		int mdtidx;
2289
2290		mdtidx = ll_get_mdt_idx(inode);
2291		if (mdtidx < 0)
2292			return mdtidx;
2293
2294		if (put_user((int)mdtidx, (int*)arg))
2295			return -EFAULT;
2296
2297		return 0;
2298	}
2299	case OBD_IOC_GETDTNAME:
2300	case OBD_IOC_GETMDNAME:
2301		return ll_get_obd_name(inode, cmd, arg);
2302	case LL_IOC_HSM_STATE_GET: {
2303		struct md_op_data	*op_data;
2304		struct hsm_user_state	*hus;
2305		int			 rc;
2306
2307		OBD_ALLOC_PTR(hus);
2308		if (hus == NULL)
2309			return -ENOMEM;
2310
2311		op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
2312					     LUSTRE_OPC_ANY, hus);
2313		if (IS_ERR(op_data)) {
2314			OBD_FREE_PTR(hus);
2315			return PTR_ERR(op_data);
2316		}
2317
2318		rc = obd_iocontrol(cmd, ll_i2mdexp(inode), sizeof(*op_data),
2319				   op_data, NULL);
2320
2321		if (copy_to_user((void *)arg, hus, sizeof(*hus)))
2322			rc = -EFAULT;
2323
2324		ll_finish_md_op_data(op_data);
2325		OBD_FREE_PTR(hus);
2326		return rc;
2327	}
2328	case LL_IOC_HSM_STATE_SET: {
2329		struct hsm_state_set	*hss;
2330		int			 rc;
2331
2332		OBD_ALLOC_PTR(hss);
2333		if (hss == NULL)
2334			return -ENOMEM;
2335
2336		if (copy_from_user(hss, (char *)arg, sizeof(*hss))) {
2337			OBD_FREE_PTR(hss);
2338			return -EFAULT;
2339		}
2340
2341		rc = ll_hsm_state_set(inode, hss);
2342
2343		OBD_FREE_PTR(hss);
2344		return rc;
2345	}
2346	case LL_IOC_HSM_ACTION: {
2347		struct md_op_data		*op_data;
2348		struct hsm_current_action	*hca;
2349		int				 rc;
2350
2351		OBD_ALLOC_PTR(hca);
2352		if (hca == NULL)
2353			return -ENOMEM;
2354
2355		op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
2356					     LUSTRE_OPC_ANY, hca);
2357		if (IS_ERR(op_data)) {
2358			OBD_FREE_PTR(hca);
2359			return PTR_ERR(op_data);
2360		}
2361
2362		rc = obd_iocontrol(cmd, ll_i2mdexp(inode), sizeof(*op_data),
2363				   op_data, NULL);
2364
2365		if (copy_to_user((char *)arg, hca, sizeof(*hca)))
2366			rc = -EFAULT;
2367
2368		ll_finish_md_op_data(op_data);
2369		OBD_FREE_PTR(hca);
2370		return rc;
2371	}
2372	case LL_IOC_SET_LEASE: {
2373		struct ll_inode_info *lli = ll_i2info(inode);
2374		struct obd_client_handle *och = NULL;
2375		bool lease_broken;
2376		fmode_t mode = 0;
2377
2378		switch (arg) {
2379		case F_WRLCK:
2380			if (!(file->f_mode & FMODE_WRITE))
2381				return -EPERM;
2382			mode = FMODE_WRITE;
2383			break;
2384		case F_RDLCK:
2385			if (!(file->f_mode & FMODE_READ))
2386				return -EPERM;
2387			mode = FMODE_READ;
2388			break;
2389		case F_UNLCK:
2390			mutex_lock(&lli->lli_och_mutex);
2391			if (fd->fd_lease_och != NULL) {
2392				och = fd->fd_lease_och;
2393				fd->fd_lease_och = NULL;
2394			}
2395			mutex_unlock(&lli->lli_och_mutex);
2396
2397			if (och != NULL) {
2398				mode = och->och_flags &
2399				       (FMODE_READ|FMODE_WRITE);
2400				rc = ll_lease_close(och, inode, &lease_broken);
2401				if (rc == 0 && lease_broken)
2402					mode = 0;
2403			} else {
2404				rc = -ENOLCK;
2405			}
2406
2407			/* return the type of lease or error */
2408			return rc < 0 ? rc : (int)mode;
2409		default:
2410			return -EINVAL;
2411		}
2412
2413		CDEBUG(D_INODE, "Set lease with mode %d\n", mode);
2414
2415		/* apply for lease */
2416		och = ll_lease_open(inode, file, mode, 0);
2417		if (IS_ERR(och))
2418			return PTR_ERR(och);
2419
2420		rc = 0;
2421		mutex_lock(&lli->lli_och_mutex);
2422		if (fd->fd_lease_och == NULL) {
2423			fd->fd_lease_och = och;
2424			och = NULL;
2425		}
2426		mutex_unlock(&lli->lli_och_mutex);
2427		if (och != NULL) {
2428			/* impossible now that only excl is supported for now */
2429			ll_lease_close(och, inode, &lease_broken);
2430			rc = -EBUSY;
2431		}
2432		return rc;
2433	}
2434	case LL_IOC_GET_LEASE: {
2435		struct ll_inode_info *lli = ll_i2info(inode);
2436		struct ldlm_lock *lock = NULL;
2437
2438		rc = 0;
2439		mutex_lock(&lli->lli_och_mutex);
2440		if (fd->fd_lease_och != NULL) {
2441			struct obd_client_handle *och = fd->fd_lease_och;
2442
2443			lock = ldlm_handle2lock(&och->och_lease_handle);
2444			if (lock != NULL) {
2445				lock_res_and_lock(lock);
2446				if (!ldlm_is_cancel(lock))
2447					rc = och->och_flags &
2448						(FMODE_READ | FMODE_WRITE);
2449				unlock_res_and_lock(lock);
2450				ldlm_lock_put(lock);
2451			}
2452		}
2453		mutex_unlock(&lli->lli_och_mutex);
2454		return rc;
2455	}
2456	case LL_IOC_HSM_IMPORT: {
2457		struct hsm_user_import *hui;
2458
2459		OBD_ALLOC_PTR(hui);
2460		if (hui == NULL)
2461			return -ENOMEM;
2462
2463		if (copy_from_user(hui, (void *)arg, sizeof(*hui))) {
2464			OBD_FREE_PTR(hui);
2465			return -EFAULT;
2466		}
2467
2468		rc = ll_hsm_import(inode, file, hui);
2469
2470		OBD_FREE_PTR(hui);
2471		return rc;
2472	}
2473	default: {
2474		int err;
2475
2476		if (LLIOC_STOP ==
2477		     ll_iocontrol_call(inode, file, cmd, arg, &err))
2478			return err;
2479
2480		return obd_iocontrol(cmd, ll_i2dtexp(inode), 0, NULL,
2481				     (void *)arg);
2482	}
2483	}
2484}
2485
2486
2487loff_t ll_file_seek(struct file *file, loff_t offset, int origin)
2488{
2489	struct inode *inode = file->f_dentry->d_inode;
2490	loff_t retval, eof = 0;
2491
2492	retval = offset + ((origin == SEEK_END) ? i_size_read(inode) :
2493			   (origin == SEEK_CUR) ? file->f_pos : 0);
2494	CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p), to=%llu=%#llx(%d)\n",
2495	       inode->i_ino, inode->i_generation, inode, retval, retval,
2496	       origin);
2497	ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_LLSEEK, 1);
2498
2499	if (origin == SEEK_END || origin == SEEK_HOLE || origin == SEEK_DATA) {
2500		retval = ll_glimpse_size(inode);
2501		if (retval != 0)
2502			return retval;
2503		eof = i_size_read(inode);
2504	}
2505
2506	retval = generic_file_llseek_size(file, offset, origin,
2507					  ll_file_maxbytes(inode), eof);
2508	return retval;
2509}
2510
2511int ll_flush(struct file *file, fl_owner_t id)
2512{
2513	struct inode *inode = file->f_dentry->d_inode;
2514	struct ll_inode_info *lli = ll_i2info(inode);
2515	struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
2516	int rc, err;
2517
2518	LASSERT(!S_ISDIR(inode->i_mode));
2519
2520	/* catch async errors that were recorded back when async writeback
2521	 * failed for pages in this mapping. */
2522	rc = lli->lli_async_rc;
2523	lli->lli_async_rc = 0;
2524	err = lov_read_and_clear_async_rc(lli->lli_clob);
2525	if (rc == 0)
2526		rc = err;
2527
2528	/* The application has been told write failure already.
2529	 * Do not report failure again. */
2530	if (fd->fd_write_failed)
2531		return 0;
2532	return rc ? -EIO : 0;
2533}
2534
2535/**
2536 * Called to make sure a portion of file has been written out.
2537 * if @local_only is not true, it will send OST_SYNC RPCs to ost.
2538 *
2539 * Return how many pages have been written.
2540 */
2541int cl_sync_file_range(struct inode *inode, loff_t start, loff_t end,
2542		       enum cl_fsync_mode mode, int ignore_layout)
2543{
2544	struct cl_env_nest nest;
2545	struct lu_env *env;
2546	struct cl_io *io;
2547	struct obd_capa *capa = NULL;
2548	struct cl_fsync_io *fio;
2549	int result;
2550
2551	if (mode != CL_FSYNC_NONE && mode != CL_FSYNC_LOCAL &&
2552	    mode != CL_FSYNC_DISCARD && mode != CL_FSYNC_ALL)
2553		return -EINVAL;
2554
2555	env = cl_env_nested_get(&nest);
2556	if (IS_ERR(env))
2557		return PTR_ERR(env);
2558
2559	capa = ll_osscapa_get(inode, CAPA_OPC_OSS_WRITE);
2560
2561	io = ccc_env_thread_io(env);
2562	io->ci_obj = cl_i2info(inode)->lli_clob;
2563	io->ci_ignore_layout = ignore_layout;
2564
2565	/* initialize parameters for sync */
2566	fio = &io->u.ci_fsync;
2567	fio->fi_capa = capa;
2568	fio->fi_start = start;
2569	fio->fi_end = end;
2570	fio->fi_fid = ll_inode2fid(inode);
2571	fio->fi_mode = mode;
2572	fio->fi_nr_written = 0;
2573
2574	if (cl_io_init(env, io, CIT_FSYNC, io->ci_obj) == 0)
2575		result = cl_io_loop(env, io);
2576	else
2577		result = io->ci_result;
2578	if (result == 0)
2579		result = fio->fi_nr_written;
2580	cl_io_fini(env, io);
2581	cl_env_nested_put(&nest, env);
2582
2583	capa_put(capa);
2584
2585	return result;
2586}
2587
2588/*
2589 * When dentry is provided (the 'else' case), *file->f_dentry may be
2590 * null and dentry must be used directly rather than pulled from
2591 * *file->f_dentry as is done otherwise.
2592 */
2593
2594int ll_fsync(struct file *file, loff_t start, loff_t end, int datasync)
2595{
2596	struct dentry *dentry = file->f_dentry;
2597	struct inode *inode = dentry->d_inode;
2598	struct ll_inode_info *lli = ll_i2info(inode);
2599	struct ptlrpc_request *req;
2600	struct obd_capa *oc;
2601	int rc, err;
2602
2603	CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p)\n", inode->i_ino,
2604	       inode->i_generation, inode);
2605	ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_FSYNC, 1);
2606
2607	rc = filemap_write_and_wait_range(inode->i_mapping, start, end);
2608	mutex_lock(&inode->i_mutex);
2609
2610	/* catch async errors that were recorded back when async writeback
2611	 * failed for pages in this mapping. */
2612	if (!S_ISDIR(inode->i_mode)) {
2613		err = lli->lli_async_rc;
2614		lli->lli_async_rc = 0;
2615		if (rc == 0)
2616			rc = err;
2617		err = lov_read_and_clear_async_rc(lli->lli_clob);
2618		if (rc == 0)
2619			rc = err;
2620	}
2621
2622	oc = ll_mdscapa_get(inode);
2623	err = md_sync(ll_i2sbi(inode)->ll_md_exp, ll_inode2fid(inode), oc,
2624		      &req);
2625	capa_put(oc);
2626	if (!rc)
2627		rc = err;
2628	if (!err)
2629		ptlrpc_req_finished(req);
2630
2631	if (datasync && S_ISREG(inode->i_mode)) {
2632		struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
2633
2634		err = cl_sync_file_range(inode, 0, OBD_OBJECT_EOF,
2635				CL_FSYNC_ALL, 0);
2636		if (rc == 0 && err < 0)
2637			rc = err;
2638		if (rc < 0)
2639			fd->fd_write_failed = true;
2640		else
2641			fd->fd_write_failed = false;
2642	}
2643
2644	mutex_unlock(&inode->i_mutex);
2645	return rc;
2646}
2647
2648int ll_file_flock(struct file *file, int cmd, struct file_lock *file_lock)
2649{
2650	struct inode *inode = file->f_dentry->d_inode;
2651	struct ll_sb_info *sbi = ll_i2sbi(inode);
2652	struct ldlm_enqueue_info einfo = {
2653		.ei_type	= LDLM_FLOCK,
2654		.ei_cb_cp	= ldlm_flock_completion_ast,
2655		.ei_cbdata	= file_lock,
2656	};
2657	struct md_op_data *op_data;
2658	struct lustre_handle lockh = {0};
2659	ldlm_policy_data_t flock = {{0}};
2660	int flags = 0;
2661	int rc;
2662	int rc2 = 0;
2663
2664	CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu file_lock=%p\n",
2665	       inode->i_ino, file_lock);
2666
2667	ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_FLOCK, 1);
2668
2669	if (file_lock->fl_flags & FL_FLOCK) {
2670		LASSERT((cmd == F_SETLKW) || (cmd == F_SETLK));
2671		/* flocks are whole-file locks */
2672		flock.l_flock.end = OFFSET_MAX;
2673		/* For flocks owner is determined by the local file desctiptor*/
2674		flock.l_flock.owner = (unsigned long)file_lock->fl_file;
2675	} else if (file_lock->fl_flags & FL_POSIX) {
2676		flock.l_flock.owner = (unsigned long)file_lock->fl_owner;
2677		flock.l_flock.start = file_lock->fl_start;
2678		flock.l_flock.end = file_lock->fl_end;
2679	} else {
2680		return -EINVAL;
2681	}
2682	flock.l_flock.pid = file_lock->fl_pid;
2683
2684	/* Somewhat ugly workaround for svc lockd.
2685	 * lockd installs custom fl_lmops->lm_compare_owner that checks
2686	 * for the fl_owner to be the same (which it always is on local node
2687	 * I guess between lockd processes) and then compares pid.
2688	 * As such we assign pid to the owner field to make it all work,
2689	 * conflict with normal locks is unlikely since pid space and
2690	 * pointer space for current->files are not intersecting */
2691	if (file_lock->fl_lmops && file_lock->fl_lmops->lm_compare_owner)
2692		flock.l_flock.owner = (unsigned long)file_lock->fl_pid;
2693
2694	switch (file_lock->fl_type) {
2695	case F_RDLCK:
2696		einfo.ei_mode = LCK_PR;
2697		break;
2698	case F_UNLCK:
2699		/* An unlock request may or may not have any relation to
2700		 * existing locks so we may not be able to pass a lock handle
2701		 * via a normal ldlm_lock_cancel() request. The request may even
2702		 * unlock a byte range in the middle of an existing lock. In
2703		 * order to process an unlock request we need all of the same
2704		 * information that is given with a normal read or write record
2705		 * lock request. To avoid creating another ldlm unlock (cancel)
2706		 * message we'll treat a LCK_NL flock request as an unlock. */
2707		einfo.ei_mode = LCK_NL;
2708		break;
2709	case F_WRLCK:
2710		einfo.ei_mode = LCK_PW;
2711		break;
2712	default:
2713		CDEBUG(D_INFO, "Unknown fcntl lock type: %d\n",
2714			file_lock->fl_type);
2715		return -ENOTSUPP;
2716	}
2717
2718	switch (cmd) {
2719	case F_SETLKW:
2720#ifdef F_SETLKW64
2721	case F_SETLKW64:
2722#endif
2723		flags = 0;
2724		break;
2725	case F_SETLK:
2726#ifdef F_SETLK64
2727	case F_SETLK64:
2728#endif
2729		flags = LDLM_FL_BLOCK_NOWAIT;
2730		break;
2731	case F_GETLK:
2732#ifdef F_GETLK64
2733	case F_GETLK64:
2734#endif
2735		flags = LDLM_FL_TEST_LOCK;
2736		/* Save the old mode so that if the mode in the lock changes we
2737		 * can decrement the appropriate reader or writer refcount. */
2738		file_lock->fl_type = einfo.ei_mode;
2739		break;
2740	default:
2741		CERROR("unknown fcntl lock command: %d\n", cmd);
2742		return -EINVAL;
2743	}
2744
2745	op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
2746				     LUSTRE_OPC_ANY, NULL);
2747	if (IS_ERR(op_data))
2748		return PTR_ERR(op_data);
2749
2750	CDEBUG(D_DLMTRACE, "inode=%lu, pid=%u, flags=%#x, mode=%u, "
2751	       "start="LPU64", end="LPU64"\n", inode->i_ino, flock.l_flock.pid,
2752	       flags, einfo.ei_mode, flock.l_flock.start, flock.l_flock.end);
2753
2754	rc = md_enqueue(sbi->ll_md_exp, &einfo, NULL,
2755			op_data, &lockh, &flock, 0, NULL /* req */, flags);
2756
2757	if ((file_lock->fl_flags & FL_FLOCK) &&
2758	    (rc == 0 || file_lock->fl_type == F_UNLCK))
2759		rc2  = flock_lock_file_wait(file, file_lock);
2760	if ((file_lock->fl_flags & FL_POSIX) &&
2761	    (rc == 0 || file_lock->fl_type == F_UNLCK) &&
2762	    !(flags & LDLM_FL_TEST_LOCK))
2763		rc2  = posix_lock_file_wait(file, file_lock);
2764
2765	if (rc2 && file_lock->fl_type != F_UNLCK) {
2766		einfo.ei_mode = LCK_NL;
2767		md_enqueue(sbi->ll_md_exp, &einfo, NULL,
2768			op_data, &lockh, &flock, 0, NULL /* req */, flags);
2769		rc = rc2;
2770	}
2771
2772	ll_finish_md_op_data(op_data);
2773
2774	return rc;
2775}
2776
2777int ll_file_noflock(struct file *file, int cmd, struct file_lock *file_lock)
2778{
2779	return -ENOSYS;
2780}
2781
2782/**
2783 * test if some locks matching bits and l_req_mode are acquired
2784 * - bits can be in different locks
2785 * - if found clear the common lock bits in *bits
2786 * - the bits not found, are kept in *bits
2787 * \param inode [IN]
2788 * \param bits [IN] searched lock bits [IN]
2789 * \param l_req_mode [IN] searched lock mode
2790 * \retval boolean, true iff all bits are found
2791 */
2792int ll_have_md_lock(struct inode *inode, __u64 *bits,  ldlm_mode_t l_req_mode)
2793{
2794	struct lustre_handle lockh;
2795	ldlm_policy_data_t policy;
2796	ldlm_mode_t mode = (l_req_mode == LCK_MINMODE) ?
2797				(LCK_CR|LCK_CW|LCK_PR|LCK_PW) : l_req_mode;
2798	struct lu_fid *fid;
2799	__u64 flags;
2800	int i;
2801
2802	if (!inode)
2803	       return 0;
2804
2805	fid = &ll_i2info(inode)->lli_fid;
2806	CDEBUG(D_INFO, "trying to match res "DFID" mode %s\n", PFID(fid),
2807	       ldlm_lockname[mode]);
2808
2809	flags = LDLM_FL_BLOCK_GRANTED | LDLM_FL_CBPENDING | LDLM_FL_TEST_LOCK;
2810	for (i = 0; i <= MDS_INODELOCK_MAXSHIFT && *bits != 0; i++) {
2811		policy.l_inodebits.bits = *bits & (1 << i);
2812		if (policy.l_inodebits.bits == 0)
2813			continue;
2814
2815		if (md_lock_match(ll_i2mdexp(inode), flags, fid, LDLM_IBITS,
2816				  &policy, mode, &lockh)) {
2817			struct ldlm_lock *lock;
2818
2819			lock = ldlm_handle2lock(&lockh);
2820			if (lock) {
2821				*bits &=
2822				      ~(lock->l_policy_data.l_inodebits.bits);
2823				LDLM_LOCK_PUT(lock);
2824			} else {
2825				*bits &= ~policy.l_inodebits.bits;
2826			}
2827		}
2828	}
2829	return *bits == 0;
2830}
2831
2832ldlm_mode_t ll_take_md_lock(struct inode *inode, __u64 bits,
2833			    struct lustre_handle *lockh, __u64 flags,
2834			    ldlm_mode_t mode)
2835{
2836	ldlm_policy_data_t policy = { .l_inodebits = {bits}};
2837	struct lu_fid *fid;
2838	ldlm_mode_t rc;
2839
2840	fid = &ll_i2info(inode)->lli_fid;
2841	CDEBUG(D_INFO, "trying to match res "DFID"\n", PFID(fid));
2842
2843	rc = md_lock_match(ll_i2mdexp(inode), LDLM_FL_BLOCK_GRANTED|flags,
2844			   fid, LDLM_IBITS, &policy, mode, lockh);
2845
2846	return rc;
2847}
2848
2849static int ll_inode_revalidate_fini(struct inode *inode, int rc)
2850{
2851	/* Already unlinked. Just update nlink and return success */
2852	if (rc == -ENOENT) {
2853		clear_nlink(inode);
2854		/* This path cannot be hit for regular files unless in
2855		 * case of obscure races, so no need to validate size.
2856		 */
2857		if (!S_ISREG(inode->i_mode) && !S_ISDIR(inode->i_mode))
2858			return 0;
2859	} else if (rc != 0) {
2860		CERROR("%s: revalidate FID "DFID" error: rc = %d\n",
2861		       ll_get_fsname(inode->i_sb, NULL, 0),
2862		       PFID(ll_inode2fid(inode)), rc);
2863	}
2864
2865	return rc;
2866}
2867
2868int __ll_inode_revalidate_it(struct dentry *dentry, struct lookup_intent *it,
2869			     __u64 ibits)
2870{
2871	struct inode *inode = dentry->d_inode;
2872	struct ptlrpc_request *req = NULL;
2873	struct obd_export *exp;
2874	int rc = 0;
2875
2876	LASSERT(inode != NULL);
2877
2878	CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p),name=%s\n",
2879	       inode->i_ino, inode->i_generation, inode, dentry->d_name.name);
2880
2881	exp = ll_i2mdexp(inode);
2882
2883	/* XXX: Enable OBD_CONNECT_ATTRFID to reduce unnecessary getattr RPC.
2884	 *      But under CMD case, it caused some lock issues, should be fixed
2885	 *      with new CMD ibits lock. See bug 12718 */
2886	if (exp_connect_flags(exp) & OBD_CONNECT_ATTRFID) {
2887		struct lookup_intent oit = { .it_op = IT_GETATTR };
2888		struct md_op_data *op_data;
2889
2890		if (ibits == MDS_INODELOCK_LOOKUP)
2891			oit.it_op = IT_LOOKUP;
2892
2893		/* Call getattr by fid, so do not provide name at all. */
2894		op_data = ll_prep_md_op_data(NULL, dentry->d_parent->d_inode,
2895					     dentry->d_inode, NULL, 0, 0,
2896					     LUSTRE_OPC_ANY, NULL);
2897		if (IS_ERR(op_data))
2898			return PTR_ERR(op_data);
2899
2900		oit.it_create_mode |= M_CHECK_STALE;
2901		rc = md_intent_lock(exp, op_data, NULL, 0,
2902				    /* we are not interested in name
2903				       based lookup */
2904				    &oit, 0, &req,
2905				    ll_md_blocking_ast, 0);
2906		ll_finish_md_op_data(op_data);
2907		oit.it_create_mode &= ~M_CHECK_STALE;
2908		if (rc < 0) {
2909			rc = ll_inode_revalidate_fini(inode, rc);
2910			GOTO (out, rc);
2911		}
2912
2913		rc = ll_revalidate_it_finish(req, &oit, dentry);
2914		if (rc != 0) {
2915			ll_intent_release(&oit);
2916			GOTO(out, rc);
2917		}
2918
2919		/* Unlinked? Unhash dentry, so it is not picked up later by
2920		   do_lookup() -> ll_revalidate_it(). We cannot use d_drop
2921		   here to preserve get_cwd functionality on 2.6.
2922		   Bug 10503 */
2923		if (!dentry->d_inode->i_nlink)
2924			d_lustre_invalidate(dentry, 0);
2925
2926		ll_lookup_finish_locks(&oit, dentry);
2927	} else if (!ll_have_md_lock(dentry->d_inode, &ibits, LCK_MINMODE)) {
2928		struct ll_sb_info *sbi = ll_i2sbi(dentry->d_inode);
2929		obd_valid valid = OBD_MD_FLGETATTR;
2930		struct md_op_data *op_data;
2931		int ealen = 0;
2932
2933		if (S_ISREG(inode->i_mode)) {
2934			rc = ll_get_max_mdsize(sbi, &ealen);
2935			if (rc)
2936				return rc;
2937			valid |= OBD_MD_FLEASIZE | OBD_MD_FLMODEASIZE;
2938		}
2939
2940		op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL,
2941					     0, ealen, LUSTRE_OPC_ANY,
2942					     NULL);
2943		if (IS_ERR(op_data))
2944			return PTR_ERR(op_data);
2945
2946		op_data->op_valid = valid;
2947		/* Once OBD_CONNECT_ATTRFID is not supported, we can't find one
2948		 * capa for this inode. Because we only keep capas of dirs
2949		 * fresh. */
2950		rc = md_getattr(sbi->ll_md_exp, op_data, &req);
2951		ll_finish_md_op_data(op_data);
2952		if (rc) {
2953			rc = ll_inode_revalidate_fini(inode, rc);
2954			return rc;
2955		}
2956
2957		rc = ll_prep_inode(&inode, req, NULL, NULL);
2958	}
2959out:
2960	ptlrpc_req_finished(req);
2961	return rc;
2962}
2963
2964int ll_inode_revalidate_it(struct dentry *dentry, struct lookup_intent *it,
2965			   __u64 ibits)
2966{
2967	struct inode *inode = dentry->d_inode;
2968	int rc;
2969
2970	rc = __ll_inode_revalidate_it(dentry, it, ibits);
2971	if (rc != 0)
2972		return rc;
2973
2974	/* if object isn't regular file, don't validate size */
2975	if (!S_ISREG(inode->i_mode)) {
2976		LTIME_S(inode->i_atime) = ll_i2info(inode)->lli_lvb.lvb_atime;
2977		LTIME_S(inode->i_mtime) = ll_i2info(inode)->lli_lvb.lvb_mtime;
2978		LTIME_S(inode->i_ctime) = ll_i2info(inode)->lli_lvb.lvb_ctime;
2979	} else {
2980		/* In case of restore, the MDT has the right size and has
2981		 * already send it back without granting the layout lock,
2982		 * inode is up-to-date so glimpse is useless.
2983		 * Also to glimpse we need the layout, in case of a running
2984		 * restore the MDT holds the layout lock so the glimpse will
2985		 * block up to the end of restore (getattr will block)
2986		 */
2987		if (!(ll_i2info(inode)->lli_flags & LLIF_FILE_RESTORING))
2988			rc = ll_glimpse_size(inode);
2989	}
2990	return rc;
2991}
2992
2993int ll_getattr_it(struct vfsmount *mnt, struct dentry *de,
2994		  struct lookup_intent *it, struct kstat *stat)
2995{
2996	struct inode *inode = de->d_inode;
2997	struct ll_sb_info *sbi = ll_i2sbi(inode);
2998	struct ll_inode_info *lli = ll_i2info(inode);
2999	int res = 0;
3000
3001	res = ll_inode_revalidate_it(de, it, MDS_INODELOCK_UPDATE |
3002					     MDS_INODELOCK_LOOKUP);
3003	ll_stats_ops_tally(sbi, LPROC_LL_GETATTR, 1);
3004
3005	if (res)
3006		return res;
3007
3008	stat->dev = inode->i_sb->s_dev;
3009	if (ll_need_32bit_api(sbi))
3010		stat->ino = cl_fid_build_ino(&lli->lli_fid, 1);
3011	else
3012		stat->ino = inode->i_ino;
3013	stat->mode = inode->i_mode;
3014	stat->nlink = inode->i_nlink;
3015	stat->uid = inode->i_uid;
3016	stat->gid = inode->i_gid;
3017	stat->rdev = inode->i_rdev;
3018	stat->atime = inode->i_atime;
3019	stat->mtime = inode->i_mtime;
3020	stat->ctime = inode->i_ctime;
3021	stat->blksize = 1 << inode->i_blkbits;
3022
3023	stat->size = i_size_read(inode);
3024	stat->blocks = inode->i_blocks;
3025
3026	return 0;
3027}
3028int ll_getattr(struct vfsmount *mnt, struct dentry *de, struct kstat *stat)
3029{
3030	struct lookup_intent it = { .it_op = IT_GETATTR };
3031
3032	return ll_getattr_it(mnt, de, &it, stat);
3033}
3034
3035int ll_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
3036		__u64 start, __u64 len)
3037{
3038	int rc;
3039	size_t num_bytes;
3040	struct ll_user_fiemap *fiemap;
3041	unsigned int extent_count = fieinfo->fi_extents_max;
3042
3043	num_bytes = sizeof(*fiemap) + (extent_count *
3044				       sizeof(struct ll_fiemap_extent));
3045	OBD_ALLOC_LARGE(fiemap, num_bytes);
3046
3047	if (fiemap == NULL)
3048		return -ENOMEM;
3049
3050	fiemap->fm_flags = fieinfo->fi_flags;
3051	fiemap->fm_extent_count = fieinfo->fi_extents_max;
3052	fiemap->fm_start = start;
3053	fiemap->fm_length = len;
3054	memcpy(&fiemap->fm_extents[0], fieinfo->fi_extents_start,
3055	       sizeof(struct ll_fiemap_extent));
3056
3057	rc = ll_do_fiemap(inode, fiemap, num_bytes);
3058
3059	fieinfo->fi_flags = fiemap->fm_flags;
3060	fieinfo->fi_extents_mapped = fiemap->fm_mapped_extents;
3061	memcpy(fieinfo->fi_extents_start, &fiemap->fm_extents[0],
3062	       fiemap->fm_mapped_extents * sizeof(struct ll_fiemap_extent));
3063
3064	OBD_FREE_LARGE(fiemap, num_bytes);
3065	return rc;
3066}
3067
3068struct posix_acl * ll_get_acl(struct inode *inode, int type)
3069{
3070	struct ll_inode_info *lli = ll_i2info(inode);
3071	struct posix_acl *acl = NULL;
3072
3073	spin_lock(&lli->lli_lock);
3074	/* VFS' acl_permission_check->check_acl will release the refcount */
3075	acl = posix_acl_dup(lli->lli_posix_acl);
3076	spin_unlock(&lli->lli_lock);
3077
3078	return acl;
3079}
3080
3081
3082int ll_inode_permission(struct inode *inode, int mask)
3083{
3084	int rc = 0;
3085
3086#ifdef MAY_NOT_BLOCK
3087	if (mask & MAY_NOT_BLOCK)
3088		return -ECHILD;
3089#endif
3090
3091       /* as root inode are NOT getting validated in lookup operation,
3092	* need to do it before permission check. */
3093
3094	if (inode == inode->i_sb->s_root->d_inode) {
3095		struct lookup_intent it = { .it_op = IT_LOOKUP };
3096
3097		rc = __ll_inode_revalidate_it(inode->i_sb->s_root, &it,
3098					      MDS_INODELOCK_LOOKUP);
3099		if (rc)
3100			return rc;
3101	}
3102
3103	CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p), inode mode %x mask %o\n",
3104	       inode->i_ino, inode->i_generation, inode, inode->i_mode, mask);
3105
3106	if (ll_i2sbi(inode)->ll_flags & LL_SBI_RMT_CLIENT)
3107		return lustre_check_remote_perm(inode, mask);
3108
3109	ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_INODE_PERM, 1);
3110	rc = generic_permission(inode, mask);
3111
3112	return rc;
3113}
3114
3115/* -o localflock - only provides locally consistent flock locks */
3116struct file_operations ll_file_operations = {
3117	.read	   = ll_file_read,
3118	.aio_read = ll_file_aio_read,
3119	.write	  = ll_file_write,
3120	.aio_write = ll_file_aio_write,
3121	.unlocked_ioctl = ll_file_ioctl,
3122	.open	   = ll_file_open,
3123	.release	= ll_file_release,
3124	.mmap	   = ll_file_mmap,
3125	.llseek	 = ll_file_seek,
3126	.splice_read    = ll_file_splice_read,
3127	.fsync	  = ll_fsync,
3128	.flush	  = ll_flush
3129};
3130
3131struct file_operations ll_file_operations_flock = {
3132	.read	   = ll_file_read,
3133	.aio_read    = ll_file_aio_read,
3134	.write	  = ll_file_write,
3135	.aio_write   = ll_file_aio_write,
3136	.unlocked_ioctl = ll_file_ioctl,
3137	.open	   = ll_file_open,
3138	.release	= ll_file_release,
3139	.mmap	   = ll_file_mmap,
3140	.llseek	 = ll_file_seek,
3141	.splice_read    = ll_file_splice_read,
3142	.fsync	  = ll_fsync,
3143	.flush	  = ll_flush,
3144	.flock	  = ll_file_flock,
3145	.lock	   = ll_file_flock
3146};
3147
3148/* These are for -o noflock - to return ENOSYS on flock calls */
3149struct file_operations ll_file_operations_noflock = {
3150	.read	   = ll_file_read,
3151	.aio_read    = ll_file_aio_read,
3152	.write	  = ll_file_write,
3153	.aio_write   = ll_file_aio_write,
3154	.unlocked_ioctl = ll_file_ioctl,
3155	.open	   = ll_file_open,
3156	.release	= ll_file_release,
3157	.mmap	   = ll_file_mmap,
3158	.llseek	 = ll_file_seek,
3159	.splice_read    = ll_file_splice_read,
3160	.fsync	  = ll_fsync,
3161	.flush	  = ll_flush,
3162	.flock	  = ll_file_noflock,
3163	.lock	   = ll_file_noflock
3164};
3165
3166struct inode_operations ll_file_inode_operations = {
3167	.setattr	= ll_setattr,
3168	.getattr	= ll_getattr,
3169	.permission	= ll_inode_permission,
3170	.setxattr	= ll_setxattr,
3171	.getxattr	= ll_getxattr,
3172	.listxattr	= ll_listxattr,
3173	.removexattr	= ll_removexattr,
3174	.fiemap		= ll_fiemap,
3175	.get_acl	= ll_get_acl,
3176};
3177
3178/* dynamic ioctl number support routins */
3179static struct llioc_ctl_data {
3180	struct rw_semaphore	ioc_sem;
3181	struct list_head	      ioc_head;
3182} llioc = {
3183	__RWSEM_INITIALIZER(llioc.ioc_sem),
3184	LIST_HEAD_INIT(llioc.ioc_head)
3185};
3186
3187
3188struct llioc_data {
3189	struct list_head	      iocd_list;
3190	unsigned int	    iocd_size;
3191	llioc_callback_t	iocd_cb;
3192	unsigned int	    iocd_count;
3193	unsigned int	    iocd_cmd[0];
3194};
3195
3196void *ll_iocontrol_register(llioc_callback_t cb, int count, unsigned int *cmd)
3197{
3198	unsigned int size;
3199	struct llioc_data *in_data = NULL;
3200
3201	if (cb == NULL || cmd == NULL ||
3202	    count > LLIOC_MAX_CMD || count < 0)
3203		return NULL;
3204
3205	size = sizeof(*in_data) + count * sizeof(unsigned int);
3206	OBD_ALLOC(in_data, size);
3207	if (in_data == NULL)
3208		return NULL;
3209
3210	memset(in_data, 0, sizeof(*in_data));
3211	in_data->iocd_size = size;
3212	in_data->iocd_cb = cb;
3213	in_data->iocd_count = count;
3214	memcpy(in_data->iocd_cmd, cmd, sizeof(unsigned int) * count);
3215
3216	down_write(&llioc.ioc_sem);
3217	list_add_tail(&in_data->iocd_list, &llioc.ioc_head);
3218	up_write(&llioc.ioc_sem);
3219
3220	return in_data;
3221}
3222
3223void ll_iocontrol_unregister(void *magic)
3224{
3225	struct llioc_data *tmp;
3226
3227	if (magic == NULL)
3228		return;
3229
3230	down_write(&llioc.ioc_sem);
3231	list_for_each_entry(tmp, &llioc.ioc_head, iocd_list) {
3232		if (tmp == magic) {
3233			unsigned int size = tmp->iocd_size;
3234
3235			list_del(&tmp->iocd_list);
3236			up_write(&llioc.ioc_sem);
3237
3238			OBD_FREE(tmp, size);
3239			return;
3240		}
3241	}
3242	up_write(&llioc.ioc_sem);
3243
3244	CWARN("didn't find iocontrol register block with magic: %p\n", magic);
3245}
3246
3247EXPORT_SYMBOL(ll_iocontrol_register);
3248EXPORT_SYMBOL(ll_iocontrol_unregister);
3249
3250enum llioc_iter ll_iocontrol_call(struct inode *inode, struct file *file,
3251			unsigned int cmd, unsigned long arg, int *rcp)
3252{
3253	enum llioc_iter ret = LLIOC_CONT;
3254	struct llioc_data *data;
3255	int rc = -EINVAL, i;
3256
3257	down_read(&llioc.ioc_sem);
3258	list_for_each_entry(data, &llioc.ioc_head, iocd_list) {
3259		for (i = 0; i < data->iocd_count; i++) {
3260			if (cmd != data->iocd_cmd[i])
3261				continue;
3262
3263			ret = data->iocd_cb(inode, file, cmd, arg, data, &rc);
3264			break;
3265		}
3266
3267		if (ret == LLIOC_STOP)
3268			break;
3269	}
3270	up_read(&llioc.ioc_sem);
3271
3272	if (rcp)
3273		*rcp = rc;
3274	return ret;
3275}
3276
3277int ll_layout_conf(struct inode *inode, const struct cl_object_conf *conf)
3278{
3279	struct ll_inode_info *lli = ll_i2info(inode);
3280	struct cl_env_nest nest;
3281	struct lu_env *env;
3282	int result;
3283
3284	if (lli->lli_clob == NULL)
3285		return 0;
3286
3287	env = cl_env_nested_get(&nest);
3288	if (IS_ERR(env))
3289		return PTR_ERR(env);
3290
3291	result = cl_conf_set(env, lli->lli_clob, conf);
3292	cl_env_nested_put(&nest, env);
3293
3294	if (conf->coc_opc == OBJECT_CONF_SET) {
3295		struct ldlm_lock *lock = conf->coc_lock;
3296
3297		LASSERT(lock != NULL);
3298		LASSERT(ldlm_has_layout(lock));
3299		if (result == 0) {
3300			/* it can only be allowed to match after layout is
3301			 * applied to inode otherwise false layout would be
3302			 * seen. Applying layout shoud happen before dropping
3303			 * the intent lock. */
3304			ldlm_lock_allow_match(lock);
3305		}
3306	}
3307	return result;
3308}
3309
3310/* Fetch layout from MDT with getxattr request, if it's not ready yet */
3311static int ll_layout_fetch(struct inode *inode, struct ldlm_lock *lock)
3312
3313{
3314	struct ll_sb_info *sbi = ll_i2sbi(inode);
3315	struct obd_capa *oc;
3316	struct ptlrpc_request *req;
3317	struct mdt_body *body;
3318	void *lvbdata;
3319	void *lmm;
3320	int lmmsize;
3321	int rc;
3322
3323	CDEBUG(D_INODE, DFID" LVB_READY=%d l_lvb_data=%p l_lvb_len=%d\n",
3324	       PFID(ll_inode2fid(inode)), !!(lock->l_flags & LDLM_FL_LVB_READY),
3325	       lock->l_lvb_data, lock->l_lvb_len);
3326
3327	if ((lock->l_lvb_data != NULL) && (lock->l_flags & LDLM_FL_LVB_READY))
3328		return 0;
3329
3330	/* if layout lock was granted right away, the layout is returned
3331	 * within DLM_LVB of dlm reply; otherwise if the lock was ever
3332	 * blocked and then granted via completion ast, we have to fetch
3333	 * layout here. Please note that we can't use the LVB buffer in
3334	 * completion AST because it doesn't have a large enough buffer */
3335	oc = ll_mdscapa_get(inode);
3336	rc = ll_get_max_mdsize(sbi, &lmmsize);
3337	if (rc == 0)
3338		rc = md_getxattr(sbi->ll_md_exp, ll_inode2fid(inode), oc,
3339				OBD_MD_FLXATTR, XATTR_NAME_LOV, NULL, 0,
3340				lmmsize, 0, &req);
3341	capa_put(oc);
3342	if (rc < 0)
3343		return rc;
3344
3345	body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
3346	if (body == NULL || body->eadatasize > lmmsize)
3347		GOTO(out, rc = -EPROTO);
3348
3349	lmmsize = body->eadatasize;
3350	if (lmmsize == 0) /* empty layout */
3351		GOTO(out, rc = 0);
3352
3353	lmm = req_capsule_server_sized_get(&req->rq_pill, &RMF_EADATA, lmmsize);
3354	if (lmm == NULL)
3355		GOTO(out, rc = -EFAULT);
3356
3357	OBD_ALLOC_LARGE(lvbdata, lmmsize);
3358	if (lvbdata == NULL)
3359		GOTO(out, rc = -ENOMEM);
3360
3361	memcpy(lvbdata, lmm, lmmsize);
3362	lock_res_and_lock(lock);
3363	if (lock->l_lvb_data != NULL)
3364		OBD_FREE_LARGE(lock->l_lvb_data, lock->l_lvb_len);
3365
3366	lock->l_lvb_data = lvbdata;
3367	lock->l_lvb_len = lmmsize;
3368	unlock_res_and_lock(lock);
3369
3370out:
3371	ptlrpc_req_finished(req);
3372	return rc;
3373}
3374
3375/**
3376 * Apply the layout to the inode. Layout lock is held and will be released
3377 * in this function.
3378 */
3379static int ll_layout_lock_set(struct lustre_handle *lockh, ldlm_mode_t mode,
3380				struct inode *inode, __u32 *gen, bool reconf)
3381{
3382	struct ll_inode_info *lli = ll_i2info(inode);
3383	struct ll_sb_info    *sbi = ll_i2sbi(inode);
3384	struct ldlm_lock *lock;
3385	struct lustre_md md = { NULL };
3386	struct cl_object_conf conf;
3387	int rc = 0;
3388	bool lvb_ready;
3389	bool wait_layout = false;
3390
3391	LASSERT(lustre_handle_is_used(lockh));
3392
3393	lock = ldlm_handle2lock(lockh);
3394	LASSERT(lock != NULL);
3395	LASSERT(ldlm_has_layout(lock));
3396
3397	LDLM_DEBUG(lock, "File %p/"DFID" being reconfigured: %d.\n",
3398		   inode, PFID(&lli->lli_fid), reconf);
3399
3400	/* in case this is a caching lock and reinstate with new inode */
3401	md_set_lock_data(sbi->ll_md_exp, &lockh->cookie, inode, NULL);
3402
3403	lock_res_and_lock(lock);
3404	lvb_ready = !!(lock->l_flags & LDLM_FL_LVB_READY);
3405	unlock_res_and_lock(lock);
3406	/* checking lvb_ready is racy but this is okay. The worst case is
3407	 * that multi processes may configure the file on the same time. */
3408	if (lvb_ready || !reconf) {
3409		rc = -ENODATA;
3410		if (lvb_ready) {
3411			/* layout_gen must be valid if layout lock is not
3412			 * cancelled and stripe has already set */
3413			*gen = lli->lli_layout_gen;
3414			rc = 0;
3415		}
3416		GOTO(out, rc);
3417	}
3418
3419	rc = ll_layout_fetch(inode, lock);
3420	if (rc < 0)
3421		GOTO(out, rc);
3422
3423	/* for layout lock, lmm is returned in lock's lvb.
3424	 * lvb_data is immutable if the lock is held so it's safe to access it
3425	 * without res lock. See the description in ldlm_lock_decref_internal()
3426	 * for the condition to free lvb_data of layout lock */
3427	if (lock->l_lvb_data != NULL) {
3428		rc = obd_unpackmd(sbi->ll_dt_exp, &md.lsm,
3429				  lock->l_lvb_data, lock->l_lvb_len);
3430		if (rc >= 0) {
3431			*gen = LL_LAYOUT_GEN_EMPTY;
3432			if (md.lsm != NULL)
3433				*gen = md.lsm->lsm_layout_gen;
3434			rc = 0;
3435		} else {
3436			CERROR("%s: file "DFID" unpackmd error: %d\n",
3437				ll_get_fsname(inode->i_sb, NULL, 0),
3438				PFID(&lli->lli_fid), rc);
3439		}
3440	}
3441	if (rc < 0)
3442		GOTO(out, rc);
3443
3444	/* set layout to file. Unlikely this will fail as old layout was
3445	 * surely eliminated */
3446	memset(&conf, 0, sizeof(conf));
3447	conf.coc_opc = OBJECT_CONF_SET;
3448	conf.coc_inode = inode;
3449	conf.coc_lock = lock;
3450	conf.u.coc_md = &md;
3451	rc = ll_layout_conf(inode, &conf);
3452
3453	if (md.lsm != NULL)
3454		obd_free_memmd(sbi->ll_dt_exp, &md.lsm);
3455
3456	/* refresh layout failed, need to wait */
3457	wait_layout = rc == -EBUSY;
3458
3459out:
3460	LDLM_LOCK_PUT(lock);
3461	ldlm_lock_decref(lockh, mode);
3462
3463	/* wait for IO to complete if it's still being used. */
3464	if (wait_layout) {
3465		CDEBUG(D_INODE, "%s: %p/"DFID" wait for layout reconf.\n",
3466			ll_get_fsname(inode->i_sb, NULL, 0),
3467			inode, PFID(&lli->lli_fid));
3468
3469		memset(&conf, 0, sizeof(conf));
3470		conf.coc_opc = OBJECT_CONF_WAIT;
3471		conf.coc_inode = inode;
3472		rc = ll_layout_conf(inode, &conf);
3473		if (rc == 0)
3474			rc = -EAGAIN;
3475
3476		CDEBUG(D_INODE, "file: "DFID" waiting layout return: %d.\n",
3477			PFID(&lli->lli_fid), rc);
3478	}
3479	return rc;
3480}
3481
3482/**
3483 * This function checks if there exists a LAYOUT lock on the client side,
3484 * or enqueues it if it doesn't have one in cache.
3485 *
3486 * This function will not hold layout lock so it may be revoked any time after
3487 * this function returns. Any operations depend on layout should be redone
3488 * in that case.
3489 *
3490 * This function should be called before lov_io_init() to get an uptodate
3491 * layout version, the caller should save the version number and after IO
3492 * is finished, this function should be called again to verify that layout
3493 * is not changed during IO time.
3494 */
3495int ll_layout_refresh(struct inode *inode, __u32 *gen)
3496{
3497	struct ll_inode_info  *lli = ll_i2info(inode);
3498	struct ll_sb_info     *sbi = ll_i2sbi(inode);
3499	struct md_op_data     *op_data;
3500	struct lookup_intent   it;
3501	struct lustre_handle   lockh;
3502	ldlm_mode_t	       mode;
3503	struct ldlm_enqueue_info einfo = {
3504		.ei_type = LDLM_IBITS,
3505		.ei_mode = LCK_CR,
3506		.ei_cb_bl = ll_md_blocking_ast,
3507		.ei_cb_cp = ldlm_completion_ast,
3508	};
3509	int rc;
3510
3511	*gen = lli->lli_layout_gen;
3512	if (!(sbi->ll_flags & LL_SBI_LAYOUT_LOCK))
3513		return 0;
3514
3515	/* sanity checks */
3516	LASSERT(fid_is_sane(ll_inode2fid(inode)));
3517	LASSERT(S_ISREG(inode->i_mode));
3518
3519	/* mostly layout lock is caching on the local side, so try to match
3520	 * it before grabbing layout lock mutex. */
3521	mode = ll_take_md_lock(inode, MDS_INODELOCK_LAYOUT, &lockh, 0,
3522			       LCK_CR | LCK_CW | LCK_PR | LCK_PW);
3523	if (mode != 0) { /* hit cached lock */
3524		rc = ll_layout_lock_set(&lockh, mode, inode, gen, false);
3525		if (rc == 0)
3526			return 0;
3527
3528		/* better hold lli_layout_mutex to try again otherwise
3529		 * it will have starvation problem. */
3530	}
3531
3532	/* take layout lock mutex to enqueue layout lock exclusively. */
3533	mutex_lock(&lli->lli_layout_mutex);
3534
3535again:
3536	/* try again. Maybe somebody else has done this. */
3537	mode = ll_take_md_lock(inode, MDS_INODELOCK_LAYOUT, &lockh, 0,
3538			       LCK_CR | LCK_CW | LCK_PR | LCK_PW);
3539	if (mode != 0) { /* hit cached lock */
3540		rc = ll_layout_lock_set(&lockh, mode, inode, gen, true);
3541		if (rc == -EAGAIN)
3542			goto again;
3543
3544		mutex_unlock(&lli->lli_layout_mutex);
3545		return rc;
3546	}
3547
3548	op_data = ll_prep_md_op_data(NULL, inode, inode, NULL,
3549			0, 0, LUSTRE_OPC_ANY, NULL);
3550	if (IS_ERR(op_data)) {
3551		mutex_unlock(&lli->lli_layout_mutex);
3552		return PTR_ERR(op_data);
3553	}
3554
3555	/* have to enqueue one */
3556	memset(&it, 0, sizeof(it));
3557	it.it_op = IT_LAYOUT;
3558	lockh.cookie = 0ULL;
3559
3560	LDLM_DEBUG_NOLOCK("%s: requeue layout lock for file %p/"DFID".\n",
3561			ll_get_fsname(inode->i_sb, NULL, 0), inode,
3562			PFID(&lli->lli_fid));
3563
3564	rc = md_enqueue(sbi->ll_md_exp, &einfo, &it, op_data, &lockh,
3565			NULL, 0, NULL, 0);
3566	if (it.d.lustre.it_data != NULL)
3567		ptlrpc_req_finished(it.d.lustre.it_data);
3568	it.d.lustre.it_data = NULL;
3569
3570	ll_finish_md_op_data(op_data);
3571
3572	mode = it.d.lustre.it_lock_mode;
3573	it.d.lustre.it_lock_mode = 0;
3574	ll_intent_drop_lock(&it);
3575
3576	if (rc == 0) {
3577		/* set lock data in case this is a new lock */
3578		ll_set_lock_data(sbi->ll_md_exp, inode, &it, NULL);
3579		rc = ll_layout_lock_set(&lockh, mode, inode, gen, true);
3580		if (rc == -EAGAIN)
3581			goto again;
3582	}
3583	mutex_unlock(&lli->lli_layout_mutex);
3584
3585	return rc;
3586}
3587
3588/**
3589 *  This function send a restore request to the MDT
3590 */
3591int ll_layout_restore(struct inode *inode)
3592{
3593	struct hsm_user_request	*hur;
3594	int			 len, rc;
3595
3596	len = sizeof(struct hsm_user_request) +
3597	      sizeof(struct hsm_user_item);
3598	OBD_ALLOC(hur, len);
3599	if (hur == NULL)
3600		return -ENOMEM;
3601
3602	hur->hur_request.hr_action = HUA_RESTORE;
3603	hur->hur_request.hr_archive_id = 0;
3604	hur->hur_request.hr_flags = 0;
3605	memcpy(&hur->hur_user_item[0].hui_fid, &ll_i2info(inode)->lli_fid,
3606	       sizeof(hur->hur_user_item[0].hui_fid));
3607	hur->hur_user_item[0].hui_extent.length = -1;
3608	hur->hur_request.hr_itemcount = 1;
3609	rc = obd_iocontrol(LL_IOC_HSM_REQUEST, cl_i2sbi(inode)->ll_md_exp,
3610			   len, hur, NULL);
3611	OBD_FREE(hur, len);
3612	return rc;
3613}
3614