[go: nahoru, domu]

1/*
2 * GPL HEADER START
3 *
4 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
5 *
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License version 2 only,
8 * as published by the Free Software Foundation.
9 *
10 * This program is distributed in the hope that it will be useful, but
11 * WITHOUT ANY WARRANTY; without even the implied warranty of
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
13 * General Public License version 2 for more details (a copy is included
14 * in the LICENSE file that accompanied this code).
15 *
16 * You should have received a copy of the GNU General Public License
17 * version 2 along with this program; If not, see
18 * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
19 *
20 * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
21 * CA 95054 USA or visit www.sun.com if you need additional information or
22 * have any questions.
23 *
24 * GPL HEADER END
25 */
26/*
27 * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved.
28 * Use is subject to license terms.
29 *
30 * Copyright (c) 2011, 2012, Intel Corporation.
31 */
32/*
33 * This file is part of Lustre, http://www.lustre.org/
34 * Lustre is a trademark of Sun Microsystems, Inc.
35 *
36 * lustre/llite/file.c
37 *
38 * Author: Peter Braam <braam@clusterfs.com>
39 * Author: Phil Schwan <phil@clusterfs.com>
40 * Author: Andreas Dilger <adilger@clusterfs.com>
41 */
42
43#define DEBUG_SUBSYSTEM S_LLITE
44#include "../include/lustre_dlm.h"
45#include "../include/lustre_lite.h"
46#include <linux/pagemap.h>
47#include <linux/file.h>
48#include "llite_internal.h"
49#include "../include/lustre/ll_fiemap.h"
50
51#include "../include/cl_object.h"
52
53static int
54ll_put_grouplock(struct inode *inode, struct file *file, unsigned long arg);
55
56static int ll_lease_close(struct obd_client_handle *och, struct inode *inode,
57			  bool *lease_broken);
58
59static enum llioc_iter
60ll_iocontrol_call(struct inode *inode, struct file *file,
61		  unsigned int cmd, unsigned long arg, int *rcp);
62
63static struct ll_file_data *ll_file_data_get(void)
64{
65	struct ll_file_data *fd;
66
67	OBD_SLAB_ALLOC_PTR_GFP(fd, ll_file_data_slab, GFP_NOFS);
68	if (fd == NULL)
69		return NULL;
70	fd->fd_write_failed = false;
71	return fd;
72}
73
74static void ll_file_data_put(struct ll_file_data *fd)
75{
76	if (fd != NULL)
77		OBD_SLAB_FREE_PTR(fd, ll_file_data_slab);
78}
79
80void ll_pack_inode2opdata(struct inode *inode, struct md_op_data *op_data,
81			  struct lustre_handle *fh)
82{
83	op_data->op_fid1 = ll_i2info(inode)->lli_fid;
84	op_data->op_attr.ia_mode = inode->i_mode;
85	op_data->op_attr.ia_atime = inode->i_atime;
86	op_data->op_attr.ia_mtime = inode->i_mtime;
87	op_data->op_attr.ia_ctime = inode->i_ctime;
88	op_data->op_attr.ia_size = i_size_read(inode);
89	op_data->op_attr_blocks = inode->i_blocks;
90	((struct ll_iattr *)&op_data->op_attr)->ia_attr_flags =
91					ll_inode_to_ext_flags(inode->i_flags);
92	op_data->op_ioepoch = ll_i2info(inode)->lli_ioepoch;
93	if (fh)
94		op_data->op_handle = *fh;
95	op_data->op_capa1 = ll_mdscapa_get(inode);
96
97	if (LLIF_DATA_MODIFIED & ll_i2info(inode)->lli_flags)
98		op_data->op_bias |= MDS_DATA_MODIFIED;
99}
100
101/**
102 * Closes the IO epoch and packs all the attributes into @op_data for
103 * the CLOSE rpc.
104 */
105static void ll_prepare_close(struct inode *inode, struct md_op_data *op_data,
106			     struct obd_client_handle *och)
107{
108	op_data->op_attr.ia_valid = ATTR_MODE | ATTR_ATIME | ATTR_ATIME_SET |
109					ATTR_MTIME | ATTR_MTIME_SET |
110					ATTR_CTIME | ATTR_CTIME_SET;
111
112	if (!(och->och_flags & FMODE_WRITE))
113		goto out;
114
115	if (!exp_connect_som(ll_i2mdexp(inode)) || !S_ISREG(inode->i_mode))
116		op_data->op_attr.ia_valid |= ATTR_SIZE | ATTR_BLOCKS;
117	else
118		ll_ioepoch_close(inode, op_data, &och, 0);
119
120out:
121	ll_pack_inode2opdata(inode, op_data, &och->och_fh);
122	ll_prep_md_op_data(op_data, inode, NULL, NULL,
123			   0, 0, LUSTRE_OPC_ANY, NULL);
124}
125
126static int ll_close_inode_openhandle(struct obd_export *md_exp,
127				     struct inode *inode,
128				     struct obd_client_handle *och,
129				     const __u64 *data_version)
130{
131	struct obd_export *exp = ll_i2mdexp(inode);
132	struct md_op_data *op_data;
133	struct ptlrpc_request *req = NULL;
134	struct obd_device *obd = class_exp2obd(exp);
135	int epoch_close = 1;
136	int rc;
137
138	if (obd == NULL) {
139		/*
140		 * XXX: in case of LMV, is this correct to access
141		 * ->exp_handle?
142		 */
143		CERROR("Invalid MDC connection handle %#llx\n",
144		       ll_i2mdexp(inode)->exp_handle.h_cookie);
145		rc = 0;
146		goto out;
147	}
148
149	op_data = kzalloc(sizeof(*op_data), GFP_NOFS);
150	if (!op_data) {
151		/* XXX We leak openhandle and request here. */
152		rc = -ENOMEM;
153		goto out;
154	}
155
156	ll_prepare_close(inode, op_data, och);
157	if (data_version != NULL) {
158		/* Pass in data_version implies release. */
159		op_data->op_bias |= MDS_HSM_RELEASE;
160		op_data->op_data_version = *data_version;
161		op_data->op_lease_handle = och->och_lease_handle;
162		op_data->op_attr.ia_valid |= ATTR_SIZE | ATTR_BLOCKS;
163	}
164	epoch_close = (op_data->op_flags & MF_EPOCH_CLOSE);
165	rc = md_close(md_exp, op_data, och->och_mod, &req);
166	if (rc == -EAGAIN) {
167		/* This close must have the epoch closed. */
168		LASSERT(epoch_close);
169		/* MDS has instructed us to obtain Size-on-MDS attribute from
170		 * OSTs and send setattr to back to MDS. */
171		rc = ll_som_update(inode, op_data);
172		if (rc) {
173			CERROR("inode %lu mdc Size-on-MDS update failed: "
174			       "rc = %d\n", inode->i_ino, rc);
175			rc = 0;
176		}
177	} else if (rc) {
178		CERROR("inode %lu mdc close failed: rc = %d\n",
179		       inode->i_ino, rc);
180	}
181
182	/* DATA_MODIFIED flag was successfully sent on close, cancel data
183	 * modification flag. */
184	if (rc == 0 && (op_data->op_bias & MDS_DATA_MODIFIED)) {
185		struct ll_inode_info *lli = ll_i2info(inode);
186
187		spin_lock(&lli->lli_lock);
188		lli->lli_flags &= ~LLIF_DATA_MODIFIED;
189		spin_unlock(&lli->lli_lock);
190	}
191
192	if (rc == 0) {
193		rc = ll_objects_destroy(req, inode);
194		if (rc)
195			CERROR("inode %lu ll_objects destroy: rc = %d\n",
196			       inode->i_ino, rc);
197	}
198	if (rc == 0 && op_data->op_bias & MDS_HSM_RELEASE) {
199		struct mdt_body *body;
200		body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
201		if (!(body->valid & OBD_MD_FLRELEASED))
202			rc = -EBUSY;
203	}
204
205	ll_finish_md_op_data(op_data);
206
207out:
208	if (exp_connect_som(exp) && !epoch_close &&
209	    S_ISREG(inode->i_mode) && (och->och_flags & FMODE_WRITE)) {
210		ll_queue_done_writing(inode, LLIF_DONE_WRITING);
211	} else {
212		md_clear_open_replay_data(md_exp, och);
213		/* Free @och if it is not waiting for DONE_WRITING. */
214		och->och_fh.cookie = DEAD_HANDLE_MAGIC;
215		OBD_FREE_PTR(och);
216	}
217	if (req) /* This is close request */
218		ptlrpc_req_finished(req);
219	return rc;
220}
221
222int ll_md_real_close(struct inode *inode, fmode_t fmode)
223{
224	struct ll_inode_info *lli = ll_i2info(inode);
225	struct obd_client_handle **och_p;
226	struct obd_client_handle *och;
227	__u64 *och_usecount;
228	int rc = 0;
229
230	if (fmode & FMODE_WRITE) {
231		och_p = &lli->lli_mds_write_och;
232		och_usecount = &lli->lli_open_fd_write_count;
233	} else if (fmode & FMODE_EXEC) {
234		och_p = &lli->lli_mds_exec_och;
235		och_usecount = &lli->lli_open_fd_exec_count;
236	} else {
237		LASSERT(fmode & FMODE_READ);
238		och_p = &lli->lli_mds_read_och;
239		och_usecount = &lli->lli_open_fd_read_count;
240	}
241
242	mutex_lock(&lli->lli_och_mutex);
243	if (*och_usecount > 0) {
244		/* There are still users of this handle, so skip
245		 * freeing it. */
246		mutex_unlock(&lli->lli_och_mutex);
247		return 0;
248	}
249
250	och=*och_p;
251	*och_p = NULL;
252	mutex_unlock(&lli->lli_och_mutex);
253
254	if (och != NULL) {
255		/* There might be a race and this handle may already
256		   be closed. */
257		rc = ll_close_inode_openhandle(ll_i2sbi(inode)->ll_md_exp,
258					       inode, och, NULL);
259	}
260
261	return rc;
262}
263
264static int ll_md_close(struct obd_export *md_exp, struct inode *inode,
265		       struct file *file)
266{
267	struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
268	struct ll_inode_info *lli = ll_i2info(inode);
269	int rc = 0;
270
271	/* clear group lock, if present */
272	if (unlikely(fd->fd_flags & LL_FILE_GROUP_LOCKED))
273		ll_put_grouplock(inode, file, fd->fd_grouplock.cg_gid);
274
275	if (fd->fd_lease_och != NULL) {
276		bool lease_broken;
277
278		/* Usually the lease is not released when the
279		 * application crashed, we need to release here. */
280		rc = ll_lease_close(fd->fd_lease_och, inode, &lease_broken);
281		CDEBUG(rc ? D_ERROR : D_INODE, "Clean up lease "DFID" %d/%d\n",
282			PFID(&lli->lli_fid), rc, lease_broken);
283
284		fd->fd_lease_och = NULL;
285	}
286
287	if (fd->fd_och != NULL) {
288		rc = ll_close_inode_openhandle(md_exp, inode, fd->fd_och, NULL);
289		fd->fd_och = NULL;
290		goto out;
291	}
292
293	/* Let's see if we have good enough OPEN lock on the file and if
294	   we can skip talking to MDS */
295	if (file->f_dentry->d_inode) { /* Can this ever be false? */
296		int lockmode;
297		__u64 flags = LDLM_FL_BLOCK_GRANTED | LDLM_FL_TEST_LOCK;
298		struct lustre_handle lockh;
299		struct inode *inode = file->f_dentry->d_inode;
300		ldlm_policy_data_t policy = {.l_inodebits={MDS_INODELOCK_OPEN}};
301
302		mutex_lock(&lli->lli_och_mutex);
303		if (fd->fd_omode & FMODE_WRITE) {
304			lockmode = LCK_CW;
305			LASSERT(lli->lli_open_fd_write_count);
306			lli->lli_open_fd_write_count--;
307		} else if (fd->fd_omode & FMODE_EXEC) {
308			lockmode = LCK_PR;
309			LASSERT(lli->lli_open_fd_exec_count);
310			lli->lli_open_fd_exec_count--;
311		} else {
312			lockmode = LCK_CR;
313			LASSERT(lli->lli_open_fd_read_count);
314			lli->lli_open_fd_read_count--;
315		}
316		mutex_unlock(&lli->lli_och_mutex);
317
318		if (!md_lock_match(md_exp, flags, ll_inode2fid(inode),
319				   LDLM_IBITS, &policy, lockmode,
320				   &lockh)) {
321			rc = ll_md_real_close(file->f_dentry->d_inode,
322					      fd->fd_omode);
323		}
324	} else {
325		CERROR("Releasing a file %p with negative dentry %p. Name %s",
326		       file, file->f_dentry, file->f_dentry->d_name.name);
327	}
328
329out:
330	LUSTRE_FPRIVATE(file) = NULL;
331	ll_file_data_put(fd);
332	ll_capa_close(inode);
333
334	return rc;
335}
336
337/* While this returns an error code, fput() the caller does not, so we need
338 * to make every effort to clean up all of our state here.  Also, applications
339 * rarely check close errors and even if an error is returned they will not
340 * re-try the close call.
341 */
342int ll_file_release(struct inode *inode, struct file *file)
343{
344	struct ll_file_data *fd;
345	struct ll_sb_info *sbi = ll_i2sbi(inode);
346	struct ll_inode_info *lli = ll_i2info(inode);
347	int rc;
348
349	CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p)\n", inode->i_ino,
350	       inode->i_generation, inode);
351
352#ifdef CONFIG_FS_POSIX_ACL
353	if (sbi->ll_flags & LL_SBI_RMT_CLIENT &&
354	    inode == inode->i_sb->s_root->d_inode) {
355		struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
356
357		LASSERT(fd != NULL);
358		if (unlikely(fd->fd_flags & LL_FILE_RMTACL)) {
359			fd->fd_flags &= ~LL_FILE_RMTACL;
360			rct_del(&sbi->ll_rct, current_pid());
361			et_search_free(&sbi->ll_et, current_pid());
362		}
363	}
364#endif
365
366	if (inode->i_sb->s_root != file->f_dentry)
367		ll_stats_ops_tally(sbi, LPROC_LL_RELEASE, 1);
368	fd = LUSTRE_FPRIVATE(file);
369	LASSERT(fd != NULL);
370
371	/* The last ref on @file, maybe not the the owner pid of statahead.
372	 * Different processes can open the same dir, "ll_opendir_key" means:
373	 * it is me that should stop the statahead thread. */
374	if (S_ISDIR(inode->i_mode) && lli->lli_opendir_key == fd &&
375	    lli->lli_opendir_pid != 0)
376		ll_stop_statahead(inode, lli->lli_opendir_key);
377
378	if (inode->i_sb->s_root == file->f_dentry) {
379		LUSTRE_FPRIVATE(file) = NULL;
380		ll_file_data_put(fd);
381		return 0;
382	}
383
384	if (!S_ISDIR(inode->i_mode)) {
385		lov_read_and_clear_async_rc(lli->lli_clob);
386		lli->lli_async_rc = 0;
387	}
388
389	rc = ll_md_close(sbi->ll_md_exp, inode, file);
390
391	if (CFS_FAIL_TIMEOUT_MS(OBD_FAIL_PTLRPC_DUMP_LOG, cfs_fail_val))
392		libcfs_debug_dumplog();
393
394	return rc;
395}
396
397static int ll_intent_file_open(struct file *file, void *lmm,
398			       int lmmsize, struct lookup_intent *itp)
399{
400	struct ll_sb_info *sbi = ll_i2sbi(file->f_dentry->d_inode);
401	struct dentry *parent = file->f_dentry->d_parent;
402	const char *name = file->f_dentry->d_name.name;
403	const int len = file->f_dentry->d_name.len;
404	struct md_op_data *op_data;
405	struct ptlrpc_request *req;
406	__u32 opc = LUSTRE_OPC_ANY;
407	int rc;
408
409	if (!parent)
410		return -ENOENT;
411
412	/* Usually we come here only for NFSD, and we want open lock.
413	   But we can also get here with pre 2.6.15 patchless kernels, and in
414	   that case that lock is also ok */
415	/* We can also get here if there was cached open handle in revalidate_it
416	 * but it disappeared while we were getting from there to ll_file_open.
417	 * But this means this file was closed and immediately opened which
418	 * makes a good candidate for using OPEN lock */
419	/* If lmmsize & lmm are not 0, we are just setting stripe info
420	 * parameters. No need for the open lock */
421	if (lmm == NULL && lmmsize == 0) {
422		itp->it_flags |= MDS_OPEN_LOCK;
423		if (itp->it_flags & FMODE_WRITE)
424			opc = LUSTRE_OPC_CREATE;
425	}
426
427	op_data  = ll_prep_md_op_data(NULL, parent->d_inode,
428				      file->f_dentry->d_inode, name, len,
429				      O_RDWR, opc, NULL);
430	if (IS_ERR(op_data))
431		return PTR_ERR(op_data);
432
433	itp->it_flags |= MDS_OPEN_BY_FID;
434	rc = md_intent_lock(sbi->ll_md_exp, op_data, lmm, lmmsize, itp,
435			    0 /*unused */, &req, ll_md_blocking_ast, 0);
436	ll_finish_md_op_data(op_data);
437	if (rc == -ESTALE) {
438		/* reason for keep own exit path - don`t flood log
439		* with messages with -ESTALE errors.
440		*/
441		if (!it_disposition(itp, DISP_OPEN_OPEN) ||
442		     it_open_error(DISP_OPEN_OPEN, itp))
443			goto out;
444		ll_release_openhandle(file->f_dentry, itp);
445		goto out;
446	}
447
448	if (it_disposition(itp, DISP_LOOKUP_NEG)) {
449		rc = -ENOENT;
450		goto out;
451	}
452
453	if (rc != 0 || it_open_error(DISP_OPEN_OPEN, itp)) {
454		rc = rc ? rc : it_open_error(DISP_OPEN_OPEN, itp);
455		CDEBUG(D_VFSTRACE, "lock enqueue: err: %d\n", rc);
456		goto out;
457	}
458
459	rc = ll_prep_inode(&file->f_dentry->d_inode, req, NULL, itp);
460	if (!rc && itp->d.lustre.it_lock_mode)
461		ll_set_lock_data(sbi->ll_md_exp, file->f_dentry->d_inode,
462				 itp, NULL);
463
464out:
465	ptlrpc_req_finished(req);
466	ll_intent_drop_lock(itp);
467
468	return rc;
469}
470
471/**
472 * Assign an obtained @ioepoch to client's inode. No lock is needed, MDS does
473 * not believe attributes if a few ioepoch holders exist. Attributes for
474 * previous ioepoch if new one is opened are also skipped by MDS.
475 */
476void ll_ioepoch_open(struct ll_inode_info *lli, __u64 ioepoch)
477{
478	if (ioepoch && lli->lli_ioepoch != ioepoch) {
479		lli->lli_ioepoch = ioepoch;
480		CDEBUG(D_INODE, "Epoch %llu opened on "DFID"\n",
481		       ioepoch, PFID(&lli->lli_fid));
482	}
483}
484
485static int ll_och_fill(struct obd_export *md_exp, struct lookup_intent *it,
486		       struct obd_client_handle *och)
487{
488	struct ptlrpc_request *req = it->d.lustre.it_data;
489	struct mdt_body *body;
490
491	body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
492	och->och_fh = body->handle;
493	och->och_fid = body->fid1;
494	och->och_lease_handle.cookie = it->d.lustre.it_lock_handle;
495	och->och_magic = OBD_CLIENT_HANDLE_MAGIC;
496	och->och_flags = it->it_flags;
497
498	return md_set_open_replay_data(md_exp, och, it);
499}
500
501static int ll_local_open(struct file *file, struct lookup_intent *it,
502			 struct ll_file_data *fd, struct obd_client_handle *och)
503{
504	struct inode *inode = file->f_dentry->d_inode;
505	struct ll_inode_info *lli = ll_i2info(inode);
506
507	LASSERT(!LUSTRE_FPRIVATE(file));
508
509	LASSERT(fd != NULL);
510
511	if (och) {
512		struct ptlrpc_request *req = it->d.lustre.it_data;
513		struct mdt_body *body;
514		int rc;
515
516		rc = ll_och_fill(ll_i2sbi(inode)->ll_md_exp, it, och);
517		if (rc != 0)
518			return rc;
519
520		body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
521		ll_ioepoch_open(lli, body->ioepoch);
522	}
523
524	LUSTRE_FPRIVATE(file) = fd;
525	ll_readahead_init(inode, &fd->fd_ras);
526	fd->fd_omode = it->it_flags & (FMODE_READ | FMODE_WRITE | FMODE_EXEC);
527	return 0;
528}
529
530/* Open a file, and (for the very first open) create objects on the OSTs at
531 * this time.  If opened with O_LOV_DELAY_CREATE, then we don't do the object
532 * creation or open until ll_lov_setstripe() ioctl is called.
533 *
534 * If we already have the stripe MD locally then we don't request it in
535 * md_open(), by passing a lmm_size = 0.
536 *
537 * It is up to the application to ensure no other processes open this file
538 * in the O_LOV_DELAY_CREATE case, or the default striping pattern will be
539 * used.  We might be able to avoid races of that sort by getting lli_open_sem
540 * before returning in the O_LOV_DELAY_CREATE case and dropping it here
541 * or in ll_file_release(), but I'm not sure that is desirable/necessary.
542 */
543int ll_file_open(struct inode *inode, struct file *file)
544{
545	struct ll_inode_info *lli = ll_i2info(inode);
546	struct lookup_intent *it, oit = { .it_op = IT_OPEN,
547					  .it_flags = file->f_flags };
548	struct obd_client_handle **och_p = NULL;
549	__u64 *och_usecount = NULL;
550	struct ll_file_data *fd;
551	int rc = 0, opendir_set = 0;
552
553	CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p), flags %o\n", inode->i_ino,
554	       inode->i_generation, inode, file->f_flags);
555
556	it = file->private_data; /* XXX: compat macro */
557	file->private_data = NULL; /* prevent ll_local_open assertion */
558
559	fd = ll_file_data_get();
560	if (fd == NULL) {
561		rc = -ENOMEM;
562		goto out_openerr;
563	}
564
565	fd->fd_file = file;
566	if (S_ISDIR(inode->i_mode)) {
567		spin_lock(&lli->lli_sa_lock);
568		if (lli->lli_opendir_key == NULL && lli->lli_sai == NULL &&
569		    lli->lli_opendir_pid == 0) {
570			lli->lli_opendir_key = fd;
571			lli->lli_opendir_pid = current_pid();
572			opendir_set = 1;
573		}
574		spin_unlock(&lli->lli_sa_lock);
575	}
576
577	if (inode->i_sb->s_root == file->f_dentry) {
578		LUSTRE_FPRIVATE(file) = fd;
579		return 0;
580	}
581
582	if (!it || !it->d.lustre.it_disposition) {
583		/* Convert f_flags into access mode. We cannot use file->f_mode,
584		 * because everything but O_ACCMODE mask was stripped from
585		 * there */
586		if ((oit.it_flags + 1) & O_ACCMODE)
587			oit.it_flags++;
588		if (file->f_flags & O_TRUNC)
589			oit.it_flags |= FMODE_WRITE;
590
591		/* kernel only call f_op->open in dentry_open.  filp_open calls
592		 * dentry_open after call to open_namei that checks permissions.
593		 * Only nfsd_open call dentry_open directly without checking
594		 * permissions and because of that this code below is safe. */
595		if (oit.it_flags & (FMODE_WRITE | FMODE_READ))
596			oit.it_flags |= MDS_OPEN_OWNEROVERRIDE;
597
598		/* We do not want O_EXCL here, presumably we opened the file
599		 * already? XXX - NFS implications? */
600		oit.it_flags &= ~O_EXCL;
601
602		/* bug20584, if "it_flags" contains O_CREAT, the file will be
603		 * created if necessary, then "IT_CREAT" should be set to keep
604		 * consistent with it */
605		if (oit.it_flags & O_CREAT)
606			oit.it_op |= IT_CREAT;
607
608		it = &oit;
609	}
610
611restart:
612	/* Let's see if we have file open on MDS already. */
613	if (it->it_flags & FMODE_WRITE) {
614		och_p = &lli->lli_mds_write_och;
615		och_usecount = &lli->lli_open_fd_write_count;
616	} else if (it->it_flags & FMODE_EXEC) {
617		och_p = &lli->lli_mds_exec_och;
618		och_usecount = &lli->lli_open_fd_exec_count;
619	 } else {
620		och_p = &lli->lli_mds_read_och;
621		och_usecount = &lli->lli_open_fd_read_count;
622	}
623
624	mutex_lock(&lli->lli_och_mutex);
625	if (*och_p) { /* Open handle is present */
626		if (it_disposition(it, DISP_OPEN_OPEN)) {
627			/* Well, there's extra open request that we do not need,
628			   let's close it somehow. This will decref request. */
629			rc = it_open_error(DISP_OPEN_OPEN, it);
630			if (rc) {
631				mutex_unlock(&lli->lli_och_mutex);
632				goto out_openerr;
633			}
634
635			ll_release_openhandle(file->f_dentry, it);
636		}
637		(*och_usecount)++;
638
639		rc = ll_local_open(file, it, fd, NULL);
640		if (rc) {
641			(*och_usecount)--;
642			mutex_unlock(&lli->lli_och_mutex);
643			goto out_openerr;
644		}
645	} else {
646		LASSERT(*och_usecount == 0);
647		if (!it->d.lustre.it_disposition) {
648			/* We cannot just request lock handle now, new ELC code
649			   means that one of other OPEN locks for this file
650			   could be cancelled, and since blocking ast handler
651			   would attempt to grab och_mutex as well, that would
652			   result in a deadlock */
653			mutex_unlock(&lli->lli_och_mutex);
654			it->it_create_mode |= M_CHECK_STALE;
655			rc = ll_intent_file_open(file, NULL, 0, it);
656			it->it_create_mode &= ~M_CHECK_STALE;
657			if (rc)
658				goto out_openerr;
659
660			goto restart;
661		}
662		*och_p = kzalloc(sizeof(struct obd_client_handle), GFP_NOFS);
663		if (!*och_p) {
664			rc = -ENOMEM;
665			goto out_och_free;
666		}
667
668		(*och_usecount)++;
669
670		/* md_intent_lock() didn't get a request ref if there was an
671		 * open error, so don't do cleanup on the request here
672		 * (bug 3430) */
673		/* XXX (green): Should not we bail out on any error here, not
674		 * just open error? */
675		rc = it_open_error(DISP_OPEN_OPEN, it);
676		if (rc)
677			goto out_och_free;
678
679		LASSERT(it_disposition(it, DISP_ENQ_OPEN_REF));
680
681		rc = ll_local_open(file, it, fd, *och_p);
682		if (rc)
683			goto out_och_free;
684	}
685	mutex_unlock(&lli->lli_och_mutex);
686	fd = NULL;
687
688	/* Must do this outside lli_och_mutex lock to prevent deadlock where
689	   different kind of OPEN lock for this same inode gets cancelled
690	   by ldlm_cancel_lru */
691	if (!S_ISREG(inode->i_mode))
692		goto out_och_free;
693
694	ll_capa_open(inode);
695
696	if (!lli->lli_has_smd &&
697	    (cl_is_lov_delay_create(file->f_flags) ||
698	     (file->f_mode & FMODE_WRITE) == 0)) {
699		CDEBUG(D_INODE, "object creation was delayed\n");
700		goto out_och_free;
701	}
702	cl_lov_delay_create_clear(&file->f_flags);
703	goto out_och_free;
704
705out_och_free:
706	if (rc) {
707		if (och_p && *och_p) {
708			OBD_FREE(*och_p, sizeof (struct obd_client_handle));
709			*och_p = NULL; /* OBD_FREE writes some magic there */
710			(*och_usecount)--;
711		}
712		mutex_unlock(&lli->lli_och_mutex);
713
714out_openerr:
715		if (opendir_set != 0)
716			ll_stop_statahead(inode, lli->lli_opendir_key);
717		if (fd != NULL)
718			ll_file_data_put(fd);
719	} else {
720		ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_OPEN, 1);
721	}
722
723	if (it && it_disposition(it, DISP_ENQ_OPEN_REF)) {
724		ptlrpc_req_finished(it->d.lustre.it_data);
725		it_clear_disposition(it, DISP_ENQ_OPEN_REF);
726	}
727
728	return rc;
729}
730
731static int ll_md_blocking_lease_ast(struct ldlm_lock *lock,
732			struct ldlm_lock_desc *desc, void *data, int flag)
733{
734	int rc;
735	struct lustre_handle lockh;
736
737	switch (flag) {
738	case LDLM_CB_BLOCKING:
739		ldlm_lock2handle(lock, &lockh);
740		rc = ldlm_cli_cancel(&lockh, LCF_ASYNC);
741		if (rc < 0) {
742			CDEBUG(D_INODE, "ldlm_cli_cancel: %d\n", rc);
743			return rc;
744		}
745		break;
746	case LDLM_CB_CANCELING:
747		/* do nothing */
748		break;
749	}
750	return 0;
751}
752
753/**
754 * Acquire a lease and open the file.
755 */
756static struct obd_client_handle *
757ll_lease_open(struct inode *inode, struct file *file, fmode_t fmode,
758	      __u64 open_flags)
759{
760	struct lookup_intent it = { .it_op = IT_OPEN };
761	struct ll_sb_info *sbi = ll_i2sbi(inode);
762	struct md_op_data *op_data;
763	struct ptlrpc_request *req;
764	struct lustre_handle old_handle = { 0 };
765	struct obd_client_handle *och = NULL;
766	int rc;
767	int rc2;
768
769	if (fmode != FMODE_WRITE && fmode != FMODE_READ)
770		return ERR_PTR(-EINVAL);
771
772	if (file != NULL) {
773		struct ll_inode_info *lli = ll_i2info(inode);
774		struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
775		struct obd_client_handle **och_p;
776		__u64 *och_usecount;
777
778		if (!(fmode & file->f_mode) || (file->f_mode & FMODE_EXEC))
779			return ERR_PTR(-EPERM);
780
781		/* Get the openhandle of the file */
782		rc = -EBUSY;
783		mutex_lock(&lli->lli_och_mutex);
784		if (fd->fd_lease_och != NULL) {
785			mutex_unlock(&lli->lli_och_mutex);
786			return ERR_PTR(rc);
787		}
788
789		if (fd->fd_och == NULL) {
790			if (file->f_mode & FMODE_WRITE) {
791				LASSERT(lli->lli_mds_write_och != NULL);
792				och_p = &lli->lli_mds_write_och;
793				och_usecount = &lli->lli_open_fd_write_count;
794			} else {
795				LASSERT(lli->lli_mds_read_och != NULL);
796				och_p = &lli->lli_mds_read_och;
797				och_usecount = &lli->lli_open_fd_read_count;
798			}
799			if (*och_usecount == 1) {
800				fd->fd_och = *och_p;
801				*och_p = NULL;
802				*och_usecount = 0;
803				rc = 0;
804			}
805		}
806		mutex_unlock(&lli->lli_och_mutex);
807		if (rc < 0) /* more than 1 opener */
808			return ERR_PTR(rc);
809
810		LASSERT(fd->fd_och != NULL);
811		old_handle = fd->fd_och->och_fh;
812	}
813
814	och = kzalloc(sizeof(*och), GFP_NOFS);
815	if (!och)
816		return ERR_PTR(-ENOMEM);
817
818	op_data = ll_prep_md_op_data(NULL, inode, inode, NULL, 0, 0,
819					LUSTRE_OPC_ANY, NULL);
820	if (IS_ERR(op_data)) {
821		rc = PTR_ERR(op_data);
822		goto out;
823	}
824
825	/* To tell the MDT this openhandle is from the same owner */
826	op_data->op_handle = old_handle;
827
828	it.it_flags = fmode | open_flags;
829	it.it_flags |= MDS_OPEN_LOCK | MDS_OPEN_BY_FID | MDS_OPEN_LEASE;
830	rc = md_intent_lock(sbi->ll_md_exp, op_data, NULL, 0, &it, 0, &req,
831				ll_md_blocking_lease_ast,
832	/* LDLM_FL_NO_LRU: To not put the lease lock into LRU list, otherwise
833	 * it can be cancelled which may mislead applications that the lease is
834	 * broken;
835	 * LDLM_FL_EXCL: Set this flag so that it won't be matched by normal
836	 * open in ll_md_blocking_ast(). Otherwise as ll_md_blocking_lease_ast
837	 * doesn't deal with openhandle, so normal openhandle will be leaked. */
838				LDLM_FL_NO_LRU | LDLM_FL_EXCL);
839	ll_finish_md_op_data(op_data);
840	ptlrpc_req_finished(req);
841	if (rc < 0)
842		goto out_release_it;
843
844	if (it_disposition(&it, DISP_LOOKUP_NEG)) {
845		rc = -ENOENT;
846		goto out_release_it;
847	}
848
849	rc = it_open_error(DISP_OPEN_OPEN, &it);
850	if (rc)
851		goto out_release_it;
852
853	LASSERT(it_disposition(&it, DISP_ENQ_OPEN_REF));
854	ll_och_fill(sbi->ll_md_exp, &it, och);
855
856	if (!it_disposition(&it, DISP_OPEN_LEASE)) /* old server? */ {
857		rc = -EOPNOTSUPP;
858		goto out_close;
859	}
860
861	/* already get lease, handle lease lock */
862	ll_set_lock_data(sbi->ll_md_exp, inode, &it, NULL);
863	if (it.d.lustre.it_lock_mode == 0 ||
864	    it.d.lustre.it_lock_bits != MDS_INODELOCK_OPEN) {
865		/* open lock must return for lease */
866		CERROR(DFID "lease granted but no open lock, %d/%llu.\n",
867			PFID(ll_inode2fid(inode)), it.d.lustre.it_lock_mode,
868			it.d.lustre.it_lock_bits);
869		rc = -EPROTO;
870		goto out_close;
871	}
872
873	ll_intent_release(&it);
874	return och;
875
876out_close:
877	rc2 = ll_close_inode_openhandle(sbi->ll_md_exp, inode, och, NULL);
878	if (rc2)
879		CERROR("Close openhandle returned %d\n", rc2);
880
881	/* cancel open lock */
882	if (it.d.lustre.it_lock_mode != 0) {
883		ldlm_lock_decref_and_cancel(&och->och_lease_handle,
884						it.d.lustre.it_lock_mode);
885		it.d.lustre.it_lock_mode = 0;
886	}
887out_release_it:
888	ll_intent_release(&it);
889out:
890	OBD_FREE_PTR(och);
891	return ERR_PTR(rc);
892}
893
894/**
895 * Release lease and close the file.
896 * It will check if the lease has ever broken.
897 */
898static int ll_lease_close(struct obd_client_handle *och, struct inode *inode,
899			  bool *lease_broken)
900{
901	struct ldlm_lock *lock;
902	bool cancelled = true;
903	int rc;
904
905	lock = ldlm_handle2lock(&och->och_lease_handle);
906	if (lock != NULL) {
907		lock_res_and_lock(lock);
908		cancelled = ldlm_is_cancel(lock);
909		unlock_res_and_lock(lock);
910		ldlm_lock_put(lock);
911	}
912
913	CDEBUG(D_INODE, "lease for "DFID" broken? %d\n",
914		PFID(&ll_i2info(inode)->lli_fid), cancelled);
915
916	if (!cancelled)
917		ldlm_cli_cancel(&och->och_lease_handle, 0);
918	if (lease_broken != NULL)
919		*lease_broken = cancelled;
920
921	rc = ll_close_inode_openhandle(ll_i2sbi(inode)->ll_md_exp, inode, och,
922				       NULL);
923	return rc;
924}
925
926/* Fills the obdo with the attributes for the lsm */
927static int ll_lsm_getattr(struct lov_stripe_md *lsm, struct obd_export *exp,
928			  struct obd_capa *capa, struct obdo *obdo,
929			  __u64 ioepoch, int sync)
930{
931	struct ptlrpc_request_set *set;
932	struct obd_info	    oinfo = { { { 0 } } };
933	int			rc;
934
935	LASSERT(lsm != NULL);
936
937	oinfo.oi_md = lsm;
938	oinfo.oi_oa = obdo;
939	oinfo.oi_oa->o_oi = lsm->lsm_oi;
940	oinfo.oi_oa->o_mode = S_IFREG;
941	oinfo.oi_oa->o_ioepoch = ioepoch;
942	oinfo.oi_oa->o_valid = OBD_MD_FLID | OBD_MD_FLTYPE |
943			       OBD_MD_FLSIZE | OBD_MD_FLBLOCKS |
944			       OBD_MD_FLBLKSZ | OBD_MD_FLATIME |
945			       OBD_MD_FLMTIME | OBD_MD_FLCTIME |
946			       OBD_MD_FLGROUP | OBD_MD_FLEPOCH |
947			       OBD_MD_FLDATAVERSION;
948	oinfo.oi_capa = capa;
949	if (sync) {
950		oinfo.oi_oa->o_valid |= OBD_MD_FLFLAGS;
951		oinfo.oi_oa->o_flags |= OBD_FL_SRVLOCK;
952	}
953
954	set = ptlrpc_prep_set();
955	if (set == NULL) {
956		CERROR("can't allocate ptlrpc set\n");
957		rc = -ENOMEM;
958	} else {
959		rc = obd_getattr_async(exp, &oinfo, set);
960		if (rc == 0)
961			rc = ptlrpc_set_wait(set);
962		ptlrpc_set_destroy(set);
963	}
964	if (rc == 0)
965		oinfo.oi_oa->o_valid &= (OBD_MD_FLBLOCKS | OBD_MD_FLBLKSZ |
966					 OBD_MD_FLATIME | OBD_MD_FLMTIME |
967					 OBD_MD_FLCTIME | OBD_MD_FLSIZE |
968					 OBD_MD_FLDATAVERSION);
969	return rc;
970}
971
972/**
973  * Performs the getattr on the inode and updates its fields.
974  * If @sync != 0, perform the getattr under the server-side lock.
975  */
976int ll_inode_getattr(struct inode *inode, struct obdo *obdo,
977		     __u64 ioepoch, int sync)
978{
979	struct obd_capa      *capa = ll_mdscapa_get(inode);
980	struct lov_stripe_md *lsm;
981	int rc;
982
983	lsm = ccc_inode_lsm_get(inode);
984	rc = ll_lsm_getattr(lsm, ll_i2dtexp(inode),
985			    capa, obdo, ioepoch, sync);
986	capa_put(capa);
987	if (rc == 0) {
988		struct ost_id *oi = lsm ? &lsm->lsm_oi : &obdo->o_oi;
989
990		obdo_refresh_inode(inode, obdo, obdo->o_valid);
991		CDEBUG(D_INODE, "objid "DOSTID" size %llu, blocks %llu,"
992		       " blksize %lu\n", POSTID(oi), i_size_read(inode),
993		       (unsigned long long)inode->i_blocks,
994		       1UL << inode->i_blkbits);
995	}
996	ccc_inode_lsm_put(inode, lsm);
997	return rc;
998}
999
1000int ll_merge_lvb(const struct lu_env *env, struct inode *inode)
1001{
1002	struct ll_inode_info *lli = ll_i2info(inode);
1003	struct cl_object *obj = lli->lli_clob;
1004	struct cl_attr *attr = ccc_env_thread_attr(env);
1005	struct ost_lvb lvb;
1006	int rc = 0;
1007
1008	ll_inode_size_lock(inode);
1009	/* merge timestamps the most recently obtained from mds with
1010	   timestamps obtained from osts */
1011	LTIME_S(inode->i_atime) = lli->lli_lvb.lvb_atime;
1012	LTIME_S(inode->i_mtime) = lli->lli_lvb.lvb_mtime;
1013	LTIME_S(inode->i_ctime) = lli->lli_lvb.lvb_ctime;
1014
1015	lvb.lvb_size = i_size_read(inode);
1016	lvb.lvb_blocks = inode->i_blocks;
1017	lvb.lvb_mtime = LTIME_S(inode->i_mtime);
1018	lvb.lvb_atime = LTIME_S(inode->i_atime);
1019	lvb.lvb_ctime = LTIME_S(inode->i_ctime);
1020
1021	cl_object_attr_lock(obj);
1022	rc = cl_object_attr_get(env, obj, attr);
1023	cl_object_attr_unlock(obj);
1024
1025	if (rc == 0) {
1026		if (lvb.lvb_atime < attr->cat_atime)
1027			lvb.lvb_atime = attr->cat_atime;
1028		if (lvb.lvb_ctime < attr->cat_ctime)
1029			lvb.lvb_ctime = attr->cat_ctime;
1030		if (lvb.lvb_mtime < attr->cat_mtime)
1031			lvb.lvb_mtime = attr->cat_mtime;
1032
1033		CDEBUG(D_VFSTRACE, DFID" updating i_size %llu\n",
1034				PFID(&lli->lli_fid), attr->cat_size);
1035		cl_isize_write_nolock(inode, attr->cat_size);
1036
1037		inode->i_blocks = attr->cat_blocks;
1038
1039		LTIME_S(inode->i_mtime) = lvb.lvb_mtime;
1040		LTIME_S(inode->i_atime) = lvb.lvb_atime;
1041		LTIME_S(inode->i_ctime) = lvb.lvb_ctime;
1042	}
1043	ll_inode_size_unlock(inode);
1044
1045	return rc;
1046}
1047
1048int ll_glimpse_ioctl(struct ll_sb_info *sbi, struct lov_stripe_md *lsm,
1049		     lstat_t *st)
1050{
1051	struct obdo obdo = { 0 };
1052	int rc;
1053
1054	rc = ll_lsm_getattr(lsm, sbi->ll_dt_exp, NULL, &obdo, 0, 0);
1055	if (rc == 0) {
1056		st->st_size   = obdo.o_size;
1057		st->st_blocks = obdo.o_blocks;
1058		st->st_mtime  = obdo.o_mtime;
1059		st->st_atime  = obdo.o_atime;
1060		st->st_ctime  = obdo.o_ctime;
1061	}
1062	return rc;
1063}
1064
1065static bool file_is_noatime(const struct file *file)
1066{
1067	const struct vfsmount *mnt = file->f_path.mnt;
1068	const struct inode *inode = file->f_path.dentry->d_inode;
1069
1070	/* Adapted from file_accessed() and touch_atime().*/
1071	if (file->f_flags & O_NOATIME)
1072		return true;
1073
1074	if (inode->i_flags & S_NOATIME)
1075		return true;
1076
1077	if (IS_NOATIME(inode))
1078		return true;
1079
1080	if (mnt->mnt_flags & (MNT_NOATIME | MNT_READONLY))
1081		return true;
1082
1083	if ((mnt->mnt_flags & MNT_NODIRATIME) && S_ISDIR(inode->i_mode))
1084		return true;
1085
1086	if ((inode->i_sb->s_flags & MS_NODIRATIME) && S_ISDIR(inode->i_mode))
1087		return true;
1088
1089	return false;
1090}
1091
1092void ll_io_init(struct cl_io *io, const struct file *file, int write)
1093{
1094	struct inode *inode = file->f_dentry->d_inode;
1095
1096	io->u.ci_rw.crw_nonblock = file->f_flags & O_NONBLOCK;
1097	if (write) {
1098		io->u.ci_wr.wr_append = !!(file->f_flags & O_APPEND);
1099		io->u.ci_wr.wr_sync = file->f_flags & O_SYNC ||
1100				      file->f_flags & O_DIRECT ||
1101				      IS_SYNC(inode);
1102	}
1103	io->ci_obj     = ll_i2info(inode)->lli_clob;
1104	io->ci_lockreq = CILR_MAYBE;
1105	if (ll_file_nolock(file)) {
1106		io->ci_lockreq = CILR_NEVER;
1107		io->ci_no_srvlock = 1;
1108	} else if (file->f_flags & O_APPEND) {
1109		io->ci_lockreq = CILR_MANDATORY;
1110	}
1111
1112	io->ci_noatime = file_is_noatime(file);
1113}
1114
1115static ssize_t
1116ll_file_io_generic(const struct lu_env *env, struct vvp_io_args *args,
1117		   struct file *file, enum cl_io_type iot,
1118		   loff_t *ppos, size_t count)
1119{
1120	struct ll_inode_info *lli = ll_i2info(file->f_dentry->d_inode);
1121	struct ll_file_data  *fd  = LUSTRE_FPRIVATE(file);
1122	struct cl_io	 *io;
1123	ssize_t	       result;
1124
1125restart:
1126	io = ccc_env_thread_io(env);
1127	ll_io_init(io, file, iot == CIT_WRITE);
1128
1129	if (cl_io_rw_init(env, io, iot, *ppos, count) == 0) {
1130		struct vvp_io *vio = vvp_env_io(env);
1131		struct ccc_io *cio = ccc_env_io(env);
1132		int write_mutex_locked = 0;
1133
1134		cio->cui_fd  = LUSTRE_FPRIVATE(file);
1135		vio->cui_io_subtype = args->via_io_subtype;
1136
1137		switch (vio->cui_io_subtype) {
1138		case IO_NORMAL:
1139			cio->cui_iter = args->u.normal.via_iter;
1140			cio->cui_iocb = args->u.normal.via_iocb;
1141			if ((iot == CIT_WRITE) &&
1142			    !(cio->cui_fd->fd_flags & LL_FILE_GROUP_LOCKED)) {
1143				if (mutex_lock_interruptible(&lli->
1144							       lli_write_mutex)) {
1145					result = -ERESTARTSYS;
1146					goto out;
1147				}
1148				write_mutex_locked = 1;
1149			} else if (iot == CIT_READ) {
1150				down_read(&lli->lli_trunc_sem);
1151			}
1152			break;
1153		case IO_SPLICE:
1154			vio->u.splice.cui_pipe = args->u.splice.via_pipe;
1155			vio->u.splice.cui_flags = args->u.splice.via_flags;
1156			break;
1157		default:
1158			CERROR("Unknown IO type - %u\n", vio->cui_io_subtype);
1159			LBUG();
1160		}
1161		result = cl_io_loop(env, io);
1162		if (write_mutex_locked)
1163			mutex_unlock(&lli->lli_write_mutex);
1164		else if (args->via_io_subtype == IO_NORMAL && iot == CIT_READ)
1165			up_read(&lli->lli_trunc_sem);
1166	} else {
1167		/* cl_io_rw_init() handled IO */
1168		result = io->ci_result;
1169	}
1170
1171	if (io->ci_nob > 0) {
1172		result = io->ci_nob;
1173		*ppos = io->u.ci_wr.wr.crw_pos;
1174	}
1175	goto out;
1176out:
1177	cl_io_fini(env, io);
1178	/* If any bit been read/written (result != 0), we just return
1179	 * short read/write instead of restart io. */
1180	if ((result == 0 || result == -ENODATA) && io->ci_need_restart) {
1181		CDEBUG(D_VFSTRACE, "Restart %s on %s from %lld, count:%zd\n",
1182		       iot == CIT_READ ? "read" : "write",
1183		       file->f_dentry->d_name.name, *ppos, count);
1184		LASSERTF(io->ci_nob == 0, "%zd", io->ci_nob);
1185		goto restart;
1186	}
1187
1188	if (iot == CIT_READ) {
1189		if (result >= 0)
1190			ll_stats_ops_tally(ll_i2sbi(file->f_dentry->d_inode),
1191					   LPROC_LL_READ_BYTES, result);
1192	} else if (iot == CIT_WRITE) {
1193		if (result >= 0) {
1194			ll_stats_ops_tally(ll_i2sbi(file->f_dentry->d_inode),
1195					   LPROC_LL_WRITE_BYTES, result);
1196			fd->fd_write_failed = false;
1197		} else if (result != -ERESTARTSYS) {
1198			fd->fd_write_failed = true;
1199		}
1200	}
1201
1202	return result;
1203}
1204
1205static ssize_t ll_file_read_iter(struct kiocb *iocb, struct iov_iter *to)
1206{
1207	struct lu_env      *env;
1208	struct vvp_io_args *args;
1209	ssize_t	     result;
1210	int		 refcheck;
1211
1212	env = cl_env_get(&refcheck);
1213	if (IS_ERR(env))
1214		return PTR_ERR(env);
1215
1216	args = vvp_env_args(env, IO_NORMAL);
1217	args->u.normal.via_iter = to;
1218	args->u.normal.via_iocb = iocb;
1219
1220	result = ll_file_io_generic(env, args, iocb->ki_filp, CIT_READ,
1221				    &iocb->ki_pos, iov_iter_count(to));
1222	cl_env_put(env, &refcheck);
1223	return result;
1224}
1225
1226/*
1227 * Write to a file (through the page cache).
1228 */
1229static ssize_t ll_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
1230{
1231	struct lu_env      *env;
1232	struct vvp_io_args *args;
1233	ssize_t	     result;
1234	int		 refcheck;
1235
1236	env = cl_env_get(&refcheck);
1237	if (IS_ERR(env))
1238		return PTR_ERR(env);
1239
1240	args = vvp_env_args(env, IO_NORMAL);
1241	args->u.normal.via_iter = from;
1242	args->u.normal.via_iocb = iocb;
1243
1244	result = ll_file_io_generic(env, args, iocb->ki_filp, CIT_WRITE,
1245				  &iocb->ki_pos, iov_iter_count(from));
1246	cl_env_put(env, &refcheck);
1247	return result;
1248}
1249
1250/*
1251 * Send file content (through pagecache) somewhere with helper
1252 */
1253static ssize_t ll_file_splice_read(struct file *in_file, loff_t *ppos,
1254				   struct pipe_inode_info *pipe, size_t count,
1255				   unsigned int flags)
1256{
1257	struct lu_env      *env;
1258	struct vvp_io_args *args;
1259	ssize_t	     result;
1260	int		 refcheck;
1261
1262	env = cl_env_get(&refcheck);
1263	if (IS_ERR(env))
1264		return PTR_ERR(env);
1265
1266	args = vvp_env_args(env, IO_SPLICE);
1267	args->u.splice.via_pipe = pipe;
1268	args->u.splice.via_flags = flags;
1269
1270	result = ll_file_io_generic(env, args, in_file, CIT_READ, ppos, count);
1271	cl_env_put(env, &refcheck);
1272	return result;
1273}
1274
1275static int ll_lov_recreate(struct inode *inode, struct ost_id *oi, u32 ost_idx)
1276{
1277	struct obd_export *exp = ll_i2dtexp(inode);
1278	struct obd_trans_info oti = { 0 };
1279	struct obdo *oa = NULL;
1280	int lsm_size;
1281	int rc = 0;
1282	struct lov_stripe_md *lsm = NULL, *lsm2;
1283
1284	OBDO_ALLOC(oa);
1285	if (oa == NULL)
1286		return -ENOMEM;
1287
1288	lsm = ccc_inode_lsm_get(inode);
1289	if (!lsm_has_objects(lsm)) {
1290		rc = -ENOENT;
1291		goto out;
1292	}
1293
1294	lsm_size = sizeof(*lsm) + (sizeof(struct lov_oinfo) *
1295		   (lsm->lsm_stripe_count));
1296
1297	OBD_ALLOC_LARGE(lsm2, lsm_size);
1298	if (lsm2 == NULL) {
1299		rc = -ENOMEM;
1300		goto out;
1301	}
1302
1303	oa->o_oi = *oi;
1304	oa->o_nlink = ost_idx;
1305	oa->o_flags |= OBD_FL_RECREATE_OBJS;
1306	oa->o_valid = OBD_MD_FLID | OBD_MD_FLFLAGS | OBD_MD_FLGROUP;
1307	obdo_from_inode(oa, inode, OBD_MD_FLTYPE | OBD_MD_FLATIME |
1308				   OBD_MD_FLMTIME | OBD_MD_FLCTIME);
1309	obdo_set_parent_fid(oa, &ll_i2info(inode)->lli_fid);
1310	memcpy(lsm2, lsm, lsm_size);
1311	ll_inode_size_lock(inode);
1312	rc = obd_create(NULL, exp, oa, &lsm2, &oti);
1313	ll_inode_size_unlock(inode);
1314
1315	OBD_FREE_LARGE(lsm2, lsm_size);
1316	goto out;
1317out:
1318	ccc_inode_lsm_put(inode, lsm);
1319	OBDO_FREE(oa);
1320	return rc;
1321}
1322
1323static int ll_lov_recreate_obj(struct inode *inode, unsigned long arg)
1324{
1325	struct ll_recreate_obj ucreat;
1326	struct ost_id		oi;
1327
1328	if (!capable(CFS_CAP_SYS_ADMIN))
1329		return -EPERM;
1330
1331	if (copy_from_user(&ucreat, (struct ll_recreate_obj *)arg,
1332			   sizeof(ucreat)))
1333		return -EFAULT;
1334
1335	ostid_set_seq_mdt0(&oi);
1336	ostid_set_id(&oi, ucreat.lrc_id);
1337	return ll_lov_recreate(inode, &oi, ucreat.lrc_ost_idx);
1338}
1339
1340static int ll_lov_recreate_fid(struct inode *inode, unsigned long arg)
1341{
1342	struct lu_fid	fid;
1343	struct ost_id	oi;
1344	u32		ost_idx;
1345
1346	if (!capable(CFS_CAP_SYS_ADMIN))
1347		return -EPERM;
1348
1349	if (copy_from_user(&fid, (struct lu_fid *)arg, sizeof(fid)))
1350		return -EFAULT;
1351
1352	fid_to_ostid(&fid, &oi);
1353	ost_idx = (fid_seq(&fid) >> 16) & 0xffff;
1354	return ll_lov_recreate(inode, &oi, ost_idx);
1355}
1356
1357int ll_lov_setstripe_ea_info(struct inode *inode, struct file *file,
1358			     int flags, struct lov_user_md *lum, int lum_size)
1359{
1360	struct lov_stripe_md *lsm = NULL;
1361	struct lookup_intent oit = {.it_op = IT_OPEN, .it_flags = flags};
1362	int rc = 0;
1363
1364	lsm = ccc_inode_lsm_get(inode);
1365	if (lsm != NULL) {
1366		ccc_inode_lsm_put(inode, lsm);
1367		CDEBUG(D_IOCTL, "stripe already exists for ino %lu\n",
1368		       inode->i_ino);
1369		rc = -EEXIST;
1370		goto out;
1371	}
1372
1373	ll_inode_size_lock(inode);
1374	rc = ll_intent_file_open(file, lum, lum_size, &oit);
1375	if (rc)
1376		goto out_unlock;
1377	rc = oit.d.lustre.it_status;
1378	if (rc < 0)
1379		goto out_req_free;
1380
1381	ll_release_openhandle(file->f_dentry, &oit);
1382
1383out_unlock:
1384	ll_inode_size_unlock(inode);
1385	ll_intent_release(&oit);
1386	ccc_inode_lsm_put(inode, lsm);
1387out:
1388	cl_lov_delay_create_clear(&file->f_flags);
1389	return rc;
1390out_req_free:
1391	ptlrpc_req_finished((struct ptlrpc_request *) oit.d.lustre.it_data);
1392	goto out;
1393}
1394
1395int ll_lov_getstripe_ea_info(struct inode *inode, const char *filename,
1396			     struct lov_mds_md **lmmp, int *lmm_size,
1397			     struct ptlrpc_request **request)
1398{
1399	struct ll_sb_info *sbi = ll_i2sbi(inode);
1400	struct mdt_body  *body;
1401	struct lov_mds_md *lmm = NULL;
1402	struct ptlrpc_request *req = NULL;
1403	struct md_op_data *op_data;
1404	int rc, lmmsize;
1405
1406	rc = ll_get_default_mdsize(sbi, &lmmsize);
1407	if (rc)
1408		return rc;
1409
1410	op_data = ll_prep_md_op_data(NULL, inode, NULL, filename,
1411				     strlen(filename), lmmsize,
1412				     LUSTRE_OPC_ANY, NULL);
1413	if (IS_ERR(op_data))
1414		return PTR_ERR(op_data);
1415
1416	op_data->op_valid = OBD_MD_FLEASIZE | OBD_MD_FLDIREA;
1417	rc = md_getattr_name(sbi->ll_md_exp, op_data, &req);
1418	ll_finish_md_op_data(op_data);
1419	if (rc < 0) {
1420		CDEBUG(D_INFO, "md_getattr_name failed "
1421		       "on %s: rc %d\n", filename, rc);
1422		goto out;
1423	}
1424
1425	body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
1426	LASSERT(body != NULL); /* checked by mdc_getattr_name */
1427
1428	lmmsize = body->eadatasize;
1429
1430	if (!(body->valid & (OBD_MD_FLEASIZE | OBD_MD_FLDIREA)) ||
1431			lmmsize == 0) {
1432		rc = -ENODATA;
1433		goto out;
1434	}
1435
1436	lmm = req_capsule_server_sized_get(&req->rq_pill, &RMF_MDT_MD, lmmsize);
1437	LASSERT(lmm != NULL);
1438
1439	if ((lmm->lmm_magic != cpu_to_le32(LOV_MAGIC_V1)) &&
1440	    (lmm->lmm_magic != cpu_to_le32(LOV_MAGIC_V3))) {
1441		rc = -EPROTO;
1442		goto out;
1443	}
1444
1445	/*
1446	 * This is coming from the MDS, so is probably in
1447	 * little endian.  We convert it to host endian before
1448	 * passing it to userspace.
1449	 */
1450	if (LOV_MAGIC != cpu_to_le32(LOV_MAGIC)) {
1451		int stripe_count;
1452
1453		stripe_count = le16_to_cpu(lmm->lmm_stripe_count);
1454		if (le32_to_cpu(lmm->lmm_pattern) & LOV_PATTERN_F_RELEASED)
1455			stripe_count = 0;
1456
1457		/* if function called for directory - we should
1458		 * avoid swab not existent lsm objects */
1459		if (lmm->lmm_magic == cpu_to_le32(LOV_MAGIC_V1)) {
1460			lustre_swab_lov_user_md_v1((struct lov_user_md_v1 *)lmm);
1461			if (S_ISREG(body->mode))
1462				lustre_swab_lov_user_md_objects(
1463				 ((struct lov_user_md_v1 *)lmm)->lmm_objects,
1464				 stripe_count);
1465		} else if (lmm->lmm_magic == cpu_to_le32(LOV_MAGIC_V3)) {
1466			lustre_swab_lov_user_md_v3((struct lov_user_md_v3 *)lmm);
1467			if (S_ISREG(body->mode))
1468				lustre_swab_lov_user_md_objects(
1469				 ((struct lov_user_md_v3 *)lmm)->lmm_objects,
1470				 stripe_count);
1471		}
1472	}
1473
1474out:
1475	*lmmp = lmm;
1476	*lmm_size = lmmsize;
1477	*request = req;
1478	return rc;
1479}
1480
1481static int ll_lov_setea(struct inode *inode, struct file *file,
1482			    unsigned long arg)
1483{
1484	int			 flags = MDS_OPEN_HAS_OBJS | FMODE_WRITE;
1485	struct lov_user_md	*lump;
1486	int			 lum_size = sizeof(struct lov_user_md) +
1487					    sizeof(struct lov_user_ost_data);
1488	int			 rc;
1489
1490	if (!capable(CFS_CAP_SYS_ADMIN))
1491		return -EPERM;
1492
1493	OBD_ALLOC_LARGE(lump, lum_size);
1494	if (lump == NULL)
1495		return -ENOMEM;
1496
1497	if (copy_from_user(lump, (struct lov_user_md *)arg, lum_size)) {
1498		OBD_FREE_LARGE(lump, lum_size);
1499		return -EFAULT;
1500	}
1501
1502	rc = ll_lov_setstripe_ea_info(inode, file, flags, lump, lum_size);
1503
1504	OBD_FREE_LARGE(lump, lum_size);
1505	return rc;
1506}
1507
1508static int ll_lov_setstripe(struct inode *inode, struct file *file,
1509			    unsigned long arg)
1510{
1511	struct lov_user_md_v3	 lumv3;
1512	struct lov_user_md_v1	*lumv1 = (struct lov_user_md_v1 *)&lumv3;
1513	struct lov_user_md_v1	*lumv1p = (struct lov_user_md_v1 *)arg;
1514	struct lov_user_md_v3	*lumv3p = (struct lov_user_md_v3 *)arg;
1515	int			 lum_size, rc;
1516	int			 flags = FMODE_WRITE;
1517
1518	/* first try with v1 which is smaller than v3 */
1519	lum_size = sizeof(struct lov_user_md_v1);
1520	if (copy_from_user(lumv1, lumv1p, lum_size))
1521		return -EFAULT;
1522
1523	if (lumv1->lmm_magic == LOV_USER_MAGIC_V3) {
1524		lum_size = sizeof(struct lov_user_md_v3);
1525		if (copy_from_user(&lumv3, lumv3p, lum_size))
1526			return -EFAULT;
1527	}
1528
1529	rc = ll_lov_setstripe_ea_info(inode, file, flags, lumv1, lum_size);
1530	if (rc == 0) {
1531		struct lov_stripe_md *lsm;
1532		__u32 gen;
1533
1534		put_user(0, &lumv1p->lmm_stripe_count);
1535
1536		ll_layout_refresh(inode, &gen);
1537		lsm = ccc_inode_lsm_get(inode);
1538		rc = obd_iocontrol(LL_IOC_LOV_GETSTRIPE, ll_i2dtexp(inode),
1539				   0, lsm, (void *)arg);
1540		ccc_inode_lsm_put(inode, lsm);
1541	}
1542	return rc;
1543}
1544
1545static int ll_lov_getstripe(struct inode *inode, unsigned long arg)
1546{
1547	struct lov_stripe_md *lsm;
1548	int rc = -ENODATA;
1549
1550	lsm = ccc_inode_lsm_get(inode);
1551	if (lsm != NULL)
1552		rc = obd_iocontrol(LL_IOC_LOV_GETSTRIPE, ll_i2dtexp(inode), 0,
1553				   lsm, (void *)arg);
1554	ccc_inode_lsm_put(inode, lsm);
1555	return rc;
1556}
1557
1558static int
1559ll_get_grouplock(struct inode *inode, struct file *file, unsigned long arg)
1560{
1561	struct ll_inode_info   *lli = ll_i2info(inode);
1562	struct ll_file_data    *fd = LUSTRE_FPRIVATE(file);
1563	struct ccc_grouplock    grouplock;
1564	int		     rc;
1565
1566	if (ll_file_nolock(file))
1567		return -EOPNOTSUPP;
1568
1569	spin_lock(&lli->lli_lock);
1570	if (fd->fd_flags & LL_FILE_GROUP_LOCKED) {
1571		CWARN("group lock already existed with gid %lu\n",
1572		      fd->fd_grouplock.cg_gid);
1573		spin_unlock(&lli->lli_lock);
1574		return -EINVAL;
1575	}
1576	LASSERT(fd->fd_grouplock.cg_lock == NULL);
1577	spin_unlock(&lli->lli_lock);
1578
1579	rc = cl_get_grouplock(cl_i2info(inode)->lli_clob,
1580			      arg, (file->f_flags & O_NONBLOCK), &grouplock);
1581	if (rc)
1582		return rc;
1583
1584	spin_lock(&lli->lli_lock);
1585	if (fd->fd_flags & LL_FILE_GROUP_LOCKED) {
1586		spin_unlock(&lli->lli_lock);
1587		CERROR("another thread just won the race\n");
1588		cl_put_grouplock(&grouplock);
1589		return -EINVAL;
1590	}
1591
1592	fd->fd_flags |= LL_FILE_GROUP_LOCKED;
1593	fd->fd_grouplock = grouplock;
1594	spin_unlock(&lli->lli_lock);
1595
1596	CDEBUG(D_INFO, "group lock %lu obtained\n", arg);
1597	return 0;
1598}
1599
1600int ll_put_grouplock(struct inode *inode, struct file *file, unsigned long arg)
1601{
1602	struct ll_inode_info   *lli = ll_i2info(inode);
1603	struct ll_file_data    *fd = LUSTRE_FPRIVATE(file);
1604	struct ccc_grouplock    grouplock;
1605
1606	spin_lock(&lli->lli_lock);
1607	if (!(fd->fd_flags & LL_FILE_GROUP_LOCKED)) {
1608		spin_unlock(&lli->lli_lock);
1609		CWARN("no group lock held\n");
1610		return -EINVAL;
1611	}
1612	LASSERT(fd->fd_grouplock.cg_lock != NULL);
1613
1614	if (fd->fd_grouplock.cg_gid != arg) {
1615		CWARN("group lock %lu doesn't match current id %lu\n",
1616		       arg, fd->fd_grouplock.cg_gid);
1617		spin_unlock(&lli->lli_lock);
1618		return -EINVAL;
1619	}
1620
1621	grouplock = fd->fd_grouplock;
1622	memset(&fd->fd_grouplock, 0, sizeof(fd->fd_grouplock));
1623	fd->fd_flags &= ~LL_FILE_GROUP_LOCKED;
1624	spin_unlock(&lli->lli_lock);
1625
1626	cl_put_grouplock(&grouplock);
1627	CDEBUG(D_INFO, "group lock %lu released\n", arg);
1628	return 0;
1629}
1630
1631/**
1632 * Close inode open handle
1633 *
1634 * \param dentry [in]     dentry which contains the inode
1635 * \param it     [in,out] intent which contains open info and result
1636 *
1637 * \retval 0     success
1638 * \retval <0    failure
1639 */
1640int ll_release_openhandle(struct dentry *dentry, struct lookup_intent *it)
1641{
1642	struct inode *inode = dentry->d_inode;
1643	struct obd_client_handle *och;
1644	int rc;
1645
1646	LASSERT(inode);
1647
1648	/* Root ? Do nothing. */
1649	if (dentry->d_inode->i_sb->s_root == dentry)
1650		return 0;
1651
1652	/* No open handle to close? Move away */
1653	if (!it_disposition(it, DISP_OPEN_OPEN))
1654		return 0;
1655
1656	LASSERT(it_open_error(DISP_OPEN_OPEN, it) == 0);
1657
1658	och = kzalloc(sizeof(*och), GFP_NOFS);
1659	if (!och) {
1660		rc = -ENOMEM;
1661		goto out;
1662	}
1663
1664	ll_och_fill(ll_i2sbi(inode)->ll_md_exp, it, och);
1665
1666	rc = ll_close_inode_openhandle(ll_i2sbi(inode)->ll_md_exp,
1667				       inode, och, NULL);
1668out:
1669	/* this one is in place of ll_file_open */
1670	if (it_disposition(it, DISP_ENQ_OPEN_REF)) {
1671		ptlrpc_req_finished(it->d.lustre.it_data);
1672		it_clear_disposition(it, DISP_ENQ_OPEN_REF);
1673	}
1674	return rc;
1675}
1676
1677/**
1678 * Get size for inode for which FIEMAP mapping is requested.
1679 * Make the FIEMAP get_info call and returns the result.
1680 */
1681static int ll_do_fiemap(struct inode *inode, struct ll_user_fiemap *fiemap,
1682			size_t num_bytes)
1683{
1684	struct obd_export *exp = ll_i2dtexp(inode);
1685	struct lov_stripe_md *lsm = NULL;
1686	struct ll_fiemap_info_key fm_key = { .name = KEY_FIEMAP, };
1687	__u32 vallen = num_bytes;
1688	int rc;
1689
1690	/* Checks for fiemap flags */
1691	if (fiemap->fm_flags & ~LUSTRE_FIEMAP_FLAGS_COMPAT) {
1692		fiemap->fm_flags &= ~LUSTRE_FIEMAP_FLAGS_COMPAT;
1693		return -EBADR;
1694	}
1695
1696	/* Check for FIEMAP_FLAG_SYNC */
1697	if (fiemap->fm_flags & FIEMAP_FLAG_SYNC) {
1698		rc = filemap_fdatawrite(inode->i_mapping);
1699		if (rc)
1700			return rc;
1701	}
1702
1703	lsm = ccc_inode_lsm_get(inode);
1704	if (lsm == NULL)
1705		return -ENOENT;
1706
1707	/* If the stripe_count > 1 and the application does not understand
1708	 * DEVICE_ORDER flag, then it cannot interpret the extents correctly.
1709	 */
1710	if (lsm->lsm_stripe_count > 1 &&
1711	    !(fiemap->fm_flags & FIEMAP_FLAG_DEVICE_ORDER)) {
1712		rc = -EOPNOTSUPP;
1713		goto out;
1714	}
1715
1716	fm_key.oa.o_oi = lsm->lsm_oi;
1717	fm_key.oa.o_valid = OBD_MD_FLID | OBD_MD_FLGROUP;
1718
1719	obdo_from_inode(&fm_key.oa, inode, OBD_MD_FLSIZE);
1720	obdo_set_parent_fid(&fm_key.oa, &ll_i2info(inode)->lli_fid);
1721	/* If filesize is 0, then there would be no objects for mapping */
1722	if (fm_key.oa.o_size == 0) {
1723		fiemap->fm_mapped_extents = 0;
1724		rc = 0;
1725		goto out;
1726	}
1727
1728	memcpy(&fm_key.fiemap, fiemap, sizeof(*fiemap));
1729
1730	rc = obd_get_info(NULL, exp, sizeof(fm_key), &fm_key, &vallen,
1731			  fiemap, lsm);
1732	if (rc)
1733		CERROR("obd_get_info failed: rc = %d\n", rc);
1734
1735out:
1736	ccc_inode_lsm_put(inode, lsm);
1737	return rc;
1738}
1739
1740int ll_fid2path(struct inode *inode, void __user *arg)
1741{
1742	struct obd_export *exp = ll_i2mdexp(inode);
1743	const struct getinfo_fid2path __user *gfin = arg;
1744	struct getinfo_fid2path *gfout;
1745	u32 pathlen;
1746	size_t outsize;
1747	int rc;
1748
1749	if (!capable(CFS_CAP_DAC_READ_SEARCH) &&
1750	    !(ll_i2sbi(inode)->ll_flags & LL_SBI_USER_FID2PATH))
1751		return -EPERM;
1752
1753	/* Only need to get the buflen */
1754	if (get_user(pathlen, &gfin->gf_pathlen))
1755		return -EFAULT;
1756
1757	if (pathlen > PATH_MAX)
1758		return -EINVAL;
1759
1760	outsize = sizeof(*gfout) + pathlen;
1761
1762	gfout = kzalloc(outsize, GFP_NOFS);
1763	if (!gfout)
1764		return -ENOMEM;
1765
1766	if (copy_from_user(gfout, arg, sizeof(*gfout))) {
1767		rc = -EFAULT;
1768		goto gf_free;
1769	}
1770
1771	/* Call mdc_iocontrol */
1772	rc = obd_iocontrol(OBD_IOC_FID2PATH, exp, outsize, gfout, NULL);
1773	if (rc != 0)
1774		goto gf_free;
1775
1776	if (copy_to_user(arg, gfout, outsize))
1777		rc = -EFAULT;
1778
1779gf_free:
1780	OBD_FREE(gfout, outsize);
1781	return rc;
1782}
1783
1784static int ll_ioctl_fiemap(struct inode *inode, unsigned long arg)
1785{
1786	struct ll_user_fiemap *fiemap_s;
1787	size_t num_bytes, ret_bytes;
1788	unsigned int extent_count;
1789	int rc = 0;
1790
1791	/* Get the extent count so we can calculate the size of
1792	 * required fiemap buffer */
1793	if (get_user(extent_count,
1794	    &((struct ll_user_fiemap __user *)arg)->fm_extent_count))
1795		return -EFAULT;
1796
1797	if (extent_count >=
1798	    (SIZE_MAX - sizeof(*fiemap_s)) / sizeof(struct ll_fiemap_extent))
1799		return -EINVAL;
1800	num_bytes = sizeof(*fiemap_s) + (extent_count *
1801					 sizeof(struct ll_fiemap_extent));
1802
1803	OBD_ALLOC_LARGE(fiemap_s, num_bytes);
1804	if (fiemap_s == NULL)
1805		return -ENOMEM;
1806
1807	/* get the fiemap value */
1808	if (copy_from_user(fiemap_s, (struct ll_user_fiemap __user *)arg,
1809			   sizeof(*fiemap_s))) {
1810		rc = -EFAULT;
1811		goto error;
1812	}
1813
1814	/* If fm_extent_count is non-zero, read the first extent since
1815	 * it is used to calculate end_offset and device from previous
1816	 * fiemap call. */
1817	if (extent_count) {
1818		if (copy_from_user(&fiemap_s->fm_extents[0],
1819		    (char __user *)arg + sizeof(*fiemap_s),
1820		    sizeof(struct ll_fiemap_extent))) {
1821			rc = -EFAULT;
1822			goto error;
1823		}
1824	}
1825
1826	rc = ll_do_fiemap(inode, fiemap_s, num_bytes);
1827	if (rc)
1828		goto error;
1829
1830	ret_bytes = sizeof(struct ll_user_fiemap);
1831
1832	if (extent_count != 0)
1833		ret_bytes += (fiemap_s->fm_mapped_extents *
1834				 sizeof(struct ll_fiemap_extent));
1835
1836	if (copy_to_user((void *)arg, fiemap_s, ret_bytes))
1837		rc = -EFAULT;
1838
1839error:
1840	OBD_FREE_LARGE(fiemap_s, num_bytes);
1841	return rc;
1842}
1843
1844/*
1845 * Read the data_version for inode.
1846 *
1847 * This value is computed using stripe object version on OST.
1848 * Version is computed using server side locking.
1849 *
1850 * @param extent_lock  Take extent lock. Not needed if a process is already
1851 *		       holding the OST object group locks.
1852 */
1853int ll_data_version(struct inode *inode, __u64 *data_version,
1854		    int extent_lock)
1855{
1856	struct lov_stripe_md	*lsm = NULL;
1857	struct ll_sb_info	*sbi = ll_i2sbi(inode);
1858	struct obdo		*obdo = NULL;
1859	int			 rc;
1860
1861	/* If no stripe, we consider version is 0. */
1862	lsm = ccc_inode_lsm_get(inode);
1863	if (!lsm_has_objects(lsm)) {
1864		*data_version = 0;
1865		CDEBUG(D_INODE, "No object for inode\n");
1866		rc = 0;
1867		goto out;
1868	}
1869
1870	obdo = kzalloc(sizeof(*obdo), GFP_NOFS);
1871	if (!obdo) {
1872		rc = -ENOMEM;
1873		goto out;
1874	}
1875
1876	rc = ll_lsm_getattr(lsm, sbi->ll_dt_exp, NULL, obdo, 0, extent_lock);
1877	if (rc == 0) {
1878		if (!(obdo->o_valid & OBD_MD_FLDATAVERSION))
1879			rc = -EOPNOTSUPP;
1880		else
1881			*data_version = obdo->o_data_version;
1882	}
1883
1884	OBD_FREE_PTR(obdo);
1885out:
1886	ccc_inode_lsm_put(inode, lsm);
1887	return rc;
1888}
1889
1890/*
1891 * Trigger a HSM release request for the provided inode.
1892 */
1893int ll_hsm_release(struct inode *inode)
1894{
1895	struct cl_env_nest nest;
1896	struct lu_env *env;
1897	struct obd_client_handle *och = NULL;
1898	__u64 data_version = 0;
1899	int rc;
1900
1901
1902	CDEBUG(D_INODE, "%s: Releasing file "DFID".\n",
1903	       ll_get_fsname(inode->i_sb, NULL, 0),
1904	       PFID(&ll_i2info(inode)->lli_fid));
1905
1906	och = ll_lease_open(inode, NULL, FMODE_WRITE, MDS_OPEN_RELEASE);
1907	if (IS_ERR(och)) {
1908		rc = PTR_ERR(och);
1909		goto out;
1910	}
1911
1912	/* Grab latest data_version and [am]time values */
1913	rc = ll_data_version(inode, &data_version, 1);
1914	if (rc != 0)
1915		goto out;
1916
1917	env = cl_env_nested_get(&nest);
1918	if (IS_ERR(env)) {
1919		rc = PTR_ERR(env);
1920		goto out;
1921	}
1922
1923	ll_merge_lvb(env, inode);
1924	cl_env_nested_put(&nest, env);
1925
1926	/* Release the file.
1927	 * NB: lease lock handle is released in mdc_hsm_release_pack() because
1928	 * we still need it to pack l_remote_handle to MDT. */
1929	rc = ll_close_inode_openhandle(ll_i2sbi(inode)->ll_md_exp, inode, och,
1930				       &data_version);
1931	och = NULL;
1932
1933
1934out:
1935	if (och != NULL && !IS_ERR(och)) /* close the file */
1936		ll_lease_close(och, inode, NULL);
1937
1938	return rc;
1939}
1940
1941struct ll_swap_stack {
1942	struct iattr		 ia1, ia2;
1943	__u64			 dv1, dv2;
1944	struct inode		*inode1, *inode2;
1945	bool			 check_dv1, check_dv2;
1946};
1947
1948static int ll_swap_layouts(struct file *file1, struct file *file2,
1949			   struct lustre_swap_layouts *lsl)
1950{
1951	struct mdc_swap_layouts	 msl;
1952	struct md_op_data	*op_data;
1953	__u32			 gid;
1954	__u64			 dv;
1955	struct ll_swap_stack	*llss = NULL;
1956	int			 rc;
1957
1958	llss = kzalloc(sizeof(*llss), GFP_NOFS);
1959	if (!llss)
1960		return -ENOMEM;
1961
1962	llss->inode1 = file1->f_dentry->d_inode;
1963	llss->inode2 = file2->f_dentry->d_inode;
1964
1965	if (!S_ISREG(llss->inode2->i_mode)) {
1966		rc = -EINVAL;
1967		goto free;
1968	}
1969
1970	if (inode_permission(llss->inode1, MAY_WRITE) ||
1971	    inode_permission(llss->inode2, MAY_WRITE)) {
1972		rc = -EPERM;
1973		goto free;
1974	}
1975
1976	if (llss->inode2->i_sb != llss->inode1->i_sb) {
1977		rc = -EXDEV;
1978		goto free;
1979	}
1980
1981	/* we use 2 bool because it is easier to swap than 2 bits */
1982	if (lsl->sl_flags & SWAP_LAYOUTS_CHECK_DV1)
1983		llss->check_dv1 = true;
1984
1985	if (lsl->sl_flags & SWAP_LAYOUTS_CHECK_DV2)
1986		llss->check_dv2 = true;
1987
1988	/* we cannot use lsl->sl_dvX directly because we may swap them */
1989	llss->dv1 = lsl->sl_dv1;
1990	llss->dv2 = lsl->sl_dv2;
1991
1992	rc = lu_fid_cmp(ll_inode2fid(llss->inode1), ll_inode2fid(llss->inode2));
1993	if (rc == 0) /* same file, done! */ {
1994		rc = 0;
1995		goto free;
1996	}
1997
1998	if (rc < 0) { /* sequentialize it */
1999		swap(llss->inode1, llss->inode2);
2000		swap(file1, file2);
2001		swap(llss->dv1, llss->dv2);
2002		swap(llss->check_dv1, llss->check_dv2);
2003	}
2004
2005	gid = lsl->sl_gid;
2006	if (gid != 0) { /* application asks to flush dirty cache */
2007		rc = ll_get_grouplock(llss->inode1, file1, gid);
2008		if (rc < 0)
2009			goto free;
2010
2011		rc = ll_get_grouplock(llss->inode2, file2, gid);
2012		if (rc < 0) {
2013			ll_put_grouplock(llss->inode1, file1, gid);
2014			goto free;
2015		}
2016	}
2017
2018	/* to be able to restore mtime and atime after swap
2019	 * we need to first save them */
2020	if (lsl->sl_flags &
2021	    (SWAP_LAYOUTS_KEEP_MTIME | SWAP_LAYOUTS_KEEP_ATIME)) {
2022		llss->ia1.ia_mtime = llss->inode1->i_mtime;
2023		llss->ia1.ia_atime = llss->inode1->i_atime;
2024		llss->ia1.ia_valid = ATTR_MTIME | ATTR_ATIME;
2025		llss->ia2.ia_mtime = llss->inode2->i_mtime;
2026		llss->ia2.ia_atime = llss->inode2->i_atime;
2027		llss->ia2.ia_valid = ATTR_MTIME | ATTR_ATIME;
2028	}
2029
2030	/* ultimate check, before swapping the layouts we check if
2031	 * dataversion has changed (if requested) */
2032	if (llss->check_dv1) {
2033		rc = ll_data_version(llss->inode1, &dv, 0);
2034		if (rc)
2035			goto putgl;
2036		if (dv != llss->dv1) {
2037			rc = -EAGAIN;
2038			goto putgl;
2039		}
2040	}
2041
2042	if (llss->check_dv2) {
2043		rc = ll_data_version(llss->inode2, &dv, 0);
2044		if (rc)
2045			goto putgl;
2046		if (dv != llss->dv2) {
2047			rc = -EAGAIN;
2048			goto putgl;
2049		}
2050	}
2051
2052	/* struct md_op_data is used to send the swap args to the mdt
2053	 * only flags is missing, so we use struct mdc_swap_layouts
2054	 * through the md_op_data->op_data */
2055	/* flags from user space have to be converted before they are send to
2056	 * server, no flag is sent today, they are only used on the client */
2057	msl.msl_flags = 0;
2058	rc = -ENOMEM;
2059	op_data = ll_prep_md_op_data(NULL, llss->inode1, llss->inode2, NULL, 0,
2060				     0, LUSTRE_OPC_ANY, &msl);
2061	if (IS_ERR(op_data)) {
2062		rc = PTR_ERR(op_data);
2063		goto free;
2064	}
2065
2066	rc = obd_iocontrol(LL_IOC_LOV_SWAP_LAYOUTS, ll_i2mdexp(llss->inode1),
2067			   sizeof(*op_data), op_data, NULL);
2068	ll_finish_md_op_data(op_data);
2069
2070putgl:
2071	if (gid != 0) {
2072		ll_put_grouplock(llss->inode2, file2, gid);
2073		ll_put_grouplock(llss->inode1, file1, gid);
2074	}
2075
2076	/* rc can be set from obd_iocontrol() or from a GOTO(putgl, ...) */
2077	if (rc != 0)
2078		goto free;
2079
2080	/* clear useless flags */
2081	if (!(lsl->sl_flags & SWAP_LAYOUTS_KEEP_MTIME)) {
2082		llss->ia1.ia_valid &= ~ATTR_MTIME;
2083		llss->ia2.ia_valid &= ~ATTR_MTIME;
2084	}
2085
2086	if (!(lsl->sl_flags & SWAP_LAYOUTS_KEEP_ATIME)) {
2087		llss->ia1.ia_valid &= ~ATTR_ATIME;
2088		llss->ia2.ia_valid &= ~ATTR_ATIME;
2089	}
2090
2091	/* update time if requested */
2092	rc = 0;
2093	if (llss->ia2.ia_valid != 0) {
2094		mutex_lock(&llss->inode1->i_mutex);
2095		rc = ll_setattr(file1->f_dentry, &llss->ia2);
2096		mutex_unlock(&llss->inode1->i_mutex);
2097	}
2098
2099	if (llss->ia1.ia_valid != 0) {
2100		int rc1;
2101
2102		mutex_lock(&llss->inode2->i_mutex);
2103		rc1 = ll_setattr(file2->f_dentry, &llss->ia1);
2104		mutex_unlock(&llss->inode2->i_mutex);
2105		if (rc == 0)
2106			rc = rc1;
2107	}
2108
2109free:
2110	if (llss != NULL)
2111		OBD_FREE_PTR(llss);
2112
2113	return rc;
2114}
2115
2116static int ll_hsm_state_set(struct inode *inode, struct hsm_state_set *hss)
2117{
2118	struct md_op_data	*op_data;
2119	int			 rc;
2120
2121	/* Non-root users are forbidden to set or clear flags which are
2122	 * NOT defined in HSM_USER_MASK. */
2123	if (((hss->hss_setmask | hss->hss_clearmask) & ~HSM_USER_MASK) &&
2124	    !capable(CFS_CAP_SYS_ADMIN))
2125		return -EPERM;
2126
2127	op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
2128				     LUSTRE_OPC_ANY, hss);
2129	if (IS_ERR(op_data))
2130		return PTR_ERR(op_data);
2131
2132	rc = obd_iocontrol(LL_IOC_HSM_STATE_SET, ll_i2mdexp(inode),
2133			   sizeof(*op_data), op_data, NULL);
2134
2135	ll_finish_md_op_data(op_data);
2136
2137	return rc;
2138}
2139
2140static int ll_hsm_import(struct inode *inode, struct file *file,
2141			 struct hsm_user_import *hui)
2142{
2143	struct hsm_state_set	*hss = NULL;
2144	struct iattr		*attr = NULL;
2145	int			 rc;
2146
2147
2148	if (!S_ISREG(inode->i_mode))
2149		return -EINVAL;
2150
2151	/* set HSM flags */
2152	hss = kzalloc(sizeof(*hss), GFP_NOFS);
2153	if (!hss) {
2154		rc = -ENOMEM;
2155		goto out;
2156	}
2157
2158	hss->hss_valid = HSS_SETMASK | HSS_ARCHIVE_ID;
2159	hss->hss_archive_id = hui->hui_archive_id;
2160	hss->hss_setmask = HS_ARCHIVED | HS_EXISTS | HS_RELEASED;
2161	rc = ll_hsm_state_set(inode, hss);
2162	if (rc != 0)
2163		goto out;
2164
2165	attr = kzalloc(sizeof(*attr), GFP_NOFS);
2166	if (!attr) {
2167		rc = -ENOMEM;
2168		goto out;
2169	}
2170
2171	attr->ia_mode = hui->hui_mode & (S_IRWXU | S_IRWXG | S_IRWXO);
2172	attr->ia_mode |= S_IFREG;
2173	attr->ia_uid = make_kuid(&init_user_ns, hui->hui_uid);
2174	attr->ia_gid = make_kgid(&init_user_ns, hui->hui_gid);
2175	attr->ia_size = hui->hui_size;
2176	attr->ia_mtime.tv_sec = hui->hui_mtime;
2177	attr->ia_mtime.tv_nsec = hui->hui_mtime_ns;
2178	attr->ia_atime.tv_sec = hui->hui_atime;
2179	attr->ia_atime.tv_nsec = hui->hui_atime_ns;
2180
2181	attr->ia_valid = ATTR_SIZE | ATTR_MODE | ATTR_FORCE |
2182			 ATTR_UID | ATTR_GID |
2183			 ATTR_MTIME | ATTR_MTIME_SET |
2184			 ATTR_ATIME | ATTR_ATIME_SET;
2185
2186	mutex_lock(&inode->i_mutex);
2187
2188	rc = ll_setattr_raw(file->f_dentry, attr, true);
2189	if (rc == -ENODATA)
2190		rc = 0;
2191
2192	mutex_unlock(&inode->i_mutex);
2193
2194out:
2195	if (hss != NULL)
2196		OBD_FREE_PTR(hss);
2197
2198	if (attr != NULL)
2199		OBD_FREE_PTR(attr);
2200
2201	return rc;
2202}
2203
2204static long
2205ll_file_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
2206{
2207	struct inode		*inode = file->f_dentry->d_inode;
2208	struct ll_file_data	*fd = LUSTRE_FPRIVATE(file);
2209	int			 flags, rc;
2210
2211	CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p),cmd=%x\n", inode->i_ino,
2212	       inode->i_generation, inode, cmd);
2213	ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_IOCTL, 1);
2214
2215	/* asm-ppc{,64} declares TCGETS, et. al. as type 't' not 'T' */
2216	if (_IOC_TYPE(cmd) == 'T' || _IOC_TYPE(cmd) == 't') /* tty ioctls */
2217		return -ENOTTY;
2218
2219	switch (cmd) {
2220	case LL_IOC_GETFLAGS:
2221		/* Get the current value of the file flags */
2222		return put_user(fd->fd_flags, (int *)arg);
2223	case LL_IOC_SETFLAGS:
2224	case LL_IOC_CLRFLAGS:
2225		/* Set or clear specific file flags */
2226		/* XXX This probably needs checks to ensure the flags are
2227		 *     not abused, and to handle any flag side effects.
2228		 */
2229		if (get_user(flags, (int *) arg))
2230			return -EFAULT;
2231
2232		if (cmd == LL_IOC_SETFLAGS) {
2233			if ((flags & LL_FILE_IGNORE_LOCK) &&
2234			    !(file->f_flags & O_DIRECT)) {
2235				CERROR("%s: unable to disable locking on "
2236				       "non-O_DIRECT file\n", current->comm);
2237				return -EINVAL;
2238			}
2239
2240			fd->fd_flags |= flags;
2241		} else {
2242			fd->fd_flags &= ~flags;
2243		}
2244		return 0;
2245	case LL_IOC_LOV_SETSTRIPE:
2246		return ll_lov_setstripe(inode, file, arg);
2247	case LL_IOC_LOV_SETEA:
2248		return ll_lov_setea(inode, file, arg);
2249	case LL_IOC_LOV_SWAP_LAYOUTS: {
2250		struct file *file2;
2251		struct lustre_swap_layouts lsl;
2252
2253		if (copy_from_user(&lsl, (char *)arg,
2254				       sizeof(struct lustre_swap_layouts)))
2255			return -EFAULT;
2256
2257		if ((file->f_flags & O_ACCMODE) == 0) /* O_RDONLY */
2258			return -EPERM;
2259
2260		file2 = fget(lsl.sl_fd);
2261		if (file2 == NULL)
2262			return -EBADF;
2263
2264		rc = -EPERM;
2265		if ((file2->f_flags & O_ACCMODE) != 0) /* O_WRONLY or O_RDWR */
2266			rc = ll_swap_layouts(file, file2, &lsl);
2267		fput(file2);
2268		return rc;
2269	}
2270	case LL_IOC_LOV_GETSTRIPE:
2271		return ll_lov_getstripe(inode, arg);
2272	case LL_IOC_RECREATE_OBJ:
2273		return ll_lov_recreate_obj(inode, arg);
2274	case LL_IOC_RECREATE_FID:
2275		return ll_lov_recreate_fid(inode, arg);
2276	case FSFILT_IOC_FIEMAP:
2277		return ll_ioctl_fiemap(inode, arg);
2278	case FSFILT_IOC_GETFLAGS:
2279	case FSFILT_IOC_SETFLAGS:
2280		return ll_iocontrol(inode, file, cmd, arg);
2281	case FSFILT_IOC_GETVERSION_OLD:
2282	case FSFILT_IOC_GETVERSION:
2283		return put_user(inode->i_generation, (int *)arg);
2284	case LL_IOC_GROUP_LOCK:
2285		return ll_get_grouplock(inode, file, arg);
2286	case LL_IOC_GROUP_UNLOCK:
2287		return ll_put_grouplock(inode, file, arg);
2288	case IOC_OBD_STATFS:
2289		return ll_obd_statfs(inode, (void *)arg);
2290
2291	/* We need to special case any other ioctls we want to handle,
2292	 * to send them to the MDS/OST as appropriate and to properly
2293	 * network encode the arg field.
2294	case FSFILT_IOC_SETVERSION_OLD:
2295	case FSFILT_IOC_SETVERSION:
2296	*/
2297	case LL_IOC_FLUSHCTX:
2298		return ll_flush_ctx(inode);
2299	case LL_IOC_PATH2FID: {
2300		if (copy_to_user((void *)arg, ll_inode2fid(inode),
2301				 sizeof(struct lu_fid)))
2302			return -EFAULT;
2303
2304		return 0;
2305	}
2306	case OBD_IOC_FID2PATH:
2307		return ll_fid2path(inode, (void *)arg);
2308	case LL_IOC_DATA_VERSION: {
2309		struct ioc_data_version	idv;
2310		int			rc;
2311
2312		if (copy_from_user(&idv, (char *)arg, sizeof(idv)))
2313			return -EFAULT;
2314
2315		rc = ll_data_version(inode, &idv.idv_version,
2316				!(idv.idv_flags & LL_DV_NOFLUSH));
2317
2318		if (rc == 0 && copy_to_user((char *) arg, &idv, sizeof(idv)))
2319			return -EFAULT;
2320
2321		return rc;
2322	}
2323
2324	case LL_IOC_GET_MDTIDX: {
2325		int mdtidx;
2326
2327		mdtidx = ll_get_mdt_idx(inode);
2328		if (mdtidx < 0)
2329			return mdtidx;
2330
2331		if (put_user((int)mdtidx, (int *)arg))
2332			return -EFAULT;
2333
2334		return 0;
2335	}
2336	case OBD_IOC_GETDTNAME:
2337	case OBD_IOC_GETMDNAME:
2338		return ll_get_obd_name(inode, cmd, arg);
2339	case LL_IOC_HSM_STATE_GET: {
2340		struct md_op_data	*op_data;
2341		struct hsm_user_state	*hus;
2342		int			 rc;
2343
2344		hus = kzalloc(sizeof(*hus), GFP_NOFS);
2345		if (!hus)
2346			return -ENOMEM;
2347
2348		op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
2349					     LUSTRE_OPC_ANY, hus);
2350		if (IS_ERR(op_data)) {
2351			OBD_FREE_PTR(hus);
2352			return PTR_ERR(op_data);
2353		}
2354
2355		rc = obd_iocontrol(cmd, ll_i2mdexp(inode), sizeof(*op_data),
2356				   op_data, NULL);
2357
2358		if (copy_to_user((void *)arg, hus, sizeof(*hus)))
2359			rc = -EFAULT;
2360
2361		ll_finish_md_op_data(op_data);
2362		OBD_FREE_PTR(hus);
2363		return rc;
2364	}
2365	case LL_IOC_HSM_STATE_SET: {
2366		struct hsm_state_set	*hss;
2367		int			 rc;
2368
2369		hss = kzalloc(sizeof(*hss), GFP_NOFS);
2370		if (!hss)
2371			return -ENOMEM;
2372
2373		if (copy_from_user(hss, (char *)arg, sizeof(*hss))) {
2374			OBD_FREE_PTR(hss);
2375			return -EFAULT;
2376		}
2377
2378		rc = ll_hsm_state_set(inode, hss);
2379
2380		OBD_FREE_PTR(hss);
2381		return rc;
2382	}
2383	case LL_IOC_HSM_ACTION: {
2384		struct md_op_data		*op_data;
2385		struct hsm_current_action	*hca;
2386		int				 rc;
2387
2388		hca = kzalloc(sizeof(*hca), GFP_NOFS);
2389		if (!hca)
2390			return -ENOMEM;
2391
2392		op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
2393					     LUSTRE_OPC_ANY, hca);
2394		if (IS_ERR(op_data)) {
2395			OBD_FREE_PTR(hca);
2396			return PTR_ERR(op_data);
2397		}
2398
2399		rc = obd_iocontrol(cmd, ll_i2mdexp(inode), sizeof(*op_data),
2400				   op_data, NULL);
2401
2402		if (copy_to_user((char *)arg, hca, sizeof(*hca)))
2403			rc = -EFAULT;
2404
2405		ll_finish_md_op_data(op_data);
2406		OBD_FREE_PTR(hca);
2407		return rc;
2408	}
2409	case LL_IOC_SET_LEASE: {
2410		struct ll_inode_info *lli = ll_i2info(inode);
2411		struct obd_client_handle *och = NULL;
2412		bool lease_broken;
2413		fmode_t mode = 0;
2414
2415		switch (arg) {
2416		case F_WRLCK:
2417			if (!(file->f_mode & FMODE_WRITE))
2418				return -EPERM;
2419			mode = FMODE_WRITE;
2420			break;
2421		case F_RDLCK:
2422			if (!(file->f_mode & FMODE_READ))
2423				return -EPERM;
2424			mode = FMODE_READ;
2425			break;
2426		case F_UNLCK:
2427			mutex_lock(&lli->lli_och_mutex);
2428			if (fd->fd_lease_och != NULL) {
2429				och = fd->fd_lease_och;
2430				fd->fd_lease_och = NULL;
2431			}
2432			mutex_unlock(&lli->lli_och_mutex);
2433
2434			if (och != NULL) {
2435				mode = och->och_flags &
2436				       (FMODE_READ|FMODE_WRITE);
2437				rc = ll_lease_close(och, inode, &lease_broken);
2438				if (rc == 0 && lease_broken)
2439					mode = 0;
2440			} else {
2441				rc = -ENOLCK;
2442			}
2443
2444			/* return the type of lease or error */
2445			return rc < 0 ? rc : (int)mode;
2446		default:
2447			return -EINVAL;
2448		}
2449
2450		CDEBUG(D_INODE, "Set lease with mode %d\n", mode);
2451
2452		/* apply for lease */
2453		och = ll_lease_open(inode, file, mode, 0);
2454		if (IS_ERR(och))
2455			return PTR_ERR(och);
2456
2457		rc = 0;
2458		mutex_lock(&lli->lli_och_mutex);
2459		if (fd->fd_lease_och == NULL) {
2460			fd->fd_lease_och = och;
2461			och = NULL;
2462		}
2463		mutex_unlock(&lli->lli_och_mutex);
2464		if (och != NULL) {
2465			/* impossible now that only excl is supported for now */
2466			ll_lease_close(och, inode, &lease_broken);
2467			rc = -EBUSY;
2468		}
2469		return rc;
2470	}
2471	case LL_IOC_GET_LEASE: {
2472		struct ll_inode_info *lli = ll_i2info(inode);
2473		struct ldlm_lock *lock = NULL;
2474
2475		rc = 0;
2476		mutex_lock(&lli->lli_och_mutex);
2477		if (fd->fd_lease_och != NULL) {
2478			struct obd_client_handle *och = fd->fd_lease_och;
2479
2480			lock = ldlm_handle2lock(&och->och_lease_handle);
2481			if (lock != NULL) {
2482				lock_res_and_lock(lock);
2483				if (!ldlm_is_cancel(lock))
2484					rc = och->och_flags &
2485						(FMODE_READ | FMODE_WRITE);
2486				unlock_res_and_lock(lock);
2487				ldlm_lock_put(lock);
2488			}
2489		}
2490		mutex_unlock(&lli->lli_och_mutex);
2491		return rc;
2492	}
2493	case LL_IOC_HSM_IMPORT: {
2494		struct hsm_user_import *hui;
2495
2496		hui = kzalloc(sizeof(*hui), GFP_NOFS);
2497		if (!hui)
2498			return -ENOMEM;
2499
2500		if (copy_from_user(hui, (void *)arg, sizeof(*hui))) {
2501			OBD_FREE_PTR(hui);
2502			return -EFAULT;
2503		}
2504
2505		rc = ll_hsm_import(inode, file, hui);
2506
2507		OBD_FREE_PTR(hui);
2508		return rc;
2509	}
2510	default: {
2511		int err;
2512
2513		if (LLIOC_STOP ==
2514		     ll_iocontrol_call(inode, file, cmd, arg, &err))
2515			return err;
2516
2517		return obd_iocontrol(cmd, ll_i2dtexp(inode), 0, NULL,
2518				     (void *)arg);
2519	}
2520	}
2521}
2522
2523
2524static loff_t ll_file_seek(struct file *file, loff_t offset, int origin)
2525{
2526	struct inode *inode = file->f_dentry->d_inode;
2527	loff_t retval, eof = 0;
2528
2529	retval = offset + ((origin == SEEK_END) ? i_size_read(inode) :
2530			   (origin == SEEK_CUR) ? file->f_pos : 0);
2531	CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p), to=%llu=%#llx(%d)\n",
2532	       inode->i_ino, inode->i_generation, inode, retval, retval,
2533	       origin);
2534	ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_LLSEEK, 1);
2535
2536	if (origin == SEEK_END || origin == SEEK_HOLE || origin == SEEK_DATA) {
2537		retval = ll_glimpse_size(inode);
2538		if (retval != 0)
2539			return retval;
2540		eof = i_size_read(inode);
2541	}
2542
2543	retval = generic_file_llseek_size(file, offset, origin,
2544					  ll_file_maxbytes(inode), eof);
2545	return retval;
2546}
2547
2548static int ll_flush(struct file *file, fl_owner_t id)
2549{
2550	struct inode *inode = file->f_dentry->d_inode;
2551	struct ll_inode_info *lli = ll_i2info(inode);
2552	struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
2553	int rc, err;
2554
2555	LASSERT(!S_ISDIR(inode->i_mode));
2556
2557	/* catch async errors that were recorded back when async writeback
2558	 * failed for pages in this mapping. */
2559	rc = lli->lli_async_rc;
2560	lli->lli_async_rc = 0;
2561	err = lov_read_and_clear_async_rc(lli->lli_clob);
2562	if (rc == 0)
2563		rc = err;
2564
2565	/* The application has been told write failure already.
2566	 * Do not report failure again. */
2567	if (fd->fd_write_failed)
2568		return 0;
2569	return rc ? -EIO : 0;
2570}
2571
2572/**
2573 * Called to make sure a portion of file has been written out.
2574 * if @mode is not CL_FSYNC_LOCAL, it will send OST_SYNC RPCs to OST.
2575 *
2576 * Return how many pages have been written.
2577 */
2578int cl_sync_file_range(struct inode *inode, loff_t start, loff_t end,
2579		       enum cl_fsync_mode mode, int ignore_layout)
2580{
2581	struct cl_env_nest nest;
2582	struct lu_env *env;
2583	struct cl_io *io;
2584	struct obd_capa *capa = NULL;
2585	struct cl_fsync_io *fio;
2586	int result;
2587
2588	if (mode != CL_FSYNC_NONE && mode != CL_FSYNC_LOCAL &&
2589	    mode != CL_FSYNC_DISCARD && mode != CL_FSYNC_ALL)
2590		return -EINVAL;
2591
2592	env = cl_env_nested_get(&nest);
2593	if (IS_ERR(env))
2594		return PTR_ERR(env);
2595
2596	capa = ll_osscapa_get(inode, CAPA_OPC_OSS_WRITE);
2597
2598	io = ccc_env_thread_io(env);
2599	io->ci_obj = cl_i2info(inode)->lli_clob;
2600	io->ci_ignore_layout = ignore_layout;
2601
2602	/* initialize parameters for sync */
2603	fio = &io->u.ci_fsync;
2604	fio->fi_capa = capa;
2605	fio->fi_start = start;
2606	fio->fi_end = end;
2607	fio->fi_fid = ll_inode2fid(inode);
2608	fio->fi_mode = mode;
2609	fio->fi_nr_written = 0;
2610
2611	if (cl_io_init(env, io, CIT_FSYNC, io->ci_obj) == 0)
2612		result = cl_io_loop(env, io);
2613	else
2614		result = io->ci_result;
2615	if (result == 0)
2616		result = fio->fi_nr_written;
2617	cl_io_fini(env, io);
2618	cl_env_nested_put(&nest, env);
2619
2620	capa_put(capa);
2621
2622	return result;
2623}
2624
2625/*
2626 * When dentry is provided (the 'else' case), *file->f_dentry may be
2627 * null and dentry must be used directly rather than pulled from
2628 * *file->f_dentry as is done otherwise.
2629 */
2630
2631int ll_fsync(struct file *file, loff_t start, loff_t end, int datasync)
2632{
2633	struct dentry *dentry = file->f_dentry;
2634	struct inode *inode = dentry->d_inode;
2635	struct ll_inode_info *lli = ll_i2info(inode);
2636	struct ptlrpc_request *req;
2637	struct obd_capa *oc;
2638	int rc, err;
2639
2640	CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p)\n", inode->i_ino,
2641	       inode->i_generation, inode);
2642	ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_FSYNC, 1);
2643
2644	rc = filemap_write_and_wait_range(inode->i_mapping, start, end);
2645	mutex_lock(&inode->i_mutex);
2646
2647	/* catch async errors that were recorded back when async writeback
2648	 * failed for pages in this mapping. */
2649	if (!S_ISDIR(inode->i_mode)) {
2650		err = lli->lli_async_rc;
2651		lli->lli_async_rc = 0;
2652		if (rc == 0)
2653			rc = err;
2654		err = lov_read_and_clear_async_rc(lli->lli_clob);
2655		if (rc == 0)
2656			rc = err;
2657	}
2658
2659	oc = ll_mdscapa_get(inode);
2660	err = md_sync(ll_i2sbi(inode)->ll_md_exp, ll_inode2fid(inode), oc,
2661		      &req);
2662	capa_put(oc);
2663	if (!rc)
2664		rc = err;
2665	if (!err)
2666		ptlrpc_req_finished(req);
2667
2668	if (S_ISREG(inode->i_mode)) {
2669		struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
2670
2671		err = cl_sync_file_range(inode, start, end, CL_FSYNC_ALL, 0);
2672		if (rc == 0 && err < 0)
2673			rc = err;
2674		if (rc < 0)
2675			fd->fd_write_failed = true;
2676		else
2677			fd->fd_write_failed = false;
2678	}
2679
2680	mutex_unlock(&inode->i_mutex);
2681	return rc;
2682}
2683
2684static int
2685ll_file_flock(struct file *file, int cmd, struct file_lock *file_lock)
2686{
2687	struct inode *inode = file->f_dentry->d_inode;
2688	struct ll_sb_info *sbi = ll_i2sbi(inode);
2689	struct ldlm_enqueue_info einfo = {
2690		.ei_type	= LDLM_FLOCK,
2691		.ei_cb_cp	= ldlm_flock_completion_ast,
2692		.ei_cbdata	= file_lock,
2693	};
2694	struct md_op_data *op_data;
2695	struct lustre_handle lockh = {0};
2696	ldlm_policy_data_t flock = {{0}};
2697	__u64 flags = 0;
2698	int rc;
2699	int rc2 = 0;
2700
2701	CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu file_lock=%p\n",
2702	       inode->i_ino, file_lock);
2703
2704	ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_FLOCK, 1);
2705
2706	if (file_lock->fl_flags & FL_FLOCK)
2707		LASSERT((cmd == F_SETLKW) || (cmd == F_SETLK));
2708	else if (!(file_lock->fl_flags & FL_POSIX))
2709		return -EINVAL;
2710
2711	flock.l_flock.owner = (unsigned long)file_lock->fl_owner;
2712	flock.l_flock.pid = file_lock->fl_pid;
2713	flock.l_flock.start = file_lock->fl_start;
2714	flock.l_flock.end = file_lock->fl_end;
2715
2716	/* Somewhat ugly workaround for svc lockd.
2717	 * lockd installs custom fl_lmops->lm_compare_owner that checks
2718	 * for the fl_owner to be the same (which it always is on local node
2719	 * I guess between lockd processes) and then compares pid.
2720	 * As such we assign pid to the owner field to make it all work,
2721	 * conflict with normal locks is unlikely since pid space and
2722	 * pointer space for current->files are not intersecting */
2723	if (file_lock->fl_lmops && file_lock->fl_lmops->lm_compare_owner)
2724		flock.l_flock.owner = (unsigned long)file_lock->fl_pid;
2725
2726	switch (file_lock->fl_type) {
2727	case F_RDLCK:
2728		einfo.ei_mode = LCK_PR;
2729		break;
2730	case F_UNLCK:
2731		/* An unlock request may or may not have any relation to
2732		 * existing locks so we may not be able to pass a lock handle
2733		 * via a normal ldlm_lock_cancel() request. The request may even
2734		 * unlock a byte range in the middle of an existing lock. In
2735		 * order to process an unlock request we need all of the same
2736		 * information that is given with a normal read or write record
2737		 * lock request. To avoid creating another ldlm unlock (cancel)
2738		 * message we'll treat a LCK_NL flock request as an unlock. */
2739		einfo.ei_mode = LCK_NL;
2740		break;
2741	case F_WRLCK:
2742		einfo.ei_mode = LCK_PW;
2743		break;
2744	default:
2745		CDEBUG(D_INFO, "Unknown fcntl lock type: %d\n",
2746			file_lock->fl_type);
2747		return -ENOTSUPP;
2748	}
2749
2750	switch (cmd) {
2751	case F_SETLKW:
2752#ifdef F_SETLKW64
2753	case F_SETLKW64:
2754#endif
2755		flags = 0;
2756		break;
2757	case F_SETLK:
2758#ifdef F_SETLK64
2759	case F_SETLK64:
2760#endif
2761		flags = LDLM_FL_BLOCK_NOWAIT;
2762		break;
2763	case F_GETLK:
2764#ifdef F_GETLK64
2765	case F_GETLK64:
2766#endif
2767		flags = LDLM_FL_TEST_LOCK;
2768		/* Save the old mode so that if the mode in the lock changes we
2769		 * can decrement the appropriate reader or writer refcount. */
2770		file_lock->fl_type = einfo.ei_mode;
2771		break;
2772	default:
2773		CERROR("unknown fcntl lock command: %d\n", cmd);
2774		return -EINVAL;
2775	}
2776
2777	op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
2778				     LUSTRE_OPC_ANY, NULL);
2779	if (IS_ERR(op_data))
2780		return PTR_ERR(op_data);
2781
2782	CDEBUG(D_DLMTRACE, "inode=%lu, pid=%u, flags=%#llx, mode=%u, start=%llu, end=%llu\n",
2783	       inode->i_ino, flock.l_flock.pid, flags, einfo.ei_mode,
2784	       flock.l_flock.start, flock.l_flock.end);
2785
2786	rc = md_enqueue(sbi->ll_md_exp, &einfo, NULL,
2787			op_data, &lockh, &flock, 0, NULL /* req */, flags);
2788
2789	if ((file_lock->fl_flags & FL_FLOCK) &&
2790	    (rc == 0 || file_lock->fl_type == F_UNLCK))
2791		rc2  = flock_lock_file_wait(file, file_lock);
2792	if ((file_lock->fl_flags & FL_POSIX) &&
2793	    (rc == 0 || file_lock->fl_type == F_UNLCK) &&
2794	    !(flags & LDLM_FL_TEST_LOCK))
2795		rc2  = posix_lock_file_wait(file, file_lock);
2796
2797	if (rc2 && file_lock->fl_type != F_UNLCK) {
2798		einfo.ei_mode = LCK_NL;
2799		md_enqueue(sbi->ll_md_exp, &einfo, NULL,
2800			op_data, &lockh, &flock, 0, NULL /* req */, flags);
2801		rc = rc2;
2802	}
2803
2804	ll_finish_md_op_data(op_data);
2805
2806	return rc;
2807}
2808
2809static int
2810ll_file_noflock(struct file *file, int cmd, struct file_lock *file_lock)
2811{
2812	return -ENOSYS;
2813}
2814
2815/**
2816 * test if some locks matching bits and l_req_mode are acquired
2817 * - bits can be in different locks
2818 * - if found clear the common lock bits in *bits
2819 * - the bits not found, are kept in *bits
2820 * \param inode [IN]
2821 * \param bits [IN] searched lock bits [IN]
2822 * \param l_req_mode [IN] searched lock mode
2823 * \retval boolean, true iff all bits are found
2824 */
2825int ll_have_md_lock(struct inode *inode, __u64 *bits,  ldlm_mode_t l_req_mode)
2826{
2827	struct lustre_handle lockh;
2828	ldlm_policy_data_t policy;
2829	ldlm_mode_t mode = (l_req_mode == LCK_MINMODE) ?
2830				(LCK_CR|LCK_CW|LCK_PR|LCK_PW) : l_req_mode;
2831	struct lu_fid *fid;
2832	__u64 flags;
2833	int i;
2834
2835	if (!inode)
2836	       return 0;
2837
2838	fid = &ll_i2info(inode)->lli_fid;
2839	CDEBUG(D_INFO, "trying to match res "DFID" mode %s\n", PFID(fid),
2840	       ldlm_lockname[mode]);
2841
2842	flags = LDLM_FL_BLOCK_GRANTED | LDLM_FL_CBPENDING | LDLM_FL_TEST_LOCK;
2843	for (i = 0; i <= MDS_INODELOCK_MAXSHIFT && *bits != 0; i++) {
2844		policy.l_inodebits.bits = *bits & (1 << i);
2845		if (policy.l_inodebits.bits == 0)
2846			continue;
2847
2848		if (md_lock_match(ll_i2mdexp(inode), flags, fid, LDLM_IBITS,
2849				  &policy, mode, &lockh)) {
2850			struct ldlm_lock *lock;
2851
2852			lock = ldlm_handle2lock(&lockh);
2853			if (lock) {
2854				*bits &=
2855				      ~(lock->l_policy_data.l_inodebits.bits);
2856				LDLM_LOCK_PUT(lock);
2857			} else {
2858				*bits &= ~policy.l_inodebits.bits;
2859			}
2860		}
2861	}
2862	return *bits == 0;
2863}
2864
2865ldlm_mode_t ll_take_md_lock(struct inode *inode, __u64 bits,
2866			    struct lustre_handle *lockh, __u64 flags,
2867			    ldlm_mode_t mode)
2868{
2869	ldlm_policy_data_t policy = { .l_inodebits = {bits}};
2870	struct lu_fid *fid;
2871	ldlm_mode_t rc;
2872
2873	fid = &ll_i2info(inode)->lli_fid;
2874	CDEBUG(D_INFO, "trying to match res "DFID"\n", PFID(fid));
2875
2876	rc = md_lock_match(ll_i2mdexp(inode), LDLM_FL_BLOCK_GRANTED|flags,
2877			   fid, LDLM_IBITS, &policy, mode, lockh);
2878
2879	return rc;
2880}
2881
2882static int ll_inode_revalidate_fini(struct inode *inode, int rc)
2883{
2884	/* Already unlinked. Just update nlink and return success */
2885	if (rc == -ENOENT) {
2886		clear_nlink(inode);
2887		/* This path cannot be hit for regular files unless in
2888		 * case of obscure races, so no need to validate size.
2889		 */
2890		if (!S_ISREG(inode->i_mode) && !S_ISDIR(inode->i_mode))
2891			return 0;
2892	} else if (rc != 0) {
2893		CDEBUG_LIMIT((rc == -EACCES || rc == -EIDRM) ? D_INFO : D_ERROR,
2894			     "%s: revalidate FID "DFID" error: rc = %d\n",
2895			     ll_get_fsname(inode->i_sb, NULL, 0),
2896			     PFID(ll_inode2fid(inode)), rc);
2897	}
2898
2899	return rc;
2900}
2901
2902static int __ll_inode_revalidate(struct dentry *dentry, __u64 ibits)
2903{
2904	struct inode *inode = dentry->d_inode;
2905	struct ptlrpc_request *req = NULL;
2906	struct obd_export *exp;
2907	int rc = 0;
2908
2909	LASSERT(inode != NULL);
2910
2911	CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p),name=%s\n",
2912	       inode->i_ino, inode->i_generation, inode, dentry->d_name.name);
2913
2914	exp = ll_i2mdexp(inode);
2915
2916	/* XXX: Enable OBD_CONNECT_ATTRFID to reduce unnecessary getattr RPC.
2917	 *      But under CMD case, it caused some lock issues, should be fixed
2918	 *      with new CMD ibits lock. See bug 12718 */
2919	if (exp_connect_flags(exp) & OBD_CONNECT_ATTRFID) {
2920		struct lookup_intent oit = { .it_op = IT_GETATTR };
2921		struct md_op_data *op_data;
2922
2923		if (ibits == MDS_INODELOCK_LOOKUP)
2924			oit.it_op = IT_LOOKUP;
2925
2926		/* Call getattr by fid, so do not provide name at all. */
2927		op_data = ll_prep_md_op_data(NULL, dentry->d_inode,
2928					     dentry->d_inode, NULL, 0, 0,
2929					     LUSTRE_OPC_ANY, NULL);
2930		if (IS_ERR(op_data))
2931			return PTR_ERR(op_data);
2932
2933		oit.it_create_mode |= M_CHECK_STALE;
2934		rc = md_intent_lock(exp, op_data, NULL, 0,
2935				    /* we are not interested in name
2936				       based lookup */
2937				    &oit, 0, &req,
2938				    ll_md_blocking_ast, 0);
2939		ll_finish_md_op_data(op_data);
2940		oit.it_create_mode &= ~M_CHECK_STALE;
2941		if (rc < 0) {
2942			rc = ll_inode_revalidate_fini(inode, rc);
2943			goto out;
2944		}
2945
2946		rc = ll_revalidate_it_finish(req, &oit, dentry);
2947		if (rc != 0) {
2948			ll_intent_release(&oit);
2949			goto out;
2950		}
2951
2952		/* Unlinked? Unhash dentry, so it is not picked up later by
2953		   do_lookup() -> ll_revalidate_it(). We cannot use d_drop
2954		   here to preserve get_cwd functionality on 2.6.
2955		   Bug 10503 */
2956		if (!dentry->d_inode->i_nlink)
2957			d_lustre_invalidate(dentry, 0);
2958
2959		ll_lookup_finish_locks(&oit, dentry);
2960	} else if (!ll_have_md_lock(dentry->d_inode, &ibits, LCK_MINMODE)) {
2961		struct ll_sb_info *sbi = ll_i2sbi(dentry->d_inode);
2962		u64 valid = OBD_MD_FLGETATTR;
2963		struct md_op_data *op_data;
2964		int ealen = 0;
2965
2966		if (S_ISREG(inode->i_mode)) {
2967			rc = ll_get_default_mdsize(sbi, &ealen);
2968			if (rc)
2969				return rc;
2970			valid |= OBD_MD_FLEASIZE | OBD_MD_FLMODEASIZE;
2971		}
2972
2973		op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL,
2974					     0, ealen, LUSTRE_OPC_ANY,
2975					     NULL);
2976		if (IS_ERR(op_data))
2977			return PTR_ERR(op_data);
2978
2979		op_data->op_valid = valid;
2980		/* Once OBD_CONNECT_ATTRFID is not supported, we can't find one
2981		 * capa for this inode. Because we only keep capas of dirs
2982		 * fresh. */
2983		rc = md_getattr(sbi->ll_md_exp, op_data, &req);
2984		ll_finish_md_op_data(op_data);
2985		if (rc) {
2986			rc = ll_inode_revalidate_fini(inode, rc);
2987			return rc;
2988		}
2989
2990		rc = ll_prep_inode(&inode, req, NULL, NULL);
2991	}
2992out:
2993	ptlrpc_req_finished(req);
2994	return rc;
2995}
2996
2997static int ll_inode_revalidate(struct dentry *dentry, __u64 ibits)
2998{
2999	struct inode *inode = dentry->d_inode;
3000	int rc;
3001
3002	rc = __ll_inode_revalidate(dentry, ibits);
3003	if (rc != 0)
3004		return rc;
3005
3006	/* if object isn't regular file, don't validate size */
3007	if (!S_ISREG(inode->i_mode)) {
3008		LTIME_S(inode->i_atime) = ll_i2info(inode)->lli_lvb.lvb_atime;
3009		LTIME_S(inode->i_mtime) = ll_i2info(inode)->lli_lvb.lvb_mtime;
3010		LTIME_S(inode->i_ctime) = ll_i2info(inode)->lli_lvb.lvb_ctime;
3011	} else {
3012		/* In case of restore, the MDT has the right size and has
3013		 * already send it back without granting the layout lock,
3014		 * inode is up-to-date so glimpse is useless.
3015		 * Also to glimpse we need the layout, in case of a running
3016		 * restore the MDT holds the layout lock so the glimpse will
3017		 * block up to the end of restore (getattr will block)
3018		 */
3019		if (!(ll_i2info(inode)->lli_flags & LLIF_FILE_RESTORING))
3020			rc = ll_glimpse_size(inode);
3021	}
3022	return rc;
3023}
3024
3025int ll_getattr(struct vfsmount *mnt, struct dentry *de, struct kstat *stat)
3026{
3027	struct inode *inode = de->d_inode;
3028	struct ll_sb_info *sbi = ll_i2sbi(inode);
3029	struct ll_inode_info *lli = ll_i2info(inode);
3030	int res = 0;
3031
3032	res = ll_inode_revalidate(de, MDS_INODELOCK_UPDATE |
3033				      MDS_INODELOCK_LOOKUP);
3034	ll_stats_ops_tally(sbi, LPROC_LL_GETATTR, 1);
3035
3036	if (res)
3037		return res;
3038
3039	stat->dev = inode->i_sb->s_dev;
3040	if (ll_need_32bit_api(sbi))
3041		stat->ino = cl_fid_build_ino(&lli->lli_fid, 1);
3042	else
3043		stat->ino = inode->i_ino;
3044	stat->mode = inode->i_mode;
3045	stat->nlink = inode->i_nlink;
3046	stat->uid = inode->i_uid;
3047	stat->gid = inode->i_gid;
3048	stat->rdev = inode->i_rdev;
3049	stat->atime = inode->i_atime;
3050	stat->mtime = inode->i_mtime;
3051	stat->ctime = inode->i_ctime;
3052	stat->blksize = 1 << inode->i_blkbits;
3053
3054	stat->size = i_size_read(inode);
3055	stat->blocks = inode->i_blocks;
3056
3057	return 0;
3058}
3059
3060static int ll_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
3061		     __u64 start, __u64 len)
3062{
3063	int rc;
3064	size_t num_bytes;
3065	struct ll_user_fiemap *fiemap;
3066	unsigned int extent_count = fieinfo->fi_extents_max;
3067
3068	num_bytes = sizeof(*fiemap) + (extent_count *
3069				       sizeof(struct ll_fiemap_extent));
3070	OBD_ALLOC_LARGE(fiemap, num_bytes);
3071
3072	if (fiemap == NULL)
3073		return -ENOMEM;
3074
3075	fiemap->fm_flags = fieinfo->fi_flags;
3076	fiemap->fm_extent_count = fieinfo->fi_extents_max;
3077	fiemap->fm_start = start;
3078	fiemap->fm_length = len;
3079	if (extent_count > 0)
3080		memcpy(&fiemap->fm_extents[0], fieinfo->fi_extents_start,
3081		       sizeof(struct ll_fiemap_extent));
3082
3083	rc = ll_do_fiemap(inode, fiemap, num_bytes);
3084
3085	fieinfo->fi_flags = fiemap->fm_flags;
3086	fieinfo->fi_extents_mapped = fiemap->fm_mapped_extents;
3087	if (extent_count > 0)
3088		memcpy(fieinfo->fi_extents_start, &fiemap->fm_extents[0],
3089		       fiemap->fm_mapped_extents *
3090		       sizeof(struct ll_fiemap_extent));
3091
3092	OBD_FREE_LARGE(fiemap, num_bytes);
3093	return rc;
3094}
3095
3096struct posix_acl *ll_get_acl(struct inode *inode, int type)
3097{
3098	struct ll_inode_info *lli = ll_i2info(inode);
3099	struct posix_acl *acl = NULL;
3100
3101	spin_lock(&lli->lli_lock);
3102	/* VFS' acl_permission_check->check_acl will release the refcount */
3103	acl = posix_acl_dup(lli->lli_posix_acl);
3104	spin_unlock(&lli->lli_lock);
3105
3106	return acl;
3107}
3108
3109
3110int ll_inode_permission(struct inode *inode, int mask)
3111{
3112	int rc = 0;
3113
3114#ifdef MAY_NOT_BLOCK
3115	if (mask & MAY_NOT_BLOCK)
3116		return -ECHILD;
3117#endif
3118
3119       /* as root inode are NOT getting validated in lookup operation,
3120	* need to do it before permission check. */
3121
3122	if (inode == inode->i_sb->s_root->d_inode) {
3123		rc = __ll_inode_revalidate(inode->i_sb->s_root,
3124					   MDS_INODELOCK_LOOKUP);
3125		if (rc)
3126			return rc;
3127	}
3128
3129	CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p), inode mode %x mask %o\n",
3130	       inode->i_ino, inode->i_generation, inode, inode->i_mode, mask);
3131
3132	if (ll_i2sbi(inode)->ll_flags & LL_SBI_RMT_CLIENT)
3133		return lustre_check_remote_perm(inode, mask);
3134
3135	ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_INODE_PERM, 1);
3136	rc = generic_permission(inode, mask);
3137
3138	return rc;
3139}
3140
3141/* -o localflock - only provides locally consistent flock locks */
3142struct file_operations ll_file_operations = {
3143	.read	   = new_sync_read,
3144	.read_iter = ll_file_read_iter,
3145	.write	  = new_sync_write,
3146	.write_iter = ll_file_write_iter,
3147	.unlocked_ioctl = ll_file_ioctl,
3148	.open	   = ll_file_open,
3149	.release	= ll_file_release,
3150	.mmap	   = ll_file_mmap,
3151	.llseek	 = ll_file_seek,
3152	.splice_read    = ll_file_splice_read,
3153	.fsync	  = ll_fsync,
3154	.flush	  = ll_flush
3155};
3156
3157struct file_operations ll_file_operations_flock = {
3158	.read	   = new_sync_read,
3159	.read_iter    = ll_file_read_iter,
3160	.write	  = new_sync_write,
3161	.write_iter   = ll_file_write_iter,
3162	.unlocked_ioctl = ll_file_ioctl,
3163	.open	   = ll_file_open,
3164	.release	= ll_file_release,
3165	.mmap	   = ll_file_mmap,
3166	.llseek	 = ll_file_seek,
3167	.splice_read    = ll_file_splice_read,
3168	.fsync	  = ll_fsync,
3169	.flush	  = ll_flush,
3170	.flock	  = ll_file_flock,
3171	.lock	   = ll_file_flock
3172};
3173
3174/* These are for -o noflock - to return ENOSYS on flock calls */
3175struct file_operations ll_file_operations_noflock = {
3176	.read	   = new_sync_read,
3177	.read_iter    = ll_file_read_iter,
3178	.write	  = new_sync_write,
3179	.write_iter   = ll_file_write_iter,
3180	.unlocked_ioctl = ll_file_ioctl,
3181	.open	   = ll_file_open,
3182	.release	= ll_file_release,
3183	.mmap	   = ll_file_mmap,
3184	.llseek	 = ll_file_seek,
3185	.splice_read    = ll_file_splice_read,
3186	.fsync	  = ll_fsync,
3187	.flush	  = ll_flush,
3188	.flock	  = ll_file_noflock,
3189	.lock	   = ll_file_noflock
3190};
3191
3192struct inode_operations ll_file_inode_operations = {
3193	.setattr	= ll_setattr,
3194	.getattr	= ll_getattr,
3195	.permission	= ll_inode_permission,
3196	.setxattr	= ll_setxattr,
3197	.getxattr	= ll_getxattr,
3198	.listxattr	= ll_listxattr,
3199	.removexattr	= ll_removexattr,
3200	.fiemap		= ll_fiemap,
3201	.get_acl	= ll_get_acl,
3202};
3203
3204/* dynamic ioctl number support routines */
3205static struct llioc_ctl_data {
3206	struct rw_semaphore	ioc_sem;
3207	struct list_head	      ioc_head;
3208} llioc = {
3209	__RWSEM_INITIALIZER(llioc.ioc_sem),
3210	LIST_HEAD_INIT(llioc.ioc_head)
3211};
3212
3213
3214struct llioc_data {
3215	struct list_head	      iocd_list;
3216	unsigned int	    iocd_size;
3217	llioc_callback_t	iocd_cb;
3218	unsigned int	    iocd_count;
3219	unsigned int	    iocd_cmd[0];
3220};
3221
3222void *ll_iocontrol_register(llioc_callback_t cb, int count, unsigned int *cmd)
3223{
3224	unsigned int size;
3225	struct llioc_data *in_data = NULL;
3226
3227	if (cb == NULL || cmd == NULL ||
3228	    count > LLIOC_MAX_CMD || count < 0)
3229		return NULL;
3230
3231	size = sizeof(*in_data) + count * sizeof(unsigned int);
3232	in_data = kzalloc(size, GFP_NOFS);
3233	if (!in_data)
3234		return NULL;
3235
3236	memset(in_data, 0, sizeof(*in_data));
3237	in_data->iocd_size = size;
3238	in_data->iocd_cb = cb;
3239	in_data->iocd_count = count;
3240	memcpy(in_data->iocd_cmd, cmd, sizeof(unsigned int) * count);
3241
3242	down_write(&llioc.ioc_sem);
3243	list_add_tail(&in_data->iocd_list, &llioc.ioc_head);
3244	up_write(&llioc.ioc_sem);
3245
3246	return in_data;
3247}
3248
3249void ll_iocontrol_unregister(void *magic)
3250{
3251	struct llioc_data *tmp;
3252
3253	if (magic == NULL)
3254		return;
3255
3256	down_write(&llioc.ioc_sem);
3257	list_for_each_entry(tmp, &llioc.ioc_head, iocd_list) {
3258		if (tmp == magic) {
3259			unsigned int size = tmp->iocd_size;
3260
3261			list_del(&tmp->iocd_list);
3262			up_write(&llioc.ioc_sem);
3263
3264			OBD_FREE(tmp, size);
3265			return;
3266		}
3267	}
3268	up_write(&llioc.ioc_sem);
3269
3270	CWARN("didn't find iocontrol register block with magic: %p\n", magic);
3271}
3272
3273EXPORT_SYMBOL(ll_iocontrol_register);
3274EXPORT_SYMBOL(ll_iocontrol_unregister);
3275
3276static enum llioc_iter
3277ll_iocontrol_call(struct inode *inode, struct file *file,
3278		  unsigned int cmd, unsigned long arg, int *rcp)
3279{
3280	enum llioc_iter ret = LLIOC_CONT;
3281	struct llioc_data *data;
3282	int rc = -EINVAL, i;
3283
3284	down_read(&llioc.ioc_sem);
3285	list_for_each_entry(data, &llioc.ioc_head, iocd_list) {
3286		for (i = 0; i < data->iocd_count; i++) {
3287			if (cmd != data->iocd_cmd[i])
3288				continue;
3289
3290			ret = data->iocd_cb(inode, file, cmd, arg, data, &rc);
3291			break;
3292		}
3293
3294		if (ret == LLIOC_STOP)
3295			break;
3296	}
3297	up_read(&llioc.ioc_sem);
3298
3299	if (rcp)
3300		*rcp = rc;
3301	return ret;
3302}
3303
3304int ll_layout_conf(struct inode *inode, const struct cl_object_conf *conf)
3305{
3306	struct ll_inode_info *lli = ll_i2info(inode);
3307	struct cl_env_nest nest;
3308	struct lu_env *env;
3309	int result;
3310
3311	if (lli->lli_clob == NULL)
3312		return 0;
3313
3314	env = cl_env_nested_get(&nest);
3315	if (IS_ERR(env))
3316		return PTR_ERR(env);
3317
3318	result = cl_conf_set(env, lli->lli_clob, conf);
3319	cl_env_nested_put(&nest, env);
3320
3321	if (conf->coc_opc == OBJECT_CONF_SET) {
3322		struct ldlm_lock *lock = conf->coc_lock;
3323
3324		LASSERT(lock != NULL);
3325		LASSERT(ldlm_has_layout(lock));
3326		if (result == 0) {
3327			/* it can only be allowed to match after layout is
3328			 * applied to inode otherwise false layout would be
3329			 * seen. Applying layout should happen before dropping
3330			 * the intent lock. */
3331			ldlm_lock_allow_match(lock);
3332		}
3333	}
3334	return result;
3335}
3336
3337/* Fetch layout from MDT with getxattr request, if it's not ready yet */
3338static int ll_layout_fetch(struct inode *inode, struct ldlm_lock *lock)
3339
3340{
3341	struct ll_sb_info *sbi = ll_i2sbi(inode);
3342	struct obd_capa *oc;
3343	struct ptlrpc_request *req;
3344	struct mdt_body *body;
3345	void *lvbdata;
3346	void *lmm;
3347	int lmmsize;
3348	int rc;
3349
3350	CDEBUG(D_INODE, DFID" LVB_READY=%d l_lvb_data=%p l_lvb_len=%d\n",
3351	       PFID(ll_inode2fid(inode)), !!(lock->l_flags & LDLM_FL_LVB_READY),
3352	       lock->l_lvb_data, lock->l_lvb_len);
3353
3354	if ((lock->l_lvb_data != NULL) && (lock->l_flags & LDLM_FL_LVB_READY))
3355		return 0;
3356
3357	/* if layout lock was granted right away, the layout is returned
3358	 * within DLM_LVB of dlm reply; otherwise if the lock was ever
3359	 * blocked and then granted via completion ast, we have to fetch
3360	 * layout here. Please note that we can't use the LVB buffer in
3361	 * completion AST because it doesn't have a large enough buffer */
3362	oc = ll_mdscapa_get(inode);
3363	rc = ll_get_default_mdsize(sbi, &lmmsize);
3364	if (rc == 0)
3365		rc = md_getxattr(sbi->ll_md_exp, ll_inode2fid(inode), oc,
3366				OBD_MD_FLXATTR, XATTR_NAME_LOV, NULL, 0,
3367				lmmsize, 0, &req);
3368	capa_put(oc);
3369	if (rc < 0)
3370		return rc;
3371
3372	body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
3373	if (body == NULL) {
3374		rc = -EPROTO;
3375		goto out;
3376	}
3377
3378	lmmsize = body->eadatasize;
3379	if (lmmsize == 0) /* empty layout */ {
3380		rc = 0;
3381		goto out;
3382	}
3383
3384	lmm = req_capsule_server_sized_get(&req->rq_pill, &RMF_EADATA, lmmsize);
3385	if (lmm == NULL) {
3386		rc = -EFAULT;
3387		goto out;
3388	}
3389
3390	OBD_ALLOC_LARGE(lvbdata, lmmsize);
3391	if (lvbdata == NULL) {
3392		rc = -ENOMEM;
3393		goto out;
3394	}
3395
3396	memcpy(lvbdata, lmm, lmmsize);
3397	lock_res_and_lock(lock);
3398	if (lock->l_lvb_data != NULL)
3399		OBD_FREE_LARGE(lock->l_lvb_data, lock->l_lvb_len);
3400
3401	lock->l_lvb_data = lvbdata;
3402	lock->l_lvb_len = lmmsize;
3403	unlock_res_and_lock(lock);
3404
3405out:
3406	ptlrpc_req_finished(req);
3407	return rc;
3408}
3409
3410/**
3411 * Apply the layout to the inode. Layout lock is held and will be released
3412 * in this function.
3413 */
3414static int ll_layout_lock_set(struct lustre_handle *lockh, ldlm_mode_t mode,
3415				struct inode *inode, __u32 *gen, bool reconf)
3416{
3417	struct ll_inode_info *lli = ll_i2info(inode);
3418	struct ll_sb_info    *sbi = ll_i2sbi(inode);
3419	struct ldlm_lock *lock;
3420	struct lustre_md md = { NULL };
3421	struct cl_object_conf conf;
3422	int rc = 0;
3423	bool lvb_ready;
3424	bool wait_layout = false;
3425
3426	LASSERT(lustre_handle_is_used(lockh));
3427
3428	lock = ldlm_handle2lock(lockh);
3429	LASSERT(lock != NULL);
3430	LASSERT(ldlm_has_layout(lock));
3431
3432	LDLM_DEBUG(lock, "File %p/"DFID" being reconfigured: %d.\n",
3433		   inode, PFID(&lli->lli_fid), reconf);
3434
3435	/* in case this is a caching lock and reinstate with new inode */
3436	md_set_lock_data(sbi->ll_md_exp, &lockh->cookie, inode, NULL);
3437
3438	lock_res_and_lock(lock);
3439	lvb_ready = !!(lock->l_flags & LDLM_FL_LVB_READY);
3440	unlock_res_and_lock(lock);
3441	/* checking lvb_ready is racy but this is okay. The worst case is
3442	 * that multi processes may configure the file on the same time. */
3443	if (lvb_ready || !reconf) {
3444		rc = -ENODATA;
3445		if (lvb_ready) {
3446			/* layout_gen must be valid if layout lock is not
3447			 * cancelled and stripe has already set */
3448			*gen = ll_layout_version_get(lli);
3449			rc = 0;
3450		}
3451		goto out;
3452	}
3453
3454	rc = ll_layout_fetch(inode, lock);
3455	if (rc < 0)
3456		goto out;
3457
3458	/* for layout lock, lmm is returned in lock's lvb.
3459	 * lvb_data is immutable if the lock is held so it's safe to access it
3460	 * without res lock. See the description in ldlm_lock_decref_internal()
3461	 * for the condition to free lvb_data of layout lock */
3462	if (lock->l_lvb_data != NULL) {
3463		rc = obd_unpackmd(sbi->ll_dt_exp, &md.lsm,
3464				  lock->l_lvb_data, lock->l_lvb_len);
3465		if (rc >= 0) {
3466			*gen = LL_LAYOUT_GEN_EMPTY;
3467			if (md.lsm != NULL)
3468				*gen = md.lsm->lsm_layout_gen;
3469			rc = 0;
3470		} else {
3471			CERROR("%s: file "DFID" unpackmd error: %d\n",
3472				ll_get_fsname(inode->i_sb, NULL, 0),
3473				PFID(&lli->lli_fid), rc);
3474		}
3475	}
3476	if (rc < 0)
3477		goto out;
3478
3479	/* set layout to file. Unlikely this will fail as old layout was
3480	 * surely eliminated */
3481	memset(&conf, 0, sizeof(conf));
3482	conf.coc_opc = OBJECT_CONF_SET;
3483	conf.coc_inode = inode;
3484	conf.coc_lock = lock;
3485	conf.u.coc_md = &md;
3486	rc = ll_layout_conf(inode, &conf);
3487
3488	if (md.lsm != NULL)
3489		obd_free_memmd(sbi->ll_dt_exp, &md.lsm);
3490
3491	/* refresh layout failed, need to wait */
3492	wait_layout = rc == -EBUSY;
3493
3494out:
3495	LDLM_LOCK_PUT(lock);
3496	ldlm_lock_decref(lockh, mode);
3497
3498	/* wait for IO to complete if it's still being used. */
3499	if (wait_layout) {
3500		CDEBUG(D_INODE, "%s: %p/"DFID" wait for layout reconf.\n",
3501			ll_get_fsname(inode->i_sb, NULL, 0),
3502			inode, PFID(&lli->lli_fid));
3503
3504		memset(&conf, 0, sizeof(conf));
3505		conf.coc_opc = OBJECT_CONF_WAIT;
3506		conf.coc_inode = inode;
3507		rc = ll_layout_conf(inode, &conf);
3508		if (rc == 0)
3509			rc = -EAGAIN;
3510
3511		CDEBUG(D_INODE, "file: "DFID" waiting layout return: %d.\n",
3512			PFID(&lli->lli_fid), rc);
3513	}
3514	return rc;
3515}
3516
3517/**
3518 * This function checks if there exists a LAYOUT lock on the client side,
3519 * or enqueues it if it doesn't have one in cache.
3520 *
3521 * This function will not hold layout lock so it may be revoked any time after
3522 * this function returns. Any operations depend on layout should be redone
3523 * in that case.
3524 *
3525 * This function should be called before lov_io_init() to get an uptodate
3526 * layout version, the caller should save the version number and after IO
3527 * is finished, this function should be called again to verify that layout
3528 * is not changed during IO time.
3529 */
3530int ll_layout_refresh(struct inode *inode, __u32 *gen)
3531{
3532	struct ll_inode_info  *lli = ll_i2info(inode);
3533	struct ll_sb_info     *sbi = ll_i2sbi(inode);
3534	struct md_op_data     *op_data;
3535	struct lookup_intent   it;
3536	struct lustre_handle   lockh;
3537	ldlm_mode_t	       mode;
3538	struct ldlm_enqueue_info einfo = {
3539		.ei_type = LDLM_IBITS,
3540		.ei_mode = LCK_CR,
3541		.ei_cb_bl = ll_md_blocking_ast,
3542		.ei_cb_cp = ldlm_completion_ast,
3543	};
3544	int rc;
3545
3546	*gen = ll_layout_version_get(lli);
3547	if (!(sbi->ll_flags & LL_SBI_LAYOUT_LOCK) || *gen != LL_LAYOUT_GEN_NONE)
3548		return 0;
3549
3550	/* sanity checks */
3551	LASSERT(fid_is_sane(ll_inode2fid(inode)));
3552	LASSERT(S_ISREG(inode->i_mode));
3553
3554	/* take layout lock mutex to enqueue layout lock exclusively. */
3555	mutex_lock(&lli->lli_layout_mutex);
3556
3557again:
3558	/* mostly layout lock is caching on the local side, so try to match
3559	 * it before grabbing layout lock mutex. */
3560	mode = ll_take_md_lock(inode, MDS_INODELOCK_LAYOUT, &lockh, 0,
3561			       LCK_CR | LCK_CW | LCK_PR | LCK_PW);
3562	if (mode != 0) { /* hit cached lock */
3563		rc = ll_layout_lock_set(&lockh, mode, inode, gen, true);
3564		if (rc == -EAGAIN)
3565			goto again;
3566
3567		mutex_unlock(&lli->lli_layout_mutex);
3568		return rc;
3569	}
3570
3571	op_data = ll_prep_md_op_data(NULL, inode, inode, NULL,
3572			0, 0, LUSTRE_OPC_ANY, NULL);
3573	if (IS_ERR(op_data)) {
3574		mutex_unlock(&lli->lli_layout_mutex);
3575		return PTR_ERR(op_data);
3576	}
3577
3578	/* have to enqueue one */
3579	memset(&it, 0, sizeof(it));
3580	it.it_op = IT_LAYOUT;
3581	lockh.cookie = 0ULL;
3582
3583	LDLM_DEBUG_NOLOCK("%s: requeue layout lock for file %p/"DFID".\n",
3584			ll_get_fsname(inode->i_sb, NULL, 0), inode,
3585			PFID(&lli->lli_fid));
3586
3587	rc = md_enqueue(sbi->ll_md_exp, &einfo, &it, op_data, &lockh,
3588			NULL, 0, NULL, 0);
3589	if (it.d.lustre.it_data != NULL)
3590		ptlrpc_req_finished(it.d.lustre.it_data);
3591	it.d.lustre.it_data = NULL;
3592
3593	ll_finish_md_op_data(op_data);
3594
3595	mode = it.d.lustre.it_lock_mode;
3596	it.d.lustre.it_lock_mode = 0;
3597	ll_intent_drop_lock(&it);
3598
3599	if (rc == 0) {
3600		/* set lock data in case this is a new lock */
3601		ll_set_lock_data(sbi->ll_md_exp, inode, &it, NULL);
3602		rc = ll_layout_lock_set(&lockh, mode, inode, gen, true);
3603		if (rc == -EAGAIN)
3604			goto again;
3605	}
3606	mutex_unlock(&lli->lli_layout_mutex);
3607
3608	return rc;
3609}
3610
3611/**
3612 *  This function send a restore request to the MDT
3613 */
3614int ll_layout_restore(struct inode *inode)
3615{
3616	struct hsm_user_request	*hur;
3617	int			 len, rc;
3618
3619	len = sizeof(struct hsm_user_request) +
3620	      sizeof(struct hsm_user_item);
3621	hur = kzalloc(len, GFP_NOFS);
3622	if (!hur)
3623		return -ENOMEM;
3624
3625	hur->hur_request.hr_action = HUA_RESTORE;
3626	hur->hur_request.hr_archive_id = 0;
3627	hur->hur_request.hr_flags = 0;
3628	memcpy(&hur->hur_user_item[0].hui_fid, &ll_i2info(inode)->lli_fid,
3629	       sizeof(hur->hur_user_item[0].hui_fid));
3630	hur->hur_user_item[0].hui_extent.length = -1;
3631	hur->hur_request.hr_itemcount = 1;
3632	rc = obd_iocontrol(LL_IOC_HSM_REQUEST, cl_i2sbi(inode)->ll_md_exp,
3633			   len, hur, NULL);
3634	OBD_FREE(hur, len);
3635	return rc;
3636}
3637