[go: nahoru, domu]

file.c revision b0f5aad587ea1fc3563d056609ee54a961ee1256
1/*
2 * GPL HEADER START
3 *
4 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
5 *
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License version 2 only,
8 * as published by the Free Software Foundation.
9 *
10 * This program is distributed in the hope that it will be useful, but
11 * WITHOUT ANY WARRANTY; without even the implied warranty of
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
13 * General Public License version 2 for more details (a copy is included
14 * in the LICENSE file that accompanied this code).
15 *
16 * You should have received a copy of the GNU General Public License
17 * version 2 along with this program; If not, see
18 * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
19 *
20 * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
21 * CA 95054 USA or visit www.sun.com if you need additional information or
22 * have any questions.
23 *
24 * GPL HEADER END
25 */
26/*
27 * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved.
28 * Use is subject to license terms.
29 *
30 * Copyright (c) 2011, 2012, Intel Corporation.
31 */
32/*
33 * This file is part of Lustre, http://www.lustre.org/
34 * Lustre is a trademark of Sun Microsystems, Inc.
35 *
36 * lustre/llite/file.c
37 *
38 * Author: Peter Braam <braam@clusterfs.com>
39 * Author: Phil Schwan <phil@clusterfs.com>
40 * Author: Andreas Dilger <adilger@clusterfs.com>
41 */
42
43#define DEBUG_SUBSYSTEM S_LLITE
44#include "../include/lustre_dlm.h"
45#include "../include/lustre_lite.h"
46#include <linux/pagemap.h>
47#include <linux/file.h>
48#include "llite_internal.h"
49#include "../include/lustre/ll_fiemap.h"
50
51#include "../include/cl_object.h"
52
53static int
54ll_put_grouplock(struct inode *inode, struct file *file, unsigned long arg);
55
56static int ll_lease_close(struct obd_client_handle *och, struct inode *inode,
57			  bool *lease_broken);
58
59static enum llioc_iter
60ll_iocontrol_call(struct inode *inode, struct file *file,
61		  unsigned int cmd, unsigned long arg, int *rcp);
62
63static struct ll_file_data *ll_file_data_get(void)
64{
65	struct ll_file_data *fd;
66
67	OBD_SLAB_ALLOC_PTR_GFP(fd, ll_file_data_slab, GFP_NOFS);
68	if (fd == NULL)
69		return NULL;
70	fd->fd_write_failed = false;
71	return fd;
72}
73
74static void ll_file_data_put(struct ll_file_data *fd)
75{
76	if (fd != NULL)
77		OBD_SLAB_FREE_PTR(fd, ll_file_data_slab);
78}
79
80void ll_pack_inode2opdata(struct inode *inode, struct md_op_data *op_data,
81			  struct lustre_handle *fh)
82{
83	op_data->op_fid1 = ll_i2info(inode)->lli_fid;
84	op_data->op_attr.ia_mode = inode->i_mode;
85	op_data->op_attr.ia_atime = inode->i_atime;
86	op_data->op_attr.ia_mtime = inode->i_mtime;
87	op_data->op_attr.ia_ctime = inode->i_ctime;
88	op_data->op_attr.ia_size = i_size_read(inode);
89	op_data->op_attr_blocks = inode->i_blocks;
90	((struct ll_iattr *)&op_data->op_attr)->ia_attr_flags =
91					ll_inode_to_ext_flags(inode->i_flags);
92	op_data->op_ioepoch = ll_i2info(inode)->lli_ioepoch;
93	if (fh)
94		op_data->op_handle = *fh;
95	op_data->op_capa1 = ll_mdscapa_get(inode);
96
97	if (LLIF_DATA_MODIFIED & ll_i2info(inode)->lli_flags)
98		op_data->op_bias |= MDS_DATA_MODIFIED;
99}
100
101/**
102 * Closes the IO epoch and packs all the attributes into @op_data for
103 * the CLOSE rpc.
104 */
105static void ll_prepare_close(struct inode *inode, struct md_op_data *op_data,
106			     struct obd_client_handle *och)
107{
108	op_data->op_attr.ia_valid = ATTR_MODE | ATTR_ATIME | ATTR_ATIME_SET |
109					ATTR_MTIME | ATTR_MTIME_SET |
110					ATTR_CTIME | ATTR_CTIME_SET;
111
112	if (!(och->och_flags & FMODE_WRITE))
113		goto out;
114
115	if (!exp_connect_som(ll_i2mdexp(inode)) || !S_ISREG(inode->i_mode))
116		op_data->op_attr.ia_valid |= ATTR_SIZE | ATTR_BLOCKS;
117	else
118		ll_ioepoch_close(inode, op_data, &och, 0);
119
120out:
121	ll_pack_inode2opdata(inode, op_data, &och->och_fh);
122	ll_prep_md_op_data(op_data, inode, NULL, NULL,
123			   0, 0, LUSTRE_OPC_ANY, NULL);
124}
125
126static int ll_close_inode_openhandle(struct obd_export *md_exp,
127				     struct inode *inode,
128				     struct obd_client_handle *och,
129				     const __u64 *data_version)
130{
131	struct obd_export *exp = ll_i2mdexp(inode);
132	struct md_op_data *op_data;
133	struct ptlrpc_request *req = NULL;
134	struct obd_device *obd = class_exp2obd(exp);
135	int epoch_close = 1;
136	int rc;
137
138	if (obd == NULL) {
139		/*
140		 * XXX: in case of LMV, is this correct to access
141		 * ->exp_handle?
142		 */
143		CERROR("Invalid MDC connection handle "LPX64"\n",
144		       ll_i2mdexp(inode)->exp_handle.h_cookie);
145		GOTO(out, rc = 0);
146	}
147
148	OBD_ALLOC_PTR(op_data);
149	if (op_data == NULL)
150		GOTO(out, rc = -ENOMEM); // XXX We leak openhandle and request here.
151
152	ll_prepare_close(inode, op_data, och);
153	if (data_version != NULL) {
154		/* Pass in data_version implies release. */
155		op_data->op_bias |= MDS_HSM_RELEASE;
156		op_data->op_data_version = *data_version;
157		op_data->op_lease_handle = och->och_lease_handle;
158		op_data->op_attr.ia_valid |= ATTR_SIZE | ATTR_BLOCKS;
159	}
160	epoch_close = (op_data->op_flags & MF_EPOCH_CLOSE);
161	rc = md_close(md_exp, op_data, och->och_mod, &req);
162	if (rc == -EAGAIN) {
163		/* This close must have the epoch closed. */
164		LASSERT(epoch_close);
165		/* MDS has instructed us to obtain Size-on-MDS attribute from
166		 * OSTs and send setattr to back to MDS. */
167		rc = ll_som_update(inode, op_data);
168		if (rc) {
169			CERROR("inode %lu mdc Size-on-MDS update failed: "
170			       "rc = %d\n", inode->i_ino, rc);
171			rc = 0;
172		}
173	} else if (rc) {
174		CERROR("inode %lu mdc close failed: rc = %d\n",
175		       inode->i_ino, rc);
176	}
177
178	/* DATA_MODIFIED flag was successfully sent on close, cancel data
179	 * modification flag. */
180	if (rc == 0 && (op_data->op_bias & MDS_DATA_MODIFIED)) {
181		struct ll_inode_info *lli = ll_i2info(inode);
182
183		spin_lock(&lli->lli_lock);
184		lli->lli_flags &= ~LLIF_DATA_MODIFIED;
185		spin_unlock(&lli->lli_lock);
186	}
187
188	if (rc == 0) {
189		rc = ll_objects_destroy(req, inode);
190		if (rc)
191			CERROR("inode %lu ll_objects destroy: rc = %d\n",
192			       inode->i_ino, rc);
193	}
194	if (rc == 0 && op_data->op_bias & MDS_HSM_RELEASE) {
195		struct mdt_body *body;
196		body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
197		if (!(body->valid & OBD_MD_FLRELEASED))
198			rc = -EBUSY;
199	}
200
201	ll_finish_md_op_data(op_data);
202
203out:
204	if (exp_connect_som(exp) && !epoch_close &&
205	    S_ISREG(inode->i_mode) && (och->och_flags & FMODE_WRITE)) {
206		ll_queue_done_writing(inode, LLIF_DONE_WRITING);
207	} else {
208		md_clear_open_replay_data(md_exp, och);
209		/* Free @och if it is not waiting for DONE_WRITING. */
210		och->och_fh.cookie = DEAD_HANDLE_MAGIC;
211		OBD_FREE_PTR(och);
212	}
213	if (req) /* This is close request */
214		ptlrpc_req_finished(req);
215	return rc;
216}
217
218int ll_md_real_close(struct inode *inode, fmode_t fmode)
219{
220	struct ll_inode_info *lli = ll_i2info(inode);
221	struct obd_client_handle **och_p;
222	struct obd_client_handle *och;
223	__u64 *och_usecount;
224	int rc = 0;
225
226	if (fmode & FMODE_WRITE) {
227		och_p = &lli->lli_mds_write_och;
228		och_usecount = &lli->lli_open_fd_write_count;
229	} else if (fmode & FMODE_EXEC) {
230		och_p = &lli->lli_mds_exec_och;
231		och_usecount = &lli->lli_open_fd_exec_count;
232	} else {
233		LASSERT(fmode & FMODE_READ);
234		och_p = &lli->lli_mds_read_och;
235		och_usecount = &lli->lli_open_fd_read_count;
236	}
237
238	mutex_lock(&lli->lli_och_mutex);
239	if (*och_usecount > 0) {
240		/* There are still users of this handle, so skip
241		 * freeing it. */
242		mutex_unlock(&lli->lli_och_mutex);
243		return 0;
244	}
245
246	och=*och_p;
247	*och_p = NULL;
248	mutex_unlock(&lli->lli_och_mutex);
249
250	if (och != NULL) {
251		/* There might be a race and this handle may already
252		   be closed. */
253		rc = ll_close_inode_openhandle(ll_i2sbi(inode)->ll_md_exp,
254					       inode, och, NULL);
255	}
256
257	return rc;
258}
259
260static int ll_md_close(struct obd_export *md_exp, struct inode *inode,
261		       struct file *file)
262{
263	struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
264	struct ll_inode_info *lli = ll_i2info(inode);
265	int rc = 0;
266
267	/* clear group lock, if present */
268	if (unlikely(fd->fd_flags & LL_FILE_GROUP_LOCKED))
269		ll_put_grouplock(inode, file, fd->fd_grouplock.cg_gid);
270
271	if (fd->fd_lease_och != NULL) {
272		bool lease_broken;
273
274		/* Usually the lease is not released when the
275		 * application crashed, we need to release here. */
276		rc = ll_lease_close(fd->fd_lease_och, inode, &lease_broken);
277		CDEBUG(rc ? D_ERROR : D_INODE, "Clean up lease "DFID" %d/%d\n",
278			PFID(&lli->lli_fid), rc, lease_broken);
279
280		fd->fd_lease_och = NULL;
281	}
282
283	if (fd->fd_och != NULL) {
284		rc = ll_close_inode_openhandle(md_exp, inode, fd->fd_och, NULL);
285		fd->fd_och = NULL;
286		GOTO(out, rc);
287	}
288
289	/* Let's see if we have good enough OPEN lock on the file and if
290	   we can skip talking to MDS */
291	if (file->f_dentry->d_inode) { /* Can this ever be false? */
292		int lockmode;
293		__u64 flags = LDLM_FL_BLOCK_GRANTED | LDLM_FL_TEST_LOCK;
294		struct lustre_handle lockh;
295		struct inode *inode = file->f_dentry->d_inode;
296		ldlm_policy_data_t policy = {.l_inodebits={MDS_INODELOCK_OPEN}};
297
298		mutex_lock(&lli->lli_och_mutex);
299		if (fd->fd_omode & FMODE_WRITE) {
300			lockmode = LCK_CW;
301			LASSERT(lli->lli_open_fd_write_count);
302			lli->lli_open_fd_write_count--;
303		} else if (fd->fd_omode & FMODE_EXEC) {
304			lockmode = LCK_PR;
305			LASSERT(lli->lli_open_fd_exec_count);
306			lli->lli_open_fd_exec_count--;
307		} else {
308			lockmode = LCK_CR;
309			LASSERT(lli->lli_open_fd_read_count);
310			lli->lli_open_fd_read_count--;
311		}
312		mutex_unlock(&lli->lli_och_mutex);
313
314		if (!md_lock_match(md_exp, flags, ll_inode2fid(inode),
315				   LDLM_IBITS, &policy, lockmode,
316				   &lockh)) {
317			rc = ll_md_real_close(file->f_dentry->d_inode,
318					      fd->fd_omode);
319		}
320	} else {
321		CERROR("Releasing a file %p with negative dentry %p. Name %s",
322		       file, file->f_dentry, file->f_dentry->d_name.name);
323	}
324
325out:
326	LUSTRE_FPRIVATE(file) = NULL;
327	ll_file_data_put(fd);
328	ll_capa_close(inode);
329
330	return rc;
331}
332
333/* While this returns an error code, fput() the caller does not, so we need
334 * to make every effort to clean up all of our state here.  Also, applications
335 * rarely check close errors and even if an error is returned they will not
336 * re-try the close call.
337 */
338int ll_file_release(struct inode *inode, struct file *file)
339{
340	struct ll_file_data *fd;
341	struct ll_sb_info *sbi = ll_i2sbi(inode);
342	struct ll_inode_info *lli = ll_i2info(inode);
343	int rc;
344
345	CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p)\n", inode->i_ino,
346	       inode->i_generation, inode);
347
348#ifdef CONFIG_FS_POSIX_ACL
349	if (sbi->ll_flags & LL_SBI_RMT_CLIENT &&
350	    inode == inode->i_sb->s_root->d_inode) {
351		struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
352
353		LASSERT(fd != NULL);
354		if (unlikely(fd->fd_flags & LL_FILE_RMTACL)) {
355			fd->fd_flags &= ~LL_FILE_RMTACL;
356			rct_del(&sbi->ll_rct, current_pid());
357			et_search_free(&sbi->ll_et, current_pid());
358		}
359	}
360#endif
361
362	if (inode->i_sb->s_root != file->f_dentry)
363		ll_stats_ops_tally(sbi, LPROC_LL_RELEASE, 1);
364	fd = LUSTRE_FPRIVATE(file);
365	LASSERT(fd != NULL);
366
367	/* The last ref on @file, maybe not the the owner pid of statahead.
368	 * Different processes can open the same dir, "ll_opendir_key" means:
369	 * it is me that should stop the statahead thread. */
370	if (S_ISDIR(inode->i_mode) && lli->lli_opendir_key == fd &&
371	    lli->lli_opendir_pid != 0)
372		ll_stop_statahead(inode, lli->lli_opendir_key);
373
374	if (inode->i_sb->s_root == file->f_dentry) {
375		LUSTRE_FPRIVATE(file) = NULL;
376		ll_file_data_put(fd);
377		return 0;
378	}
379
380	if (!S_ISDIR(inode->i_mode)) {
381		lov_read_and_clear_async_rc(lli->lli_clob);
382		lli->lli_async_rc = 0;
383	}
384
385	rc = ll_md_close(sbi->ll_md_exp, inode, file);
386
387	if (CFS_FAIL_TIMEOUT_MS(OBD_FAIL_PTLRPC_DUMP_LOG, cfs_fail_val))
388		libcfs_debug_dumplog();
389
390	return rc;
391}
392
393static int ll_intent_file_open(struct file *file, void *lmm,
394			       int lmmsize, struct lookup_intent *itp)
395{
396	struct ll_sb_info *sbi = ll_i2sbi(file->f_dentry->d_inode);
397	struct dentry *parent = file->f_dentry->d_parent;
398	const char *name = file->f_dentry->d_name.name;
399	const int len = file->f_dentry->d_name.len;
400	struct md_op_data *op_data;
401	struct ptlrpc_request *req;
402	__u32 opc = LUSTRE_OPC_ANY;
403	int rc;
404
405	if (!parent)
406		return -ENOENT;
407
408	/* Usually we come here only for NFSD, and we want open lock.
409	   But we can also get here with pre 2.6.15 patchless kernels, and in
410	   that case that lock is also ok */
411	/* We can also get here if there was cached open handle in revalidate_it
412	 * but it disappeared while we were getting from there to ll_file_open.
413	 * But this means this file was closed and immediately opened which
414	 * makes a good candidate for using OPEN lock */
415	/* If lmmsize & lmm are not 0, we are just setting stripe info
416	 * parameters. No need for the open lock */
417	if (lmm == NULL && lmmsize == 0) {
418		itp->it_flags |= MDS_OPEN_LOCK;
419		if (itp->it_flags & FMODE_WRITE)
420			opc = LUSTRE_OPC_CREATE;
421	}
422
423	op_data  = ll_prep_md_op_data(NULL, parent->d_inode,
424				      file->f_dentry->d_inode, name, len,
425				      O_RDWR, opc, NULL);
426	if (IS_ERR(op_data))
427		return PTR_ERR(op_data);
428
429	itp->it_flags |= MDS_OPEN_BY_FID;
430	rc = md_intent_lock(sbi->ll_md_exp, op_data, lmm, lmmsize, itp,
431			    0 /*unused */, &req, ll_md_blocking_ast, 0);
432	ll_finish_md_op_data(op_data);
433	if (rc == -ESTALE) {
434		/* reason for keep own exit path - don`t flood log
435		* with messages with -ESTALE errors.
436		*/
437		if (!it_disposition(itp, DISP_OPEN_OPEN) ||
438		     it_open_error(DISP_OPEN_OPEN, itp))
439			GOTO(out, rc);
440		ll_release_openhandle(file->f_dentry, itp);
441		GOTO(out, rc);
442	}
443
444	if (it_disposition(itp, DISP_LOOKUP_NEG))
445		GOTO(out, rc = -ENOENT);
446
447	if (rc != 0 || it_open_error(DISP_OPEN_OPEN, itp)) {
448		rc = rc ? rc : it_open_error(DISP_OPEN_OPEN, itp);
449		CDEBUG(D_VFSTRACE, "lock enqueue: err: %d\n", rc);
450		GOTO(out, rc);
451	}
452
453	rc = ll_prep_inode(&file->f_dentry->d_inode, req, NULL, itp);
454	if (!rc && itp->d.lustre.it_lock_mode)
455		ll_set_lock_data(sbi->ll_md_exp, file->f_dentry->d_inode,
456				 itp, NULL);
457
458out:
459	ptlrpc_req_finished(req);
460	ll_intent_drop_lock(itp);
461
462	return rc;
463}
464
465/**
466 * Assign an obtained @ioepoch to client's inode. No lock is needed, MDS does
467 * not believe attributes if a few ioepoch holders exist. Attributes for
468 * previous ioepoch if new one is opened are also skipped by MDS.
469 */
470void ll_ioepoch_open(struct ll_inode_info *lli, __u64 ioepoch)
471{
472	if (ioepoch && lli->lli_ioepoch != ioepoch) {
473		lli->lli_ioepoch = ioepoch;
474		CDEBUG(D_INODE, "Epoch %llu opened on "DFID"\n",
475		       ioepoch, PFID(&lli->lli_fid));
476	}
477}
478
479static int ll_och_fill(struct obd_export *md_exp, struct lookup_intent *it,
480		       struct obd_client_handle *och)
481{
482	struct ptlrpc_request *req = it->d.lustre.it_data;
483	struct mdt_body *body;
484
485	body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
486	och->och_fh = body->handle;
487	och->och_fid = body->fid1;
488	och->och_lease_handle.cookie = it->d.lustre.it_lock_handle;
489	och->och_magic = OBD_CLIENT_HANDLE_MAGIC;
490	och->och_flags = it->it_flags;
491
492	return md_set_open_replay_data(md_exp, och, it);
493}
494
495static int ll_local_open(struct file *file, struct lookup_intent *it,
496			 struct ll_file_data *fd, struct obd_client_handle *och)
497{
498	struct inode *inode = file->f_dentry->d_inode;
499	struct ll_inode_info *lli = ll_i2info(inode);
500
501	LASSERT(!LUSTRE_FPRIVATE(file));
502
503	LASSERT(fd != NULL);
504
505	if (och) {
506		struct ptlrpc_request *req = it->d.lustre.it_data;
507		struct mdt_body *body;
508		int rc;
509
510		rc = ll_och_fill(ll_i2sbi(inode)->ll_md_exp, it, och);
511		if (rc != 0)
512			return rc;
513
514		body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
515		ll_ioepoch_open(lli, body->ioepoch);
516	}
517
518	LUSTRE_FPRIVATE(file) = fd;
519	ll_readahead_init(inode, &fd->fd_ras);
520	fd->fd_omode = it->it_flags & (FMODE_READ | FMODE_WRITE | FMODE_EXEC);
521	return 0;
522}
523
524/* Open a file, and (for the very first open) create objects on the OSTs at
525 * this time.  If opened with O_LOV_DELAY_CREATE, then we don't do the object
526 * creation or open until ll_lov_setstripe() ioctl is called.
527 *
528 * If we already have the stripe MD locally then we don't request it in
529 * md_open(), by passing a lmm_size = 0.
530 *
531 * It is up to the application to ensure no other processes open this file
532 * in the O_LOV_DELAY_CREATE case, or the default striping pattern will be
533 * used.  We might be able to avoid races of that sort by getting lli_open_sem
534 * before returning in the O_LOV_DELAY_CREATE case and dropping it here
535 * or in ll_file_release(), but I'm not sure that is desirable/necessary.
536 */
537int ll_file_open(struct inode *inode, struct file *file)
538{
539	struct ll_inode_info *lli = ll_i2info(inode);
540	struct lookup_intent *it, oit = { .it_op = IT_OPEN,
541					  .it_flags = file->f_flags };
542	struct obd_client_handle **och_p = NULL;
543	__u64 *och_usecount = NULL;
544	struct ll_file_data *fd;
545	int rc = 0, opendir_set = 0;
546
547	CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p), flags %o\n", inode->i_ino,
548	       inode->i_generation, inode, file->f_flags);
549
550	it = file->private_data; /* XXX: compat macro */
551	file->private_data = NULL; /* prevent ll_local_open assertion */
552
553	fd = ll_file_data_get();
554	if (fd == NULL)
555		GOTO(out_openerr, rc = -ENOMEM);
556
557	fd->fd_file = file;
558	if (S_ISDIR(inode->i_mode)) {
559		spin_lock(&lli->lli_sa_lock);
560		if (lli->lli_opendir_key == NULL && lli->lli_sai == NULL &&
561		    lli->lli_opendir_pid == 0) {
562			lli->lli_opendir_key = fd;
563			lli->lli_opendir_pid = current_pid();
564			opendir_set = 1;
565		}
566		spin_unlock(&lli->lli_sa_lock);
567	}
568
569	if (inode->i_sb->s_root == file->f_dentry) {
570		LUSTRE_FPRIVATE(file) = fd;
571		return 0;
572	}
573
574	if (!it || !it->d.lustre.it_disposition) {
575		/* Convert f_flags into access mode. We cannot use file->f_mode,
576		 * because everything but O_ACCMODE mask was stripped from
577		 * there */
578		if ((oit.it_flags + 1) & O_ACCMODE)
579			oit.it_flags++;
580		if (file->f_flags & O_TRUNC)
581			oit.it_flags |= FMODE_WRITE;
582
583		/* kernel only call f_op->open in dentry_open.  filp_open calls
584		 * dentry_open after call to open_namei that checks permissions.
585		 * Only nfsd_open call dentry_open directly without checking
586		 * permissions and because of that this code below is safe. */
587		if (oit.it_flags & (FMODE_WRITE | FMODE_READ))
588			oit.it_flags |= MDS_OPEN_OWNEROVERRIDE;
589
590		/* We do not want O_EXCL here, presumably we opened the file
591		 * already? XXX - NFS implications? */
592		oit.it_flags &= ~O_EXCL;
593
594		/* bug20584, if "it_flags" contains O_CREAT, the file will be
595		 * created if necessary, then "IT_CREAT" should be set to keep
596		 * consistent with it */
597		if (oit.it_flags & O_CREAT)
598			oit.it_op |= IT_CREAT;
599
600		it = &oit;
601	}
602
603restart:
604	/* Let's see if we have file open on MDS already. */
605	if (it->it_flags & FMODE_WRITE) {
606		och_p = &lli->lli_mds_write_och;
607		och_usecount = &lli->lli_open_fd_write_count;
608	} else if (it->it_flags & FMODE_EXEC) {
609		och_p = &lli->lli_mds_exec_och;
610		och_usecount = &lli->lli_open_fd_exec_count;
611	 } else {
612		och_p = &lli->lli_mds_read_och;
613		och_usecount = &lli->lli_open_fd_read_count;
614	}
615
616	mutex_lock(&lli->lli_och_mutex);
617	if (*och_p) { /* Open handle is present */
618		if (it_disposition(it, DISP_OPEN_OPEN)) {
619			/* Well, there's extra open request that we do not need,
620			   let's close it somehow. This will decref request. */
621			rc = it_open_error(DISP_OPEN_OPEN, it);
622			if (rc) {
623				mutex_unlock(&lli->lli_och_mutex);
624				GOTO(out_openerr, rc);
625			}
626
627			ll_release_openhandle(file->f_dentry, it);
628		}
629		(*och_usecount)++;
630
631		rc = ll_local_open(file, it, fd, NULL);
632		if (rc) {
633			(*och_usecount)--;
634			mutex_unlock(&lli->lli_och_mutex);
635			GOTO(out_openerr, rc);
636		}
637	} else {
638		LASSERT(*och_usecount == 0);
639		if (!it->d.lustre.it_disposition) {
640			/* We cannot just request lock handle now, new ELC code
641			   means that one of other OPEN locks for this file
642			   could be cancelled, and since blocking ast handler
643			   would attempt to grab och_mutex as well, that would
644			   result in a deadlock */
645			mutex_unlock(&lli->lli_och_mutex);
646			it->it_create_mode |= M_CHECK_STALE;
647			rc = ll_intent_file_open(file, NULL, 0, it);
648			it->it_create_mode &= ~M_CHECK_STALE;
649			if (rc)
650				GOTO(out_openerr, rc);
651
652			goto restart;
653		}
654		OBD_ALLOC(*och_p, sizeof (struct obd_client_handle));
655		if (!*och_p)
656			GOTO(out_och_free, rc = -ENOMEM);
657
658		(*och_usecount)++;
659
660		/* md_intent_lock() didn't get a request ref if there was an
661		 * open error, so don't do cleanup on the request here
662		 * (bug 3430) */
663		/* XXX (green): Should not we bail out on any error here, not
664		 * just open error? */
665		rc = it_open_error(DISP_OPEN_OPEN, it);
666		if (rc)
667			GOTO(out_och_free, rc);
668
669		LASSERT(it_disposition(it, DISP_ENQ_OPEN_REF));
670
671		rc = ll_local_open(file, it, fd, *och_p);
672		if (rc)
673			GOTO(out_och_free, rc);
674	}
675	mutex_unlock(&lli->lli_och_mutex);
676	fd = NULL;
677
678	/* Must do this outside lli_och_mutex lock to prevent deadlock where
679	   different kind of OPEN lock for this same inode gets cancelled
680	   by ldlm_cancel_lru */
681	if (!S_ISREG(inode->i_mode))
682		GOTO(out_och_free, rc);
683
684	ll_capa_open(inode);
685
686	if (!lli->lli_has_smd &&
687	    (cl_is_lov_delay_create(file->f_flags) ||
688	     (file->f_mode & FMODE_WRITE) == 0)) {
689		CDEBUG(D_INODE, "object creation was delayed\n");
690		GOTO(out_och_free, rc);
691	}
692	cl_lov_delay_create_clear(&file->f_flags);
693	GOTO(out_och_free, rc);
694
695out_och_free:
696	if (rc) {
697		if (och_p && *och_p) {
698			OBD_FREE(*och_p, sizeof (struct obd_client_handle));
699			*och_p = NULL; /* OBD_FREE writes some magic there */
700			(*och_usecount)--;
701		}
702		mutex_unlock(&lli->lli_och_mutex);
703
704out_openerr:
705		if (opendir_set != 0)
706			ll_stop_statahead(inode, lli->lli_opendir_key);
707		if (fd != NULL)
708			ll_file_data_put(fd);
709	} else {
710		ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_OPEN, 1);
711	}
712
713	if (it && it_disposition(it, DISP_ENQ_OPEN_REF)) {
714		ptlrpc_req_finished(it->d.lustre.it_data);
715		it_clear_disposition(it, DISP_ENQ_OPEN_REF);
716	}
717
718	return rc;
719}
720
721static int ll_md_blocking_lease_ast(struct ldlm_lock *lock,
722			struct ldlm_lock_desc *desc, void *data, int flag)
723{
724	int rc;
725	struct lustre_handle lockh;
726
727	switch (flag) {
728	case LDLM_CB_BLOCKING:
729		ldlm_lock2handle(lock, &lockh);
730		rc = ldlm_cli_cancel(&lockh, LCF_ASYNC);
731		if (rc < 0) {
732			CDEBUG(D_INODE, "ldlm_cli_cancel: %d\n", rc);
733			return rc;
734		}
735		break;
736	case LDLM_CB_CANCELING:
737		/* do nothing */
738		break;
739	}
740	return 0;
741}
742
743/**
744 * Acquire a lease and open the file.
745 */
746static struct obd_client_handle *
747ll_lease_open(struct inode *inode, struct file *file, fmode_t fmode,
748	      __u64 open_flags)
749{
750	struct lookup_intent it = { .it_op = IT_OPEN };
751	struct ll_sb_info *sbi = ll_i2sbi(inode);
752	struct md_op_data *op_data;
753	struct ptlrpc_request *req;
754	struct lustre_handle old_handle = { 0 };
755	struct obd_client_handle *och = NULL;
756	int rc;
757	int rc2;
758
759	if (fmode != FMODE_WRITE && fmode != FMODE_READ)
760		return ERR_PTR(-EINVAL);
761
762	if (file != NULL) {
763		struct ll_inode_info *lli = ll_i2info(inode);
764		struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
765		struct obd_client_handle **och_p;
766		__u64 *och_usecount;
767
768		if (!(fmode & file->f_mode) || (file->f_mode & FMODE_EXEC))
769			return ERR_PTR(-EPERM);
770
771		/* Get the openhandle of the file */
772		rc = -EBUSY;
773		mutex_lock(&lli->lli_och_mutex);
774		if (fd->fd_lease_och != NULL) {
775			mutex_unlock(&lli->lli_och_mutex);
776			return ERR_PTR(rc);
777		}
778
779		if (fd->fd_och == NULL) {
780			if (file->f_mode & FMODE_WRITE) {
781				LASSERT(lli->lli_mds_write_och != NULL);
782				och_p = &lli->lli_mds_write_och;
783				och_usecount = &lli->lli_open_fd_write_count;
784			} else {
785				LASSERT(lli->lli_mds_read_och != NULL);
786				och_p = &lli->lli_mds_read_och;
787				och_usecount = &lli->lli_open_fd_read_count;
788			}
789			if (*och_usecount == 1) {
790				fd->fd_och = *och_p;
791				*och_p = NULL;
792				*och_usecount = 0;
793				rc = 0;
794			}
795		}
796		mutex_unlock(&lli->lli_och_mutex);
797		if (rc < 0) /* more than 1 opener */
798			return ERR_PTR(rc);
799
800		LASSERT(fd->fd_och != NULL);
801		old_handle = fd->fd_och->och_fh;
802	}
803
804	OBD_ALLOC_PTR(och);
805	if (och == NULL)
806		return ERR_PTR(-ENOMEM);
807
808	op_data = ll_prep_md_op_data(NULL, inode, inode, NULL, 0, 0,
809					LUSTRE_OPC_ANY, NULL);
810	if (IS_ERR(op_data))
811		GOTO(out, rc = PTR_ERR(op_data));
812
813	/* To tell the MDT this openhandle is from the same owner */
814	op_data->op_handle = old_handle;
815
816	it.it_flags = fmode | open_flags;
817	it.it_flags |= MDS_OPEN_LOCK | MDS_OPEN_BY_FID | MDS_OPEN_LEASE;
818	rc = md_intent_lock(sbi->ll_md_exp, op_data, NULL, 0, &it, 0, &req,
819				ll_md_blocking_lease_ast,
820	/* LDLM_FL_NO_LRU: To not put the lease lock into LRU list, otherwise
821	 * it can be cancelled which may mislead applications that the lease is
822	 * broken;
823	 * LDLM_FL_EXCL: Set this flag so that it won't be matched by normal
824	 * open in ll_md_blocking_ast(). Otherwise as ll_md_blocking_lease_ast
825	 * doesn't deal with openhandle, so normal openhandle will be leaked. */
826				LDLM_FL_NO_LRU | LDLM_FL_EXCL);
827	ll_finish_md_op_data(op_data);
828	ptlrpc_req_finished(req);
829	if (rc < 0)
830		GOTO(out_release_it, rc);
831
832	if (it_disposition(&it, DISP_LOOKUP_NEG))
833		GOTO(out_release_it, rc = -ENOENT);
834
835	rc = it_open_error(DISP_OPEN_OPEN, &it);
836	if (rc)
837		GOTO(out_release_it, rc);
838
839	LASSERT(it_disposition(&it, DISP_ENQ_OPEN_REF));
840	ll_och_fill(sbi->ll_md_exp, &it, och);
841
842	if (!it_disposition(&it, DISP_OPEN_LEASE)) /* old server? */
843		GOTO(out_close, rc = -EOPNOTSUPP);
844
845	/* already get lease, handle lease lock */
846	ll_set_lock_data(sbi->ll_md_exp, inode, &it, NULL);
847	if (it.d.lustre.it_lock_mode == 0 ||
848	    it.d.lustre.it_lock_bits != MDS_INODELOCK_OPEN) {
849		/* open lock must return for lease */
850		CERROR(DFID "lease granted but no open lock, %d/%llu.\n",
851			PFID(ll_inode2fid(inode)), it.d.lustre.it_lock_mode,
852			it.d.lustre.it_lock_bits);
853		GOTO(out_close, rc = -EPROTO);
854	}
855
856	ll_intent_release(&it);
857	return och;
858
859out_close:
860	rc2 = ll_close_inode_openhandle(sbi->ll_md_exp, inode, och, NULL);
861	if (rc2)
862		CERROR("Close openhandle returned %d\n", rc2);
863
864	/* cancel open lock */
865	if (it.d.lustre.it_lock_mode != 0) {
866		ldlm_lock_decref_and_cancel(&och->och_lease_handle,
867						it.d.lustre.it_lock_mode);
868		it.d.lustre.it_lock_mode = 0;
869	}
870out_release_it:
871	ll_intent_release(&it);
872out:
873	OBD_FREE_PTR(och);
874	return ERR_PTR(rc);
875}
876
877/**
878 * Release lease and close the file.
879 * It will check if the lease has ever broken.
880 */
881static int ll_lease_close(struct obd_client_handle *och, struct inode *inode,
882			  bool *lease_broken)
883{
884	struct ldlm_lock *lock;
885	bool cancelled = true;
886	int rc;
887
888	lock = ldlm_handle2lock(&och->och_lease_handle);
889	if (lock != NULL) {
890		lock_res_and_lock(lock);
891		cancelled = ldlm_is_cancel(lock);
892		unlock_res_and_lock(lock);
893		ldlm_lock_put(lock);
894	}
895
896	CDEBUG(D_INODE, "lease for "DFID" broken? %d\n",
897		PFID(&ll_i2info(inode)->lli_fid), cancelled);
898
899	if (!cancelled)
900		ldlm_cli_cancel(&och->och_lease_handle, 0);
901	if (lease_broken != NULL)
902		*lease_broken = cancelled;
903
904	rc = ll_close_inode_openhandle(ll_i2sbi(inode)->ll_md_exp, inode, och,
905				       NULL);
906	return rc;
907}
908
909/* Fills the obdo with the attributes for the lsm */
910static int ll_lsm_getattr(struct lov_stripe_md *lsm, struct obd_export *exp,
911			  struct obd_capa *capa, struct obdo *obdo,
912			  __u64 ioepoch, int sync)
913{
914	struct ptlrpc_request_set *set;
915	struct obd_info	    oinfo = { { { 0 } } };
916	int			rc;
917
918	LASSERT(lsm != NULL);
919
920	oinfo.oi_md = lsm;
921	oinfo.oi_oa = obdo;
922	oinfo.oi_oa->o_oi = lsm->lsm_oi;
923	oinfo.oi_oa->o_mode = S_IFREG;
924	oinfo.oi_oa->o_ioepoch = ioepoch;
925	oinfo.oi_oa->o_valid = OBD_MD_FLID | OBD_MD_FLTYPE |
926			       OBD_MD_FLSIZE | OBD_MD_FLBLOCKS |
927			       OBD_MD_FLBLKSZ | OBD_MD_FLATIME |
928			       OBD_MD_FLMTIME | OBD_MD_FLCTIME |
929			       OBD_MD_FLGROUP | OBD_MD_FLEPOCH |
930			       OBD_MD_FLDATAVERSION;
931	oinfo.oi_capa = capa;
932	if (sync) {
933		oinfo.oi_oa->o_valid |= OBD_MD_FLFLAGS;
934		oinfo.oi_oa->o_flags |= OBD_FL_SRVLOCK;
935	}
936
937	set = ptlrpc_prep_set();
938	if (set == NULL) {
939		CERROR("can't allocate ptlrpc set\n");
940		rc = -ENOMEM;
941	} else {
942		rc = obd_getattr_async(exp, &oinfo, set);
943		if (rc == 0)
944			rc = ptlrpc_set_wait(set);
945		ptlrpc_set_destroy(set);
946	}
947	if (rc == 0)
948		oinfo.oi_oa->o_valid &= (OBD_MD_FLBLOCKS | OBD_MD_FLBLKSZ |
949					 OBD_MD_FLATIME | OBD_MD_FLMTIME |
950					 OBD_MD_FLCTIME | OBD_MD_FLSIZE |
951					 OBD_MD_FLDATAVERSION);
952	return rc;
953}
954
955/**
956  * Performs the getattr on the inode and updates its fields.
957  * If @sync != 0, perform the getattr under the server-side lock.
958  */
959int ll_inode_getattr(struct inode *inode, struct obdo *obdo,
960		     __u64 ioepoch, int sync)
961{
962	struct obd_capa      *capa = ll_mdscapa_get(inode);
963	struct lov_stripe_md *lsm;
964	int rc;
965
966	lsm = ccc_inode_lsm_get(inode);
967	rc = ll_lsm_getattr(lsm, ll_i2dtexp(inode),
968			    capa, obdo, ioepoch, sync);
969	capa_put(capa);
970	if (rc == 0) {
971		struct ost_id *oi = lsm ? &lsm->lsm_oi : &obdo->o_oi;
972
973		obdo_refresh_inode(inode, obdo, obdo->o_valid);
974		CDEBUG(D_INODE, "objid "DOSTID" size %llu, blocks %llu,"
975		       " blksize %lu\n", POSTID(oi), i_size_read(inode),
976		       (unsigned long long)inode->i_blocks,
977		       (unsigned long)ll_inode_blksize(inode));
978	}
979	ccc_inode_lsm_put(inode, lsm);
980	return rc;
981}
982
983int ll_merge_lvb(const struct lu_env *env, struct inode *inode)
984{
985	struct ll_inode_info *lli = ll_i2info(inode);
986	struct cl_object *obj = lli->lli_clob;
987	struct cl_attr *attr = ccc_env_thread_attr(env);
988	struct ost_lvb lvb;
989	int rc = 0;
990
991	ll_inode_size_lock(inode);
992	/* merge timestamps the most recently obtained from mds with
993	   timestamps obtained from osts */
994	LTIME_S(inode->i_atime) = lli->lli_lvb.lvb_atime;
995	LTIME_S(inode->i_mtime) = lli->lli_lvb.lvb_mtime;
996	LTIME_S(inode->i_ctime) = lli->lli_lvb.lvb_ctime;
997	inode_init_lvb(inode, &lvb);
998
999	cl_object_attr_lock(obj);
1000	rc = cl_object_attr_get(env, obj, attr);
1001	cl_object_attr_unlock(obj);
1002
1003	if (rc == 0) {
1004		if (lvb.lvb_atime < attr->cat_atime)
1005			lvb.lvb_atime = attr->cat_atime;
1006		if (lvb.lvb_ctime < attr->cat_ctime)
1007			lvb.lvb_ctime = attr->cat_ctime;
1008		if (lvb.lvb_mtime < attr->cat_mtime)
1009			lvb.lvb_mtime = attr->cat_mtime;
1010
1011		CDEBUG(D_VFSTRACE, DFID" updating i_size %llu\n",
1012				PFID(&lli->lli_fid), attr->cat_size);
1013		cl_isize_write_nolock(inode, attr->cat_size);
1014
1015		inode->i_blocks = attr->cat_blocks;
1016
1017		LTIME_S(inode->i_mtime) = lvb.lvb_mtime;
1018		LTIME_S(inode->i_atime) = lvb.lvb_atime;
1019		LTIME_S(inode->i_ctime) = lvb.lvb_ctime;
1020	}
1021	ll_inode_size_unlock(inode);
1022
1023	return rc;
1024}
1025
1026int ll_glimpse_ioctl(struct ll_sb_info *sbi, struct lov_stripe_md *lsm,
1027		     lstat_t *st)
1028{
1029	struct obdo obdo = { 0 };
1030	int rc;
1031
1032	rc = ll_lsm_getattr(lsm, sbi->ll_dt_exp, NULL, &obdo, 0, 0);
1033	if (rc == 0) {
1034		st->st_size   = obdo.o_size;
1035		st->st_blocks = obdo.o_blocks;
1036		st->st_mtime  = obdo.o_mtime;
1037		st->st_atime  = obdo.o_atime;
1038		st->st_ctime  = obdo.o_ctime;
1039	}
1040	return rc;
1041}
1042
1043static bool file_is_noatime(const struct file *file)
1044{
1045	const struct vfsmount *mnt = file->f_path.mnt;
1046	const struct inode *inode = file->f_path.dentry->d_inode;
1047
1048	/* Adapted from file_accessed() and touch_atime().*/
1049	if (file->f_flags & O_NOATIME)
1050		return true;
1051
1052	if (inode->i_flags & S_NOATIME)
1053		return true;
1054
1055	if (IS_NOATIME(inode))
1056		return true;
1057
1058	if (mnt->mnt_flags & (MNT_NOATIME | MNT_READONLY))
1059		return true;
1060
1061	if ((mnt->mnt_flags & MNT_NODIRATIME) && S_ISDIR(inode->i_mode))
1062		return true;
1063
1064	if ((inode->i_sb->s_flags & MS_NODIRATIME) && S_ISDIR(inode->i_mode))
1065		return true;
1066
1067	return false;
1068}
1069
1070void ll_io_init(struct cl_io *io, const struct file *file, int write)
1071{
1072	struct inode *inode = file->f_dentry->d_inode;
1073
1074	io->u.ci_rw.crw_nonblock = file->f_flags & O_NONBLOCK;
1075	if (write) {
1076		io->u.ci_wr.wr_append = !!(file->f_flags & O_APPEND);
1077		io->u.ci_wr.wr_sync = file->f_flags & O_SYNC ||
1078				      file->f_flags & O_DIRECT ||
1079				      IS_SYNC(inode);
1080	}
1081	io->ci_obj     = ll_i2info(inode)->lli_clob;
1082	io->ci_lockreq = CILR_MAYBE;
1083	if (ll_file_nolock(file)) {
1084		io->ci_lockreq = CILR_NEVER;
1085		io->ci_no_srvlock = 1;
1086	} else if (file->f_flags & O_APPEND) {
1087		io->ci_lockreq = CILR_MANDATORY;
1088	}
1089
1090	io->ci_noatime = file_is_noatime(file);
1091}
1092
1093static ssize_t
1094ll_file_io_generic(const struct lu_env *env, struct vvp_io_args *args,
1095		   struct file *file, enum cl_io_type iot,
1096		   loff_t *ppos, size_t count)
1097{
1098	struct ll_inode_info *lli = ll_i2info(file->f_dentry->d_inode);
1099	struct ll_file_data  *fd  = LUSTRE_FPRIVATE(file);
1100	struct cl_io	 *io;
1101	ssize_t	       result;
1102
1103restart:
1104	io = ccc_env_thread_io(env);
1105	ll_io_init(io, file, iot == CIT_WRITE);
1106
1107	if (cl_io_rw_init(env, io, iot, *ppos, count) == 0) {
1108		struct vvp_io *vio = vvp_env_io(env);
1109		struct ccc_io *cio = ccc_env_io(env);
1110		int write_mutex_locked = 0;
1111
1112		cio->cui_fd  = LUSTRE_FPRIVATE(file);
1113		vio->cui_io_subtype = args->via_io_subtype;
1114
1115		switch (vio->cui_io_subtype) {
1116		case IO_NORMAL:
1117			cio->cui_iter = args->u.normal.via_iter;
1118			cio->cui_iocb = args->u.normal.via_iocb;
1119			if ((iot == CIT_WRITE) &&
1120			    !(cio->cui_fd->fd_flags & LL_FILE_GROUP_LOCKED)) {
1121				if (mutex_lock_interruptible(&lli->
1122							       lli_write_mutex))
1123					GOTO(out, result = -ERESTARTSYS);
1124				write_mutex_locked = 1;
1125			} else if (iot == CIT_READ) {
1126				down_read(&lli->lli_trunc_sem);
1127			}
1128			break;
1129		case IO_SPLICE:
1130			vio->u.splice.cui_pipe = args->u.splice.via_pipe;
1131			vio->u.splice.cui_flags = args->u.splice.via_flags;
1132			break;
1133		default:
1134			CERROR("Unknown IO type - %u\n", vio->cui_io_subtype);
1135			LBUG();
1136		}
1137		result = cl_io_loop(env, io);
1138		if (write_mutex_locked)
1139			mutex_unlock(&lli->lli_write_mutex);
1140		else if (args->via_io_subtype == IO_NORMAL && iot == CIT_READ)
1141			up_read(&lli->lli_trunc_sem);
1142	} else {
1143		/* cl_io_rw_init() handled IO */
1144		result = io->ci_result;
1145	}
1146
1147	if (io->ci_nob > 0) {
1148		result = io->ci_nob;
1149		*ppos = io->u.ci_wr.wr.crw_pos;
1150	}
1151	GOTO(out, result);
1152out:
1153	cl_io_fini(env, io);
1154	/* If any bit been read/written (result != 0), we just return
1155	 * short read/write instead of restart io. */
1156	if ((result == 0 || result == -ENODATA) && io->ci_need_restart) {
1157		CDEBUG(D_VFSTRACE, "Restart %s on %s from %lld, count:%zd\n",
1158		       iot == CIT_READ ? "read" : "write",
1159		       file->f_dentry->d_name.name, *ppos, count);
1160		LASSERTF(io->ci_nob == 0, "%zd", io->ci_nob);
1161		goto restart;
1162	}
1163
1164	if (iot == CIT_READ) {
1165		if (result >= 0)
1166			ll_stats_ops_tally(ll_i2sbi(file->f_dentry->d_inode),
1167					   LPROC_LL_READ_BYTES, result);
1168	} else if (iot == CIT_WRITE) {
1169		if (result >= 0) {
1170			ll_stats_ops_tally(ll_i2sbi(file->f_dentry->d_inode),
1171					   LPROC_LL_WRITE_BYTES, result);
1172			fd->fd_write_failed = false;
1173		} else if (result != -ERESTARTSYS) {
1174			fd->fd_write_failed = true;
1175		}
1176	}
1177
1178	return result;
1179}
1180
1181static ssize_t ll_file_read_iter(struct kiocb *iocb, struct iov_iter *to)
1182{
1183	struct lu_env      *env;
1184	struct vvp_io_args *args;
1185	ssize_t	     result;
1186	int		 refcheck;
1187
1188	env = cl_env_get(&refcheck);
1189	if (IS_ERR(env))
1190		return PTR_ERR(env);
1191
1192	args = vvp_env_args(env, IO_NORMAL);
1193	args->u.normal.via_iter = to;
1194	args->u.normal.via_iocb = iocb;
1195
1196	result = ll_file_io_generic(env, args, iocb->ki_filp, CIT_READ,
1197				    &iocb->ki_pos, iov_iter_count(to));
1198	cl_env_put(env, &refcheck);
1199	return result;
1200}
1201
1202/*
1203 * Write to a file (through the page cache).
1204 */
1205static ssize_t ll_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
1206{
1207	struct lu_env      *env;
1208	struct vvp_io_args *args;
1209	ssize_t	     result;
1210	int		 refcheck;
1211
1212	env = cl_env_get(&refcheck);
1213	if (IS_ERR(env))
1214		return PTR_ERR(env);
1215
1216	args = vvp_env_args(env, IO_NORMAL);
1217	args->u.normal.via_iter = from;
1218	args->u.normal.via_iocb = iocb;
1219
1220	result = ll_file_io_generic(env, args, iocb->ki_filp, CIT_WRITE,
1221				  &iocb->ki_pos, iov_iter_count(from));
1222	cl_env_put(env, &refcheck);
1223	return result;
1224}
1225
1226/*
1227 * Send file content (through pagecache) somewhere with helper
1228 */
1229static ssize_t ll_file_splice_read(struct file *in_file, loff_t *ppos,
1230				   struct pipe_inode_info *pipe, size_t count,
1231				   unsigned int flags)
1232{
1233	struct lu_env      *env;
1234	struct vvp_io_args *args;
1235	ssize_t	     result;
1236	int		 refcheck;
1237
1238	env = cl_env_get(&refcheck);
1239	if (IS_ERR(env))
1240		return PTR_ERR(env);
1241
1242	args = vvp_env_args(env, IO_SPLICE);
1243	args->u.splice.via_pipe = pipe;
1244	args->u.splice.via_flags = flags;
1245
1246	result = ll_file_io_generic(env, args, in_file, CIT_READ, ppos, count);
1247	cl_env_put(env, &refcheck);
1248	return result;
1249}
1250
1251static int ll_lov_recreate(struct inode *inode, struct ost_id *oi,
1252			   obd_count ost_idx)
1253{
1254	struct obd_export *exp = ll_i2dtexp(inode);
1255	struct obd_trans_info oti = { 0 };
1256	struct obdo *oa = NULL;
1257	int lsm_size;
1258	int rc = 0;
1259	struct lov_stripe_md *lsm = NULL, *lsm2;
1260
1261	OBDO_ALLOC(oa);
1262	if (oa == NULL)
1263		return -ENOMEM;
1264
1265	lsm = ccc_inode_lsm_get(inode);
1266	if (!lsm_has_objects(lsm))
1267		GOTO(out, rc = -ENOENT);
1268
1269	lsm_size = sizeof(*lsm) + (sizeof(struct lov_oinfo) *
1270		   (lsm->lsm_stripe_count));
1271
1272	OBD_ALLOC_LARGE(lsm2, lsm_size);
1273	if (lsm2 == NULL)
1274		GOTO(out, rc = -ENOMEM);
1275
1276	oa->o_oi = *oi;
1277	oa->o_nlink = ost_idx;
1278	oa->o_flags |= OBD_FL_RECREATE_OBJS;
1279	oa->o_valid = OBD_MD_FLID | OBD_MD_FLFLAGS | OBD_MD_FLGROUP;
1280	obdo_from_inode(oa, inode, OBD_MD_FLTYPE | OBD_MD_FLATIME |
1281				   OBD_MD_FLMTIME | OBD_MD_FLCTIME);
1282	obdo_set_parent_fid(oa, &ll_i2info(inode)->lli_fid);
1283	memcpy(lsm2, lsm, lsm_size);
1284	ll_inode_size_lock(inode);
1285	rc = obd_create(NULL, exp, oa, &lsm2, &oti);
1286	ll_inode_size_unlock(inode);
1287
1288	OBD_FREE_LARGE(lsm2, lsm_size);
1289	GOTO(out, rc);
1290out:
1291	ccc_inode_lsm_put(inode, lsm);
1292	OBDO_FREE(oa);
1293	return rc;
1294}
1295
1296static int ll_lov_recreate_obj(struct inode *inode, unsigned long arg)
1297{
1298	struct ll_recreate_obj ucreat;
1299	struct ost_id		oi;
1300
1301	if (!capable(CFS_CAP_SYS_ADMIN))
1302		return -EPERM;
1303
1304	if (copy_from_user(&ucreat, (struct ll_recreate_obj *)arg,
1305			   sizeof(ucreat)))
1306		return -EFAULT;
1307
1308	ostid_set_seq_mdt0(&oi);
1309	ostid_set_id(&oi, ucreat.lrc_id);
1310	return ll_lov_recreate(inode, &oi, ucreat.lrc_ost_idx);
1311}
1312
1313static int ll_lov_recreate_fid(struct inode *inode, unsigned long arg)
1314{
1315	struct lu_fid	fid;
1316	struct ost_id	oi;
1317	obd_count	ost_idx;
1318
1319	if (!capable(CFS_CAP_SYS_ADMIN))
1320		return -EPERM;
1321
1322	if (copy_from_user(&fid, (struct lu_fid *)arg, sizeof(fid)))
1323		return -EFAULT;
1324
1325	fid_to_ostid(&fid, &oi);
1326	ost_idx = (fid_seq(&fid) >> 16) & 0xffff;
1327	return ll_lov_recreate(inode, &oi, ost_idx);
1328}
1329
1330int ll_lov_setstripe_ea_info(struct inode *inode, struct file *file,
1331			     int flags, struct lov_user_md *lum, int lum_size)
1332{
1333	struct lov_stripe_md *lsm = NULL;
1334	struct lookup_intent oit = {.it_op = IT_OPEN, .it_flags = flags};
1335	int rc = 0;
1336
1337	lsm = ccc_inode_lsm_get(inode);
1338	if (lsm != NULL) {
1339		ccc_inode_lsm_put(inode, lsm);
1340		CDEBUG(D_IOCTL, "stripe already exists for ino %lu\n",
1341		       inode->i_ino);
1342		GOTO(out, rc = -EEXIST);
1343	}
1344
1345	ll_inode_size_lock(inode);
1346	rc = ll_intent_file_open(file, lum, lum_size, &oit);
1347	if (rc)
1348		GOTO(out_unlock, rc);
1349	rc = oit.d.lustre.it_status;
1350	if (rc < 0)
1351		GOTO(out_req_free, rc);
1352
1353	ll_release_openhandle(file->f_dentry, &oit);
1354
1355out_unlock:
1356	ll_inode_size_unlock(inode);
1357	ll_intent_release(&oit);
1358	ccc_inode_lsm_put(inode, lsm);
1359out:
1360	cl_lov_delay_create_clear(&file->f_flags);
1361	return rc;
1362out_req_free:
1363	ptlrpc_req_finished((struct ptlrpc_request *) oit.d.lustre.it_data);
1364	goto out;
1365}
1366
1367int ll_lov_getstripe_ea_info(struct inode *inode, const char *filename,
1368			     struct lov_mds_md **lmmp, int *lmm_size,
1369			     struct ptlrpc_request **request)
1370{
1371	struct ll_sb_info *sbi = ll_i2sbi(inode);
1372	struct mdt_body  *body;
1373	struct lov_mds_md *lmm = NULL;
1374	struct ptlrpc_request *req = NULL;
1375	struct md_op_data *op_data;
1376	int rc, lmmsize;
1377
1378	rc = ll_get_default_mdsize(sbi, &lmmsize);
1379	if (rc)
1380		return rc;
1381
1382	op_data = ll_prep_md_op_data(NULL, inode, NULL, filename,
1383				     strlen(filename), lmmsize,
1384				     LUSTRE_OPC_ANY, NULL);
1385	if (IS_ERR(op_data))
1386		return PTR_ERR(op_data);
1387
1388	op_data->op_valid = OBD_MD_FLEASIZE | OBD_MD_FLDIREA;
1389	rc = md_getattr_name(sbi->ll_md_exp, op_data, &req);
1390	ll_finish_md_op_data(op_data);
1391	if (rc < 0) {
1392		CDEBUG(D_INFO, "md_getattr_name failed "
1393		       "on %s: rc %d\n", filename, rc);
1394		GOTO(out, rc);
1395	}
1396
1397	body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
1398	LASSERT(body != NULL); /* checked by mdc_getattr_name */
1399
1400	lmmsize = body->eadatasize;
1401
1402	if (!(body->valid & (OBD_MD_FLEASIZE | OBD_MD_FLDIREA)) ||
1403			lmmsize == 0) {
1404		GOTO(out, rc = -ENODATA);
1405	}
1406
1407	lmm = req_capsule_server_sized_get(&req->rq_pill, &RMF_MDT_MD, lmmsize);
1408	LASSERT(lmm != NULL);
1409
1410	if ((lmm->lmm_magic != cpu_to_le32(LOV_MAGIC_V1)) &&
1411	    (lmm->lmm_magic != cpu_to_le32(LOV_MAGIC_V3))) {
1412		GOTO(out, rc = -EPROTO);
1413	}
1414
1415	/*
1416	 * This is coming from the MDS, so is probably in
1417	 * little endian.  We convert it to host endian before
1418	 * passing it to userspace.
1419	 */
1420	if (LOV_MAGIC != cpu_to_le32(LOV_MAGIC)) {
1421		int stripe_count;
1422
1423		stripe_count = le16_to_cpu(lmm->lmm_stripe_count);
1424		if (le32_to_cpu(lmm->lmm_pattern) & LOV_PATTERN_F_RELEASED)
1425			stripe_count = 0;
1426
1427		/* if function called for directory - we should
1428		 * avoid swab not existent lsm objects */
1429		if (lmm->lmm_magic == cpu_to_le32(LOV_MAGIC_V1)) {
1430			lustre_swab_lov_user_md_v1((struct lov_user_md_v1 *)lmm);
1431			if (S_ISREG(body->mode))
1432				lustre_swab_lov_user_md_objects(
1433				 ((struct lov_user_md_v1 *)lmm)->lmm_objects,
1434				 stripe_count);
1435		} else if (lmm->lmm_magic == cpu_to_le32(LOV_MAGIC_V3)) {
1436			lustre_swab_lov_user_md_v3((struct lov_user_md_v3 *)lmm);
1437			if (S_ISREG(body->mode))
1438				lustre_swab_lov_user_md_objects(
1439				 ((struct lov_user_md_v3 *)lmm)->lmm_objects,
1440				 stripe_count);
1441		}
1442	}
1443
1444out:
1445	*lmmp = lmm;
1446	*lmm_size = lmmsize;
1447	*request = req;
1448	return rc;
1449}
1450
1451static int ll_lov_setea(struct inode *inode, struct file *file,
1452			    unsigned long arg)
1453{
1454	int			 flags = MDS_OPEN_HAS_OBJS | FMODE_WRITE;
1455	struct lov_user_md	*lump;
1456	int			 lum_size = sizeof(struct lov_user_md) +
1457					    sizeof(struct lov_user_ost_data);
1458	int			 rc;
1459
1460	if (!capable(CFS_CAP_SYS_ADMIN))
1461		return -EPERM;
1462
1463	OBD_ALLOC_LARGE(lump, lum_size);
1464	if (lump == NULL)
1465		return -ENOMEM;
1466
1467	if (copy_from_user(lump, (struct lov_user_md  *)arg, lum_size)) {
1468		OBD_FREE_LARGE(lump, lum_size);
1469		return -EFAULT;
1470	}
1471
1472	rc = ll_lov_setstripe_ea_info(inode, file, flags, lump, lum_size);
1473
1474	OBD_FREE_LARGE(lump, lum_size);
1475	return rc;
1476}
1477
1478static int ll_lov_setstripe(struct inode *inode, struct file *file,
1479			    unsigned long arg)
1480{
1481	struct lov_user_md_v3	 lumv3;
1482	struct lov_user_md_v1	*lumv1 = (struct lov_user_md_v1 *)&lumv3;
1483	struct lov_user_md_v1	*lumv1p = (struct lov_user_md_v1 *)arg;
1484	struct lov_user_md_v3	*lumv3p = (struct lov_user_md_v3 *)arg;
1485	int			 lum_size, rc;
1486	int			 flags = FMODE_WRITE;
1487
1488	/* first try with v1 which is smaller than v3 */
1489	lum_size = sizeof(struct lov_user_md_v1);
1490	if (copy_from_user(lumv1, lumv1p, lum_size))
1491		return -EFAULT;
1492
1493	if (lumv1->lmm_magic == LOV_USER_MAGIC_V3) {
1494		lum_size = sizeof(struct lov_user_md_v3);
1495		if (copy_from_user(&lumv3, lumv3p, lum_size))
1496			return -EFAULT;
1497	}
1498
1499	rc = ll_lov_setstripe_ea_info(inode, file, flags, lumv1, lum_size);
1500	if (rc == 0) {
1501		struct lov_stripe_md *lsm;
1502		__u32 gen;
1503
1504		put_user(0, &lumv1p->lmm_stripe_count);
1505
1506		ll_layout_refresh(inode, &gen);
1507		lsm = ccc_inode_lsm_get(inode);
1508		rc = obd_iocontrol(LL_IOC_LOV_GETSTRIPE, ll_i2dtexp(inode),
1509				   0, lsm, (void *)arg);
1510		ccc_inode_lsm_put(inode, lsm);
1511	}
1512	return rc;
1513}
1514
1515static int ll_lov_getstripe(struct inode *inode, unsigned long arg)
1516{
1517	struct lov_stripe_md *lsm;
1518	int rc = -ENODATA;
1519
1520	lsm = ccc_inode_lsm_get(inode);
1521	if (lsm != NULL)
1522		rc = obd_iocontrol(LL_IOC_LOV_GETSTRIPE, ll_i2dtexp(inode), 0,
1523				   lsm, (void *)arg);
1524	ccc_inode_lsm_put(inode, lsm);
1525	return rc;
1526}
1527
1528static int
1529ll_get_grouplock(struct inode *inode, struct file *file, unsigned long arg)
1530{
1531	struct ll_inode_info   *lli = ll_i2info(inode);
1532	struct ll_file_data    *fd = LUSTRE_FPRIVATE(file);
1533	struct ccc_grouplock    grouplock;
1534	int		     rc;
1535
1536	if (ll_file_nolock(file))
1537		return -EOPNOTSUPP;
1538
1539	spin_lock(&lli->lli_lock);
1540	if (fd->fd_flags & LL_FILE_GROUP_LOCKED) {
1541		CWARN("group lock already existed with gid %lu\n",
1542		      fd->fd_grouplock.cg_gid);
1543		spin_unlock(&lli->lli_lock);
1544		return -EINVAL;
1545	}
1546	LASSERT(fd->fd_grouplock.cg_lock == NULL);
1547	spin_unlock(&lli->lli_lock);
1548
1549	rc = cl_get_grouplock(cl_i2info(inode)->lli_clob,
1550			      arg, (file->f_flags & O_NONBLOCK), &grouplock);
1551	if (rc)
1552		return rc;
1553
1554	spin_lock(&lli->lli_lock);
1555	if (fd->fd_flags & LL_FILE_GROUP_LOCKED) {
1556		spin_unlock(&lli->lli_lock);
1557		CERROR("another thread just won the race\n");
1558		cl_put_grouplock(&grouplock);
1559		return -EINVAL;
1560	}
1561
1562	fd->fd_flags |= LL_FILE_GROUP_LOCKED;
1563	fd->fd_grouplock = grouplock;
1564	spin_unlock(&lli->lli_lock);
1565
1566	CDEBUG(D_INFO, "group lock %lu obtained\n", arg);
1567	return 0;
1568}
1569
1570int ll_put_grouplock(struct inode *inode, struct file *file, unsigned long arg)
1571{
1572	struct ll_inode_info   *lli = ll_i2info(inode);
1573	struct ll_file_data    *fd = LUSTRE_FPRIVATE(file);
1574	struct ccc_grouplock    grouplock;
1575
1576	spin_lock(&lli->lli_lock);
1577	if (!(fd->fd_flags & LL_FILE_GROUP_LOCKED)) {
1578		spin_unlock(&lli->lli_lock);
1579		CWARN("no group lock held\n");
1580		return -EINVAL;
1581	}
1582	LASSERT(fd->fd_grouplock.cg_lock != NULL);
1583
1584	if (fd->fd_grouplock.cg_gid != arg) {
1585		CWARN("group lock %lu doesn't match current id %lu\n",
1586		       arg, fd->fd_grouplock.cg_gid);
1587		spin_unlock(&lli->lli_lock);
1588		return -EINVAL;
1589	}
1590
1591	grouplock = fd->fd_grouplock;
1592	memset(&fd->fd_grouplock, 0, sizeof(fd->fd_grouplock));
1593	fd->fd_flags &= ~LL_FILE_GROUP_LOCKED;
1594	spin_unlock(&lli->lli_lock);
1595
1596	cl_put_grouplock(&grouplock);
1597	CDEBUG(D_INFO, "group lock %lu released\n", arg);
1598	return 0;
1599}
1600
1601/**
1602 * Close inode open handle
1603 *
1604 * \param dentry [in]     dentry which contains the inode
1605 * \param it     [in,out] intent which contains open info and result
1606 *
1607 * \retval 0     success
1608 * \retval <0    failure
1609 */
1610int ll_release_openhandle(struct dentry *dentry, struct lookup_intent *it)
1611{
1612	struct inode *inode = dentry->d_inode;
1613	struct obd_client_handle *och;
1614	int rc;
1615
1616	LASSERT(inode);
1617
1618	/* Root ? Do nothing. */
1619	if (dentry->d_inode->i_sb->s_root == dentry)
1620		return 0;
1621
1622	/* No open handle to close? Move away */
1623	if (!it_disposition(it, DISP_OPEN_OPEN))
1624		return 0;
1625
1626	LASSERT(it_open_error(DISP_OPEN_OPEN, it) == 0);
1627
1628	OBD_ALLOC(och, sizeof(*och));
1629	if (!och)
1630		GOTO(out, rc = -ENOMEM);
1631
1632	ll_och_fill(ll_i2sbi(inode)->ll_md_exp, it, och);
1633
1634	rc = ll_close_inode_openhandle(ll_i2sbi(inode)->ll_md_exp,
1635				       inode, och, NULL);
1636out:
1637	/* this one is in place of ll_file_open */
1638	if (it_disposition(it, DISP_ENQ_OPEN_REF)) {
1639		ptlrpc_req_finished(it->d.lustre.it_data);
1640		it_clear_disposition(it, DISP_ENQ_OPEN_REF);
1641	}
1642	return rc;
1643}
1644
1645/**
1646 * Get size for inode for which FIEMAP mapping is requested.
1647 * Make the FIEMAP get_info call and returns the result.
1648 */
1649static int ll_do_fiemap(struct inode *inode, struct ll_user_fiemap *fiemap,
1650			size_t num_bytes)
1651{
1652	struct obd_export *exp = ll_i2dtexp(inode);
1653	struct lov_stripe_md *lsm = NULL;
1654	struct ll_fiemap_info_key fm_key = { .name = KEY_FIEMAP, };
1655	__u32 vallen = num_bytes;
1656	int rc;
1657
1658	/* Checks for fiemap flags */
1659	if (fiemap->fm_flags & ~LUSTRE_FIEMAP_FLAGS_COMPAT) {
1660		fiemap->fm_flags &= ~LUSTRE_FIEMAP_FLAGS_COMPAT;
1661		return -EBADR;
1662	}
1663
1664	/* Check for FIEMAP_FLAG_SYNC */
1665	if (fiemap->fm_flags & FIEMAP_FLAG_SYNC) {
1666		rc = filemap_fdatawrite(inode->i_mapping);
1667		if (rc)
1668			return rc;
1669	}
1670
1671	lsm = ccc_inode_lsm_get(inode);
1672	if (lsm == NULL)
1673		return -ENOENT;
1674
1675	/* If the stripe_count > 1 and the application does not understand
1676	 * DEVICE_ORDER flag, then it cannot interpret the extents correctly.
1677	 */
1678	if (lsm->lsm_stripe_count > 1 &&
1679	    !(fiemap->fm_flags & FIEMAP_FLAG_DEVICE_ORDER))
1680		GOTO(out, rc = -EOPNOTSUPP);
1681
1682	fm_key.oa.o_oi = lsm->lsm_oi;
1683	fm_key.oa.o_valid = OBD_MD_FLID | OBD_MD_FLGROUP;
1684
1685	obdo_from_inode(&fm_key.oa, inode, OBD_MD_FLSIZE);
1686	obdo_set_parent_fid(&fm_key.oa, &ll_i2info(inode)->lli_fid);
1687	/* If filesize is 0, then there would be no objects for mapping */
1688	if (fm_key.oa.o_size == 0) {
1689		fiemap->fm_mapped_extents = 0;
1690		GOTO(out, rc = 0);
1691	}
1692
1693	memcpy(&fm_key.fiemap, fiemap, sizeof(*fiemap));
1694
1695	rc = obd_get_info(NULL, exp, sizeof(fm_key), &fm_key, &vallen,
1696			  fiemap, lsm);
1697	if (rc)
1698		CERROR("obd_get_info failed: rc = %d\n", rc);
1699
1700out:
1701	ccc_inode_lsm_put(inode, lsm);
1702	return rc;
1703}
1704
1705int ll_fid2path(struct inode *inode, void *arg)
1706{
1707	struct obd_export	*exp = ll_i2mdexp(inode);
1708	struct getinfo_fid2path	*gfout, *gfin;
1709	int			 outsize, rc;
1710
1711	if (!capable(CFS_CAP_DAC_READ_SEARCH) &&
1712	    !(ll_i2sbi(inode)->ll_flags & LL_SBI_USER_FID2PATH))
1713		return -EPERM;
1714
1715	/* Need to get the buflen */
1716	OBD_ALLOC_PTR(gfin);
1717	if (gfin == NULL)
1718		return -ENOMEM;
1719	if (copy_from_user(gfin, arg, sizeof(*gfin))) {
1720		OBD_FREE_PTR(gfin);
1721		return -EFAULT;
1722	}
1723
1724	outsize = sizeof(*gfout) + gfin->gf_pathlen;
1725	OBD_ALLOC(gfout, outsize);
1726	if (gfout == NULL) {
1727		OBD_FREE_PTR(gfin);
1728		return -ENOMEM;
1729	}
1730	memcpy(gfout, gfin, sizeof(*gfout));
1731	OBD_FREE_PTR(gfin);
1732
1733	/* Call mdc_iocontrol */
1734	rc = obd_iocontrol(OBD_IOC_FID2PATH, exp, outsize, gfout, NULL);
1735	if (rc)
1736		GOTO(gf_free, rc);
1737
1738	if (copy_to_user(arg, gfout, outsize))
1739		rc = -EFAULT;
1740
1741gf_free:
1742	OBD_FREE(gfout, outsize);
1743	return rc;
1744}
1745
1746static int ll_ioctl_fiemap(struct inode *inode, unsigned long arg)
1747{
1748	struct ll_user_fiemap *fiemap_s;
1749	size_t num_bytes, ret_bytes;
1750	unsigned int extent_count;
1751	int rc = 0;
1752
1753	/* Get the extent count so we can calculate the size of
1754	 * required fiemap buffer */
1755	if (get_user(extent_count,
1756	    &((struct ll_user_fiemap __user *)arg)->fm_extent_count))
1757		return -EFAULT;
1758
1759	if (extent_count >=
1760	    (SIZE_MAX - sizeof(*fiemap_s)) / sizeof(struct ll_fiemap_extent))
1761		return -EINVAL;
1762	num_bytes = sizeof(*fiemap_s) + (extent_count *
1763					 sizeof(struct ll_fiemap_extent));
1764
1765	OBD_ALLOC_LARGE(fiemap_s, num_bytes);
1766	if (fiemap_s == NULL)
1767		return -ENOMEM;
1768
1769	/* get the fiemap value */
1770	if (copy_from_user(fiemap_s, (struct ll_user_fiemap __user *)arg,
1771			   sizeof(*fiemap_s)))
1772		GOTO(error, rc = -EFAULT);
1773
1774	/* If fm_extent_count is non-zero, read the first extent since
1775	 * it is used to calculate end_offset and device from previous
1776	 * fiemap call. */
1777	if (extent_count) {
1778		if (copy_from_user(&fiemap_s->fm_extents[0],
1779		    (char __user *)arg + sizeof(*fiemap_s),
1780		    sizeof(struct ll_fiemap_extent)))
1781			GOTO(error, rc = -EFAULT);
1782	}
1783
1784	rc = ll_do_fiemap(inode, fiemap_s, num_bytes);
1785	if (rc)
1786		GOTO(error, rc);
1787
1788	ret_bytes = sizeof(struct ll_user_fiemap);
1789
1790	if (extent_count != 0)
1791		ret_bytes += (fiemap_s->fm_mapped_extents *
1792				 sizeof(struct ll_fiemap_extent));
1793
1794	if (copy_to_user((void *)arg, fiemap_s, ret_bytes))
1795		rc = -EFAULT;
1796
1797error:
1798	OBD_FREE_LARGE(fiemap_s, num_bytes);
1799	return rc;
1800}
1801
1802/*
1803 * Read the data_version for inode.
1804 *
1805 * This value is computed using stripe object version on OST.
1806 * Version is computed using server side locking.
1807 *
1808 * @param extent_lock  Take extent lock. Not needed if a process is already
1809 *		       holding the OST object group locks.
1810 */
1811int ll_data_version(struct inode *inode, __u64 *data_version,
1812		    int extent_lock)
1813{
1814	struct lov_stripe_md	*lsm = NULL;
1815	struct ll_sb_info	*sbi = ll_i2sbi(inode);
1816	struct obdo		*obdo = NULL;
1817	int			 rc;
1818
1819	/* If no stripe, we consider version is 0. */
1820	lsm = ccc_inode_lsm_get(inode);
1821	if (!lsm_has_objects(lsm)) {
1822		*data_version = 0;
1823		CDEBUG(D_INODE, "No object for inode\n");
1824		GOTO(out, rc = 0);
1825	}
1826
1827	OBD_ALLOC_PTR(obdo);
1828	if (obdo == NULL)
1829		GOTO(out, rc = -ENOMEM);
1830
1831	rc = ll_lsm_getattr(lsm, sbi->ll_dt_exp, NULL, obdo, 0, extent_lock);
1832	if (rc == 0) {
1833		if (!(obdo->o_valid & OBD_MD_FLDATAVERSION))
1834			rc = -EOPNOTSUPP;
1835		else
1836			*data_version = obdo->o_data_version;
1837	}
1838
1839	OBD_FREE_PTR(obdo);
1840out:
1841	ccc_inode_lsm_put(inode, lsm);
1842	return rc;
1843}
1844
1845/*
1846 * Trigger a HSM release request for the provided inode.
1847 */
1848int ll_hsm_release(struct inode *inode)
1849{
1850	struct cl_env_nest nest;
1851	struct lu_env *env;
1852	struct obd_client_handle *och = NULL;
1853	__u64 data_version = 0;
1854	int rc;
1855
1856
1857	CDEBUG(D_INODE, "%s: Releasing file "DFID".\n",
1858	       ll_get_fsname(inode->i_sb, NULL, 0),
1859	       PFID(&ll_i2info(inode)->lli_fid));
1860
1861	och = ll_lease_open(inode, NULL, FMODE_WRITE, MDS_OPEN_RELEASE);
1862	if (IS_ERR(och))
1863		GOTO(out, rc = PTR_ERR(och));
1864
1865	/* Grab latest data_version and [am]time values */
1866	rc = ll_data_version(inode, &data_version, 1);
1867	if (rc != 0)
1868		GOTO(out, rc);
1869
1870	env = cl_env_nested_get(&nest);
1871	if (IS_ERR(env))
1872		GOTO(out, rc = PTR_ERR(env));
1873
1874	ll_merge_lvb(env, inode);
1875	cl_env_nested_put(&nest, env);
1876
1877	/* Release the file.
1878	 * NB: lease lock handle is released in mdc_hsm_release_pack() because
1879	 * we still need it to pack l_remote_handle to MDT. */
1880	rc = ll_close_inode_openhandle(ll_i2sbi(inode)->ll_md_exp, inode, och,
1881				       &data_version);
1882	och = NULL;
1883
1884
1885out:
1886	if (och != NULL && !IS_ERR(och)) /* close the file */
1887		ll_lease_close(och, inode, NULL);
1888
1889	return rc;
1890}
1891
1892struct ll_swap_stack {
1893	struct iattr		 ia1, ia2;
1894	__u64			 dv1, dv2;
1895	struct inode		*inode1, *inode2;
1896	bool			 check_dv1, check_dv2;
1897};
1898
1899static int ll_swap_layouts(struct file *file1, struct file *file2,
1900			   struct lustre_swap_layouts *lsl)
1901{
1902	struct mdc_swap_layouts	 msl;
1903	struct md_op_data	*op_data;
1904	__u32			 gid;
1905	__u64			 dv;
1906	struct ll_swap_stack	*llss = NULL;
1907	int			 rc;
1908
1909	OBD_ALLOC_PTR(llss);
1910	if (llss == NULL)
1911		return -ENOMEM;
1912
1913	llss->inode1 = file1->f_dentry->d_inode;
1914	llss->inode2 = file2->f_dentry->d_inode;
1915
1916	if (!S_ISREG(llss->inode2->i_mode))
1917		GOTO(free, rc = -EINVAL);
1918
1919	if (inode_permission(llss->inode1, MAY_WRITE) ||
1920	    inode_permission(llss->inode2, MAY_WRITE))
1921		GOTO(free, rc = -EPERM);
1922
1923	if (llss->inode2->i_sb != llss->inode1->i_sb)
1924		GOTO(free, rc = -EXDEV);
1925
1926	/* we use 2 bool because it is easier to swap than 2 bits */
1927	if (lsl->sl_flags & SWAP_LAYOUTS_CHECK_DV1)
1928		llss->check_dv1 = true;
1929
1930	if (lsl->sl_flags & SWAP_LAYOUTS_CHECK_DV2)
1931		llss->check_dv2 = true;
1932
1933	/* we cannot use lsl->sl_dvX directly because we may swap them */
1934	llss->dv1 = lsl->sl_dv1;
1935	llss->dv2 = lsl->sl_dv2;
1936
1937	rc = lu_fid_cmp(ll_inode2fid(llss->inode1), ll_inode2fid(llss->inode2));
1938	if (rc == 0) /* same file, done! */
1939		GOTO(free, rc = 0);
1940
1941	if (rc < 0) { /* sequentialize it */
1942		swap(llss->inode1, llss->inode2);
1943		swap(file1, file2);
1944		swap(llss->dv1, llss->dv2);
1945		swap(llss->check_dv1, llss->check_dv2);
1946	}
1947
1948	gid = lsl->sl_gid;
1949	if (gid != 0) { /* application asks to flush dirty cache */
1950		rc = ll_get_grouplock(llss->inode1, file1, gid);
1951		if (rc < 0)
1952			GOTO(free, rc);
1953
1954		rc = ll_get_grouplock(llss->inode2, file2, gid);
1955		if (rc < 0) {
1956			ll_put_grouplock(llss->inode1, file1, gid);
1957			GOTO(free, rc);
1958		}
1959	}
1960
1961	/* to be able to restore mtime and atime after swap
1962	 * we need to first save them */
1963	if (lsl->sl_flags &
1964	    (SWAP_LAYOUTS_KEEP_MTIME | SWAP_LAYOUTS_KEEP_ATIME)) {
1965		llss->ia1.ia_mtime = llss->inode1->i_mtime;
1966		llss->ia1.ia_atime = llss->inode1->i_atime;
1967		llss->ia1.ia_valid = ATTR_MTIME | ATTR_ATIME;
1968		llss->ia2.ia_mtime = llss->inode2->i_mtime;
1969		llss->ia2.ia_atime = llss->inode2->i_atime;
1970		llss->ia2.ia_valid = ATTR_MTIME | ATTR_ATIME;
1971	}
1972
1973	/* ultimate check, before swapping the layouts we check if
1974	 * dataversion has changed (if requested) */
1975	if (llss->check_dv1) {
1976		rc = ll_data_version(llss->inode1, &dv, 0);
1977		if (rc)
1978			GOTO(putgl, rc);
1979		if (dv != llss->dv1)
1980			GOTO(putgl, rc = -EAGAIN);
1981	}
1982
1983	if (llss->check_dv2) {
1984		rc = ll_data_version(llss->inode2, &dv, 0);
1985		if (rc)
1986			GOTO(putgl, rc);
1987		if (dv != llss->dv2)
1988			GOTO(putgl, rc = -EAGAIN);
1989	}
1990
1991	/* struct md_op_data is used to send the swap args to the mdt
1992	 * only flags is missing, so we use struct mdc_swap_layouts
1993	 * through the md_op_data->op_data */
1994	/* flags from user space have to be converted before they are send to
1995	 * server, no flag is sent today, they are only used on the client */
1996	msl.msl_flags = 0;
1997	rc = -ENOMEM;
1998	op_data = ll_prep_md_op_data(NULL, llss->inode1, llss->inode2, NULL, 0,
1999				     0, LUSTRE_OPC_ANY, &msl);
2000	if (IS_ERR(op_data))
2001		GOTO(free, rc = PTR_ERR(op_data));
2002
2003	rc = obd_iocontrol(LL_IOC_LOV_SWAP_LAYOUTS, ll_i2mdexp(llss->inode1),
2004			   sizeof(*op_data), op_data, NULL);
2005	ll_finish_md_op_data(op_data);
2006
2007putgl:
2008	if (gid != 0) {
2009		ll_put_grouplock(llss->inode2, file2, gid);
2010		ll_put_grouplock(llss->inode1, file1, gid);
2011	}
2012
2013	/* rc can be set from obd_iocontrol() or from a GOTO(putgl, ...) */
2014	if (rc != 0)
2015		GOTO(free, rc);
2016
2017	/* clear useless flags */
2018	if (!(lsl->sl_flags & SWAP_LAYOUTS_KEEP_MTIME)) {
2019		llss->ia1.ia_valid &= ~ATTR_MTIME;
2020		llss->ia2.ia_valid &= ~ATTR_MTIME;
2021	}
2022
2023	if (!(lsl->sl_flags & SWAP_LAYOUTS_KEEP_ATIME)) {
2024		llss->ia1.ia_valid &= ~ATTR_ATIME;
2025		llss->ia2.ia_valid &= ~ATTR_ATIME;
2026	}
2027
2028	/* update time if requested */
2029	rc = 0;
2030	if (llss->ia2.ia_valid != 0) {
2031		mutex_lock(&llss->inode1->i_mutex);
2032		rc = ll_setattr(file1->f_dentry, &llss->ia2);
2033		mutex_unlock(&llss->inode1->i_mutex);
2034	}
2035
2036	if (llss->ia1.ia_valid != 0) {
2037		int rc1;
2038
2039		mutex_lock(&llss->inode2->i_mutex);
2040		rc1 = ll_setattr(file2->f_dentry, &llss->ia1);
2041		mutex_unlock(&llss->inode2->i_mutex);
2042		if (rc == 0)
2043			rc = rc1;
2044	}
2045
2046free:
2047	if (llss != NULL)
2048		OBD_FREE_PTR(llss);
2049
2050	return rc;
2051}
2052
2053static int ll_hsm_state_set(struct inode *inode, struct hsm_state_set *hss)
2054{
2055	struct md_op_data	*op_data;
2056	int			 rc;
2057
2058	/* Non-root users are forbidden to set or clear flags which are
2059	 * NOT defined in HSM_USER_MASK. */
2060	if (((hss->hss_setmask | hss->hss_clearmask) & ~HSM_USER_MASK) &&
2061	    !capable(CFS_CAP_SYS_ADMIN))
2062		return -EPERM;
2063
2064	op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
2065				     LUSTRE_OPC_ANY, hss);
2066	if (IS_ERR(op_data))
2067		return PTR_ERR(op_data);
2068
2069	rc = obd_iocontrol(LL_IOC_HSM_STATE_SET, ll_i2mdexp(inode),
2070			   sizeof(*op_data), op_data, NULL);
2071
2072	ll_finish_md_op_data(op_data);
2073
2074	return rc;
2075}
2076
2077static int ll_hsm_import(struct inode *inode, struct file *file,
2078			 struct hsm_user_import *hui)
2079{
2080	struct hsm_state_set	*hss = NULL;
2081	struct iattr		*attr = NULL;
2082	int			 rc;
2083
2084
2085	if (!S_ISREG(inode->i_mode))
2086		return -EINVAL;
2087
2088	/* set HSM flags */
2089	OBD_ALLOC_PTR(hss);
2090	if (hss == NULL)
2091		GOTO(out, rc = -ENOMEM);
2092
2093	hss->hss_valid = HSS_SETMASK | HSS_ARCHIVE_ID;
2094	hss->hss_archive_id = hui->hui_archive_id;
2095	hss->hss_setmask = HS_ARCHIVED | HS_EXISTS | HS_RELEASED;
2096	rc = ll_hsm_state_set(inode, hss);
2097	if (rc != 0)
2098		GOTO(out, rc);
2099
2100	OBD_ALLOC_PTR(attr);
2101	if (attr == NULL)
2102		GOTO(out, rc = -ENOMEM);
2103
2104	attr->ia_mode = hui->hui_mode & (S_IRWXU | S_IRWXG | S_IRWXO);
2105	attr->ia_mode |= S_IFREG;
2106	attr->ia_uid = make_kuid(&init_user_ns, hui->hui_uid);
2107	attr->ia_gid = make_kgid(&init_user_ns, hui->hui_gid);
2108	attr->ia_size = hui->hui_size;
2109	attr->ia_mtime.tv_sec = hui->hui_mtime;
2110	attr->ia_mtime.tv_nsec = hui->hui_mtime_ns;
2111	attr->ia_atime.tv_sec = hui->hui_atime;
2112	attr->ia_atime.tv_nsec = hui->hui_atime_ns;
2113
2114	attr->ia_valid = ATTR_SIZE | ATTR_MODE | ATTR_FORCE |
2115			 ATTR_UID | ATTR_GID |
2116			 ATTR_MTIME | ATTR_MTIME_SET |
2117			 ATTR_ATIME | ATTR_ATIME_SET;
2118
2119	rc = ll_setattr_raw(file->f_dentry, attr, true);
2120	if (rc == -ENODATA)
2121		rc = 0;
2122
2123out:
2124	if (hss != NULL)
2125		OBD_FREE_PTR(hss);
2126
2127	if (attr != NULL)
2128		OBD_FREE_PTR(attr);
2129
2130	return rc;
2131}
2132
2133static long
2134ll_file_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
2135{
2136	struct inode		*inode = file->f_dentry->d_inode;
2137	struct ll_file_data	*fd = LUSTRE_FPRIVATE(file);
2138	int			 flags, rc;
2139
2140	CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p),cmd=%x\n", inode->i_ino,
2141	       inode->i_generation, inode, cmd);
2142	ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_IOCTL, 1);
2143
2144	/* asm-ppc{,64} declares TCGETS, et. al. as type 't' not 'T' */
2145	if (_IOC_TYPE(cmd) == 'T' || _IOC_TYPE(cmd) == 't') /* tty ioctls */
2146		return -ENOTTY;
2147
2148	switch(cmd) {
2149	case LL_IOC_GETFLAGS:
2150		/* Get the current value of the file flags */
2151		return put_user(fd->fd_flags, (int *)arg);
2152	case LL_IOC_SETFLAGS:
2153	case LL_IOC_CLRFLAGS:
2154		/* Set or clear specific file flags */
2155		/* XXX This probably needs checks to ensure the flags are
2156		 *     not abused, and to handle any flag side effects.
2157		 */
2158		if (get_user(flags, (int *) arg))
2159			return -EFAULT;
2160
2161		if (cmd == LL_IOC_SETFLAGS) {
2162			if ((flags & LL_FILE_IGNORE_LOCK) &&
2163			    !(file->f_flags & O_DIRECT)) {
2164				CERROR("%s: unable to disable locking on "
2165				       "non-O_DIRECT file\n", current->comm);
2166				return -EINVAL;
2167			}
2168
2169			fd->fd_flags |= flags;
2170		} else {
2171			fd->fd_flags &= ~flags;
2172		}
2173		return 0;
2174	case LL_IOC_LOV_SETSTRIPE:
2175		return ll_lov_setstripe(inode, file, arg);
2176	case LL_IOC_LOV_SETEA:
2177		return ll_lov_setea(inode, file, arg);
2178	case LL_IOC_LOV_SWAP_LAYOUTS: {
2179		struct file *file2;
2180		struct lustre_swap_layouts lsl;
2181
2182		if (copy_from_user(&lsl, (char *)arg,
2183				       sizeof(struct lustre_swap_layouts)))
2184			return -EFAULT;
2185
2186		if ((file->f_flags & O_ACCMODE) == 0) /* O_RDONLY */
2187			return -EPERM;
2188
2189		file2 = fget(lsl.sl_fd);
2190		if (file2 == NULL)
2191			return -EBADF;
2192
2193		rc = -EPERM;
2194		if ((file2->f_flags & O_ACCMODE) != 0) /* O_WRONLY or O_RDWR */
2195			rc = ll_swap_layouts(file, file2, &lsl);
2196		fput(file2);
2197		return rc;
2198	}
2199	case LL_IOC_LOV_GETSTRIPE:
2200		return ll_lov_getstripe(inode, arg);
2201	case LL_IOC_RECREATE_OBJ:
2202		return ll_lov_recreate_obj(inode, arg);
2203	case LL_IOC_RECREATE_FID:
2204		return ll_lov_recreate_fid(inode, arg);
2205	case FSFILT_IOC_FIEMAP:
2206		return ll_ioctl_fiemap(inode, arg);
2207	case FSFILT_IOC_GETFLAGS:
2208	case FSFILT_IOC_SETFLAGS:
2209		return ll_iocontrol(inode, file, cmd, arg);
2210	case FSFILT_IOC_GETVERSION_OLD:
2211	case FSFILT_IOC_GETVERSION:
2212		return put_user(inode->i_generation, (int *)arg);
2213	case LL_IOC_GROUP_LOCK:
2214		return ll_get_grouplock(inode, file, arg);
2215	case LL_IOC_GROUP_UNLOCK:
2216		return ll_put_grouplock(inode, file, arg);
2217	case IOC_OBD_STATFS:
2218		return ll_obd_statfs(inode, (void *)arg);
2219
2220	/* We need to special case any other ioctls we want to handle,
2221	 * to send them to the MDS/OST as appropriate and to properly
2222	 * network encode the arg field.
2223	case FSFILT_IOC_SETVERSION_OLD:
2224	case FSFILT_IOC_SETVERSION:
2225	*/
2226	case LL_IOC_FLUSHCTX:
2227		return ll_flush_ctx(inode);
2228	case LL_IOC_PATH2FID: {
2229		if (copy_to_user((void *)arg, ll_inode2fid(inode),
2230				 sizeof(struct lu_fid)))
2231			return -EFAULT;
2232
2233		return 0;
2234	}
2235	case OBD_IOC_FID2PATH:
2236		return ll_fid2path(inode, (void *)arg);
2237	case LL_IOC_DATA_VERSION: {
2238		struct ioc_data_version	idv;
2239		int			rc;
2240
2241		if (copy_from_user(&idv, (char *)arg, sizeof(idv)))
2242			return -EFAULT;
2243
2244		rc = ll_data_version(inode, &idv.idv_version,
2245				!(idv.idv_flags & LL_DV_NOFLUSH));
2246
2247		if (rc == 0 && copy_to_user((char *) arg, &idv, sizeof(idv)))
2248			return -EFAULT;
2249
2250		return rc;
2251	}
2252
2253	case LL_IOC_GET_MDTIDX: {
2254		int mdtidx;
2255
2256		mdtidx = ll_get_mdt_idx(inode);
2257		if (mdtidx < 0)
2258			return mdtidx;
2259
2260		if (put_user((int)mdtidx, (int*)arg))
2261			return -EFAULT;
2262
2263		return 0;
2264	}
2265	case OBD_IOC_GETDTNAME:
2266	case OBD_IOC_GETMDNAME:
2267		return ll_get_obd_name(inode, cmd, arg);
2268	case LL_IOC_HSM_STATE_GET: {
2269		struct md_op_data	*op_data;
2270		struct hsm_user_state	*hus;
2271		int			 rc;
2272
2273		OBD_ALLOC_PTR(hus);
2274		if (hus == NULL)
2275			return -ENOMEM;
2276
2277		op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
2278					     LUSTRE_OPC_ANY, hus);
2279		if (IS_ERR(op_data)) {
2280			OBD_FREE_PTR(hus);
2281			return PTR_ERR(op_data);
2282		}
2283
2284		rc = obd_iocontrol(cmd, ll_i2mdexp(inode), sizeof(*op_data),
2285				   op_data, NULL);
2286
2287		if (copy_to_user((void *)arg, hus, sizeof(*hus)))
2288			rc = -EFAULT;
2289
2290		ll_finish_md_op_data(op_data);
2291		OBD_FREE_PTR(hus);
2292		return rc;
2293	}
2294	case LL_IOC_HSM_STATE_SET: {
2295		struct hsm_state_set	*hss;
2296		int			 rc;
2297
2298		OBD_ALLOC_PTR(hss);
2299		if (hss == NULL)
2300			return -ENOMEM;
2301
2302		if (copy_from_user(hss, (char *)arg, sizeof(*hss))) {
2303			OBD_FREE_PTR(hss);
2304			return -EFAULT;
2305		}
2306
2307		rc = ll_hsm_state_set(inode, hss);
2308
2309		OBD_FREE_PTR(hss);
2310		return rc;
2311	}
2312	case LL_IOC_HSM_ACTION: {
2313		struct md_op_data		*op_data;
2314		struct hsm_current_action	*hca;
2315		int				 rc;
2316
2317		OBD_ALLOC_PTR(hca);
2318		if (hca == NULL)
2319			return -ENOMEM;
2320
2321		op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
2322					     LUSTRE_OPC_ANY, hca);
2323		if (IS_ERR(op_data)) {
2324			OBD_FREE_PTR(hca);
2325			return PTR_ERR(op_data);
2326		}
2327
2328		rc = obd_iocontrol(cmd, ll_i2mdexp(inode), sizeof(*op_data),
2329				   op_data, NULL);
2330
2331		if (copy_to_user((char *)arg, hca, sizeof(*hca)))
2332			rc = -EFAULT;
2333
2334		ll_finish_md_op_data(op_data);
2335		OBD_FREE_PTR(hca);
2336		return rc;
2337	}
2338	case LL_IOC_SET_LEASE: {
2339		struct ll_inode_info *lli = ll_i2info(inode);
2340		struct obd_client_handle *och = NULL;
2341		bool lease_broken;
2342		fmode_t mode = 0;
2343
2344		switch (arg) {
2345		case F_WRLCK:
2346			if (!(file->f_mode & FMODE_WRITE))
2347				return -EPERM;
2348			mode = FMODE_WRITE;
2349			break;
2350		case F_RDLCK:
2351			if (!(file->f_mode & FMODE_READ))
2352				return -EPERM;
2353			mode = FMODE_READ;
2354			break;
2355		case F_UNLCK:
2356			mutex_lock(&lli->lli_och_mutex);
2357			if (fd->fd_lease_och != NULL) {
2358				och = fd->fd_lease_och;
2359				fd->fd_lease_och = NULL;
2360			}
2361			mutex_unlock(&lli->lli_och_mutex);
2362
2363			if (och != NULL) {
2364				mode = och->och_flags &
2365				       (FMODE_READ|FMODE_WRITE);
2366				rc = ll_lease_close(och, inode, &lease_broken);
2367				if (rc == 0 && lease_broken)
2368					mode = 0;
2369			} else {
2370				rc = -ENOLCK;
2371			}
2372
2373			/* return the type of lease or error */
2374			return rc < 0 ? rc : (int)mode;
2375		default:
2376			return -EINVAL;
2377		}
2378
2379		CDEBUG(D_INODE, "Set lease with mode %d\n", mode);
2380
2381		/* apply for lease */
2382		och = ll_lease_open(inode, file, mode, 0);
2383		if (IS_ERR(och))
2384			return PTR_ERR(och);
2385
2386		rc = 0;
2387		mutex_lock(&lli->lli_och_mutex);
2388		if (fd->fd_lease_och == NULL) {
2389			fd->fd_lease_och = och;
2390			och = NULL;
2391		}
2392		mutex_unlock(&lli->lli_och_mutex);
2393		if (och != NULL) {
2394			/* impossible now that only excl is supported for now */
2395			ll_lease_close(och, inode, &lease_broken);
2396			rc = -EBUSY;
2397		}
2398		return rc;
2399	}
2400	case LL_IOC_GET_LEASE: {
2401		struct ll_inode_info *lli = ll_i2info(inode);
2402		struct ldlm_lock *lock = NULL;
2403
2404		rc = 0;
2405		mutex_lock(&lli->lli_och_mutex);
2406		if (fd->fd_lease_och != NULL) {
2407			struct obd_client_handle *och = fd->fd_lease_och;
2408
2409			lock = ldlm_handle2lock(&och->och_lease_handle);
2410			if (lock != NULL) {
2411				lock_res_and_lock(lock);
2412				if (!ldlm_is_cancel(lock))
2413					rc = och->och_flags &
2414						(FMODE_READ | FMODE_WRITE);
2415				unlock_res_and_lock(lock);
2416				ldlm_lock_put(lock);
2417			}
2418		}
2419		mutex_unlock(&lli->lli_och_mutex);
2420		return rc;
2421	}
2422	case LL_IOC_HSM_IMPORT: {
2423		struct hsm_user_import *hui;
2424
2425		OBD_ALLOC_PTR(hui);
2426		if (hui == NULL)
2427			return -ENOMEM;
2428
2429		if (copy_from_user(hui, (void *)arg, sizeof(*hui))) {
2430			OBD_FREE_PTR(hui);
2431			return -EFAULT;
2432		}
2433
2434		rc = ll_hsm_import(inode, file, hui);
2435
2436		OBD_FREE_PTR(hui);
2437		return rc;
2438	}
2439	default: {
2440		int err;
2441
2442		if (LLIOC_STOP ==
2443		     ll_iocontrol_call(inode, file, cmd, arg, &err))
2444			return err;
2445
2446		return obd_iocontrol(cmd, ll_i2dtexp(inode), 0, NULL,
2447				     (void *)arg);
2448	}
2449	}
2450}
2451
2452
2453static loff_t ll_file_seek(struct file *file, loff_t offset, int origin)
2454{
2455	struct inode *inode = file->f_dentry->d_inode;
2456	loff_t retval, eof = 0;
2457
2458	retval = offset + ((origin == SEEK_END) ? i_size_read(inode) :
2459			   (origin == SEEK_CUR) ? file->f_pos : 0);
2460	CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p), to=%llu=%#llx(%d)\n",
2461	       inode->i_ino, inode->i_generation, inode, retval, retval,
2462	       origin);
2463	ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_LLSEEK, 1);
2464
2465	if (origin == SEEK_END || origin == SEEK_HOLE || origin == SEEK_DATA) {
2466		retval = ll_glimpse_size(inode);
2467		if (retval != 0)
2468			return retval;
2469		eof = i_size_read(inode);
2470	}
2471
2472	retval = generic_file_llseek_size(file, offset, origin,
2473					  ll_file_maxbytes(inode), eof);
2474	return retval;
2475}
2476
2477static int ll_flush(struct file *file, fl_owner_t id)
2478{
2479	struct inode *inode = file->f_dentry->d_inode;
2480	struct ll_inode_info *lli = ll_i2info(inode);
2481	struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
2482	int rc, err;
2483
2484	LASSERT(!S_ISDIR(inode->i_mode));
2485
2486	/* catch async errors that were recorded back when async writeback
2487	 * failed for pages in this mapping. */
2488	rc = lli->lli_async_rc;
2489	lli->lli_async_rc = 0;
2490	err = lov_read_and_clear_async_rc(lli->lli_clob);
2491	if (rc == 0)
2492		rc = err;
2493
2494	/* The application has been told write failure already.
2495	 * Do not report failure again. */
2496	if (fd->fd_write_failed)
2497		return 0;
2498	return rc ? -EIO : 0;
2499}
2500
2501/**
2502 * Called to make sure a portion of file has been written out.
2503 * if @mode is not CL_FSYNC_LOCAL, it will send OST_SYNC RPCs to OST.
2504 *
2505 * Return how many pages have been written.
2506 */
2507int cl_sync_file_range(struct inode *inode, loff_t start, loff_t end,
2508		       enum cl_fsync_mode mode, int ignore_layout)
2509{
2510	struct cl_env_nest nest;
2511	struct lu_env *env;
2512	struct cl_io *io;
2513	struct obd_capa *capa = NULL;
2514	struct cl_fsync_io *fio;
2515	int result;
2516
2517	if (mode != CL_FSYNC_NONE && mode != CL_FSYNC_LOCAL &&
2518	    mode != CL_FSYNC_DISCARD && mode != CL_FSYNC_ALL)
2519		return -EINVAL;
2520
2521	env = cl_env_nested_get(&nest);
2522	if (IS_ERR(env))
2523		return PTR_ERR(env);
2524
2525	capa = ll_osscapa_get(inode, CAPA_OPC_OSS_WRITE);
2526
2527	io = ccc_env_thread_io(env);
2528	io->ci_obj = cl_i2info(inode)->lli_clob;
2529	io->ci_ignore_layout = ignore_layout;
2530
2531	/* initialize parameters for sync */
2532	fio = &io->u.ci_fsync;
2533	fio->fi_capa = capa;
2534	fio->fi_start = start;
2535	fio->fi_end = end;
2536	fio->fi_fid = ll_inode2fid(inode);
2537	fio->fi_mode = mode;
2538	fio->fi_nr_written = 0;
2539
2540	if (cl_io_init(env, io, CIT_FSYNC, io->ci_obj) == 0)
2541		result = cl_io_loop(env, io);
2542	else
2543		result = io->ci_result;
2544	if (result == 0)
2545		result = fio->fi_nr_written;
2546	cl_io_fini(env, io);
2547	cl_env_nested_put(&nest, env);
2548
2549	capa_put(capa);
2550
2551	return result;
2552}
2553
2554/*
2555 * When dentry is provided (the 'else' case), *file->f_dentry may be
2556 * null and dentry must be used directly rather than pulled from
2557 * *file->f_dentry as is done otherwise.
2558 */
2559
2560int ll_fsync(struct file *file, loff_t start, loff_t end, int datasync)
2561{
2562	struct dentry *dentry = file->f_dentry;
2563	struct inode *inode = dentry->d_inode;
2564	struct ll_inode_info *lli = ll_i2info(inode);
2565	struct ptlrpc_request *req;
2566	struct obd_capa *oc;
2567	int rc, err;
2568
2569	CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p)\n", inode->i_ino,
2570	       inode->i_generation, inode);
2571	ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_FSYNC, 1);
2572
2573	rc = filemap_write_and_wait_range(inode->i_mapping, start, end);
2574	mutex_lock(&inode->i_mutex);
2575
2576	/* catch async errors that were recorded back when async writeback
2577	 * failed for pages in this mapping. */
2578	if (!S_ISDIR(inode->i_mode)) {
2579		err = lli->lli_async_rc;
2580		lli->lli_async_rc = 0;
2581		if (rc == 0)
2582			rc = err;
2583		err = lov_read_and_clear_async_rc(lli->lli_clob);
2584		if (rc == 0)
2585			rc = err;
2586	}
2587
2588	oc = ll_mdscapa_get(inode);
2589	err = md_sync(ll_i2sbi(inode)->ll_md_exp, ll_inode2fid(inode), oc,
2590		      &req);
2591	capa_put(oc);
2592	if (!rc)
2593		rc = err;
2594	if (!err)
2595		ptlrpc_req_finished(req);
2596
2597	if (S_ISREG(inode->i_mode)) {
2598		struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
2599
2600		err = cl_sync_file_range(inode, start, end, CL_FSYNC_ALL, 0);
2601		if (rc == 0 && err < 0)
2602			rc = err;
2603		if (rc < 0)
2604			fd->fd_write_failed = true;
2605		else
2606			fd->fd_write_failed = false;
2607	}
2608
2609	mutex_unlock(&inode->i_mutex);
2610	return rc;
2611}
2612
2613static int
2614ll_file_flock(struct file *file, int cmd, struct file_lock *file_lock)
2615{
2616	struct inode *inode = file->f_dentry->d_inode;
2617	struct ll_sb_info *sbi = ll_i2sbi(inode);
2618	struct ldlm_enqueue_info einfo = {
2619		.ei_type	= LDLM_FLOCK,
2620		.ei_cb_cp	= ldlm_flock_completion_ast,
2621		.ei_cbdata	= file_lock,
2622	};
2623	struct md_op_data *op_data;
2624	struct lustre_handle lockh = {0};
2625	ldlm_policy_data_t flock = {{0}};
2626	__u64 flags = 0;
2627	int rc;
2628	int rc2 = 0;
2629
2630	CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu file_lock=%p\n",
2631	       inode->i_ino, file_lock);
2632
2633	ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_FLOCK, 1);
2634
2635	if (file_lock->fl_flags & FL_FLOCK)
2636		LASSERT((cmd == F_SETLKW) || (cmd == F_SETLK));
2637	else if (!(file_lock->fl_flags & FL_POSIX))
2638		return -EINVAL;
2639
2640	flock.l_flock.owner = (unsigned long)file_lock->fl_owner;
2641	flock.l_flock.pid = file_lock->fl_pid;
2642	flock.l_flock.start = file_lock->fl_start;
2643	flock.l_flock.end = file_lock->fl_end;
2644
2645	/* Somewhat ugly workaround for svc lockd.
2646	 * lockd installs custom fl_lmops->lm_compare_owner that checks
2647	 * for the fl_owner to be the same (which it always is on local node
2648	 * I guess between lockd processes) and then compares pid.
2649	 * As such we assign pid to the owner field to make it all work,
2650	 * conflict with normal locks is unlikely since pid space and
2651	 * pointer space for current->files are not intersecting */
2652	if (file_lock->fl_lmops && file_lock->fl_lmops->lm_compare_owner)
2653		flock.l_flock.owner = (unsigned long)file_lock->fl_pid;
2654
2655	switch (file_lock->fl_type) {
2656	case F_RDLCK:
2657		einfo.ei_mode = LCK_PR;
2658		break;
2659	case F_UNLCK:
2660		/* An unlock request may or may not have any relation to
2661		 * existing locks so we may not be able to pass a lock handle
2662		 * via a normal ldlm_lock_cancel() request. The request may even
2663		 * unlock a byte range in the middle of an existing lock. In
2664		 * order to process an unlock request we need all of the same
2665		 * information that is given with a normal read or write record
2666		 * lock request. To avoid creating another ldlm unlock (cancel)
2667		 * message we'll treat a LCK_NL flock request as an unlock. */
2668		einfo.ei_mode = LCK_NL;
2669		break;
2670	case F_WRLCK:
2671		einfo.ei_mode = LCK_PW;
2672		break;
2673	default:
2674		CDEBUG(D_INFO, "Unknown fcntl lock type: %d\n",
2675			file_lock->fl_type);
2676		return -ENOTSUPP;
2677	}
2678
2679	switch (cmd) {
2680	case F_SETLKW:
2681#ifdef F_SETLKW64
2682	case F_SETLKW64:
2683#endif
2684		flags = 0;
2685		break;
2686	case F_SETLK:
2687#ifdef F_SETLK64
2688	case F_SETLK64:
2689#endif
2690		flags = LDLM_FL_BLOCK_NOWAIT;
2691		break;
2692	case F_GETLK:
2693#ifdef F_GETLK64
2694	case F_GETLK64:
2695#endif
2696		flags = LDLM_FL_TEST_LOCK;
2697		/* Save the old mode so that if the mode in the lock changes we
2698		 * can decrement the appropriate reader or writer refcount. */
2699		file_lock->fl_type = einfo.ei_mode;
2700		break;
2701	default:
2702		CERROR("unknown fcntl lock command: %d\n", cmd);
2703		return -EINVAL;
2704	}
2705
2706	op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
2707				     LUSTRE_OPC_ANY, NULL);
2708	if (IS_ERR(op_data))
2709		return PTR_ERR(op_data);
2710
2711	CDEBUG(D_DLMTRACE, "inode=%lu, pid=%u, flags=%#llx, mode=%u, start=%llu, end=%llu\n",
2712	       inode->i_ino, flock.l_flock.pid, flags, einfo.ei_mode,
2713	       flock.l_flock.start, flock.l_flock.end);
2714
2715	rc = md_enqueue(sbi->ll_md_exp, &einfo, NULL,
2716			op_data, &lockh, &flock, 0, NULL /* req */, flags);
2717
2718	if ((file_lock->fl_flags & FL_FLOCK) &&
2719	    (rc == 0 || file_lock->fl_type == F_UNLCK))
2720		rc2  = flock_lock_file_wait(file, file_lock);
2721	if ((file_lock->fl_flags & FL_POSIX) &&
2722	    (rc == 0 || file_lock->fl_type == F_UNLCK) &&
2723	    !(flags & LDLM_FL_TEST_LOCK))
2724		rc2  = posix_lock_file_wait(file, file_lock);
2725
2726	if (rc2 && file_lock->fl_type != F_UNLCK) {
2727		einfo.ei_mode = LCK_NL;
2728		md_enqueue(sbi->ll_md_exp, &einfo, NULL,
2729			op_data, &lockh, &flock, 0, NULL /* req */, flags);
2730		rc = rc2;
2731	}
2732
2733	ll_finish_md_op_data(op_data);
2734
2735	return rc;
2736}
2737
2738static int
2739ll_file_noflock(struct file *file, int cmd, struct file_lock *file_lock)
2740{
2741	return -ENOSYS;
2742}
2743
2744/**
2745 * test if some locks matching bits and l_req_mode are acquired
2746 * - bits can be in different locks
2747 * - if found clear the common lock bits in *bits
2748 * - the bits not found, are kept in *bits
2749 * \param inode [IN]
2750 * \param bits [IN] searched lock bits [IN]
2751 * \param l_req_mode [IN] searched lock mode
2752 * \retval boolean, true iff all bits are found
2753 */
2754int ll_have_md_lock(struct inode *inode, __u64 *bits,  ldlm_mode_t l_req_mode)
2755{
2756	struct lustre_handle lockh;
2757	ldlm_policy_data_t policy;
2758	ldlm_mode_t mode = (l_req_mode == LCK_MINMODE) ?
2759				(LCK_CR|LCK_CW|LCK_PR|LCK_PW) : l_req_mode;
2760	struct lu_fid *fid;
2761	__u64 flags;
2762	int i;
2763
2764	if (!inode)
2765	       return 0;
2766
2767	fid = &ll_i2info(inode)->lli_fid;
2768	CDEBUG(D_INFO, "trying to match res "DFID" mode %s\n", PFID(fid),
2769	       ldlm_lockname[mode]);
2770
2771	flags = LDLM_FL_BLOCK_GRANTED | LDLM_FL_CBPENDING | LDLM_FL_TEST_LOCK;
2772	for (i = 0; i <= MDS_INODELOCK_MAXSHIFT && *bits != 0; i++) {
2773		policy.l_inodebits.bits = *bits & (1 << i);
2774		if (policy.l_inodebits.bits == 0)
2775			continue;
2776
2777		if (md_lock_match(ll_i2mdexp(inode), flags, fid, LDLM_IBITS,
2778				  &policy, mode, &lockh)) {
2779			struct ldlm_lock *lock;
2780
2781			lock = ldlm_handle2lock(&lockh);
2782			if (lock) {
2783				*bits &=
2784				      ~(lock->l_policy_data.l_inodebits.bits);
2785				LDLM_LOCK_PUT(lock);
2786			} else {
2787				*bits &= ~policy.l_inodebits.bits;
2788			}
2789		}
2790	}
2791	return *bits == 0;
2792}
2793
2794ldlm_mode_t ll_take_md_lock(struct inode *inode, __u64 bits,
2795			    struct lustre_handle *lockh, __u64 flags,
2796			    ldlm_mode_t mode)
2797{
2798	ldlm_policy_data_t policy = { .l_inodebits = {bits}};
2799	struct lu_fid *fid;
2800	ldlm_mode_t rc;
2801
2802	fid = &ll_i2info(inode)->lli_fid;
2803	CDEBUG(D_INFO, "trying to match res "DFID"\n", PFID(fid));
2804
2805	rc = md_lock_match(ll_i2mdexp(inode), LDLM_FL_BLOCK_GRANTED|flags,
2806			   fid, LDLM_IBITS, &policy, mode, lockh);
2807
2808	return rc;
2809}
2810
2811static int ll_inode_revalidate_fini(struct inode *inode, int rc)
2812{
2813	/* Already unlinked. Just update nlink and return success */
2814	if (rc == -ENOENT) {
2815		clear_nlink(inode);
2816		/* This path cannot be hit for regular files unless in
2817		 * case of obscure races, so no need to validate size.
2818		 */
2819		if (!S_ISREG(inode->i_mode) && !S_ISDIR(inode->i_mode))
2820			return 0;
2821	} else if (rc != 0) {
2822		CDEBUG_LIMIT((rc == -EACCES || rc == -EIDRM) ? D_INFO : D_ERROR,
2823			     "%s: revalidate FID "DFID" error: rc = %d\n",
2824			     ll_get_fsname(inode->i_sb, NULL, 0),
2825			     PFID(ll_inode2fid(inode)), rc);
2826	}
2827
2828	return rc;
2829}
2830
2831static int __ll_inode_revalidate(struct dentry *dentry, __u64 ibits)
2832{
2833	struct inode *inode = dentry->d_inode;
2834	struct ptlrpc_request *req = NULL;
2835	struct obd_export *exp;
2836	int rc = 0;
2837
2838	LASSERT(inode != NULL);
2839
2840	CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p),name=%s\n",
2841	       inode->i_ino, inode->i_generation, inode, dentry->d_name.name);
2842
2843	exp = ll_i2mdexp(inode);
2844
2845	/* XXX: Enable OBD_CONNECT_ATTRFID to reduce unnecessary getattr RPC.
2846	 *      But under CMD case, it caused some lock issues, should be fixed
2847	 *      with new CMD ibits lock. See bug 12718 */
2848	if (exp_connect_flags(exp) & OBD_CONNECT_ATTRFID) {
2849		struct lookup_intent oit = { .it_op = IT_GETATTR };
2850		struct md_op_data *op_data;
2851
2852		if (ibits == MDS_INODELOCK_LOOKUP)
2853			oit.it_op = IT_LOOKUP;
2854
2855		/* Call getattr by fid, so do not provide name at all. */
2856		op_data = ll_prep_md_op_data(NULL, dentry->d_inode,
2857					     dentry->d_inode, NULL, 0, 0,
2858					     LUSTRE_OPC_ANY, NULL);
2859		if (IS_ERR(op_data))
2860			return PTR_ERR(op_data);
2861
2862		oit.it_create_mode |= M_CHECK_STALE;
2863		rc = md_intent_lock(exp, op_data, NULL, 0,
2864				    /* we are not interested in name
2865				       based lookup */
2866				    &oit, 0, &req,
2867				    ll_md_blocking_ast, 0);
2868		ll_finish_md_op_data(op_data);
2869		oit.it_create_mode &= ~M_CHECK_STALE;
2870		if (rc < 0) {
2871			rc = ll_inode_revalidate_fini(inode, rc);
2872			GOTO (out, rc);
2873		}
2874
2875		rc = ll_revalidate_it_finish(req, &oit, dentry);
2876		if (rc != 0) {
2877			ll_intent_release(&oit);
2878			GOTO(out, rc);
2879		}
2880
2881		/* Unlinked? Unhash dentry, so it is not picked up later by
2882		   do_lookup() -> ll_revalidate_it(). We cannot use d_drop
2883		   here to preserve get_cwd functionality on 2.6.
2884		   Bug 10503 */
2885		if (!dentry->d_inode->i_nlink)
2886			d_lustre_invalidate(dentry, 0);
2887
2888		ll_lookup_finish_locks(&oit, dentry);
2889	} else if (!ll_have_md_lock(dentry->d_inode, &ibits, LCK_MINMODE)) {
2890		struct ll_sb_info *sbi = ll_i2sbi(dentry->d_inode);
2891		obd_valid valid = OBD_MD_FLGETATTR;
2892		struct md_op_data *op_data;
2893		int ealen = 0;
2894
2895		if (S_ISREG(inode->i_mode)) {
2896			rc = ll_get_default_mdsize(sbi, &ealen);
2897			if (rc)
2898				return rc;
2899			valid |= OBD_MD_FLEASIZE | OBD_MD_FLMODEASIZE;
2900		}
2901
2902		op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL,
2903					     0, ealen, LUSTRE_OPC_ANY,
2904					     NULL);
2905		if (IS_ERR(op_data))
2906			return PTR_ERR(op_data);
2907
2908		op_data->op_valid = valid;
2909		/* Once OBD_CONNECT_ATTRFID is not supported, we can't find one
2910		 * capa for this inode. Because we only keep capas of dirs
2911		 * fresh. */
2912		rc = md_getattr(sbi->ll_md_exp, op_data, &req);
2913		ll_finish_md_op_data(op_data);
2914		if (rc) {
2915			rc = ll_inode_revalidate_fini(inode, rc);
2916			return rc;
2917		}
2918
2919		rc = ll_prep_inode(&inode, req, NULL, NULL);
2920	}
2921out:
2922	ptlrpc_req_finished(req);
2923	return rc;
2924}
2925
2926static int ll_inode_revalidate(struct dentry *dentry, __u64 ibits)
2927{
2928	struct inode *inode = dentry->d_inode;
2929	int rc;
2930
2931	rc = __ll_inode_revalidate(dentry, ibits);
2932	if (rc != 0)
2933		return rc;
2934
2935	/* if object isn't regular file, don't validate size */
2936	if (!S_ISREG(inode->i_mode)) {
2937		LTIME_S(inode->i_atime) = ll_i2info(inode)->lli_lvb.lvb_atime;
2938		LTIME_S(inode->i_mtime) = ll_i2info(inode)->lli_lvb.lvb_mtime;
2939		LTIME_S(inode->i_ctime) = ll_i2info(inode)->lli_lvb.lvb_ctime;
2940	} else {
2941		/* In case of restore, the MDT has the right size and has
2942		 * already send it back without granting the layout lock,
2943		 * inode is up-to-date so glimpse is useless.
2944		 * Also to glimpse we need the layout, in case of a running
2945		 * restore the MDT holds the layout lock so the glimpse will
2946		 * block up to the end of restore (getattr will block)
2947		 */
2948		if (!(ll_i2info(inode)->lli_flags & LLIF_FILE_RESTORING))
2949			rc = ll_glimpse_size(inode);
2950	}
2951	return rc;
2952}
2953
2954int ll_getattr(struct vfsmount *mnt, struct dentry *de, struct kstat *stat)
2955{
2956	struct inode *inode = de->d_inode;
2957	struct ll_sb_info *sbi = ll_i2sbi(inode);
2958	struct ll_inode_info *lli = ll_i2info(inode);
2959	int res = 0;
2960
2961	res = ll_inode_revalidate(de, MDS_INODELOCK_UPDATE |
2962				      MDS_INODELOCK_LOOKUP);
2963	ll_stats_ops_tally(sbi, LPROC_LL_GETATTR, 1);
2964
2965	if (res)
2966		return res;
2967
2968	stat->dev = inode->i_sb->s_dev;
2969	if (ll_need_32bit_api(sbi))
2970		stat->ino = cl_fid_build_ino(&lli->lli_fid, 1);
2971	else
2972		stat->ino = inode->i_ino;
2973	stat->mode = inode->i_mode;
2974	stat->nlink = inode->i_nlink;
2975	stat->uid = inode->i_uid;
2976	stat->gid = inode->i_gid;
2977	stat->rdev = inode->i_rdev;
2978	stat->atime = inode->i_atime;
2979	stat->mtime = inode->i_mtime;
2980	stat->ctime = inode->i_ctime;
2981	stat->blksize = 1 << inode->i_blkbits;
2982
2983	stat->size = i_size_read(inode);
2984	stat->blocks = inode->i_blocks;
2985
2986	return 0;
2987}
2988
2989static int ll_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
2990		     __u64 start, __u64 len)
2991{
2992	int rc;
2993	size_t num_bytes;
2994	struct ll_user_fiemap *fiemap;
2995	unsigned int extent_count = fieinfo->fi_extents_max;
2996
2997	num_bytes = sizeof(*fiemap) + (extent_count *
2998				       sizeof(struct ll_fiemap_extent));
2999	OBD_ALLOC_LARGE(fiemap, num_bytes);
3000
3001	if (fiemap == NULL)
3002		return -ENOMEM;
3003
3004	fiemap->fm_flags = fieinfo->fi_flags;
3005	fiemap->fm_extent_count = fieinfo->fi_extents_max;
3006	fiemap->fm_start = start;
3007	fiemap->fm_length = len;
3008	if (extent_count > 0)
3009		memcpy(&fiemap->fm_extents[0], fieinfo->fi_extents_start,
3010		       sizeof(struct ll_fiemap_extent));
3011
3012	rc = ll_do_fiemap(inode, fiemap, num_bytes);
3013
3014	fieinfo->fi_flags = fiemap->fm_flags;
3015	fieinfo->fi_extents_mapped = fiemap->fm_mapped_extents;
3016	if (extent_count > 0)
3017		memcpy(fieinfo->fi_extents_start, &fiemap->fm_extents[0],
3018		       fiemap->fm_mapped_extents *
3019		       sizeof(struct ll_fiemap_extent));
3020
3021	OBD_FREE_LARGE(fiemap, num_bytes);
3022	return rc;
3023}
3024
3025struct posix_acl *ll_get_acl(struct inode *inode, int type)
3026{
3027	struct ll_inode_info *lli = ll_i2info(inode);
3028	struct posix_acl *acl = NULL;
3029
3030	spin_lock(&lli->lli_lock);
3031	/* VFS' acl_permission_check->check_acl will release the refcount */
3032	acl = posix_acl_dup(lli->lli_posix_acl);
3033	spin_unlock(&lli->lli_lock);
3034
3035	return acl;
3036}
3037
3038
3039int ll_inode_permission(struct inode *inode, int mask)
3040{
3041	int rc = 0;
3042
3043#ifdef MAY_NOT_BLOCK
3044	if (mask & MAY_NOT_BLOCK)
3045		return -ECHILD;
3046#endif
3047
3048       /* as root inode are NOT getting validated in lookup operation,
3049	* need to do it before permission check. */
3050
3051	if (inode == inode->i_sb->s_root->d_inode) {
3052		rc = __ll_inode_revalidate(inode->i_sb->s_root,
3053					   MDS_INODELOCK_LOOKUP);
3054		if (rc)
3055			return rc;
3056	}
3057
3058	CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p), inode mode %x mask %o\n",
3059	       inode->i_ino, inode->i_generation, inode, inode->i_mode, mask);
3060
3061	if (ll_i2sbi(inode)->ll_flags & LL_SBI_RMT_CLIENT)
3062		return lustre_check_remote_perm(inode, mask);
3063
3064	ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_INODE_PERM, 1);
3065	rc = generic_permission(inode, mask);
3066
3067	return rc;
3068}
3069
3070/* -o localflock - only provides locally consistent flock locks */
3071struct file_operations ll_file_operations = {
3072	.read	   = new_sync_read,
3073	.read_iter = ll_file_read_iter,
3074	.write	  = new_sync_write,
3075	.write_iter = ll_file_write_iter,
3076	.unlocked_ioctl = ll_file_ioctl,
3077	.open	   = ll_file_open,
3078	.release	= ll_file_release,
3079	.mmap	   = ll_file_mmap,
3080	.llseek	 = ll_file_seek,
3081	.splice_read    = ll_file_splice_read,
3082	.fsync	  = ll_fsync,
3083	.flush	  = ll_flush
3084};
3085
3086struct file_operations ll_file_operations_flock = {
3087	.read	   = new_sync_read,
3088	.read_iter    = ll_file_read_iter,
3089	.write	  = new_sync_write,
3090	.write_iter   = ll_file_write_iter,
3091	.unlocked_ioctl = ll_file_ioctl,
3092	.open	   = ll_file_open,
3093	.release	= ll_file_release,
3094	.mmap	   = ll_file_mmap,
3095	.llseek	 = ll_file_seek,
3096	.splice_read    = ll_file_splice_read,
3097	.fsync	  = ll_fsync,
3098	.flush	  = ll_flush,
3099	.flock	  = ll_file_flock,
3100	.lock	   = ll_file_flock
3101};
3102
3103/* These are for -o noflock - to return ENOSYS on flock calls */
3104struct file_operations ll_file_operations_noflock = {
3105	.read	   = new_sync_read,
3106	.read_iter    = ll_file_read_iter,
3107	.write	  = new_sync_write,
3108	.write_iter   = ll_file_write_iter,
3109	.unlocked_ioctl = ll_file_ioctl,
3110	.open	   = ll_file_open,
3111	.release	= ll_file_release,
3112	.mmap	   = ll_file_mmap,
3113	.llseek	 = ll_file_seek,
3114	.splice_read    = ll_file_splice_read,
3115	.fsync	  = ll_fsync,
3116	.flush	  = ll_flush,
3117	.flock	  = ll_file_noflock,
3118	.lock	   = ll_file_noflock
3119};
3120
3121struct inode_operations ll_file_inode_operations = {
3122	.setattr	= ll_setattr,
3123	.getattr	= ll_getattr,
3124	.permission	= ll_inode_permission,
3125	.setxattr	= ll_setxattr,
3126	.getxattr	= ll_getxattr,
3127	.listxattr	= ll_listxattr,
3128	.removexattr	= ll_removexattr,
3129	.fiemap		= ll_fiemap,
3130	.get_acl	= ll_get_acl,
3131};
3132
3133/* dynamic ioctl number support routines */
3134static struct llioc_ctl_data {
3135	struct rw_semaphore	ioc_sem;
3136	struct list_head	      ioc_head;
3137} llioc = {
3138	__RWSEM_INITIALIZER(llioc.ioc_sem),
3139	LIST_HEAD_INIT(llioc.ioc_head)
3140};
3141
3142
3143struct llioc_data {
3144	struct list_head	      iocd_list;
3145	unsigned int	    iocd_size;
3146	llioc_callback_t	iocd_cb;
3147	unsigned int	    iocd_count;
3148	unsigned int	    iocd_cmd[0];
3149};
3150
3151void *ll_iocontrol_register(llioc_callback_t cb, int count, unsigned int *cmd)
3152{
3153	unsigned int size;
3154	struct llioc_data *in_data = NULL;
3155
3156	if (cb == NULL || cmd == NULL ||
3157	    count > LLIOC_MAX_CMD || count < 0)
3158		return NULL;
3159
3160	size = sizeof(*in_data) + count * sizeof(unsigned int);
3161	OBD_ALLOC(in_data, size);
3162	if (in_data == NULL)
3163		return NULL;
3164
3165	memset(in_data, 0, sizeof(*in_data));
3166	in_data->iocd_size = size;
3167	in_data->iocd_cb = cb;
3168	in_data->iocd_count = count;
3169	memcpy(in_data->iocd_cmd, cmd, sizeof(unsigned int) * count);
3170
3171	down_write(&llioc.ioc_sem);
3172	list_add_tail(&in_data->iocd_list, &llioc.ioc_head);
3173	up_write(&llioc.ioc_sem);
3174
3175	return in_data;
3176}
3177
3178void ll_iocontrol_unregister(void *magic)
3179{
3180	struct llioc_data *tmp;
3181
3182	if (magic == NULL)
3183		return;
3184
3185	down_write(&llioc.ioc_sem);
3186	list_for_each_entry(tmp, &llioc.ioc_head, iocd_list) {
3187		if (tmp == magic) {
3188			unsigned int size = tmp->iocd_size;
3189
3190			list_del(&tmp->iocd_list);
3191			up_write(&llioc.ioc_sem);
3192
3193			OBD_FREE(tmp, size);
3194			return;
3195		}
3196	}
3197	up_write(&llioc.ioc_sem);
3198
3199	CWARN("didn't find iocontrol register block with magic: %p\n", magic);
3200}
3201
3202EXPORT_SYMBOL(ll_iocontrol_register);
3203EXPORT_SYMBOL(ll_iocontrol_unregister);
3204
3205static enum llioc_iter
3206ll_iocontrol_call(struct inode *inode, struct file *file,
3207		  unsigned int cmd, unsigned long arg, int *rcp)
3208{
3209	enum llioc_iter ret = LLIOC_CONT;
3210	struct llioc_data *data;
3211	int rc = -EINVAL, i;
3212
3213	down_read(&llioc.ioc_sem);
3214	list_for_each_entry(data, &llioc.ioc_head, iocd_list) {
3215		for (i = 0; i < data->iocd_count; i++) {
3216			if (cmd != data->iocd_cmd[i])
3217				continue;
3218
3219			ret = data->iocd_cb(inode, file, cmd, arg, data, &rc);
3220			break;
3221		}
3222
3223		if (ret == LLIOC_STOP)
3224			break;
3225	}
3226	up_read(&llioc.ioc_sem);
3227
3228	if (rcp)
3229		*rcp = rc;
3230	return ret;
3231}
3232
3233int ll_layout_conf(struct inode *inode, const struct cl_object_conf *conf)
3234{
3235	struct ll_inode_info *lli = ll_i2info(inode);
3236	struct cl_env_nest nest;
3237	struct lu_env *env;
3238	int result;
3239
3240	if (lli->lli_clob == NULL)
3241		return 0;
3242
3243	env = cl_env_nested_get(&nest);
3244	if (IS_ERR(env))
3245		return PTR_ERR(env);
3246
3247	result = cl_conf_set(env, lli->lli_clob, conf);
3248	cl_env_nested_put(&nest, env);
3249
3250	if (conf->coc_opc == OBJECT_CONF_SET) {
3251		struct ldlm_lock *lock = conf->coc_lock;
3252
3253		LASSERT(lock != NULL);
3254		LASSERT(ldlm_has_layout(lock));
3255		if (result == 0) {
3256			/* it can only be allowed to match after layout is
3257			 * applied to inode otherwise false layout would be
3258			 * seen. Applying layout should happen before dropping
3259			 * the intent lock. */
3260			ldlm_lock_allow_match(lock);
3261		}
3262	}
3263	return result;
3264}
3265
3266/* Fetch layout from MDT with getxattr request, if it's not ready yet */
3267static int ll_layout_fetch(struct inode *inode, struct ldlm_lock *lock)
3268
3269{
3270	struct ll_sb_info *sbi = ll_i2sbi(inode);
3271	struct obd_capa *oc;
3272	struct ptlrpc_request *req;
3273	struct mdt_body *body;
3274	void *lvbdata;
3275	void *lmm;
3276	int lmmsize;
3277	int rc;
3278
3279	CDEBUG(D_INODE, DFID" LVB_READY=%d l_lvb_data=%p l_lvb_len=%d\n",
3280	       PFID(ll_inode2fid(inode)), !!(lock->l_flags & LDLM_FL_LVB_READY),
3281	       lock->l_lvb_data, lock->l_lvb_len);
3282
3283	if ((lock->l_lvb_data != NULL) && (lock->l_flags & LDLM_FL_LVB_READY))
3284		return 0;
3285
3286	/* if layout lock was granted right away, the layout is returned
3287	 * within DLM_LVB of dlm reply; otherwise if the lock was ever
3288	 * blocked and then granted via completion ast, we have to fetch
3289	 * layout here. Please note that we can't use the LVB buffer in
3290	 * completion AST because it doesn't have a large enough buffer */
3291	oc = ll_mdscapa_get(inode);
3292	rc = ll_get_default_mdsize(sbi, &lmmsize);
3293	if (rc == 0)
3294		rc = md_getxattr(sbi->ll_md_exp, ll_inode2fid(inode), oc,
3295				OBD_MD_FLXATTR, XATTR_NAME_LOV, NULL, 0,
3296				lmmsize, 0, &req);
3297	capa_put(oc);
3298	if (rc < 0)
3299		return rc;
3300
3301	body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
3302	if (body == NULL)
3303		GOTO(out, rc = -EPROTO);
3304
3305	lmmsize = body->eadatasize;
3306	if (lmmsize == 0) /* empty layout */
3307		GOTO(out, rc = 0);
3308
3309	lmm = req_capsule_server_sized_get(&req->rq_pill, &RMF_EADATA, lmmsize);
3310	if (lmm == NULL)
3311		GOTO(out, rc = -EFAULT);
3312
3313	OBD_ALLOC_LARGE(lvbdata, lmmsize);
3314	if (lvbdata == NULL)
3315		GOTO(out, rc = -ENOMEM);
3316
3317	memcpy(lvbdata, lmm, lmmsize);
3318	lock_res_and_lock(lock);
3319	if (lock->l_lvb_data != NULL)
3320		OBD_FREE_LARGE(lock->l_lvb_data, lock->l_lvb_len);
3321
3322	lock->l_lvb_data = lvbdata;
3323	lock->l_lvb_len = lmmsize;
3324	unlock_res_and_lock(lock);
3325
3326out:
3327	ptlrpc_req_finished(req);
3328	return rc;
3329}
3330
3331/**
3332 * Apply the layout to the inode. Layout lock is held and will be released
3333 * in this function.
3334 */
3335static int ll_layout_lock_set(struct lustre_handle *lockh, ldlm_mode_t mode,
3336				struct inode *inode, __u32 *gen, bool reconf)
3337{
3338	struct ll_inode_info *lli = ll_i2info(inode);
3339	struct ll_sb_info    *sbi = ll_i2sbi(inode);
3340	struct ldlm_lock *lock;
3341	struct lustre_md md = { NULL };
3342	struct cl_object_conf conf;
3343	int rc = 0;
3344	bool lvb_ready;
3345	bool wait_layout = false;
3346
3347	LASSERT(lustre_handle_is_used(lockh));
3348
3349	lock = ldlm_handle2lock(lockh);
3350	LASSERT(lock != NULL);
3351	LASSERT(ldlm_has_layout(lock));
3352
3353	LDLM_DEBUG(lock, "File %p/"DFID" being reconfigured: %d.\n",
3354		   inode, PFID(&lli->lli_fid), reconf);
3355
3356	/* in case this is a caching lock and reinstate with new inode */
3357	md_set_lock_data(sbi->ll_md_exp, &lockh->cookie, inode, NULL);
3358
3359	lock_res_and_lock(lock);
3360	lvb_ready = !!(lock->l_flags & LDLM_FL_LVB_READY);
3361	unlock_res_and_lock(lock);
3362	/* checking lvb_ready is racy but this is okay. The worst case is
3363	 * that multi processes may configure the file on the same time. */
3364	if (lvb_ready || !reconf) {
3365		rc = -ENODATA;
3366		if (lvb_ready) {
3367			/* layout_gen must be valid if layout lock is not
3368			 * cancelled and stripe has already set */
3369			*gen = ll_layout_version_get(lli);
3370			rc = 0;
3371		}
3372		GOTO(out, rc);
3373	}
3374
3375	rc = ll_layout_fetch(inode, lock);
3376	if (rc < 0)
3377		GOTO(out, rc);
3378
3379	/* for layout lock, lmm is returned in lock's lvb.
3380	 * lvb_data is immutable if the lock is held so it's safe to access it
3381	 * without res lock. See the description in ldlm_lock_decref_internal()
3382	 * for the condition to free lvb_data of layout lock */
3383	if (lock->l_lvb_data != NULL) {
3384		rc = obd_unpackmd(sbi->ll_dt_exp, &md.lsm,
3385				  lock->l_lvb_data, lock->l_lvb_len);
3386		if (rc >= 0) {
3387			*gen = LL_LAYOUT_GEN_EMPTY;
3388			if (md.lsm != NULL)
3389				*gen = md.lsm->lsm_layout_gen;
3390			rc = 0;
3391		} else {
3392			CERROR("%s: file "DFID" unpackmd error: %d\n",
3393				ll_get_fsname(inode->i_sb, NULL, 0),
3394				PFID(&lli->lli_fid), rc);
3395		}
3396	}
3397	if (rc < 0)
3398		GOTO(out, rc);
3399
3400	/* set layout to file. Unlikely this will fail as old layout was
3401	 * surely eliminated */
3402	memset(&conf, 0, sizeof(conf));
3403	conf.coc_opc = OBJECT_CONF_SET;
3404	conf.coc_inode = inode;
3405	conf.coc_lock = lock;
3406	conf.u.coc_md = &md;
3407	rc = ll_layout_conf(inode, &conf);
3408
3409	if (md.lsm != NULL)
3410		obd_free_memmd(sbi->ll_dt_exp, &md.lsm);
3411
3412	/* refresh layout failed, need to wait */
3413	wait_layout = rc == -EBUSY;
3414
3415out:
3416	LDLM_LOCK_PUT(lock);
3417	ldlm_lock_decref(lockh, mode);
3418
3419	/* wait for IO to complete if it's still being used. */
3420	if (wait_layout) {
3421		CDEBUG(D_INODE, "%s: %p/"DFID" wait for layout reconf.\n",
3422			ll_get_fsname(inode->i_sb, NULL, 0),
3423			inode, PFID(&lli->lli_fid));
3424
3425		memset(&conf, 0, sizeof(conf));
3426		conf.coc_opc = OBJECT_CONF_WAIT;
3427		conf.coc_inode = inode;
3428		rc = ll_layout_conf(inode, &conf);
3429		if (rc == 0)
3430			rc = -EAGAIN;
3431
3432		CDEBUG(D_INODE, "file: "DFID" waiting layout return: %d.\n",
3433			PFID(&lli->lli_fid), rc);
3434	}
3435	return rc;
3436}
3437
3438/**
3439 * This function checks if there exists a LAYOUT lock on the client side,
3440 * or enqueues it if it doesn't have one in cache.
3441 *
3442 * This function will not hold layout lock so it may be revoked any time after
3443 * this function returns. Any operations depend on layout should be redone
3444 * in that case.
3445 *
3446 * This function should be called before lov_io_init() to get an uptodate
3447 * layout version, the caller should save the version number and after IO
3448 * is finished, this function should be called again to verify that layout
3449 * is not changed during IO time.
3450 */
3451int ll_layout_refresh(struct inode *inode, __u32 *gen)
3452{
3453	struct ll_inode_info  *lli = ll_i2info(inode);
3454	struct ll_sb_info     *sbi = ll_i2sbi(inode);
3455	struct md_op_data     *op_data;
3456	struct lookup_intent   it;
3457	struct lustre_handle   lockh;
3458	ldlm_mode_t	       mode;
3459	struct ldlm_enqueue_info einfo = {
3460		.ei_type = LDLM_IBITS,
3461		.ei_mode = LCK_CR,
3462		.ei_cb_bl = ll_md_blocking_ast,
3463		.ei_cb_cp = ldlm_completion_ast,
3464	};
3465	int rc;
3466
3467	*gen = ll_layout_version_get(lli);
3468	if (!(sbi->ll_flags & LL_SBI_LAYOUT_LOCK) || *gen != LL_LAYOUT_GEN_NONE)
3469		return 0;
3470
3471	/* sanity checks */
3472	LASSERT(fid_is_sane(ll_inode2fid(inode)));
3473	LASSERT(S_ISREG(inode->i_mode));
3474
3475	/* take layout lock mutex to enqueue layout lock exclusively. */
3476	mutex_lock(&lli->lli_layout_mutex);
3477
3478again:
3479	/* mostly layout lock is caching on the local side, so try to match
3480	 * it before grabbing layout lock mutex. */
3481	mode = ll_take_md_lock(inode, MDS_INODELOCK_LAYOUT, &lockh, 0,
3482			       LCK_CR | LCK_CW | LCK_PR | LCK_PW);
3483	if (mode != 0) { /* hit cached lock */
3484		rc = ll_layout_lock_set(&lockh, mode, inode, gen, true);
3485		if (rc == -EAGAIN)
3486			goto again;
3487
3488		mutex_unlock(&lli->lli_layout_mutex);
3489		return rc;
3490	}
3491
3492	op_data = ll_prep_md_op_data(NULL, inode, inode, NULL,
3493			0, 0, LUSTRE_OPC_ANY, NULL);
3494	if (IS_ERR(op_data)) {
3495		mutex_unlock(&lli->lli_layout_mutex);
3496		return PTR_ERR(op_data);
3497	}
3498
3499	/* have to enqueue one */
3500	memset(&it, 0, sizeof(it));
3501	it.it_op = IT_LAYOUT;
3502	lockh.cookie = 0ULL;
3503
3504	LDLM_DEBUG_NOLOCK("%s: requeue layout lock for file %p/"DFID".\n",
3505			ll_get_fsname(inode->i_sb, NULL, 0), inode,
3506			PFID(&lli->lli_fid));
3507
3508	rc = md_enqueue(sbi->ll_md_exp, &einfo, &it, op_data, &lockh,
3509			NULL, 0, NULL, 0);
3510	if (it.d.lustre.it_data != NULL)
3511		ptlrpc_req_finished(it.d.lustre.it_data);
3512	it.d.lustre.it_data = NULL;
3513
3514	ll_finish_md_op_data(op_data);
3515
3516	mode = it.d.lustre.it_lock_mode;
3517	it.d.lustre.it_lock_mode = 0;
3518	ll_intent_drop_lock(&it);
3519
3520	if (rc == 0) {
3521		/* set lock data in case this is a new lock */
3522		ll_set_lock_data(sbi->ll_md_exp, inode, &it, NULL);
3523		rc = ll_layout_lock_set(&lockh, mode, inode, gen, true);
3524		if (rc == -EAGAIN)
3525			goto again;
3526	}
3527	mutex_unlock(&lli->lli_layout_mutex);
3528
3529	return rc;
3530}
3531
3532/**
3533 *  This function send a restore request to the MDT
3534 */
3535int ll_layout_restore(struct inode *inode)
3536{
3537	struct hsm_user_request	*hur;
3538	int			 len, rc;
3539
3540	len = sizeof(struct hsm_user_request) +
3541	      sizeof(struct hsm_user_item);
3542	OBD_ALLOC(hur, len);
3543	if (hur == NULL)
3544		return -ENOMEM;
3545
3546	hur->hur_request.hr_action = HUA_RESTORE;
3547	hur->hur_request.hr_archive_id = 0;
3548	hur->hur_request.hr_flags = 0;
3549	memcpy(&hur->hur_user_item[0].hui_fid, &ll_i2info(inode)->lli_fid,
3550	       sizeof(hur->hur_user_item[0].hui_fid));
3551	hur->hur_user_item[0].hui_extent.length = -1;
3552	hur->hur_request.hr_itemcount = 1;
3553	rc = obd_iocontrol(LL_IOC_HSM_REQUEST, cl_i2sbi(inode)->ll_md_exp,
3554			   len, hur, NULL);
3555	OBD_FREE(hur, len);
3556	return rc;
3557}
3558