[go: nahoru, domu]

1/*
2 * Copyright (c) 2006, 2007, 2008, 2009 QLogic Corporation. All rights reserved.
3 * Copyright (c) 2005, 2006 PathScale, Inc. All rights reserved.
4 *
5 * This software is available to you under a choice of one of two
6 * licenses.  You may choose to be licensed under the terms of the GNU
7 * General Public License (GPL) Version 2, available from the file
8 * COPYING in the main directory of this source tree, or the
9 * OpenIB.org BSD license below:
10 *
11 *     Redistribution and use in source and binary forms, with or
12 *     without modification, are permitted provided that the following
13 *     conditions are met:
14 *
15 *      - Redistributions of source code must retain the above
16 *        copyright notice, this list of conditions and the following
17 *        disclaimer.
18 *
19 *      - Redistributions in binary form must reproduce the above
20 *        copyright notice, this list of conditions and the following
21 *        disclaimer in the documentation and/or other materials
22 *        provided with the distribution.
23 *
24 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
25 * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
26 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
27 * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
28 * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
29 * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
30 * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
31 * SOFTWARE.
32 */
33
34#include <linux/io.h>
35
36#include "qib.h"
37
38/* cut down ridiculously long IB macro names */
39#define OP(x) IB_OPCODE_RC_##x
40
41static void rc_timeout(unsigned long arg);
42
43static u32 restart_sge(struct qib_sge_state *ss, struct qib_swqe *wqe,
44		       u32 psn, u32 pmtu)
45{
46	u32 len;
47
48	len = ((psn - wqe->psn) & QIB_PSN_MASK) * pmtu;
49	ss->sge = wqe->sg_list[0];
50	ss->sg_list = wqe->sg_list + 1;
51	ss->num_sge = wqe->wr.num_sge;
52	ss->total_len = wqe->length;
53	qib_skip_sge(ss, len, 0);
54	return wqe->length - len;
55}
56
57static void start_timer(struct qib_qp *qp)
58{
59	qp->s_flags |= QIB_S_TIMER;
60	qp->s_timer.function = rc_timeout;
61	/* 4.096 usec. * (1 << qp->timeout) */
62	qp->s_timer.expires = jiffies + qp->timeout_jiffies;
63	add_timer(&qp->s_timer);
64}
65
66/**
67 * qib_make_rc_ack - construct a response packet (ACK, NAK, or RDMA read)
68 * @dev: the device for this QP
69 * @qp: a pointer to the QP
70 * @ohdr: a pointer to the IB header being constructed
71 * @pmtu: the path MTU
72 *
73 * Return 1 if constructed; otherwise, return 0.
74 * Note that we are in the responder's side of the QP context.
75 * Note the QP s_lock must be held.
76 */
77static int qib_make_rc_ack(struct qib_ibdev *dev, struct qib_qp *qp,
78			   struct qib_other_headers *ohdr, u32 pmtu)
79{
80	struct qib_ack_entry *e;
81	u32 hwords;
82	u32 len;
83	u32 bth0;
84	u32 bth2;
85
86	/* Don't send an ACK if we aren't supposed to. */
87	if (!(ib_qib_state_ops[qp->state] & QIB_PROCESS_RECV_OK))
88		goto bail;
89
90	/* header size in 32-bit words LRH+BTH = (8+12)/4. */
91	hwords = 5;
92
93	switch (qp->s_ack_state) {
94	case OP(RDMA_READ_RESPONSE_LAST):
95	case OP(RDMA_READ_RESPONSE_ONLY):
96		e = &qp->s_ack_queue[qp->s_tail_ack_queue];
97		if (e->rdma_sge.mr) {
98			qib_put_mr(e->rdma_sge.mr);
99			e->rdma_sge.mr = NULL;
100		}
101		/* FALLTHROUGH */
102	case OP(ATOMIC_ACKNOWLEDGE):
103		/*
104		 * We can increment the tail pointer now that the last
105		 * response has been sent instead of only being
106		 * constructed.
107		 */
108		if (++qp->s_tail_ack_queue > QIB_MAX_RDMA_ATOMIC)
109			qp->s_tail_ack_queue = 0;
110		/* FALLTHROUGH */
111	case OP(SEND_ONLY):
112	case OP(ACKNOWLEDGE):
113		/* Check for no next entry in the queue. */
114		if (qp->r_head_ack_queue == qp->s_tail_ack_queue) {
115			if (qp->s_flags & QIB_S_ACK_PENDING)
116				goto normal;
117			goto bail;
118		}
119
120		e = &qp->s_ack_queue[qp->s_tail_ack_queue];
121		if (e->opcode == OP(RDMA_READ_REQUEST)) {
122			/*
123			 * If a RDMA read response is being resent and
124			 * we haven't seen the duplicate request yet,
125			 * then stop sending the remaining responses the
126			 * responder has seen until the requester resends it.
127			 */
128			len = e->rdma_sge.sge_length;
129			if (len && !e->rdma_sge.mr) {
130				qp->s_tail_ack_queue = qp->r_head_ack_queue;
131				goto bail;
132			}
133			/* Copy SGE state in case we need to resend */
134			qp->s_rdma_mr = e->rdma_sge.mr;
135			if (qp->s_rdma_mr)
136				qib_get_mr(qp->s_rdma_mr);
137			qp->s_ack_rdma_sge.sge = e->rdma_sge;
138			qp->s_ack_rdma_sge.num_sge = 1;
139			qp->s_cur_sge = &qp->s_ack_rdma_sge;
140			if (len > pmtu) {
141				len = pmtu;
142				qp->s_ack_state = OP(RDMA_READ_RESPONSE_FIRST);
143			} else {
144				qp->s_ack_state = OP(RDMA_READ_RESPONSE_ONLY);
145				e->sent = 1;
146			}
147			ohdr->u.aeth = qib_compute_aeth(qp);
148			hwords++;
149			qp->s_ack_rdma_psn = e->psn;
150			bth2 = qp->s_ack_rdma_psn++ & QIB_PSN_MASK;
151		} else {
152			/* COMPARE_SWAP or FETCH_ADD */
153			qp->s_cur_sge = NULL;
154			len = 0;
155			qp->s_ack_state = OP(ATOMIC_ACKNOWLEDGE);
156			ohdr->u.at.aeth = qib_compute_aeth(qp);
157			ohdr->u.at.atomic_ack_eth[0] =
158				cpu_to_be32(e->atomic_data >> 32);
159			ohdr->u.at.atomic_ack_eth[1] =
160				cpu_to_be32(e->atomic_data);
161			hwords += sizeof(ohdr->u.at) / sizeof(u32);
162			bth2 = e->psn & QIB_PSN_MASK;
163			e->sent = 1;
164		}
165		bth0 = qp->s_ack_state << 24;
166		break;
167
168	case OP(RDMA_READ_RESPONSE_FIRST):
169		qp->s_ack_state = OP(RDMA_READ_RESPONSE_MIDDLE);
170		/* FALLTHROUGH */
171	case OP(RDMA_READ_RESPONSE_MIDDLE):
172		qp->s_cur_sge = &qp->s_ack_rdma_sge;
173		qp->s_rdma_mr = qp->s_ack_rdma_sge.sge.mr;
174		if (qp->s_rdma_mr)
175			qib_get_mr(qp->s_rdma_mr);
176		len = qp->s_ack_rdma_sge.sge.sge_length;
177		if (len > pmtu)
178			len = pmtu;
179		else {
180			ohdr->u.aeth = qib_compute_aeth(qp);
181			hwords++;
182			qp->s_ack_state = OP(RDMA_READ_RESPONSE_LAST);
183			e = &qp->s_ack_queue[qp->s_tail_ack_queue];
184			e->sent = 1;
185		}
186		bth0 = qp->s_ack_state << 24;
187		bth2 = qp->s_ack_rdma_psn++ & QIB_PSN_MASK;
188		break;
189
190	default:
191normal:
192		/*
193		 * Send a regular ACK.
194		 * Set the s_ack_state so we wait until after sending
195		 * the ACK before setting s_ack_state to ACKNOWLEDGE
196		 * (see above).
197		 */
198		qp->s_ack_state = OP(SEND_ONLY);
199		qp->s_flags &= ~QIB_S_ACK_PENDING;
200		qp->s_cur_sge = NULL;
201		if (qp->s_nak_state)
202			ohdr->u.aeth =
203				cpu_to_be32((qp->r_msn & QIB_MSN_MASK) |
204					    (qp->s_nak_state <<
205					     QIB_AETH_CREDIT_SHIFT));
206		else
207			ohdr->u.aeth = qib_compute_aeth(qp);
208		hwords++;
209		len = 0;
210		bth0 = OP(ACKNOWLEDGE) << 24;
211		bth2 = qp->s_ack_psn & QIB_PSN_MASK;
212	}
213	qp->s_rdma_ack_cnt++;
214	qp->s_hdrwords = hwords;
215	qp->s_cur_size = len;
216	qib_make_ruc_header(qp, ohdr, bth0, bth2);
217	return 1;
218
219bail:
220	qp->s_ack_state = OP(ACKNOWLEDGE);
221	qp->s_flags &= ~(QIB_S_RESP_PENDING | QIB_S_ACK_PENDING);
222	return 0;
223}
224
225/**
226 * qib_make_rc_req - construct a request packet (SEND, RDMA r/w, ATOMIC)
227 * @qp: a pointer to the QP
228 *
229 * Return 1 if constructed; otherwise, return 0.
230 */
231int qib_make_rc_req(struct qib_qp *qp)
232{
233	struct qib_ibdev *dev = to_idev(qp->ibqp.device);
234	struct qib_other_headers *ohdr;
235	struct qib_sge_state *ss;
236	struct qib_swqe *wqe;
237	u32 hwords;
238	u32 len;
239	u32 bth0;
240	u32 bth2;
241	u32 pmtu = qp->pmtu;
242	char newreq;
243	unsigned long flags;
244	int ret = 0;
245	int delta;
246
247	ohdr = &qp->s_hdr->u.oth;
248	if (qp->remote_ah_attr.ah_flags & IB_AH_GRH)
249		ohdr = &qp->s_hdr->u.l.oth;
250
251	/*
252	 * The lock is needed to synchronize between the sending tasklet,
253	 * the receive interrupt handler, and timeout resends.
254	 */
255	spin_lock_irqsave(&qp->s_lock, flags);
256
257	/* Sending responses has higher priority over sending requests. */
258	if ((qp->s_flags & QIB_S_RESP_PENDING) &&
259	    qib_make_rc_ack(dev, qp, ohdr, pmtu))
260		goto done;
261
262	if (!(ib_qib_state_ops[qp->state] & QIB_PROCESS_SEND_OK)) {
263		if (!(ib_qib_state_ops[qp->state] & QIB_FLUSH_SEND))
264			goto bail;
265		/* We are in the error state, flush the work request. */
266		if (qp->s_last == qp->s_head)
267			goto bail;
268		/* If DMAs are in progress, we can't flush immediately. */
269		if (atomic_read(&qp->s_dma_busy)) {
270			qp->s_flags |= QIB_S_WAIT_DMA;
271			goto bail;
272		}
273		wqe = get_swqe_ptr(qp, qp->s_last);
274		qib_send_complete(qp, wqe, qp->s_last != qp->s_acked ?
275			IB_WC_SUCCESS : IB_WC_WR_FLUSH_ERR);
276		/* will get called again */
277		goto done;
278	}
279
280	if (qp->s_flags & (QIB_S_WAIT_RNR | QIB_S_WAIT_ACK))
281		goto bail;
282
283	if (qib_cmp24(qp->s_psn, qp->s_sending_hpsn) <= 0) {
284		if (qib_cmp24(qp->s_sending_psn, qp->s_sending_hpsn) <= 0) {
285			qp->s_flags |= QIB_S_WAIT_PSN;
286			goto bail;
287		}
288		qp->s_sending_psn = qp->s_psn;
289		qp->s_sending_hpsn = qp->s_psn - 1;
290	}
291
292	/* header size in 32-bit words LRH+BTH = (8+12)/4. */
293	hwords = 5;
294	bth0 = 0;
295
296	/* Send a request. */
297	wqe = get_swqe_ptr(qp, qp->s_cur);
298	switch (qp->s_state) {
299	default:
300		if (!(ib_qib_state_ops[qp->state] & QIB_PROCESS_NEXT_SEND_OK))
301			goto bail;
302		/*
303		 * Resend an old request or start a new one.
304		 *
305		 * We keep track of the current SWQE so that
306		 * we don't reset the "furthest progress" state
307		 * if we need to back up.
308		 */
309		newreq = 0;
310		if (qp->s_cur == qp->s_tail) {
311			/* Check if send work queue is empty. */
312			if (qp->s_tail == qp->s_head)
313				goto bail;
314			/*
315			 * If a fence is requested, wait for previous
316			 * RDMA read and atomic operations to finish.
317			 */
318			if ((wqe->wr.send_flags & IB_SEND_FENCE) &&
319			    qp->s_num_rd_atomic) {
320				qp->s_flags |= QIB_S_WAIT_FENCE;
321				goto bail;
322			}
323			wqe->psn = qp->s_next_psn;
324			newreq = 1;
325		}
326		/*
327		 * Note that we have to be careful not to modify the
328		 * original work request since we may need to resend
329		 * it.
330		 */
331		len = wqe->length;
332		ss = &qp->s_sge;
333		bth2 = qp->s_psn & QIB_PSN_MASK;
334		switch (wqe->wr.opcode) {
335		case IB_WR_SEND:
336		case IB_WR_SEND_WITH_IMM:
337			/* If no credit, return. */
338			if (!(qp->s_flags & QIB_S_UNLIMITED_CREDIT) &&
339			    qib_cmp24(wqe->ssn, qp->s_lsn + 1) > 0) {
340				qp->s_flags |= QIB_S_WAIT_SSN_CREDIT;
341				goto bail;
342			}
343			wqe->lpsn = wqe->psn;
344			if (len > pmtu) {
345				wqe->lpsn += (len - 1) / pmtu;
346				qp->s_state = OP(SEND_FIRST);
347				len = pmtu;
348				break;
349			}
350			if (wqe->wr.opcode == IB_WR_SEND)
351				qp->s_state = OP(SEND_ONLY);
352			else {
353				qp->s_state = OP(SEND_ONLY_WITH_IMMEDIATE);
354				/* Immediate data comes after the BTH */
355				ohdr->u.imm_data = wqe->wr.ex.imm_data;
356				hwords += 1;
357			}
358			if (wqe->wr.send_flags & IB_SEND_SOLICITED)
359				bth0 |= IB_BTH_SOLICITED;
360			bth2 |= IB_BTH_REQ_ACK;
361			if (++qp->s_cur == qp->s_size)
362				qp->s_cur = 0;
363			break;
364
365		case IB_WR_RDMA_WRITE:
366			if (newreq && !(qp->s_flags & QIB_S_UNLIMITED_CREDIT))
367				qp->s_lsn++;
368			/* FALLTHROUGH */
369		case IB_WR_RDMA_WRITE_WITH_IMM:
370			/* If no credit, return. */
371			if (!(qp->s_flags & QIB_S_UNLIMITED_CREDIT) &&
372			    qib_cmp24(wqe->ssn, qp->s_lsn + 1) > 0) {
373				qp->s_flags |= QIB_S_WAIT_SSN_CREDIT;
374				goto bail;
375			}
376			ohdr->u.rc.reth.vaddr =
377				cpu_to_be64(wqe->wr.wr.rdma.remote_addr);
378			ohdr->u.rc.reth.rkey =
379				cpu_to_be32(wqe->wr.wr.rdma.rkey);
380			ohdr->u.rc.reth.length = cpu_to_be32(len);
381			hwords += sizeof(struct ib_reth) / sizeof(u32);
382			wqe->lpsn = wqe->psn;
383			if (len > pmtu) {
384				wqe->lpsn += (len - 1) / pmtu;
385				qp->s_state = OP(RDMA_WRITE_FIRST);
386				len = pmtu;
387				break;
388			}
389			if (wqe->wr.opcode == IB_WR_RDMA_WRITE)
390				qp->s_state = OP(RDMA_WRITE_ONLY);
391			else {
392				qp->s_state =
393					OP(RDMA_WRITE_ONLY_WITH_IMMEDIATE);
394				/* Immediate data comes after RETH */
395				ohdr->u.rc.imm_data = wqe->wr.ex.imm_data;
396				hwords += 1;
397				if (wqe->wr.send_flags & IB_SEND_SOLICITED)
398					bth0 |= IB_BTH_SOLICITED;
399			}
400			bth2 |= IB_BTH_REQ_ACK;
401			if (++qp->s_cur == qp->s_size)
402				qp->s_cur = 0;
403			break;
404
405		case IB_WR_RDMA_READ:
406			/*
407			 * Don't allow more operations to be started
408			 * than the QP limits allow.
409			 */
410			if (newreq) {
411				if (qp->s_num_rd_atomic >=
412				    qp->s_max_rd_atomic) {
413					qp->s_flags |= QIB_S_WAIT_RDMAR;
414					goto bail;
415				}
416				qp->s_num_rd_atomic++;
417				if (!(qp->s_flags & QIB_S_UNLIMITED_CREDIT))
418					qp->s_lsn++;
419				/*
420				 * Adjust s_next_psn to count the
421				 * expected number of responses.
422				 */
423				if (len > pmtu)
424					qp->s_next_psn += (len - 1) / pmtu;
425				wqe->lpsn = qp->s_next_psn++;
426			}
427			ohdr->u.rc.reth.vaddr =
428				cpu_to_be64(wqe->wr.wr.rdma.remote_addr);
429			ohdr->u.rc.reth.rkey =
430				cpu_to_be32(wqe->wr.wr.rdma.rkey);
431			ohdr->u.rc.reth.length = cpu_to_be32(len);
432			qp->s_state = OP(RDMA_READ_REQUEST);
433			hwords += sizeof(ohdr->u.rc.reth) / sizeof(u32);
434			ss = NULL;
435			len = 0;
436			bth2 |= IB_BTH_REQ_ACK;
437			if (++qp->s_cur == qp->s_size)
438				qp->s_cur = 0;
439			break;
440
441		case IB_WR_ATOMIC_CMP_AND_SWP:
442		case IB_WR_ATOMIC_FETCH_AND_ADD:
443			/*
444			 * Don't allow more operations to be started
445			 * than the QP limits allow.
446			 */
447			if (newreq) {
448				if (qp->s_num_rd_atomic >=
449				    qp->s_max_rd_atomic) {
450					qp->s_flags |= QIB_S_WAIT_RDMAR;
451					goto bail;
452				}
453				qp->s_num_rd_atomic++;
454				if (!(qp->s_flags & QIB_S_UNLIMITED_CREDIT))
455					qp->s_lsn++;
456				wqe->lpsn = wqe->psn;
457			}
458			if (wqe->wr.opcode == IB_WR_ATOMIC_CMP_AND_SWP) {
459				qp->s_state = OP(COMPARE_SWAP);
460				ohdr->u.atomic_eth.swap_data = cpu_to_be64(
461					wqe->wr.wr.atomic.swap);
462				ohdr->u.atomic_eth.compare_data = cpu_to_be64(
463					wqe->wr.wr.atomic.compare_add);
464			} else {
465				qp->s_state = OP(FETCH_ADD);
466				ohdr->u.atomic_eth.swap_data = cpu_to_be64(
467					wqe->wr.wr.atomic.compare_add);
468				ohdr->u.atomic_eth.compare_data = 0;
469			}
470			ohdr->u.atomic_eth.vaddr[0] = cpu_to_be32(
471				wqe->wr.wr.atomic.remote_addr >> 32);
472			ohdr->u.atomic_eth.vaddr[1] = cpu_to_be32(
473				wqe->wr.wr.atomic.remote_addr);
474			ohdr->u.atomic_eth.rkey = cpu_to_be32(
475				wqe->wr.wr.atomic.rkey);
476			hwords += sizeof(struct ib_atomic_eth) / sizeof(u32);
477			ss = NULL;
478			len = 0;
479			bth2 |= IB_BTH_REQ_ACK;
480			if (++qp->s_cur == qp->s_size)
481				qp->s_cur = 0;
482			break;
483
484		default:
485			goto bail;
486		}
487		qp->s_sge.sge = wqe->sg_list[0];
488		qp->s_sge.sg_list = wqe->sg_list + 1;
489		qp->s_sge.num_sge = wqe->wr.num_sge;
490		qp->s_sge.total_len = wqe->length;
491		qp->s_len = wqe->length;
492		if (newreq) {
493			qp->s_tail++;
494			if (qp->s_tail >= qp->s_size)
495				qp->s_tail = 0;
496		}
497		if (wqe->wr.opcode == IB_WR_RDMA_READ)
498			qp->s_psn = wqe->lpsn + 1;
499		else {
500			qp->s_psn++;
501			if (qib_cmp24(qp->s_psn, qp->s_next_psn) > 0)
502				qp->s_next_psn = qp->s_psn;
503		}
504		break;
505
506	case OP(RDMA_READ_RESPONSE_FIRST):
507		/*
508		 * qp->s_state is normally set to the opcode of the
509		 * last packet constructed for new requests and therefore
510		 * is never set to RDMA read response.
511		 * RDMA_READ_RESPONSE_FIRST is used by the ACK processing
512		 * thread to indicate a SEND needs to be restarted from an
513		 * earlier PSN without interferring with the sending thread.
514		 * See qib_restart_rc().
515		 */
516		qp->s_len = restart_sge(&qp->s_sge, wqe, qp->s_psn, pmtu);
517		/* FALLTHROUGH */
518	case OP(SEND_FIRST):
519		qp->s_state = OP(SEND_MIDDLE);
520		/* FALLTHROUGH */
521	case OP(SEND_MIDDLE):
522		bth2 = qp->s_psn++ & QIB_PSN_MASK;
523		if (qib_cmp24(qp->s_psn, qp->s_next_psn) > 0)
524			qp->s_next_psn = qp->s_psn;
525		ss = &qp->s_sge;
526		len = qp->s_len;
527		if (len > pmtu) {
528			len = pmtu;
529			break;
530		}
531		if (wqe->wr.opcode == IB_WR_SEND)
532			qp->s_state = OP(SEND_LAST);
533		else {
534			qp->s_state = OP(SEND_LAST_WITH_IMMEDIATE);
535			/* Immediate data comes after the BTH */
536			ohdr->u.imm_data = wqe->wr.ex.imm_data;
537			hwords += 1;
538		}
539		if (wqe->wr.send_flags & IB_SEND_SOLICITED)
540			bth0 |= IB_BTH_SOLICITED;
541		bth2 |= IB_BTH_REQ_ACK;
542		qp->s_cur++;
543		if (qp->s_cur >= qp->s_size)
544			qp->s_cur = 0;
545		break;
546
547	case OP(RDMA_READ_RESPONSE_LAST):
548		/*
549		 * qp->s_state is normally set to the opcode of the
550		 * last packet constructed for new requests and therefore
551		 * is never set to RDMA read response.
552		 * RDMA_READ_RESPONSE_LAST is used by the ACK processing
553		 * thread to indicate a RDMA write needs to be restarted from
554		 * an earlier PSN without interferring with the sending thread.
555		 * See qib_restart_rc().
556		 */
557		qp->s_len = restart_sge(&qp->s_sge, wqe, qp->s_psn, pmtu);
558		/* FALLTHROUGH */
559	case OP(RDMA_WRITE_FIRST):
560		qp->s_state = OP(RDMA_WRITE_MIDDLE);
561		/* FALLTHROUGH */
562	case OP(RDMA_WRITE_MIDDLE):
563		bth2 = qp->s_psn++ & QIB_PSN_MASK;
564		if (qib_cmp24(qp->s_psn, qp->s_next_psn) > 0)
565			qp->s_next_psn = qp->s_psn;
566		ss = &qp->s_sge;
567		len = qp->s_len;
568		if (len > pmtu) {
569			len = pmtu;
570			break;
571		}
572		if (wqe->wr.opcode == IB_WR_RDMA_WRITE)
573			qp->s_state = OP(RDMA_WRITE_LAST);
574		else {
575			qp->s_state = OP(RDMA_WRITE_LAST_WITH_IMMEDIATE);
576			/* Immediate data comes after the BTH */
577			ohdr->u.imm_data = wqe->wr.ex.imm_data;
578			hwords += 1;
579			if (wqe->wr.send_flags & IB_SEND_SOLICITED)
580				bth0 |= IB_BTH_SOLICITED;
581		}
582		bth2 |= IB_BTH_REQ_ACK;
583		qp->s_cur++;
584		if (qp->s_cur >= qp->s_size)
585			qp->s_cur = 0;
586		break;
587
588	case OP(RDMA_READ_RESPONSE_MIDDLE):
589		/*
590		 * qp->s_state is normally set to the opcode of the
591		 * last packet constructed for new requests and therefore
592		 * is never set to RDMA read response.
593		 * RDMA_READ_RESPONSE_MIDDLE is used by the ACK processing
594		 * thread to indicate a RDMA read needs to be restarted from
595		 * an earlier PSN without interferring with the sending thread.
596		 * See qib_restart_rc().
597		 */
598		len = ((qp->s_psn - wqe->psn) & QIB_PSN_MASK) * pmtu;
599		ohdr->u.rc.reth.vaddr =
600			cpu_to_be64(wqe->wr.wr.rdma.remote_addr + len);
601		ohdr->u.rc.reth.rkey =
602			cpu_to_be32(wqe->wr.wr.rdma.rkey);
603		ohdr->u.rc.reth.length = cpu_to_be32(wqe->length - len);
604		qp->s_state = OP(RDMA_READ_REQUEST);
605		hwords += sizeof(ohdr->u.rc.reth) / sizeof(u32);
606		bth2 = (qp->s_psn & QIB_PSN_MASK) | IB_BTH_REQ_ACK;
607		qp->s_psn = wqe->lpsn + 1;
608		ss = NULL;
609		len = 0;
610		qp->s_cur++;
611		if (qp->s_cur == qp->s_size)
612			qp->s_cur = 0;
613		break;
614	}
615	qp->s_sending_hpsn = bth2;
616	delta = (((int) bth2 - (int) wqe->psn) << 8) >> 8;
617	if (delta && delta % QIB_PSN_CREDIT == 0)
618		bth2 |= IB_BTH_REQ_ACK;
619	if (qp->s_flags & QIB_S_SEND_ONE) {
620		qp->s_flags &= ~QIB_S_SEND_ONE;
621		qp->s_flags |= QIB_S_WAIT_ACK;
622		bth2 |= IB_BTH_REQ_ACK;
623	}
624	qp->s_len -= len;
625	qp->s_hdrwords = hwords;
626	qp->s_cur_sge = ss;
627	qp->s_cur_size = len;
628	qib_make_ruc_header(qp, ohdr, bth0 | (qp->s_state << 24), bth2);
629done:
630	ret = 1;
631	goto unlock;
632
633bail:
634	qp->s_flags &= ~QIB_S_BUSY;
635unlock:
636	spin_unlock_irqrestore(&qp->s_lock, flags);
637	return ret;
638}
639
640/**
641 * qib_send_rc_ack - Construct an ACK packet and send it
642 * @qp: a pointer to the QP
643 *
644 * This is called from qib_rc_rcv() and qib_kreceive().
645 * Note that RDMA reads and atomics are handled in the
646 * send side QP state and tasklet.
647 */
648void qib_send_rc_ack(struct qib_qp *qp)
649{
650	struct qib_devdata *dd = dd_from_ibdev(qp->ibqp.device);
651	struct qib_ibport *ibp = to_iport(qp->ibqp.device, qp->port_num);
652	struct qib_pportdata *ppd = ppd_from_ibp(ibp);
653	u64 pbc;
654	u16 lrh0;
655	u32 bth0;
656	u32 hwords;
657	u32 pbufn;
658	u32 __iomem *piobuf;
659	struct qib_ib_header hdr;
660	struct qib_other_headers *ohdr;
661	u32 control;
662	unsigned long flags;
663
664	spin_lock_irqsave(&qp->s_lock, flags);
665
666	if (!(ib_qib_state_ops[qp->state] & QIB_PROCESS_RECV_OK))
667		goto unlock;
668
669	/* Don't send ACK or NAK if a RDMA read or atomic is pending. */
670	if ((qp->s_flags & QIB_S_RESP_PENDING) || qp->s_rdma_ack_cnt)
671		goto queue_ack;
672
673	/* Construct the header with s_lock held so APM doesn't change it. */
674	ohdr = &hdr.u.oth;
675	lrh0 = QIB_LRH_BTH;
676	/* header size in 32-bit words LRH+BTH+AETH = (8+12+4)/4. */
677	hwords = 6;
678	if (unlikely(qp->remote_ah_attr.ah_flags & IB_AH_GRH)) {
679		hwords += qib_make_grh(ibp, &hdr.u.l.grh,
680				       &qp->remote_ah_attr.grh, hwords, 0);
681		ohdr = &hdr.u.l.oth;
682		lrh0 = QIB_LRH_GRH;
683	}
684	/* read pkey_index w/o lock (its atomic) */
685	bth0 = qib_get_pkey(ibp, qp->s_pkey_index) | (OP(ACKNOWLEDGE) << 24);
686	if (qp->s_mig_state == IB_MIG_MIGRATED)
687		bth0 |= IB_BTH_MIG_REQ;
688	if (qp->r_nak_state)
689		ohdr->u.aeth = cpu_to_be32((qp->r_msn & QIB_MSN_MASK) |
690					    (qp->r_nak_state <<
691					     QIB_AETH_CREDIT_SHIFT));
692	else
693		ohdr->u.aeth = qib_compute_aeth(qp);
694	lrh0 |= ibp->sl_to_vl[qp->remote_ah_attr.sl] << 12 |
695		qp->remote_ah_attr.sl << 4;
696	hdr.lrh[0] = cpu_to_be16(lrh0);
697	hdr.lrh[1] = cpu_to_be16(qp->remote_ah_attr.dlid);
698	hdr.lrh[2] = cpu_to_be16(hwords + SIZE_OF_CRC);
699	hdr.lrh[3] = cpu_to_be16(ppd->lid | qp->remote_ah_attr.src_path_bits);
700	ohdr->bth[0] = cpu_to_be32(bth0);
701	ohdr->bth[1] = cpu_to_be32(qp->remote_qpn);
702	ohdr->bth[2] = cpu_to_be32(qp->r_ack_psn & QIB_PSN_MASK);
703
704	spin_unlock_irqrestore(&qp->s_lock, flags);
705
706	/* Don't try to send ACKs if the link isn't ACTIVE */
707	if (!(ppd->lflags & QIBL_LINKACTIVE))
708		goto done;
709
710	control = dd->f_setpbc_control(ppd, hwords + SIZE_OF_CRC,
711				       qp->s_srate, lrh0 >> 12);
712	/* length is + 1 for the control dword */
713	pbc = ((u64) control << 32) | (hwords + 1);
714
715	piobuf = dd->f_getsendbuf(ppd, pbc, &pbufn);
716	if (!piobuf) {
717		/*
718		 * We are out of PIO buffers at the moment.
719		 * Pass responsibility for sending the ACK to the
720		 * send tasklet so that when a PIO buffer becomes
721		 * available, the ACK is sent ahead of other outgoing
722		 * packets.
723		 */
724		spin_lock_irqsave(&qp->s_lock, flags);
725		goto queue_ack;
726	}
727
728	/*
729	 * Write the pbc.
730	 * We have to flush after the PBC for correctness
731	 * on some cpus or WC buffer can be written out of order.
732	 */
733	writeq(pbc, piobuf);
734
735	if (dd->flags & QIB_PIO_FLUSH_WC) {
736		u32 *hdrp = (u32 *) &hdr;
737
738		qib_flush_wc();
739		qib_pio_copy(piobuf + 2, hdrp, hwords - 1);
740		qib_flush_wc();
741		__raw_writel(hdrp[hwords - 1], piobuf + hwords + 1);
742	} else
743		qib_pio_copy(piobuf + 2, (u32 *) &hdr, hwords);
744
745	if (dd->flags & QIB_USE_SPCL_TRIG) {
746		u32 spcl_off = (pbufn >= dd->piobcnt2k) ? 2047 : 1023;
747
748		qib_flush_wc();
749		__raw_writel(0xaebecede, piobuf + spcl_off);
750	}
751
752	qib_flush_wc();
753	qib_sendbuf_done(dd, pbufn);
754
755	this_cpu_inc(ibp->pmastats->n_unicast_xmit);
756	goto done;
757
758queue_ack:
759	if (ib_qib_state_ops[qp->state] & QIB_PROCESS_RECV_OK) {
760		ibp->n_rc_qacks++;
761		qp->s_flags |= QIB_S_ACK_PENDING | QIB_S_RESP_PENDING;
762		qp->s_nak_state = qp->r_nak_state;
763		qp->s_ack_psn = qp->r_ack_psn;
764
765		/* Schedule the send tasklet. */
766		qib_schedule_send(qp);
767	}
768unlock:
769	spin_unlock_irqrestore(&qp->s_lock, flags);
770done:
771	return;
772}
773
774/**
775 * reset_psn - reset the QP state to send starting from PSN
776 * @qp: the QP
777 * @psn: the packet sequence number to restart at
778 *
779 * This is called from qib_rc_rcv() to process an incoming RC ACK
780 * for the given QP.
781 * Called at interrupt level with the QP s_lock held.
782 */
783static void reset_psn(struct qib_qp *qp, u32 psn)
784{
785	u32 n = qp->s_acked;
786	struct qib_swqe *wqe = get_swqe_ptr(qp, n);
787	u32 opcode;
788
789	qp->s_cur = n;
790
791	/*
792	 * If we are starting the request from the beginning,
793	 * let the normal send code handle initialization.
794	 */
795	if (qib_cmp24(psn, wqe->psn) <= 0) {
796		qp->s_state = OP(SEND_LAST);
797		goto done;
798	}
799
800	/* Find the work request opcode corresponding to the given PSN. */
801	opcode = wqe->wr.opcode;
802	for (;;) {
803		int diff;
804
805		if (++n == qp->s_size)
806			n = 0;
807		if (n == qp->s_tail)
808			break;
809		wqe = get_swqe_ptr(qp, n);
810		diff = qib_cmp24(psn, wqe->psn);
811		if (diff < 0)
812			break;
813		qp->s_cur = n;
814		/*
815		 * If we are starting the request from the beginning,
816		 * let the normal send code handle initialization.
817		 */
818		if (diff == 0) {
819			qp->s_state = OP(SEND_LAST);
820			goto done;
821		}
822		opcode = wqe->wr.opcode;
823	}
824
825	/*
826	 * Set the state to restart in the middle of a request.
827	 * Don't change the s_sge, s_cur_sge, or s_cur_size.
828	 * See qib_make_rc_req().
829	 */
830	switch (opcode) {
831	case IB_WR_SEND:
832	case IB_WR_SEND_WITH_IMM:
833		qp->s_state = OP(RDMA_READ_RESPONSE_FIRST);
834		break;
835
836	case IB_WR_RDMA_WRITE:
837	case IB_WR_RDMA_WRITE_WITH_IMM:
838		qp->s_state = OP(RDMA_READ_RESPONSE_LAST);
839		break;
840
841	case IB_WR_RDMA_READ:
842		qp->s_state = OP(RDMA_READ_RESPONSE_MIDDLE);
843		break;
844
845	default:
846		/*
847		 * This case shouldn't happen since its only
848		 * one PSN per req.
849		 */
850		qp->s_state = OP(SEND_LAST);
851	}
852done:
853	qp->s_psn = psn;
854	/*
855	 * Set QIB_S_WAIT_PSN as qib_rc_complete() may start the timer
856	 * asynchronously before the send tasklet can get scheduled.
857	 * Doing it in qib_make_rc_req() is too late.
858	 */
859	if ((qib_cmp24(qp->s_psn, qp->s_sending_hpsn) <= 0) &&
860	    (qib_cmp24(qp->s_sending_psn, qp->s_sending_hpsn) <= 0))
861		qp->s_flags |= QIB_S_WAIT_PSN;
862}
863
864/*
865 * Back up requester to resend the last un-ACKed request.
866 * The QP r_lock and s_lock should be held and interrupts disabled.
867 */
868static void qib_restart_rc(struct qib_qp *qp, u32 psn, int wait)
869{
870	struct qib_swqe *wqe = get_swqe_ptr(qp, qp->s_acked);
871	struct qib_ibport *ibp;
872
873	if (qp->s_retry == 0) {
874		if (qp->s_mig_state == IB_MIG_ARMED) {
875			qib_migrate_qp(qp);
876			qp->s_retry = qp->s_retry_cnt;
877		} else if (qp->s_last == qp->s_acked) {
878			qib_send_complete(qp, wqe, IB_WC_RETRY_EXC_ERR);
879			qib_error_qp(qp, IB_WC_WR_FLUSH_ERR);
880			return;
881		} else /* XXX need to handle delayed completion */
882			return;
883	} else
884		qp->s_retry--;
885
886	ibp = to_iport(qp->ibqp.device, qp->port_num);
887	if (wqe->wr.opcode == IB_WR_RDMA_READ)
888		ibp->n_rc_resends++;
889	else
890		ibp->n_rc_resends += (qp->s_psn - psn) & QIB_PSN_MASK;
891
892	qp->s_flags &= ~(QIB_S_WAIT_FENCE | QIB_S_WAIT_RDMAR |
893			 QIB_S_WAIT_SSN_CREDIT | QIB_S_WAIT_PSN |
894			 QIB_S_WAIT_ACK);
895	if (wait)
896		qp->s_flags |= QIB_S_SEND_ONE;
897	reset_psn(qp, psn);
898}
899
900/*
901 * This is called from s_timer for missing responses.
902 */
903static void rc_timeout(unsigned long arg)
904{
905	struct qib_qp *qp = (struct qib_qp *)arg;
906	struct qib_ibport *ibp;
907	unsigned long flags;
908
909	spin_lock_irqsave(&qp->r_lock, flags);
910	spin_lock(&qp->s_lock);
911	if (qp->s_flags & QIB_S_TIMER) {
912		ibp = to_iport(qp->ibqp.device, qp->port_num);
913		ibp->n_rc_timeouts++;
914		qp->s_flags &= ~QIB_S_TIMER;
915		del_timer(&qp->s_timer);
916		qib_restart_rc(qp, qp->s_last_psn + 1, 1);
917		qib_schedule_send(qp);
918	}
919	spin_unlock(&qp->s_lock);
920	spin_unlock_irqrestore(&qp->r_lock, flags);
921}
922
923/*
924 * This is called from s_timer for RNR timeouts.
925 */
926void qib_rc_rnr_retry(unsigned long arg)
927{
928	struct qib_qp *qp = (struct qib_qp *)arg;
929	unsigned long flags;
930
931	spin_lock_irqsave(&qp->s_lock, flags);
932	if (qp->s_flags & QIB_S_WAIT_RNR) {
933		qp->s_flags &= ~QIB_S_WAIT_RNR;
934		del_timer(&qp->s_timer);
935		qib_schedule_send(qp);
936	}
937	spin_unlock_irqrestore(&qp->s_lock, flags);
938}
939
940/*
941 * Set qp->s_sending_psn to the next PSN after the given one.
942 * This would be psn+1 except when RDMA reads are present.
943 */
944static void reset_sending_psn(struct qib_qp *qp, u32 psn)
945{
946	struct qib_swqe *wqe;
947	u32 n = qp->s_last;
948
949	/* Find the work request corresponding to the given PSN. */
950	for (;;) {
951		wqe = get_swqe_ptr(qp, n);
952		if (qib_cmp24(psn, wqe->lpsn) <= 0) {
953			if (wqe->wr.opcode == IB_WR_RDMA_READ)
954				qp->s_sending_psn = wqe->lpsn + 1;
955			else
956				qp->s_sending_psn = psn + 1;
957			break;
958		}
959		if (++n == qp->s_size)
960			n = 0;
961		if (n == qp->s_tail)
962			break;
963	}
964}
965
966/*
967 * This should be called with the QP s_lock held and interrupts disabled.
968 */
969void qib_rc_send_complete(struct qib_qp *qp, struct qib_ib_header *hdr)
970{
971	struct qib_other_headers *ohdr;
972	struct qib_swqe *wqe;
973	struct ib_wc wc;
974	unsigned i;
975	u32 opcode;
976	u32 psn;
977
978	if (!(ib_qib_state_ops[qp->state] & QIB_PROCESS_OR_FLUSH_SEND))
979		return;
980
981	/* Find out where the BTH is */
982	if ((be16_to_cpu(hdr->lrh[0]) & 3) == QIB_LRH_BTH)
983		ohdr = &hdr->u.oth;
984	else
985		ohdr = &hdr->u.l.oth;
986
987	opcode = be32_to_cpu(ohdr->bth[0]) >> 24;
988	if (opcode >= OP(RDMA_READ_RESPONSE_FIRST) &&
989	    opcode <= OP(ATOMIC_ACKNOWLEDGE)) {
990		WARN_ON(!qp->s_rdma_ack_cnt);
991		qp->s_rdma_ack_cnt--;
992		return;
993	}
994
995	psn = be32_to_cpu(ohdr->bth[2]);
996	reset_sending_psn(qp, psn);
997
998	/*
999	 * Start timer after a packet requesting an ACK has been sent and
1000	 * there are still requests that haven't been acked.
1001	 */
1002	if ((psn & IB_BTH_REQ_ACK) && qp->s_acked != qp->s_tail &&
1003	    !(qp->s_flags & (QIB_S_TIMER | QIB_S_WAIT_RNR | QIB_S_WAIT_PSN)) &&
1004	    (ib_qib_state_ops[qp->state] & QIB_PROCESS_RECV_OK))
1005		start_timer(qp);
1006
1007	while (qp->s_last != qp->s_acked) {
1008		wqe = get_swqe_ptr(qp, qp->s_last);
1009		if (qib_cmp24(wqe->lpsn, qp->s_sending_psn) >= 0 &&
1010		    qib_cmp24(qp->s_sending_psn, qp->s_sending_hpsn) <= 0)
1011			break;
1012		for (i = 0; i < wqe->wr.num_sge; i++) {
1013			struct qib_sge *sge = &wqe->sg_list[i];
1014
1015			qib_put_mr(sge->mr);
1016		}
1017		/* Post a send completion queue entry if requested. */
1018		if (!(qp->s_flags & QIB_S_SIGNAL_REQ_WR) ||
1019		    (wqe->wr.send_flags & IB_SEND_SIGNALED)) {
1020			memset(&wc, 0, sizeof wc);
1021			wc.wr_id = wqe->wr.wr_id;
1022			wc.status = IB_WC_SUCCESS;
1023			wc.opcode = ib_qib_wc_opcode[wqe->wr.opcode];
1024			wc.byte_len = wqe->length;
1025			wc.qp = &qp->ibqp;
1026			qib_cq_enter(to_icq(qp->ibqp.send_cq), &wc, 0);
1027		}
1028		if (++qp->s_last >= qp->s_size)
1029			qp->s_last = 0;
1030	}
1031	/*
1032	 * If we were waiting for sends to complete before resending,
1033	 * and they are now complete, restart sending.
1034	 */
1035	if (qp->s_flags & QIB_S_WAIT_PSN &&
1036	    qib_cmp24(qp->s_sending_psn, qp->s_sending_hpsn) > 0) {
1037		qp->s_flags &= ~QIB_S_WAIT_PSN;
1038		qp->s_sending_psn = qp->s_psn;
1039		qp->s_sending_hpsn = qp->s_psn - 1;
1040		qib_schedule_send(qp);
1041	}
1042}
1043
1044static inline void update_last_psn(struct qib_qp *qp, u32 psn)
1045{
1046	qp->s_last_psn = psn;
1047}
1048
1049/*
1050 * Generate a SWQE completion.
1051 * This is similar to qib_send_complete but has to check to be sure
1052 * that the SGEs are not being referenced if the SWQE is being resent.
1053 */
1054static struct qib_swqe *do_rc_completion(struct qib_qp *qp,
1055					 struct qib_swqe *wqe,
1056					 struct qib_ibport *ibp)
1057{
1058	struct ib_wc wc;
1059	unsigned i;
1060
1061	/*
1062	 * Don't decrement refcount and don't generate a
1063	 * completion if the SWQE is being resent until the send
1064	 * is finished.
1065	 */
1066	if (qib_cmp24(wqe->lpsn, qp->s_sending_psn) < 0 ||
1067	    qib_cmp24(qp->s_sending_psn, qp->s_sending_hpsn) > 0) {
1068		for (i = 0; i < wqe->wr.num_sge; i++) {
1069			struct qib_sge *sge = &wqe->sg_list[i];
1070
1071			qib_put_mr(sge->mr);
1072		}
1073		/* Post a send completion queue entry if requested. */
1074		if (!(qp->s_flags & QIB_S_SIGNAL_REQ_WR) ||
1075		    (wqe->wr.send_flags & IB_SEND_SIGNALED)) {
1076			memset(&wc, 0, sizeof wc);
1077			wc.wr_id = wqe->wr.wr_id;
1078			wc.status = IB_WC_SUCCESS;
1079			wc.opcode = ib_qib_wc_opcode[wqe->wr.opcode];
1080			wc.byte_len = wqe->length;
1081			wc.qp = &qp->ibqp;
1082			qib_cq_enter(to_icq(qp->ibqp.send_cq), &wc, 0);
1083		}
1084		if (++qp->s_last >= qp->s_size)
1085			qp->s_last = 0;
1086	} else
1087		ibp->n_rc_delayed_comp++;
1088
1089	qp->s_retry = qp->s_retry_cnt;
1090	update_last_psn(qp, wqe->lpsn);
1091
1092	/*
1093	 * If we are completing a request which is in the process of
1094	 * being resent, we can stop resending it since we know the
1095	 * responder has already seen it.
1096	 */
1097	if (qp->s_acked == qp->s_cur) {
1098		if (++qp->s_cur >= qp->s_size)
1099			qp->s_cur = 0;
1100		qp->s_acked = qp->s_cur;
1101		wqe = get_swqe_ptr(qp, qp->s_cur);
1102		if (qp->s_acked != qp->s_tail) {
1103			qp->s_state = OP(SEND_LAST);
1104			qp->s_psn = wqe->psn;
1105		}
1106	} else {
1107		if (++qp->s_acked >= qp->s_size)
1108			qp->s_acked = 0;
1109		if (qp->state == IB_QPS_SQD && qp->s_acked == qp->s_cur)
1110			qp->s_draining = 0;
1111		wqe = get_swqe_ptr(qp, qp->s_acked);
1112	}
1113	return wqe;
1114}
1115
1116/**
1117 * do_rc_ack - process an incoming RC ACK
1118 * @qp: the QP the ACK came in on
1119 * @psn: the packet sequence number of the ACK
1120 * @opcode: the opcode of the request that resulted in the ACK
1121 *
1122 * This is called from qib_rc_rcv_resp() to process an incoming RC ACK
1123 * for the given QP.
1124 * Called at interrupt level with the QP s_lock held.
1125 * Returns 1 if OK, 0 if current operation should be aborted (NAK).
1126 */
1127static int do_rc_ack(struct qib_qp *qp, u32 aeth, u32 psn, int opcode,
1128		     u64 val, struct qib_ctxtdata *rcd)
1129{
1130	struct qib_ibport *ibp;
1131	enum ib_wc_status status;
1132	struct qib_swqe *wqe;
1133	int ret = 0;
1134	u32 ack_psn;
1135	int diff;
1136
1137	/* Remove QP from retry timer */
1138	if (qp->s_flags & (QIB_S_TIMER | QIB_S_WAIT_RNR)) {
1139		qp->s_flags &= ~(QIB_S_TIMER | QIB_S_WAIT_RNR);
1140		del_timer(&qp->s_timer);
1141	}
1142
1143	/*
1144	 * Note that NAKs implicitly ACK outstanding SEND and RDMA write
1145	 * requests and implicitly NAK RDMA read and atomic requests issued
1146	 * before the NAK'ed request.  The MSN won't include the NAK'ed
1147	 * request but will include an ACK'ed request(s).
1148	 */
1149	ack_psn = psn;
1150	if (aeth >> 29)
1151		ack_psn--;
1152	wqe = get_swqe_ptr(qp, qp->s_acked);
1153	ibp = to_iport(qp->ibqp.device, qp->port_num);
1154
1155	/*
1156	 * The MSN might be for a later WQE than the PSN indicates so
1157	 * only complete WQEs that the PSN finishes.
1158	 */
1159	while ((diff = qib_cmp24(ack_psn, wqe->lpsn)) >= 0) {
1160		/*
1161		 * RDMA_READ_RESPONSE_ONLY is a special case since
1162		 * we want to generate completion events for everything
1163		 * before the RDMA read, copy the data, then generate
1164		 * the completion for the read.
1165		 */
1166		if (wqe->wr.opcode == IB_WR_RDMA_READ &&
1167		    opcode == OP(RDMA_READ_RESPONSE_ONLY) &&
1168		    diff == 0) {
1169			ret = 1;
1170			goto bail;
1171		}
1172		/*
1173		 * If this request is a RDMA read or atomic, and the ACK is
1174		 * for a later operation, this ACK NAKs the RDMA read or
1175		 * atomic.  In other words, only a RDMA_READ_LAST or ONLY
1176		 * can ACK a RDMA read and likewise for atomic ops.  Note
1177		 * that the NAK case can only happen if relaxed ordering is
1178		 * used and requests are sent after an RDMA read or atomic
1179		 * is sent but before the response is received.
1180		 */
1181		if ((wqe->wr.opcode == IB_WR_RDMA_READ &&
1182		     (opcode != OP(RDMA_READ_RESPONSE_LAST) || diff != 0)) ||
1183		    ((wqe->wr.opcode == IB_WR_ATOMIC_CMP_AND_SWP ||
1184		      wqe->wr.opcode == IB_WR_ATOMIC_FETCH_AND_ADD) &&
1185		     (opcode != OP(ATOMIC_ACKNOWLEDGE) || diff != 0))) {
1186			/* Retry this request. */
1187			if (!(qp->r_flags & QIB_R_RDMAR_SEQ)) {
1188				qp->r_flags |= QIB_R_RDMAR_SEQ;
1189				qib_restart_rc(qp, qp->s_last_psn + 1, 0);
1190				if (list_empty(&qp->rspwait)) {
1191					qp->r_flags |= QIB_R_RSP_SEND;
1192					atomic_inc(&qp->refcount);
1193					list_add_tail(&qp->rspwait,
1194						      &rcd->qp_wait_list);
1195				}
1196			}
1197			/*
1198			 * No need to process the ACK/NAK since we are
1199			 * restarting an earlier request.
1200			 */
1201			goto bail;
1202		}
1203		if (wqe->wr.opcode == IB_WR_ATOMIC_CMP_AND_SWP ||
1204		    wqe->wr.opcode == IB_WR_ATOMIC_FETCH_AND_ADD) {
1205			u64 *vaddr = wqe->sg_list[0].vaddr;
1206			*vaddr = val;
1207		}
1208		if (qp->s_num_rd_atomic &&
1209		    (wqe->wr.opcode == IB_WR_RDMA_READ ||
1210		     wqe->wr.opcode == IB_WR_ATOMIC_CMP_AND_SWP ||
1211		     wqe->wr.opcode == IB_WR_ATOMIC_FETCH_AND_ADD)) {
1212			qp->s_num_rd_atomic--;
1213			/* Restart sending task if fence is complete */
1214			if ((qp->s_flags & QIB_S_WAIT_FENCE) &&
1215			    !qp->s_num_rd_atomic) {
1216				qp->s_flags &= ~(QIB_S_WAIT_FENCE |
1217						 QIB_S_WAIT_ACK);
1218				qib_schedule_send(qp);
1219			} else if (qp->s_flags & QIB_S_WAIT_RDMAR) {
1220				qp->s_flags &= ~(QIB_S_WAIT_RDMAR |
1221						 QIB_S_WAIT_ACK);
1222				qib_schedule_send(qp);
1223			}
1224		}
1225		wqe = do_rc_completion(qp, wqe, ibp);
1226		if (qp->s_acked == qp->s_tail)
1227			break;
1228	}
1229
1230	switch (aeth >> 29) {
1231	case 0:         /* ACK */
1232		ibp->n_rc_acks++;
1233		if (qp->s_acked != qp->s_tail) {
1234			/*
1235			 * We are expecting more ACKs so
1236			 * reset the retransmit timer.
1237			 */
1238			start_timer(qp);
1239			/*
1240			 * We can stop resending the earlier packets and
1241			 * continue with the next packet the receiver wants.
1242			 */
1243			if (qib_cmp24(qp->s_psn, psn) <= 0)
1244				reset_psn(qp, psn + 1);
1245		} else if (qib_cmp24(qp->s_psn, psn) <= 0) {
1246			qp->s_state = OP(SEND_LAST);
1247			qp->s_psn = psn + 1;
1248		}
1249		if (qp->s_flags & QIB_S_WAIT_ACK) {
1250			qp->s_flags &= ~QIB_S_WAIT_ACK;
1251			qib_schedule_send(qp);
1252		}
1253		qib_get_credit(qp, aeth);
1254		qp->s_rnr_retry = qp->s_rnr_retry_cnt;
1255		qp->s_retry = qp->s_retry_cnt;
1256		update_last_psn(qp, psn);
1257		ret = 1;
1258		goto bail;
1259
1260	case 1:         /* RNR NAK */
1261		ibp->n_rnr_naks++;
1262		if (qp->s_acked == qp->s_tail)
1263			goto bail;
1264		if (qp->s_flags & QIB_S_WAIT_RNR)
1265			goto bail;
1266		if (qp->s_rnr_retry == 0) {
1267			status = IB_WC_RNR_RETRY_EXC_ERR;
1268			goto class_b;
1269		}
1270		if (qp->s_rnr_retry_cnt < 7)
1271			qp->s_rnr_retry--;
1272
1273		/* The last valid PSN is the previous PSN. */
1274		update_last_psn(qp, psn - 1);
1275
1276		ibp->n_rc_resends += (qp->s_psn - psn) & QIB_PSN_MASK;
1277
1278		reset_psn(qp, psn);
1279
1280		qp->s_flags &= ~(QIB_S_WAIT_SSN_CREDIT | QIB_S_WAIT_ACK);
1281		qp->s_flags |= QIB_S_WAIT_RNR;
1282		qp->s_timer.function = qib_rc_rnr_retry;
1283		qp->s_timer.expires = jiffies + usecs_to_jiffies(
1284			ib_qib_rnr_table[(aeth >> QIB_AETH_CREDIT_SHIFT) &
1285					   QIB_AETH_CREDIT_MASK]);
1286		add_timer(&qp->s_timer);
1287		goto bail;
1288
1289	case 3:         /* NAK */
1290		if (qp->s_acked == qp->s_tail)
1291			goto bail;
1292		/* The last valid PSN is the previous PSN. */
1293		update_last_psn(qp, psn - 1);
1294		switch ((aeth >> QIB_AETH_CREDIT_SHIFT) &
1295			QIB_AETH_CREDIT_MASK) {
1296		case 0: /* PSN sequence error */
1297			ibp->n_seq_naks++;
1298			/*
1299			 * Back up to the responder's expected PSN.
1300			 * Note that we might get a NAK in the middle of an
1301			 * RDMA READ response which terminates the RDMA
1302			 * READ.
1303			 */
1304			qib_restart_rc(qp, psn, 0);
1305			qib_schedule_send(qp);
1306			break;
1307
1308		case 1: /* Invalid Request */
1309			status = IB_WC_REM_INV_REQ_ERR;
1310			ibp->n_other_naks++;
1311			goto class_b;
1312
1313		case 2: /* Remote Access Error */
1314			status = IB_WC_REM_ACCESS_ERR;
1315			ibp->n_other_naks++;
1316			goto class_b;
1317
1318		case 3: /* Remote Operation Error */
1319			status = IB_WC_REM_OP_ERR;
1320			ibp->n_other_naks++;
1321class_b:
1322			if (qp->s_last == qp->s_acked) {
1323				qib_send_complete(qp, wqe, status);
1324				qib_error_qp(qp, IB_WC_WR_FLUSH_ERR);
1325			}
1326			break;
1327
1328		default:
1329			/* Ignore other reserved NAK error codes */
1330			goto reserved;
1331		}
1332		qp->s_retry = qp->s_retry_cnt;
1333		qp->s_rnr_retry = qp->s_rnr_retry_cnt;
1334		goto bail;
1335
1336	default:                /* 2: reserved */
1337reserved:
1338		/* Ignore reserved NAK codes. */
1339		goto bail;
1340	}
1341
1342bail:
1343	return ret;
1344}
1345
1346/*
1347 * We have seen an out of sequence RDMA read middle or last packet.
1348 * This ACKs SENDs and RDMA writes up to the first RDMA read or atomic SWQE.
1349 */
1350static void rdma_seq_err(struct qib_qp *qp, struct qib_ibport *ibp, u32 psn,
1351			 struct qib_ctxtdata *rcd)
1352{
1353	struct qib_swqe *wqe;
1354
1355	/* Remove QP from retry timer */
1356	if (qp->s_flags & (QIB_S_TIMER | QIB_S_WAIT_RNR)) {
1357		qp->s_flags &= ~(QIB_S_TIMER | QIB_S_WAIT_RNR);
1358		del_timer(&qp->s_timer);
1359	}
1360
1361	wqe = get_swqe_ptr(qp, qp->s_acked);
1362
1363	while (qib_cmp24(psn, wqe->lpsn) > 0) {
1364		if (wqe->wr.opcode == IB_WR_RDMA_READ ||
1365		    wqe->wr.opcode == IB_WR_ATOMIC_CMP_AND_SWP ||
1366		    wqe->wr.opcode == IB_WR_ATOMIC_FETCH_AND_ADD)
1367			break;
1368		wqe = do_rc_completion(qp, wqe, ibp);
1369	}
1370
1371	ibp->n_rdma_seq++;
1372	qp->r_flags |= QIB_R_RDMAR_SEQ;
1373	qib_restart_rc(qp, qp->s_last_psn + 1, 0);
1374	if (list_empty(&qp->rspwait)) {
1375		qp->r_flags |= QIB_R_RSP_SEND;
1376		atomic_inc(&qp->refcount);
1377		list_add_tail(&qp->rspwait, &rcd->qp_wait_list);
1378	}
1379}
1380
1381/**
1382 * qib_rc_rcv_resp - process an incoming RC response packet
1383 * @ibp: the port this packet came in on
1384 * @ohdr: the other headers for this packet
1385 * @data: the packet data
1386 * @tlen: the packet length
1387 * @qp: the QP for this packet
1388 * @opcode: the opcode for this packet
1389 * @psn: the packet sequence number for this packet
1390 * @hdrsize: the header length
1391 * @pmtu: the path MTU
1392 *
1393 * This is called from qib_rc_rcv() to process an incoming RC response
1394 * packet for the given QP.
1395 * Called at interrupt level.
1396 */
1397static void qib_rc_rcv_resp(struct qib_ibport *ibp,
1398			    struct qib_other_headers *ohdr,
1399			    void *data, u32 tlen,
1400			    struct qib_qp *qp,
1401			    u32 opcode,
1402			    u32 psn, u32 hdrsize, u32 pmtu,
1403			    struct qib_ctxtdata *rcd)
1404{
1405	struct qib_swqe *wqe;
1406	struct qib_pportdata *ppd = ppd_from_ibp(ibp);
1407	enum ib_wc_status status;
1408	unsigned long flags;
1409	int diff;
1410	u32 pad;
1411	u32 aeth;
1412	u64 val;
1413
1414	if (opcode != OP(RDMA_READ_RESPONSE_MIDDLE)) {
1415		/*
1416		 * If ACK'd PSN on SDMA busy list try to make progress to
1417		 * reclaim SDMA credits.
1418		 */
1419		if ((qib_cmp24(psn, qp->s_sending_psn) >= 0) &&
1420		    (qib_cmp24(qp->s_sending_psn, qp->s_sending_hpsn) <= 0)) {
1421
1422			/*
1423			 * If send tasklet not running attempt to progress
1424			 * SDMA queue.
1425			 */
1426			if (!(qp->s_flags & QIB_S_BUSY)) {
1427				/* Acquire SDMA Lock */
1428				spin_lock_irqsave(&ppd->sdma_lock, flags);
1429				/* Invoke sdma make progress */
1430				qib_sdma_make_progress(ppd);
1431				/* Release SDMA Lock */
1432				spin_unlock_irqrestore(&ppd->sdma_lock, flags);
1433			}
1434		}
1435	}
1436
1437	spin_lock_irqsave(&qp->s_lock, flags);
1438	if (!(ib_qib_state_ops[qp->state] & QIB_PROCESS_RECV_OK))
1439		goto ack_done;
1440
1441	/* Ignore invalid responses. */
1442	if (qib_cmp24(psn, qp->s_next_psn) >= 0)
1443		goto ack_done;
1444
1445	/* Ignore duplicate responses. */
1446	diff = qib_cmp24(psn, qp->s_last_psn);
1447	if (unlikely(diff <= 0)) {
1448		/* Update credits for "ghost" ACKs */
1449		if (diff == 0 && opcode == OP(ACKNOWLEDGE)) {
1450			aeth = be32_to_cpu(ohdr->u.aeth);
1451			if ((aeth >> 29) == 0)
1452				qib_get_credit(qp, aeth);
1453		}
1454		goto ack_done;
1455	}
1456
1457	/*
1458	 * Skip everything other than the PSN we expect, if we are waiting
1459	 * for a reply to a restarted RDMA read or atomic op.
1460	 */
1461	if (qp->r_flags & QIB_R_RDMAR_SEQ) {
1462		if (qib_cmp24(psn, qp->s_last_psn + 1) != 0)
1463			goto ack_done;
1464		qp->r_flags &= ~QIB_R_RDMAR_SEQ;
1465	}
1466
1467	if (unlikely(qp->s_acked == qp->s_tail))
1468		goto ack_done;
1469	wqe = get_swqe_ptr(qp, qp->s_acked);
1470	status = IB_WC_SUCCESS;
1471
1472	switch (opcode) {
1473	case OP(ACKNOWLEDGE):
1474	case OP(ATOMIC_ACKNOWLEDGE):
1475	case OP(RDMA_READ_RESPONSE_FIRST):
1476		aeth = be32_to_cpu(ohdr->u.aeth);
1477		if (opcode == OP(ATOMIC_ACKNOWLEDGE)) {
1478			__be32 *p = ohdr->u.at.atomic_ack_eth;
1479
1480			val = ((u64) be32_to_cpu(p[0]) << 32) |
1481				be32_to_cpu(p[1]);
1482		} else
1483			val = 0;
1484		if (!do_rc_ack(qp, aeth, psn, opcode, val, rcd) ||
1485		    opcode != OP(RDMA_READ_RESPONSE_FIRST))
1486			goto ack_done;
1487		hdrsize += 4;
1488		wqe = get_swqe_ptr(qp, qp->s_acked);
1489		if (unlikely(wqe->wr.opcode != IB_WR_RDMA_READ))
1490			goto ack_op_err;
1491		/*
1492		 * If this is a response to a resent RDMA read, we
1493		 * have to be careful to copy the data to the right
1494		 * location.
1495		 */
1496		qp->s_rdma_read_len = restart_sge(&qp->s_rdma_read_sge,
1497						  wqe, psn, pmtu);
1498		goto read_middle;
1499
1500	case OP(RDMA_READ_RESPONSE_MIDDLE):
1501		/* no AETH, no ACK */
1502		if (unlikely(qib_cmp24(psn, qp->s_last_psn + 1)))
1503			goto ack_seq_err;
1504		if (unlikely(wqe->wr.opcode != IB_WR_RDMA_READ))
1505			goto ack_op_err;
1506read_middle:
1507		if (unlikely(tlen != (hdrsize + pmtu + 4)))
1508			goto ack_len_err;
1509		if (unlikely(pmtu >= qp->s_rdma_read_len))
1510			goto ack_len_err;
1511
1512		/*
1513		 * We got a response so update the timeout.
1514		 * 4.096 usec. * (1 << qp->timeout)
1515		 */
1516		qp->s_flags |= QIB_S_TIMER;
1517		mod_timer(&qp->s_timer, jiffies + qp->timeout_jiffies);
1518		if (qp->s_flags & QIB_S_WAIT_ACK) {
1519			qp->s_flags &= ~QIB_S_WAIT_ACK;
1520			qib_schedule_send(qp);
1521		}
1522
1523		if (opcode == OP(RDMA_READ_RESPONSE_MIDDLE))
1524			qp->s_retry = qp->s_retry_cnt;
1525
1526		/*
1527		 * Update the RDMA receive state but do the copy w/o
1528		 * holding the locks and blocking interrupts.
1529		 */
1530		qp->s_rdma_read_len -= pmtu;
1531		update_last_psn(qp, psn);
1532		spin_unlock_irqrestore(&qp->s_lock, flags);
1533		qib_copy_sge(&qp->s_rdma_read_sge, data, pmtu, 0);
1534		goto bail;
1535
1536	case OP(RDMA_READ_RESPONSE_ONLY):
1537		aeth = be32_to_cpu(ohdr->u.aeth);
1538		if (!do_rc_ack(qp, aeth, psn, opcode, 0, rcd))
1539			goto ack_done;
1540		/* Get the number of bytes the message was padded by. */
1541		pad = (be32_to_cpu(ohdr->bth[0]) >> 20) & 3;
1542		/*
1543		 * Check that the data size is >= 0 && <= pmtu.
1544		 * Remember to account for the AETH header (4) and
1545		 * ICRC (4).
1546		 */
1547		if (unlikely(tlen < (hdrsize + pad + 8)))
1548			goto ack_len_err;
1549		/*
1550		 * If this is a response to a resent RDMA read, we
1551		 * have to be careful to copy the data to the right
1552		 * location.
1553		 */
1554		wqe = get_swqe_ptr(qp, qp->s_acked);
1555		qp->s_rdma_read_len = restart_sge(&qp->s_rdma_read_sge,
1556						  wqe, psn, pmtu);
1557		goto read_last;
1558
1559	case OP(RDMA_READ_RESPONSE_LAST):
1560		/* ACKs READ req. */
1561		if (unlikely(qib_cmp24(psn, qp->s_last_psn + 1)))
1562			goto ack_seq_err;
1563		if (unlikely(wqe->wr.opcode != IB_WR_RDMA_READ))
1564			goto ack_op_err;
1565		/* Get the number of bytes the message was padded by. */
1566		pad = (be32_to_cpu(ohdr->bth[0]) >> 20) & 3;
1567		/*
1568		 * Check that the data size is >= 1 && <= pmtu.
1569		 * Remember to account for the AETH header (4) and
1570		 * ICRC (4).
1571		 */
1572		if (unlikely(tlen <= (hdrsize + pad + 8)))
1573			goto ack_len_err;
1574read_last:
1575		tlen -= hdrsize + pad + 8;
1576		if (unlikely(tlen != qp->s_rdma_read_len))
1577			goto ack_len_err;
1578		aeth = be32_to_cpu(ohdr->u.aeth);
1579		qib_copy_sge(&qp->s_rdma_read_sge, data, tlen, 0);
1580		WARN_ON(qp->s_rdma_read_sge.num_sge);
1581		(void) do_rc_ack(qp, aeth, psn,
1582				 OP(RDMA_READ_RESPONSE_LAST), 0, rcd);
1583		goto ack_done;
1584	}
1585
1586ack_op_err:
1587	status = IB_WC_LOC_QP_OP_ERR;
1588	goto ack_err;
1589
1590ack_seq_err:
1591	rdma_seq_err(qp, ibp, psn, rcd);
1592	goto ack_done;
1593
1594ack_len_err:
1595	status = IB_WC_LOC_LEN_ERR;
1596ack_err:
1597	if (qp->s_last == qp->s_acked) {
1598		qib_send_complete(qp, wqe, status);
1599		qib_error_qp(qp, IB_WC_WR_FLUSH_ERR);
1600	}
1601ack_done:
1602	spin_unlock_irqrestore(&qp->s_lock, flags);
1603bail:
1604	return;
1605}
1606
1607/**
1608 * qib_rc_rcv_error - process an incoming duplicate or error RC packet
1609 * @ohdr: the other headers for this packet
1610 * @data: the packet data
1611 * @qp: the QP for this packet
1612 * @opcode: the opcode for this packet
1613 * @psn: the packet sequence number for this packet
1614 * @diff: the difference between the PSN and the expected PSN
1615 *
1616 * This is called from qib_rc_rcv() to process an unexpected
1617 * incoming RC packet for the given QP.
1618 * Called at interrupt level.
1619 * Return 1 if no more processing is needed; otherwise return 0 to
1620 * schedule a response to be sent.
1621 */
1622static int qib_rc_rcv_error(struct qib_other_headers *ohdr,
1623			    void *data,
1624			    struct qib_qp *qp,
1625			    u32 opcode,
1626			    u32 psn,
1627			    int diff,
1628			    struct qib_ctxtdata *rcd)
1629{
1630	struct qib_ibport *ibp = to_iport(qp->ibqp.device, qp->port_num);
1631	struct qib_ack_entry *e;
1632	unsigned long flags;
1633	u8 i, prev;
1634	int old_req;
1635
1636	if (diff > 0) {
1637		/*
1638		 * Packet sequence error.
1639		 * A NAK will ACK earlier sends and RDMA writes.
1640		 * Don't queue the NAK if we already sent one.
1641		 */
1642		if (!qp->r_nak_state) {
1643			ibp->n_rc_seqnak++;
1644			qp->r_nak_state = IB_NAK_PSN_ERROR;
1645			/* Use the expected PSN. */
1646			qp->r_ack_psn = qp->r_psn;
1647			/*
1648			 * Wait to send the sequence NAK until all packets
1649			 * in the receive queue have been processed.
1650			 * Otherwise, we end up propagating congestion.
1651			 */
1652			if (list_empty(&qp->rspwait)) {
1653				qp->r_flags |= QIB_R_RSP_NAK;
1654				atomic_inc(&qp->refcount);
1655				list_add_tail(&qp->rspwait, &rcd->qp_wait_list);
1656			}
1657		}
1658		goto done;
1659	}
1660
1661	/*
1662	 * Handle a duplicate request.  Don't re-execute SEND, RDMA
1663	 * write or atomic op.  Don't NAK errors, just silently drop
1664	 * the duplicate request.  Note that r_sge, r_len, and
1665	 * r_rcv_len may be in use so don't modify them.
1666	 *
1667	 * We are supposed to ACK the earliest duplicate PSN but we
1668	 * can coalesce an outstanding duplicate ACK.  We have to
1669	 * send the earliest so that RDMA reads can be restarted at
1670	 * the requester's expected PSN.
1671	 *
1672	 * First, find where this duplicate PSN falls within the
1673	 * ACKs previously sent.
1674	 * old_req is true if there is an older response that is scheduled
1675	 * to be sent before sending this one.
1676	 */
1677	e = NULL;
1678	old_req = 1;
1679	ibp->n_rc_dupreq++;
1680
1681	spin_lock_irqsave(&qp->s_lock, flags);
1682
1683	for (i = qp->r_head_ack_queue; ; i = prev) {
1684		if (i == qp->s_tail_ack_queue)
1685			old_req = 0;
1686		if (i)
1687			prev = i - 1;
1688		else
1689			prev = QIB_MAX_RDMA_ATOMIC;
1690		if (prev == qp->r_head_ack_queue) {
1691			e = NULL;
1692			break;
1693		}
1694		e = &qp->s_ack_queue[prev];
1695		if (!e->opcode) {
1696			e = NULL;
1697			break;
1698		}
1699		if (qib_cmp24(psn, e->psn) >= 0) {
1700			if (prev == qp->s_tail_ack_queue &&
1701			    qib_cmp24(psn, e->lpsn) <= 0)
1702				old_req = 0;
1703			break;
1704		}
1705	}
1706	switch (opcode) {
1707	case OP(RDMA_READ_REQUEST): {
1708		struct ib_reth *reth;
1709		u32 offset;
1710		u32 len;
1711
1712		/*
1713		 * If we didn't find the RDMA read request in the ack queue,
1714		 * we can ignore this request.
1715		 */
1716		if (!e || e->opcode != OP(RDMA_READ_REQUEST))
1717			goto unlock_done;
1718		/* RETH comes after BTH */
1719		reth = &ohdr->u.rc.reth;
1720		/*
1721		 * Address range must be a subset of the original
1722		 * request and start on pmtu boundaries.
1723		 * We reuse the old ack_queue slot since the requester
1724		 * should not back up and request an earlier PSN for the
1725		 * same request.
1726		 */
1727		offset = ((psn - e->psn) & QIB_PSN_MASK) *
1728			qp->pmtu;
1729		len = be32_to_cpu(reth->length);
1730		if (unlikely(offset + len != e->rdma_sge.sge_length))
1731			goto unlock_done;
1732		if (e->rdma_sge.mr) {
1733			qib_put_mr(e->rdma_sge.mr);
1734			e->rdma_sge.mr = NULL;
1735		}
1736		if (len != 0) {
1737			u32 rkey = be32_to_cpu(reth->rkey);
1738			u64 vaddr = be64_to_cpu(reth->vaddr);
1739			int ok;
1740
1741			ok = qib_rkey_ok(qp, &e->rdma_sge, len, vaddr, rkey,
1742					 IB_ACCESS_REMOTE_READ);
1743			if (unlikely(!ok))
1744				goto unlock_done;
1745		} else {
1746			e->rdma_sge.vaddr = NULL;
1747			e->rdma_sge.length = 0;
1748			e->rdma_sge.sge_length = 0;
1749		}
1750		e->psn = psn;
1751		if (old_req)
1752			goto unlock_done;
1753		qp->s_tail_ack_queue = prev;
1754		break;
1755	}
1756
1757	case OP(COMPARE_SWAP):
1758	case OP(FETCH_ADD): {
1759		/*
1760		 * If we didn't find the atomic request in the ack queue
1761		 * or the send tasklet is already backed up to send an
1762		 * earlier entry, we can ignore this request.
1763		 */
1764		if (!e || e->opcode != (u8) opcode || old_req)
1765			goto unlock_done;
1766		qp->s_tail_ack_queue = prev;
1767		break;
1768	}
1769
1770	default:
1771		/*
1772		 * Ignore this operation if it doesn't request an ACK
1773		 * or an earlier RDMA read or atomic is going to be resent.
1774		 */
1775		if (!(psn & IB_BTH_REQ_ACK) || old_req)
1776			goto unlock_done;
1777		/*
1778		 * Resend the most recent ACK if this request is
1779		 * after all the previous RDMA reads and atomics.
1780		 */
1781		if (i == qp->r_head_ack_queue) {
1782			spin_unlock_irqrestore(&qp->s_lock, flags);
1783			qp->r_nak_state = 0;
1784			qp->r_ack_psn = qp->r_psn - 1;
1785			goto send_ack;
1786		}
1787		/*
1788		 * Try to send a simple ACK to work around a Mellanox bug
1789		 * which doesn't accept a RDMA read response or atomic
1790		 * response as an ACK for earlier SENDs or RDMA writes.
1791		 */
1792		if (!(qp->s_flags & QIB_S_RESP_PENDING)) {
1793			spin_unlock_irqrestore(&qp->s_lock, flags);
1794			qp->r_nak_state = 0;
1795			qp->r_ack_psn = qp->s_ack_queue[i].psn - 1;
1796			goto send_ack;
1797		}
1798		/*
1799		 * Resend the RDMA read or atomic op which
1800		 * ACKs this duplicate request.
1801		 */
1802		qp->s_tail_ack_queue = i;
1803		break;
1804	}
1805	qp->s_ack_state = OP(ACKNOWLEDGE);
1806	qp->s_flags |= QIB_S_RESP_PENDING;
1807	qp->r_nak_state = 0;
1808	qib_schedule_send(qp);
1809
1810unlock_done:
1811	spin_unlock_irqrestore(&qp->s_lock, flags);
1812done:
1813	return 1;
1814
1815send_ack:
1816	return 0;
1817}
1818
1819void qib_rc_error(struct qib_qp *qp, enum ib_wc_status err)
1820{
1821	unsigned long flags;
1822	int lastwqe;
1823
1824	spin_lock_irqsave(&qp->s_lock, flags);
1825	lastwqe = qib_error_qp(qp, err);
1826	spin_unlock_irqrestore(&qp->s_lock, flags);
1827
1828	if (lastwqe) {
1829		struct ib_event ev;
1830
1831		ev.device = qp->ibqp.device;
1832		ev.element.qp = &qp->ibqp;
1833		ev.event = IB_EVENT_QP_LAST_WQE_REACHED;
1834		qp->ibqp.event_handler(&ev, qp->ibqp.qp_context);
1835	}
1836}
1837
1838static inline void qib_update_ack_queue(struct qib_qp *qp, unsigned n)
1839{
1840	unsigned next;
1841
1842	next = n + 1;
1843	if (next > QIB_MAX_RDMA_ATOMIC)
1844		next = 0;
1845	qp->s_tail_ack_queue = next;
1846	qp->s_ack_state = OP(ACKNOWLEDGE);
1847}
1848
1849/**
1850 * qib_rc_rcv - process an incoming RC packet
1851 * @rcd: the context pointer
1852 * @hdr: the header of this packet
1853 * @has_grh: true if the header has a GRH
1854 * @data: the packet data
1855 * @tlen: the packet length
1856 * @qp: the QP for this packet
1857 *
1858 * This is called from qib_qp_rcv() to process an incoming RC packet
1859 * for the given QP.
1860 * Called at interrupt level.
1861 */
1862void qib_rc_rcv(struct qib_ctxtdata *rcd, struct qib_ib_header *hdr,
1863		int has_grh, void *data, u32 tlen, struct qib_qp *qp)
1864{
1865	struct qib_ibport *ibp = &rcd->ppd->ibport_data;
1866	struct qib_other_headers *ohdr;
1867	u32 opcode;
1868	u32 hdrsize;
1869	u32 psn;
1870	u32 pad;
1871	struct ib_wc wc;
1872	u32 pmtu = qp->pmtu;
1873	int diff;
1874	struct ib_reth *reth;
1875	unsigned long flags;
1876	int ret;
1877
1878	/* Check for GRH */
1879	if (!has_grh) {
1880		ohdr = &hdr->u.oth;
1881		hdrsize = 8 + 12;       /* LRH + BTH */
1882	} else {
1883		ohdr = &hdr->u.l.oth;
1884		hdrsize = 8 + 40 + 12;  /* LRH + GRH + BTH */
1885	}
1886
1887	opcode = be32_to_cpu(ohdr->bth[0]);
1888	if (qib_ruc_check_hdr(ibp, hdr, has_grh, qp, opcode))
1889		return;
1890
1891	psn = be32_to_cpu(ohdr->bth[2]);
1892	opcode >>= 24;
1893
1894	/*
1895	 * Process responses (ACKs) before anything else.  Note that the
1896	 * packet sequence number will be for something in the send work
1897	 * queue rather than the expected receive packet sequence number.
1898	 * In other words, this QP is the requester.
1899	 */
1900	if (opcode >= OP(RDMA_READ_RESPONSE_FIRST) &&
1901	    opcode <= OP(ATOMIC_ACKNOWLEDGE)) {
1902		qib_rc_rcv_resp(ibp, ohdr, data, tlen, qp, opcode, psn,
1903				hdrsize, pmtu, rcd);
1904		return;
1905	}
1906
1907	/* Compute 24 bits worth of difference. */
1908	diff = qib_cmp24(psn, qp->r_psn);
1909	if (unlikely(diff)) {
1910		if (qib_rc_rcv_error(ohdr, data, qp, opcode, psn, diff, rcd))
1911			return;
1912		goto send_ack;
1913	}
1914
1915	/* Check for opcode sequence errors. */
1916	switch (qp->r_state) {
1917	case OP(SEND_FIRST):
1918	case OP(SEND_MIDDLE):
1919		if (opcode == OP(SEND_MIDDLE) ||
1920		    opcode == OP(SEND_LAST) ||
1921		    opcode == OP(SEND_LAST_WITH_IMMEDIATE))
1922			break;
1923		goto nack_inv;
1924
1925	case OP(RDMA_WRITE_FIRST):
1926	case OP(RDMA_WRITE_MIDDLE):
1927		if (opcode == OP(RDMA_WRITE_MIDDLE) ||
1928		    opcode == OP(RDMA_WRITE_LAST) ||
1929		    opcode == OP(RDMA_WRITE_LAST_WITH_IMMEDIATE))
1930			break;
1931		goto nack_inv;
1932
1933	default:
1934		if (opcode == OP(SEND_MIDDLE) ||
1935		    opcode == OP(SEND_LAST) ||
1936		    opcode == OP(SEND_LAST_WITH_IMMEDIATE) ||
1937		    opcode == OP(RDMA_WRITE_MIDDLE) ||
1938		    opcode == OP(RDMA_WRITE_LAST) ||
1939		    opcode == OP(RDMA_WRITE_LAST_WITH_IMMEDIATE))
1940			goto nack_inv;
1941		/*
1942		 * Note that it is up to the requester to not send a new
1943		 * RDMA read or atomic operation before receiving an ACK
1944		 * for the previous operation.
1945		 */
1946		break;
1947	}
1948
1949	if (qp->state == IB_QPS_RTR && !(qp->r_flags & QIB_R_COMM_EST)) {
1950		qp->r_flags |= QIB_R_COMM_EST;
1951		if (qp->ibqp.event_handler) {
1952			struct ib_event ev;
1953
1954			ev.device = qp->ibqp.device;
1955			ev.element.qp = &qp->ibqp;
1956			ev.event = IB_EVENT_COMM_EST;
1957			qp->ibqp.event_handler(&ev, qp->ibqp.qp_context);
1958		}
1959	}
1960
1961	/* OK, process the packet. */
1962	switch (opcode) {
1963	case OP(SEND_FIRST):
1964		ret = qib_get_rwqe(qp, 0);
1965		if (ret < 0)
1966			goto nack_op_err;
1967		if (!ret)
1968			goto rnr_nak;
1969		qp->r_rcv_len = 0;
1970		/* FALLTHROUGH */
1971	case OP(SEND_MIDDLE):
1972	case OP(RDMA_WRITE_MIDDLE):
1973send_middle:
1974		/* Check for invalid length PMTU or posted rwqe len. */
1975		if (unlikely(tlen != (hdrsize + pmtu + 4)))
1976			goto nack_inv;
1977		qp->r_rcv_len += pmtu;
1978		if (unlikely(qp->r_rcv_len > qp->r_len))
1979			goto nack_inv;
1980		qib_copy_sge(&qp->r_sge, data, pmtu, 1);
1981		break;
1982
1983	case OP(RDMA_WRITE_LAST_WITH_IMMEDIATE):
1984		/* consume RWQE */
1985		ret = qib_get_rwqe(qp, 1);
1986		if (ret < 0)
1987			goto nack_op_err;
1988		if (!ret)
1989			goto rnr_nak;
1990		goto send_last_imm;
1991
1992	case OP(SEND_ONLY):
1993	case OP(SEND_ONLY_WITH_IMMEDIATE):
1994		ret = qib_get_rwqe(qp, 0);
1995		if (ret < 0)
1996			goto nack_op_err;
1997		if (!ret)
1998			goto rnr_nak;
1999		qp->r_rcv_len = 0;
2000		if (opcode == OP(SEND_ONLY))
2001			goto no_immediate_data;
2002		/* FALLTHROUGH for SEND_ONLY_WITH_IMMEDIATE */
2003	case OP(SEND_LAST_WITH_IMMEDIATE):
2004send_last_imm:
2005		wc.ex.imm_data = ohdr->u.imm_data;
2006		hdrsize += 4;
2007		wc.wc_flags = IB_WC_WITH_IMM;
2008		goto send_last;
2009	case OP(SEND_LAST):
2010	case OP(RDMA_WRITE_LAST):
2011no_immediate_data:
2012		wc.wc_flags = 0;
2013		wc.ex.imm_data = 0;
2014send_last:
2015		/* Get the number of bytes the message was padded by. */
2016		pad = (be32_to_cpu(ohdr->bth[0]) >> 20) & 3;
2017		/* Check for invalid length. */
2018		/* XXX LAST len should be >= 1 */
2019		if (unlikely(tlen < (hdrsize + pad + 4)))
2020			goto nack_inv;
2021		/* Don't count the CRC. */
2022		tlen -= (hdrsize + pad + 4);
2023		wc.byte_len = tlen + qp->r_rcv_len;
2024		if (unlikely(wc.byte_len > qp->r_len))
2025			goto nack_inv;
2026		qib_copy_sge(&qp->r_sge, data, tlen, 1);
2027		qib_put_ss(&qp->r_sge);
2028		qp->r_msn++;
2029		if (!test_and_clear_bit(QIB_R_WRID_VALID, &qp->r_aflags))
2030			break;
2031		wc.wr_id = qp->r_wr_id;
2032		wc.status = IB_WC_SUCCESS;
2033		if (opcode == OP(RDMA_WRITE_LAST_WITH_IMMEDIATE) ||
2034		    opcode == OP(RDMA_WRITE_ONLY_WITH_IMMEDIATE))
2035			wc.opcode = IB_WC_RECV_RDMA_WITH_IMM;
2036		else
2037			wc.opcode = IB_WC_RECV;
2038		wc.qp = &qp->ibqp;
2039		wc.src_qp = qp->remote_qpn;
2040		wc.slid = qp->remote_ah_attr.dlid;
2041		wc.sl = qp->remote_ah_attr.sl;
2042		/* zero fields that are N/A */
2043		wc.vendor_err = 0;
2044		wc.pkey_index = 0;
2045		wc.dlid_path_bits = 0;
2046		wc.port_num = 0;
2047		/* Signal completion event if the solicited bit is set. */
2048		qib_cq_enter(to_icq(qp->ibqp.recv_cq), &wc,
2049			     (ohdr->bth[0] &
2050			      cpu_to_be32(IB_BTH_SOLICITED)) != 0);
2051		break;
2052
2053	case OP(RDMA_WRITE_FIRST):
2054	case OP(RDMA_WRITE_ONLY):
2055	case OP(RDMA_WRITE_ONLY_WITH_IMMEDIATE):
2056		if (unlikely(!(qp->qp_access_flags & IB_ACCESS_REMOTE_WRITE)))
2057			goto nack_inv;
2058		/* consume RWQE */
2059		reth = &ohdr->u.rc.reth;
2060		hdrsize += sizeof(*reth);
2061		qp->r_len = be32_to_cpu(reth->length);
2062		qp->r_rcv_len = 0;
2063		qp->r_sge.sg_list = NULL;
2064		if (qp->r_len != 0) {
2065			u32 rkey = be32_to_cpu(reth->rkey);
2066			u64 vaddr = be64_to_cpu(reth->vaddr);
2067			int ok;
2068
2069			/* Check rkey & NAK */
2070			ok = qib_rkey_ok(qp, &qp->r_sge.sge, qp->r_len, vaddr,
2071					 rkey, IB_ACCESS_REMOTE_WRITE);
2072			if (unlikely(!ok))
2073				goto nack_acc;
2074			qp->r_sge.num_sge = 1;
2075		} else {
2076			qp->r_sge.num_sge = 0;
2077			qp->r_sge.sge.mr = NULL;
2078			qp->r_sge.sge.vaddr = NULL;
2079			qp->r_sge.sge.length = 0;
2080			qp->r_sge.sge.sge_length = 0;
2081		}
2082		if (opcode == OP(RDMA_WRITE_FIRST))
2083			goto send_middle;
2084		else if (opcode == OP(RDMA_WRITE_ONLY))
2085			goto no_immediate_data;
2086		ret = qib_get_rwqe(qp, 1);
2087		if (ret < 0)
2088			goto nack_op_err;
2089		if (!ret)
2090			goto rnr_nak;
2091		wc.ex.imm_data = ohdr->u.rc.imm_data;
2092		hdrsize += 4;
2093		wc.wc_flags = IB_WC_WITH_IMM;
2094		goto send_last;
2095
2096	case OP(RDMA_READ_REQUEST): {
2097		struct qib_ack_entry *e;
2098		u32 len;
2099		u8 next;
2100
2101		if (unlikely(!(qp->qp_access_flags & IB_ACCESS_REMOTE_READ)))
2102			goto nack_inv;
2103		next = qp->r_head_ack_queue + 1;
2104		/* s_ack_queue is size QIB_MAX_RDMA_ATOMIC+1 so use > not >= */
2105		if (next > QIB_MAX_RDMA_ATOMIC)
2106			next = 0;
2107		spin_lock_irqsave(&qp->s_lock, flags);
2108		if (unlikely(next == qp->s_tail_ack_queue)) {
2109			if (!qp->s_ack_queue[next].sent)
2110				goto nack_inv_unlck;
2111			qib_update_ack_queue(qp, next);
2112		}
2113		e = &qp->s_ack_queue[qp->r_head_ack_queue];
2114		if (e->opcode == OP(RDMA_READ_REQUEST) && e->rdma_sge.mr) {
2115			qib_put_mr(e->rdma_sge.mr);
2116			e->rdma_sge.mr = NULL;
2117		}
2118		reth = &ohdr->u.rc.reth;
2119		len = be32_to_cpu(reth->length);
2120		if (len) {
2121			u32 rkey = be32_to_cpu(reth->rkey);
2122			u64 vaddr = be64_to_cpu(reth->vaddr);
2123			int ok;
2124
2125			/* Check rkey & NAK */
2126			ok = qib_rkey_ok(qp, &e->rdma_sge, len, vaddr,
2127					 rkey, IB_ACCESS_REMOTE_READ);
2128			if (unlikely(!ok))
2129				goto nack_acc_unlck;
2130			/*
2131			 * Update the next expected PSN.  We add 1 later
2132			 * below, so only add the remainder here.
2133			 */
2134			if (len > pmtu)
2135				qp->r_psn += (len - 1) / pmtu;
2136		} else {
2137			e->rdma_sge.mr = NULL;
2138			e->rdma_sge.vaddr = NULL;
2139			e->rdma_sge.length = 0;
2140			e->rdma_sge.sge_length = 0;
2141		}
2142		e->opcode = opcode;
2143		e->sent = 0;
2144		e->psn = psn;
2145		e->lpsn = qp->r_psn;
2146		/*
2147		 * We need to increment the MSN here instead of when we
2148		 * finish sending the result since a duplicate request would
2149		 * increment it more than once.
2150		 */
2151		qp->r_msn++;
2152		qp->r_psn++;
2153		qp->r_state = opcode;
2154		qp->r_nak_state = 0;
2155		qp->r_head_ack_queue = next;
2156
2157		/* Schedule the send tasklet. */
2158		qp->s_flags |= QIB_S_RESP_PENDING;
2159		qib_schedule_send(qp);
2160
2161		goto sunlock;
2162	}
2163
2164	case OP(COMPARE_SWAP):
2165	case OP(FETCH_ADD): {
2166		struct ib_atomic_eth *ateth;
2167		struct qib_ack_entry *e;
2168		u64 vaddr;
2169		atomic64_t *maddr;
2170		u64 sdata;
2171		u32 rkey;
2172		u8 next;
2173
2174		if (unlikely(!(qp->qp_access_flags & IB_ACCESS_REMOTE_ATOMIC)))
2175			goto nack_inv;
2176		next = qp->r_head_ack_queue + 1;
2177		if (next > QIB_MAX_RDMA_ATOMIC)
2178			next = 0;
2179		spin_lock_irqsave(&qp->s_lock, flags);
2180		if (unlikely(next == qp->s_tail_ack_queue)) {
2181			if (!qp->s_ack_queue[next].sent)
2182				goto nack_inv_unlck;
2183			qib_update_ack_queue(qp, next);
2184		}
2185		e = &qp->s_ack_queue[qp->r_head_ack_queue];
2186		if (e->opcode == OP(RDMA_READ_REQUEST) && e->rdma_sge.mr) {
2187			qib_put_mr(e->rdma_sge.mr);
2188			e->rdma_sge.mr = NULL;
2189		}
2190		ateth = &ohdr->u.atomic_eth;
2191		vaddr = ((u64) be32_to_cpu(ateth->vaddr[0]) << 32) |
2192			be32_to_cpu(ateth->vaddr[1]);
2193		if (unlikely(vaddr & (sizeof(u64) - 1)))
2194			goto nack_inv_unlck;
2195		rkey = be32_to_cpu(ateth->rkey);
2196		/* Check rkey & NAK */
2197		if (unlikely(!qib_rkey_ok(qp, &qp->r_sge.sge, sizeof(u64),
2198					  vaddr, rkey,
2199					  IB_ACCESS_REMOTE_ATOMIC)))
2200			goto nack_acc_unlck;
2201		/* Perform atomic OP and save result. */
2202		maddr = (atomic64_t *) qp->r_sge.sge.vaddr;
2203		sdata = be64_to_cpu(ateth->swap_data);
2204		e->atomic_data = (opcode == OP(FETCH_ADD)) ?
2205			(u64) atomic64_add_return(sdata, maddr) - sdata :
2206			(u64) cmpxchg((u64 *) qp->r_sge.sge.vaddr,
2207				      be64_to_cpu(ateth->compare_data),
2208				      sdata);
2209		qib_put_mr(qp->r_sge.sge.mr);
2210		qp->r_sge.num_sge = 0;
2211		e->opcode = opcode;
2212		e->sent = 0;
2213		e->psn = psn;
2214		e->lpsn = psn;
2215		qp->r_msn++;
2216		qp->r_psn++;
2217		qp->r_state = opcode;
2218		qp->r_nak_state = 0;
2219		qp->r_head_ack_queue = next;
2220
2221		/* Schedule the send tasklet. */
2222		qp->s_flags |= QIB_S_RESP_PENDING;
2223		qib_schedule_send(qp);
2224
2225		goto sunlock;
2226	}
2227
2228	default:
2229		/* NAK unknown opcodes. */
2230		goto nack_inv;
2231	}
2232	qp->r_psn++;
2233	qp->r_state = opcode;
2234	qp->r_ack_psn = psn;
2235	qp->r_nak_state = 0;
2236	/* Send an ACK if requested or required. */
2237	if (psn & (1 << 31))
2238		goto send_ack;
2239	return;
2240
2241rnr_nak:
2242	qp->r_nak_state = IB_RNR_NAK | qp->r_min_rnr_timer;
2243	qp->r_ack_psn = qp->r_psn;
2244	/* Queue RNR NAK for later */
2245	if (list_empty(&qp->rspwait)) {
2246		qp->r_flags |= QIB_R_RSP_NAK;
2247		atomic_inc(&qp->refcount);
2248		list_add_tail(&qp->rspwait, &rcd->qp_wait_list);
2249	}
2250	return;
2251
2252nack_op_err:
2253	qib_rc_error(qp, IB_WC_LOC_QP_OP_ERR);
2254	qp->r_nak_state = IB_NAK_REMOTE_OPERATIONAL_ERROR;
2255	qp->r_ack_psn = qp->r_psn;
2256	/* Queue NAK for later */
2257	if (list_empty(&qp->rspwait)) {
2258		qp->r_flags |= QIB_R_RSP_NAK;
2259		atomic_inc(&qp->refcount);
2260		list_add_tail(&qp->rspwait, &rcd->qp_wait_list);
2261	}
2262	return;
2263
2264nack_inv_unlck:
2265	spin_unlock_irqrestore(&qp->s_lock, flags);
2266nack_inv:
2267	qib_rc_error(qp, IB_WC_LOC_QP_OP_ERR);
2268	qp->r_nak_state = IB_NAK_INVALID_REQUEST;
2269	qp->r_ack_psn = qp->r_psn;
2270	/* Queue NAK for later */
2271	if (list_empty(&qp->rspwait)) {
2272		qp->r_flags |= QIB_R_RSP_NAK;
2273		atomic_inc(&qp->refcount);
2274		list_add_tail(&qp->rspwait, &rcd->qp_wait_list);
2275	}
2276	return;
2277
2278nack_acc_unlck:
2279	spin_unlock_irqrestore(&qp->s_lock, flags);
2280nack_acc:
2281	qib_rc_error(qp, IB_WC_LOC_PROT_ERR);
2282	qp->r_nak_state = IB_NAK_REMOTE_ACCESS_ERROR;
2283	qp->r_ack_psn = qp->r_psn;
2284send_ack:
2285	qib_send_rc_ack(qp);
2286	return;
2287
2288sunlock:
2289	spin_unlock_irqrestore(&qp->s_lock, flags);
2290}
2291