1/* 2 * Copyright (c) 2006, 2007, 2008, 2009 QLogic Corporation. All rights reserved. 3 * Copyright (c) 2005, 2006 PathScale, Inc. All rights reserved. 4 * 5 * This software is available to you under a choice of one of two 6 * licenses. You may choose to be licensed under the terms of the GNU 7 * General Public License (GPL) Version 2, available from the file 8 * COPYING in the main directory of this source tree, or the 9 * OpenIB.org BSD license below: 10 * 11 * Redistribution and use in source and binary forms, with or 12 * without modification, are permitted provided that the following 13 * conditions are met: 14 * 15 * - Redistributions of source code must retain the above 16 * copyright notice, this list of conditions and the following 17 * disclaimer. 18 * 19 * - Redistributions in binary form must reproduce the above 20 * copyright notice, this list of conditions and the following 21 * disclaimer in the documentation and/or other materials 22 * provided with the distribution. 23 * 24 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 25 * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 26 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND 27 * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS 28 * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN 29 * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN 30 * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 31 * SOFTWARE. 32 */ 33 34#include <linux/io.h> 35 36#include "qib.h" 37 38/* cut down ridiculously long IB macro names */ 39#define OP(x) IB_OPCODE_RC_##x 40 41static void rc_timeout(unsigned long arg); 42 43static u32 restart_sge(struct qib_sge_state *ss, struct qib_swqe *wqe, 44 u32 psn, u32 pmtu) 45{ 46 u32 len; 47 48 len = ((psn - wqe->psn) & QIB_PSN_MASK) * pmtu; 49 ss->sge = wqe->sg_list[0]; 50 ss->sg_list = wqe->sg_list + 1; 51 ss->num_sge = wqe->wr.num_sge; 52 ss->total_len = wqe->length; 53 qib_skip_sge(ss, len, 0); 54 return wqe->length - len; 55} 56 57static void start_timer(struct qib_qp *qp) 58{ 59 qp->s_flags |= QIB_S_TIMER; 60 qp->s_timer.function = rc_timeout; 61 /* 4.096 usec. * (1 << qp->timeout) */ 62 qp->s_timer.expires = jiffies + qp->timeout_jiffies; 63 add_timer(&qp->s_timer); 64} 65 66/** 67 * qib_make_rc_ack - construct a response packet (ACK, NAK, or RDMA read) 68 * @dev: the device for this QP 69 * @qp: a pointer to the QP 70 * @ohdr: a pointer to the IB header being constructed 71 * @pmtu: the path MTU 72 * 73 * Return 1 if constructed; otherwise, return 0. 74 * Note that we are in the responder's side of the QP context. 75 * Note the QP s_lock must be held. 76 */ 77static int qib_make_rc_ack(struct qib_ibdev *dev, struct qib_qp *qp, 78 struct qib_other_headers *ohdr, u32 pmtu) 79{ 80 struct qib_ack_entry *e; 81 u32 hwords; 82 u32 len; 83 u32 bth0; 84 u32 bth2; 85 86 /* Don't send an ACK if we aren't supposed to. */ 87 if (!(ib_qib_state_ops[qp->state] & QIB_PROCESS_RECV_OK)) 88 goto bail; 89 90 /* header size in 32-bit words LRH+BTH = (8+12)/4. */ 91 hwords = 5; 92 93 switch (qp->s_ack_state) { 94 case OP(RDMA_READ_RESPONSE_LAST): 95 case OP(RDMA_READ_RESPONSE_ONLY): 96 e = &qp->s_ack_queue[qp->s_tail_ack_queue]; 97 if (e->rdma_sge.mr) { 98 qib_put_mr(e->rdma_sge.mr); 99 e->rdma_sge.mr = NULL; 100 } 101 /* FALLTHROUGH */ 102 case OP(ATOMIC_ACKNOWLEDGE): 103 /* 104 * We can increment the tail pointer now that the last 105 * response has been sent instead of only being 106 * constructed. 107 */ 108 if (++qp->s_tail_ack_queue > QIB_MAX_RDMA_ATOMIC) 109 qp->s_tail_ack_queue = 0; 110 /* FALLTHROUGH */ 111 case OP(SEND_ONLY): 112 case OP(ACKNOWLEDGE): 113 /* Check for no next entry in the queue. */ 114 if (qp->r_head_ack_queue == qp->s_tail_ack_queue) { 115 if (qp->s_flags & QIB_S_ACK_PENDING) 116 goto normal; 117 goto bail; 118 } 119 120 e = &qp->s_ack_queue[qp->s_tail_ack_queue]; 121 if (e->opcode == OP(RDMA_READ_REQUEST)) { 122 /* 123 * If a RDMA read response is being resent and 124 * we haven't seen the duplicate request yet, 125 * then stop sending the remaining responses the 126 * responder has seen until the requester resends it. 127 */ 128 len = e->rdma_sge.sge_length; 129 if (len && !e->rdma_sge.mr) { 130 qp->s_tail_ack_queue = qp->r_head_ack_queue; 131 goto bail; 132 } 133 /* Copy SGE state in case we need to resend */ 134 qp->s_rdma_mr = e->rdma_sge.mr; 135 if (qp->s_rdma_mr) 136 qib_get_mr(qp->s_rdma_mr); 137 qp->s_ack_rdma_sge.sge = e->rdma_sge; 138 qp->s_ack_rdma_sge.num_sge = 1; 139 qp->s_cur_sge = &qp->s_ack_rdma_sge; 140 if (len > pmtu) { 141 len = pmtu; 142 qp->s_ack_state = OP(RDMA_READ_RESPONSE_FIRST); 143 } else { 144 qp->s_ack_state = OP(RDMA_READ_RESPONSE_ONLY); 145 e->sent = 1; 146 } 147 ohdr->u.aeth = qib_compute_aeth(qp); 148 hwords++; 149 qp->s_ack_rdma_psn = e->psn; 150 bth2 = qp->s_ack_rdma_psn++ & QIB_PSN_MASK; 151 } else { 152 /* COMPARE_SWAP or FETCH_ADD */ 153 qp->s_cur_sge = NULL; 154 len = 0; 155 qp->s_ack_state = OP(ATOMIC_ACKNOWLEDGE); 156 ohdr->u.at.aeth = qib_compute_aeth(qp); 157 ohdr->u.at.atomic_ack_eth[0] = 158 cpu_to_be32(e->atomic_data >> 32); 159 ohdr->u.at.atomic_ack_eth[1] = 160 cpu_to_be32(e->atomic_data); 161 hwords += sizeof(ohdr->u.at) / sizeof(u32); 162 bth2 = e->psn & QIB_PSN_MASK; 163 e->sent = 1; 164 } 165 bth0 = qp->s_ack_state << 24; 166 break; 167 168 case OP(RDMA_READ_RESPONSE_FIRST): 169 qp->s_ack_state = OP(RDMA_READ_RESPONSE_MIDDLE); 170 /* FALLTHROUGH */ 171 case OP(RDMA_READ_RESPONSE_MIDDLE): 172 qp->s_cur_sge = &qp->s_ack_rdma_sge; 173 qp->s_rdma_mr = qp->s_ack_rdma_sge.sge.mr; 174 if (qp->s_rdma_mr) 175 qib_get_mr(qp->s_rdma_mr); 176 len = qp->s_ack_rdma_sge.sge.sge_length; 177 if (len > pmtu) 178 len = pmtu; 179 else { 180 ohdr->u.aeth = qib_compute_aeth(qp); 181 hwords++; 182 qp->s_ack_state = OP(RDMA_READ_RESPONSE_LAST); 183 e = &qp->s_ack_queue[qp->s_tail_ack_queue]; 184 e->sent = 1; 185 } 186 bth0 = qp->s_ack_state << 24; 187 bth2 = qp->s_ack_rdma_psn++ & QIB_PSN_MASK; 188 break; 189 190 default: 191normal: 192 /* 193 * Send a regular ACK. 194 * Set the s_ack_state so we wait until after sending 195 * the ACK before setting s_ack_state to ACKNOWLEDGE 196 * (see above). 197 */ 198 qp->s_ack_state = OP(SEND_ONLY); 199 qp->s_flags &= ~QIB_S_ACK_PENDING; 200 qp->s_cur_sge = NULL; 201 if (qp->s_nak_state) 202 ohdr->u.aeth = 203 cpu_to_be32((qp->r_msn & QIB_MSN_MASK) | 204 (qp->s_nak_state << 205 QIB_AETH_CREDIT_SHIFT)); 206 else 207 ohdr->u.aeth = qib_compute_aeth(qp); 208 hwords++; 209 len = 0; 210 bth0 = OP(ACKNOWLEDGE) << 24; 211 bth2 = qp->s_ack_psn & QIB_PSN_MASK; 212 } 213 qp->s_rdma_ack_cnt++; 214 qp->s_hdrwords = hwords; 215 qp->s_cur_size = len; 216 qib_make_ruc_header(qp, ohdr, bth0, bth2); 217 return 1; 218 219bail: 220 qp->s_ack_state = OP(ACKNOWLEDGE); 221 qp->s_flags &= ~(QIB_S_RESP_PENDING | QIB_S_ACK_PENDING); 222 return 0; 223} 224 225/** 226 * qib_make_rc_req - construct a request packet (SEND, RDMA r/w, ATOMIC) 227 * @qp: a pointer to the QP 228 * 229 * Return 1 if constructed; otherwise, return 0. 230 */ 231int qib_make_rc_req(struct qib_qp *qp) 232{ 233 struct qib_ibdev *dev = to_idev(qp->ibqp.device); 234 struct qib_other_headers *ohdr; 235 struct qib_sge_state *ss; 236 struct qib_swqe *wqe; 237 u32 hwords; 238 u32 len; 239 u32 bth0; 240 u32 bth2; 241 u32 pmtu = qp->pmtu; 242 char newreq; 243 unsigned long flags; 244 int ret = 0; 245 int delta; 246 247 ohdr = &qp->s_hdr->u.oth; 248 if (qp->remote_ah_attr.ah_flags & IB_AH_GRH) 249 ohdr = &qp->s_hdr->u.l.oth; 250 251 /* 252 * The lock is needed to synchronize between the sending tasklet, 253 * the receive interrupt handler, and timeout resends. 254 */ 255 spin_lock_irqsave(&qp->s_lock, flags); 256 257 /* Sending responses has higher priority over sending requests. */ 258 if ((qp->s_flags & QIB_S_RESP_PENDING) && 259 qib_make_rc_ack(dev, qp, ohdr, pmtu)) 260 goto done; 261 262 if (!(ib_qib_state_ops[qp->state] & QIB_PROCESS_SEND_OK)) { 263 if (!(ib_qib_state_ops[qp->state] & QIB_FLUSH_SEND)) 264 goto bail; 265 /* We are in the error state, flush the work request. */ 266 if (qp->s_last == qp->s_head) 267 goto bail; 268 /* If DMAs are in progress, we can't flush immediately. */ 269 if (atomic_read(&qp->s_dma_busy)) { 270 qp->s_flags |= QIB_S_WAIT_DMA; 271 goto bail; 272 } 273 wqe = get_swqe_ptr(qp, qp->s_last); 274 qib_send_complete(qp, wqe, qp->s_last != qp->s_acked ? 275 IB_WC_SUCCESS : IB_WC_WR_FLUSH_ERR); 276 /* will get called again */ 277 goto done; 278 } 279 280 if (qp->s_flags & (QIB_S_WAIT_RNR | QIB_S_WAIT_ACK)) 281 goto bail; 282 283 if (qib_cmp24(qp->s_psn, qp->s_sending_hpsn) <= 0) { 284 if (qib_cmp24(qp->s_sending_psn, qp->s_sending_hpsn) <= 0) { 285 qp->s_flags |= QIB_S_WAIT_PSN; 286 goto bail; 287 } 288 qp->s_sending_psn = qp->s_psn; 289 qp->s_sending_hpsn = qp->s_psn - 1; 290 } 291 292 /* header size in 32-bit words LRH+BTH = (8+12)/4. */ 293 hwords = 5; 294 bth0 = 0; 295 296 /* Send a request. */ 297 wqe = get_swqe_ptr(qp, qp->s_cur); 298 switch (qp->s_state) { 299 default: 300 if (!(ib_qib_state_ops[qp->state] & QIB_PROCESS_NEXT_SEND_OK)) 301 goto bail; 302 /* 303 * Resend an old request or start a new one. 304 * 305 * We keep track of the current SWQE so that 306 * we don't reset the "furthest progress" state 307 * if we need to back up. 308 */ 309 newreq = 0; 310 if (qp->s_cur == qp->s_tail) { 311 /* Check if send work queue is empty. */ 312 if (qp->s_tail == qp->s_head) 313 goto bail; 314 /* 315 * If a fence is requested, wait for previous 316 * RDMA read and atomic operations to finish. 317 */ 318 if ((wqe->wr.send_flags & IB_SEND_FENCE) && 319 qp->s_num_rd_atomic) { 320 qp->s_flags |= QIB_S_WAIT_FENCE; 321 goto bail; 322 } 323 wqe->psn = qp->s_next_psn; 324 newreq = 1; 325 } 326 /* 327 * Note that we have to be careful not to modify the 328 * original work request since we may need to resend 329 * it. 330 */ 331 len = wqe->length; 332 ss = &qp->s_sge; 333 bth2 = qp->s_psn & QIB_PSN_MASK; 334 switch (wqe->wr.opcode) { 335 case IB_WR_SEND: 336 case IB_WR_SEND_WITH_IMM: 337 /* If no credit, return. */ 338 if (!(qp->s_flags & QIB_S_UNLIMITED_CREDIT) && 339 qib_cmp24(wqe->ssn, qp->s_lsn + 1) > 0) { 340 qp->s_flags |= QIB_S_WAIT_SSN_CREDIT; 341 goto bail; 342 } 343 wqe->lpsn = wqe->psn; 344 if (len > pmtu) { 345 wqe->lpsn += (len - 1) / pmtu; 346 qp->s_state = OP(SEND_FIRST); 347 len = pmtu; 348 break; 349 } 350 if (wqe->wr.opcode == IB_WR_SEND) 351 qp->s_state = OP(SEND_ONLY); 352 else { 353 qp->s_state = OP(SEND_ONLY_WITH_IMMEDIATE); 354 /* Immediate data comes after the BTH */ 355 ohdr->u.imm_data = wqe->wr.ex.imm_data; 356 hwords += 1; 357 } 358 if (wqe->wr.send_flags & IB_SEND_SOLICITED) 359 bth0 |= IB_BTH_SOLICITED; 360 bth2 |= IB_BTH_REQ_ACK; 361 if (++qp->s_cur == qp->s_size) 362 qp->s_cur = 0; 363 break; 364 365 case IB_WR_RDMA_WRITE: 366 if (newreq && !(qp->s_flags & QIB_S_UNLIMITED_CREDIT)) 367 qp->s_lsn++; 368 /* FALLTHROUGH */ 369 case IB_WR_RDMA_WRITE_WITH_IMM: 370 /* If no credit, return. */ 371 if (!(qp->s_flags & QIB_S_UNLIMITED_CREDIT) && 372 qib_cmp24(wqe->ssn, qp->s_lsn + 1) > 0) { 373 qp->s_flags |= QIB_S_WAIT_SSN_CREDIT; 374 goto bail; 375 } 376 ohdr->u.rc.reth.vaddr = 377 cpu_to_be64(wqe->wr.wr.rdma.remote_addr); 378 ohdr->u.rc.reth.rkey = 379 cpu_to_be32(wqe->wr.wr.rdma.rkey); 380 ohdr->u.rc.reth.length = cpu_to_be32(len); 381 hwords += sizeof(struct ib_reth) / sizeof(u32); 382 wqe->lpsn = wqe->psn; 383 if (len > pmtu) { 384 wqe->lpsn += (len - 1) / pmtu; 385 qp->s_state = OP(RDMA_WRITE_FIRST); 386 len = pmtu; 387 break; 388 } 389 if (wqe->wr.opcode == IB_WR_RDMA_WRITE) 390 qp->s_state = OP(RDMA_WRITE_ONLY); 391 else { 392 qp->s_state = 393 OP(RDMA_WRITE_ONLY_WITH_IMMEDIATE); 394 /* Immediate data comes after RETH */ 395 ohdr->u.rc.imm_data = wqe->wr.ex.imm_data; 396 hwords += 1; 397 if (wqe->wr.send_flags & IB_SEND_SOLICITED) 398 bth0 |= IB_BTH_SOLICITED; 399 } 400 bth2 |= IB_BTH_REQ_ACK; 401 if (++qp->s_cur == qp->s_size) 402 qp->s_cur = 0; 403 break; 404 405 case IB_WR_RDMA_READ: 406 /* 407 * Don't allow more operations to be started 408 * than the QP limits allow. 409 */ 410 if (newreq) { 411 if (qp->s_num_rd_atomic >= 412 qp->s_max_rd_atomic) { 413 qp->s_flags |= QIB_S_WAIT_RDMAR; 414 goto bail; 415 } 416 qp->s_num_rd_atomic++; 417 if (!(qp->s_flags & QIB_S_UNLIMITED_CREDIT)) 418 qp->s_lsn++; 419 /* 420 * Adjust s_next_psn to count the 421 * expected number of responses. 422 */ 423 if (len > pmtu) 424 qp->s_next_psn += (len - 1) / pmtu; 425 wqe->lpsn = qp->s_next_psn++; 426 } 427 ohdr->u.rc.reth.vaddr = 428 cpu_to_be64(wqe->wr.wr.rdma.remote_addr); 429 ohdr->u.rc.reth.rkey = 430 cpu_to_be32(wqe->wr.wr.rdma.rkey); 431 ohdr->u.rc.reth.length = cpu_to_be32(len); 432 qp->s_state = OP(RDMA_READ_REQUEST); 433 hwords += sizeof(ohdr->u.rc.reth) / sizeof(u32); 434 ss = NULL; 435 len = 0; 436 bth2 |= IB_BTH_REQ_ACK; 437 if (++qp->s_cur == qp->s_size) 438 qp->s_cur = 0; 439 break; 440 441 case IB_WR_ATOMIC_CMP_AND_SWP: 442 case IB_WR_ATOMIC_FETCH_AND_ADD: 443 /* 444 * Don't allow more operations to be started 445 * than the QP limits allow. 446 */ 447 if (newreq) { 448 if (qp->s_num_rd_atomic >= 449 qp->s_max_rd_atomic) { 450 qp->s_flags |= QIB_S_WAIT_RDMAR; 451 goto bail; 452 } 453 qp->s_num_rd_atomic++; 454 if (!(qp->s_flags & QIB_S_UNLIMITED_CREDIT)) 455 qp->s_lsn++; 456 wqe->lpsn = wqe->psn; 457 } 458 if (wqe->wr.opcode == IB_WR_ATOMIC_CMP_AND_SWP) { 459 qp->s_state = OP(COMPARE_SWAP); 460 ohdr->u.atomic_eth.swap_data = cpu_to_be64( 461 wqe->wr.wr.atomic.swap); 462 ohdr->u.atomic_eth.compare_data = cpu_to_be64( 463 wqe->wr.wr.atomic.compare_add); 464 } else { 465 qp->s_state = OP(FETCH_ADD); 466 ohdr->u.atomic_eth.swap_data = cpu_to_be64( 467 wqe->wr.wr.atomic.compare_add); 468 ohdr->u.atomic_eth.compare_data = 0; 469 } 470 ohdr->u.atomic_eth.vaddr[0] = cpu_to_be32( 471 wqe->wr.wr.atomic.remote_addr >> 32); 472 ohdr->u.atomic_eth.vaddr[1] = cpu_to_be32( 473 wqe->wr.wr.atomic.remote_addr); 474 ohdr->u.atomic_eth.rkey = cpu_to_be32( 475 wqe->wr.wr.atomic.rkey); 476 hwords += sizeof(struct ib_atomic_eth) / sizeof(u32); 477 ss = NULL; 478 len = 0; 479 bth2 |= IB_BTH_REQ_ACK; 480 if (++qp->s_cur == qp->s_size) 481 qp->s_cur = 0; 482 break; 483 484 default: 485 goto bail; 486 } 487 qp->s_sge.sge = wqe->sg_list[0]; 488 qp->s_sge.sg_list = wqe->sg_list + 1; 489 qp->s_sge.num_sge = wqe->wr.num_sge; 490 qp->s_sge.total_len = wqe->length; 491 qp->s_len = wqe->length; 492 if (newreq) { 493 qp->s_tail++; 494 if (qp->s_tail >= qp->s_size) 495 qp->s_tail = 0; 496 } 497 if (wqe->wr.opcode == IB_WR_RDMA_READ) 498 qp->s_psn = wqe->lpsn + 1; 499 else { 500 qp->s_psn++; 501 if (qib_cmp24(qp->s_psn, qp->s_next_psn) > 0) 502 qp->s_next_psn = qp->s_psn; 503 } 504 break; 505 506 case OP(RDMA_READ_RESPONSE_FIRST): 507 /* 508 * qp->s_state is normally set to the opcode of the 509 * last packet constructed for new requests and therefore 510 * is never set to RDMA read response. 511 * RDMA_READ_RESPONSE_FIRST is used by the ACK processing 512 * thread to indicate a SEND needs to be restarted from an 513 * earlier PSN without interferring with the sending thread. 514 * See qib_restart_rc(). 515 */ 516 qp->s_len = restart_sge(&qp->s_sge, wqe, qp->s_psn, pmtu); 517 /* FALLTHROUGH */ 518 case OP(SEND_FIRST): 519 qp->s_state = OP(SEND_MIDDLE); 520 /* FALLTHROUGH */ 521 case OP(SEND_MIDDLE): 522 bth2 = qp->s_psn++ & QIB_PSN_MASK; 523 if (qib_cmp24(qp->s_psn, qp->s_next_psn) > 0) 524 qp->s_next_psn = qp->s_psn; 525 ss = &qp->s_sge; 526 len = qp->s_len; 527 if (len > pmtu) { 528 len = pmtu; 529 break; 530 } 531 if (wqe->wr.opcode == IB_WR_SEND) 532 qp->s_state = OP(SEND_LAST); 533 else { 534 qp->s_state = OP(SEND_LAST_WITH_IMMEDIATE); 535 /* Immediate data comes after the BTH */ 536 ohdr->u.imm_data = wqe->wr.ex.imm_data; 537 hwords += 1; 538 } 539 if (wqe->wr.send_flags & IB_SEND_SOLICITED) 540 bth0 |= IB_BTH_SOLICITED; 541 bth2 |= IB_BTH_REQ_ACK; 542 qp->s_cur++; 543 if (qp->s_cur >= qp->s_size) 544 qp->s_cur = 0; 545 break; 546 547 case OP(RDMA_READ_RESPONSE_LAST): 548 /* 549 * qp->s_state is normally set to the opcode of the 550 * last packet constructed for new requests and therefore 551 * is never set to RDMA read response. 552 * RDMA_READ_RESPONSE_LAST is used by the ACK processing 553 * thread to indicate a RDMA write needs to be restarted from 554 * an earlier PSN without interferring with the sending thread. 555 * See qib_restart_rc(). 556 */ 557 qp->s_len = restart_sge(&qp->s_sge, wqe, qp->s_psn, pmtu); 558 /* FALLTHROUGH */ 559 case OP(RDMA_WRITE_FIRST): 560 qp->s_state = OP(RDMA_WRITE_MIDDLE); 561 /* FALLTHROUGH */ 562 case OP(RDMA_WRITE_MIDDLE): 563 bth2 = qp->s_psn++ & QIB_PSN_MASK; 564 if (qib_cmp24(qp->s_psn, qp->s_next_psn) > 0) 565 qp->s_next_psn = qp->s_psn; 566 ss = &qp->s_sge; 567 len = qp->s_len; 568 if (len > pmtu) { 569 len = pmtu; 570 break; 571 } 572 if (wqe->wr.opcode == IB_WR_RDMA_WRITE) 573 qp->s_state = OP(RDMA_WRITE_LAST); 574 else { 575 qp->s_state = OP(RDMA_WRITE_LAST_WITH_IMMEDIATE); 576 /* Immediate data comes after the BTH */ 577 ohdr->u.imm_data = wqe->wr.ex.imm_data; 578 hwords += 1; 579 if (wqe->wr.send_flags & IB_SEND_SOLICITED) 580 bth0 |= IB_BTH_SOLICITED; 581 } 582 bth2 |= IB_BTH_REQ_ACK; 583 qp->s_cur++; 584 if (qp->s_cur >= qp->s_size) 585 qp->s_cur = 0; 586 break; 587 588 case OP(RDMA_READ_RESPONSE_MIDDLE): 589 /* 590 * qp->s_state is normally set to the opcode of the 591 * last packet constructed for new requests and therefore 592 * is never set to RDMA read response. 593 * RDMA_READ_RESPONSE_MIDDLE is used by the ACK processing 594 * thread to indicate a RDMA read needs to be restarted from 595 * an earlier PSN without interferring with the sending thread. 596 * See qib_restart_rc(). 597 */ 598 len = ((qp->s_psn - wqe->psn) & QIB_PSN_MASK) * pmtu; 599 ohdr->u.rc.reth.vaddr = 600 cpu_to_be64(wqe->wr.wr.rdma.remote_addr + len); 601 ohdr->u.rc.reth.rkey = 602 cpu_to_be32(wqe->wr.wr.rdma.rkey); 603 ohdr->u.rc.reth.length = cpu_to_be32(wqe->length - len); 604 qp->s_state = OP(RDMA_READ_REQUEST); 605 hwords += sizeof(ohdr->u.rc.reth) / sizeof(u32); 606 bth2 = (qp->s_psn & QIB_PSN_MASK) | IB_BTH_REQ_ACK; 607 qp->s_psn = wqe->lpsn + 1; 608 ss = NULL; 609 len = 0; 610 qp->s_cur++; 611 if (qp->s_cur == qp->s_size) 612 qp->s_cur = 0; 613 break; 614 } 615 qp->s_sending_hpsn = bth2; 616 delta = (((int) bth2 - (int) wqe->psn) << 8) >> 8; 617 if (delta && delta % QIB_PSN_CREDIT == 0) 618 bth2 |= IB_BTH_REQ_ACK; 619 if (qp->s_flags & QIB_S_SEND_ONE) { 620 qp->s_flags &= ~QIB_S_SEND_ONE; 621 qp->s_flags |= QIB_S_WAIT_ACK; 622 bth2 |= IB_BTH_REQ_ACK; 623 } 624 qp->s_len -= len; 625 qp->s_hdrwords = hwords; 626 qp->s_cur_sge = ss; 627 qp->s_cur_size = len; 628 qib_make_ruc_header(qp, ohdr, bth0 | (qp->s_state << 24), bth2); 629done: 630 ret = 1; 631 goto unlock; 632 633bail: 634 qp->s_flags &= ~QIB_S_BUSY; 635unlock: 636 spin_unlock_irqrestore(&qp->s_lock, flags); 637 return ret; 638} 639 640/** 641 * qib_send_rc_ack - Construct an ACK packet and send it 642 * @qp: a pointer to the QP 643 * 644 * This is called from qib_rc_rcv() and qib_kreceive(). 645 * Note that RDMA reads and atomics are handled in the 646 * send side QP state and tasklet. 647 */ 648void qib_send_rc_ack(struct qib_qp *qp) 649{ 650 struct qib_devdata *dd = dd_from_ibdev(qp->ibqp.device); 651 struct qib_ibport *ibp = to_iport(qp->ibqp.device, qp->port_num); 652 struct qib_pportdata *ppd = ppd_from_ibp(ibp); 653 u64 pbc; 654 u16 lrh0; 655 u32 bth0; 656 u32 hwords; 657 u32 pbufn; 658 u32 __iomem *piobuf; 659 struct qib_ib_header hdr; 660 struct qib_other_headers *ohdr; 661 u32 control; 662 unsigned long flags; 663 664 spin_lock_irqsave(&qp->s_lock, flags); 665 666 if (!(ib_qib_state_ops[qp->state] & QIB_PROCESS_RECV_OK)) 667 goto unlock; 668 669 /* Don't send ACK or NAK if a RDMA read or atomic is pending. */ 670 if ((qp->s_flags & QIB_S_RESP_PENDING) || qp->s_rdma_ack_cnt) 671 goto queue_ack; 672 673 /* Construct the header with s_lock held so APM doesn't change it. */ 674 ohdr = &hdr.u.oth; 675 lrh0 = QIB_LRH_BTH; 676 /* header size in 32-bit words LRH+BTH+AETH = (8+12+4)/4. */ 677 hwords = 6; 678 if (unlikely(qp->remote_ah_attr.ah_flags & IB_AH_GRH)) { 679 hwords += qib_make_grh(ibp, &hdr.u.l.grh, 680 &qp->remote_ah_attr.grh, hwords, 0); 681 ohdr = &hdr.u.l.oth; 682 lrh0 = QIB_LRH_GRH; 683 } 684 /* read pkey_index w/o lock (its atomic) */ 685 bth0 = qib_get_pkey(ibp, qp->s_pkey_index) | (OP(ACKNOWLEDGE) << 24); 686 if (qp->s_mig_state == IB_MIG_MIGRATED) 687 bth0 |= IB_BTH_MIG_REQ; 688 if (qp->r_nak_state) 689 ohdr->u.aeth = cpu_to_be32((qp->r_msn & QIB_MSN_MASK) | 690 (qp->r_nak_state << 691 QIB_AETH_CREDIT_SHIFT)); 692 else 693 ohdr->u.aeth = qib_compute_aeth(qp); 694 lrh0 |= ibp->sl_to_vl[qp->remote_ah_attr.sl] << 12 | 695 qp->remote_ah_attr.sl << 4; 696 hdr.lrh[0] = cpu_to_be16(lrh0); 697 hdr.lrh[1] = cpu_to_be16(qp->remote_ah_attr.dlid); 698 hdr.lrh[2] = cpu_to_be16(hwords + SIZE_OF_CRC); 699 hdr.lrh[3] = cpu_to_be16(ppd->lid | qp->remote_ah_attr.src_path_bits); 700 ohdr->bth[0] = cpu_to_be32(bth0); 701 ohdr->bth[1] = cpu_to_be32(qp->remote_qpn); 702 ohdr->bth[2] = cpu_to_be32(qp->r_ack_psn & QIB_PSN_MASK); 703 704 spin_unlock_irqrestore(&qp->s_lock, flags); 705 706 /* Don't try to send ACKs if the link isn't ACTIVE */ 707 if (!(ppd->lflags & QIBL_LINKACTIVE)) 708 goto done; 709 710 control = dd->f_setpbc_control(ppd, hwords + SIZE_OF_CRC, 711 qp->s_srate, lrh0 >> 12); 712 /* length is + 1 for the control dword */ 713 pbc = ((u64) control << 32) | (hwords + 1); 714 715 piobuf = dd->f_getsendbuf(ppd, pbc, &pbufn); 716 if (!piobuf) { 717 /* 718 * We are out of PIO buffers at the moment. 719 * Pass responsibility for sending the ACK to the 720 * send tasklet so that when a PIO buffer becomes 721 * available, the ACK is sent ahead of other outgoing 722 * packets. 723 */ 724 spin_lock_irqsave(&qp->s_lock, flags); 725 goto queue_ack; 726 } 727 728 /* 729 * Write the pbc. 730 * We have to flush after the PBC for correctness 731 * on some cpus or WC buffer can be written out of order. 732 */ 733 writeq(pbc, piobuf); 734 735 if (dd->flags & QIB_PIO_FLUSH_WC) { 736 u32 *hdrp = (u32 *) &hdr; 737 738 qib_flush_wc(); 739 qib_pio_copy(piobuf + 2, hdrp, hwords - 1); 740 qib_flush_wc(); 741 __raw_writel(hdrp[hwords - 1], piobuf + hwords + 1); 742 } else 743 qib_pio_copy(piobuf + 2, (u32 *) &hdr, hwords); 744 745 if (dd->flags & QIB_USE_SPCL_TRIG) { 746 u32 spcl_off = (pbufn >= dd->piobcnt2k) ? 2047 : 1023; 747 748 qib_flush_wc(); 749 __raw_writel(0xaebecede, piobuf + spcl_off); 750 } 751 752 qib_flush_wc(); 753 qib_sendbuf_done(dd, pbufn); 754 755 this_cpu_inc(ibp->pmastats->n_unicast_xmit); 756 goto done; 757 758queue_ack: 759 if (ib_qib_state_ops[qp->state] & QIB_PROCESS_RECV_OK) { 760 ibp->n_rc_qacks++; 761 qp->s_flags |= QIB_S_ACK_PENDING | QIB_S_RESP_PENDING; 762 qp->s_nak_state = qp->r_nak_state; 763 qp->s_ack_psn = qp->r_ack_psn; 764 765 /* Schedule the send tasklet. */ 766 qib_schedule_send(qp); 767 } 768unlock: 769 spin_unlock_irqrestore(&qp->s_lock, flags); 770done: 771 return; 772} 773 774/** 775 * reset_psn - reset the QP state to send starting from PSN 776 * @qp: the QP 777 * @psn: the packet sequence number to restart at 778 * 779 * This is called from qib_rc_rcv() to process an incoming RC ACK 780 * for the given QP. 781 * Called at interrupt level with the QP s_lock held. 782 */ 783static void reset_psn(struct qib_qp *qp, u32 psn) 784{ 785 u32 n = qp->s_acked; 786 struct qib_swqe *wqe = get_swqe_ptr(qp, n); 787 u32 opcode; 788 789 qp->s_cur = n; 790 791 /* 792 * If we are starting the request from the beginning, 793 * let the normal send code handle initialization. 794 */ 795 if (qib_cmp24(psn, wqe->psn) <= 0) { 796 qp->s_state = OP(SEND_LAST); 797 goto done; 798 } 799 800 /* Find the work request opcode corresponding to the given PSN. */ 801 opcode = wqe->wr.opcode; 802 for (;;) { 803 int diff; 804 805 if (++n == qp->s_size) 806 n = 0; 807 if (n == qp->s_tail) 808 break; 809 wqe = get_swqe_ptr(qp, n); 810 diff = qib_cmp24(psn, wqe->psn); 811 if (diff < 0) 812 break; 813 qp->s_cur = n; 814 /* 815 * If we are starting the request from the beginning, 816 * let the normal send code handle initialization. 817 */ 818 if (diff == 0) { 819 qp->s_state = OP(SEND_LAST); 820 goto done; 821 } 822 opcode = wqe->wr.opcode; 823 } 824 825 /* 826 * Set the state to restart in the middle of a request. 827 * Don't change the s_sge, s_cur_sge, or s_cur_size. 828 * See qib_make_rc_req(). 829 */ 830 switch (opcode) { 831 case IB_WR_SEND: 832 case IB_WR_SEND_WITH_IMM: 833 qp->s_state = OP(RDMA_READ_RESPONSE_FIRST); 834 break; 835 836 case IB_WR_RDMA_WRITE: 837 case IB_WR_RDMA_WRITE_WITH_IMM: 838 qp->s_state = OP(RDMA_READ_RESPONSE_LAST); 839 break; 840 841 case IB_WR_RDMA_READ: 842 qp->s_state = OP(RDMA_READ_RESPONSE_MIDDLE); 843 break; 844 845 default: 846 /* 847 * This case shouldn't happen since its only 848 * one PSN per req. 849 */ 850 qp->s_state = OP(SEND_LAST); 851 } 852done: 853 qp->s_psn = psn; 854 /* 855 * Set QIB_S_WAIT_PSN as qib_rc_complete() may start the timer 856 * asynchronously before the send tasklet can get scheduled. 857 * Doing it in qib_make_rc_req() is too late. 858 */ 859 if ((qib_cmp24(qp->s_psn, qp->s_sending_hpsn) <= 0) && 860 (qib_cmp24(qp->s_sending_psn, qp->s_sending_hpsn) <= 0)) 861 qp->s_flags |= QIB_S_WAIT_PSN; 862} 863 864/* 865 * Back up requester to resend the last un-ACKed request. 866 * The QP r_lock and s_lock should be held and interrupts disabled. 867 */ 868static void qib_restart_rc(struct qib_qp *qp, u32 psn, int wait) 869{ 870 struct qib_swqe *wqe = get_swqe_ptr(qp, qp->s_acked); 871 struct qib_ibport *ibp; 872 873 if (qp->s_retry == 0) { 874 if (qp->s_mig_state == IB_MIG_ARMED) { 875 qib_migrate_qp(qp); 876 qp->s_retry = qp->s_retry_cnt; 877 } else if (qp->s_last == qp->s_acked) { 878 qib_send_complete(qp, wqe, IB_WC_RETRY_EXC_ERR); 879 qib_error_qp(qp, IB_WC_WR_FLUSH_ERR); 880 return; 881 } else /* XXX need to handle delayed completion */ 882 return; 883 } else 884 qp->s_retry--; 885 886 ibp = to_iport(qp->ibqp.device, qp->port_num); 887 if (wqe->wr.opcode == IB_WR_RDMA_READ) 888 ibp->n_rc_resends++; 889 else 890 ibp->n_rc_resends += (qp->s_psn - psn) & QIB_PSN_MASK; 891 892 qp->s_flags &= ~(QIB_S_WAIT_FENCE | QIB_S_WAIT_RDMAR | 893 QIB_S_WAIT_SSN_CREDIT | QIB_S_WAIT_PSN | 894 QIB_S_WAIT_ACK); 895 if (wait) 896 qp->s_flags |= QIB_S_SEND_ONE; 897 reset_psn(qp, psn); 898} 899 900/* 901 * This is called from s_timer for missing responses. 902 */ 903static void rc_timeout(unsigned long arg) 904{ 905 struct qib_qp *qp = (struct qib_qp *)arg; 906 struct qib_ibport *ibp; 907 unsigned long flags; 908 909 spin_lock_irqsave(&qp->r_lock, flags); 910 spin_lock(&qp->s_lock); 911 if (qp->s_flags & QIB_S_TIMER) { 912 ibp = to_iport(qp->ibqp.device, qp->port_num); 913 ibp->n_rc_timeouts++; 914 qp->s_flags &= ~QIB_S_TIMER; 915 del_timer(&qp->s_timer); 916 qib_restart_rc(qp, qp->s_last_psn + 1, 1); 917 qib_schedule_send(qp); 918 } 919 spin_unlock(&qp->s_lock); 920 spin_unlock_irqrestore(&qp->r_lock, flags); 921} 922 923/* 924 * This is called from s_timer for RNR timeouts. 925 */ 926void qib_rc_rnr_retry(unsigned long arg) 927{ 928 struct qib_qp *qp = (struct qib_qp *)arg; 929 unsigned long flags; 930 931 spin_lock_irqsave(&qp->s_lock, flags); 932 if (qp->s_flags & QIB_S_WAIT_RNR) { 933 qp->s_flags &= ~QIB_S_WAIT_RNR; 934 del_timer(&qp->s_timer); 935 qib_schedule_send(qp); 936 } 937 spin_unlock_irqrestore(&qp->s_lock, flags); 938} 939 940/* 941 * Set qp->s_sending_psn to the next PSN after the given one. 942 * This would be psn+1 except when RDMA reads are present. 943 */ 944static void reset_sending_psn(struct qib_qp *qp, u32 psn) 945{ 946 struct qib_swqe *wqe; 947 u32 n = qp->s_last; 948 949 /* Find the work request corresponding to the given PSN. */ 950 for (;;) { 951 wqe = get_swqe_ptr(qp, n); 952 if (qib_cmp24(psn, wqe->lpsn) <= 0) { 953 if (wqe->wr.opcode == IB_WR_RDMA_READ) 954 qp->s_sending_psn = wqe->lpsn + 1; 955 else 956 qp->s_sending_psn = psn + 1; 957 break; 958 } 959 if (++n == qp->s_size) 960 n = 0; 961 if (n == qp->s_tail) 962 break; 963 } 964} 965 966/* 967 * This should be called with the QP s_lock held and interrupts disabled. 968 */ 969void qib_rc_send_complete(struct qib_qp *qp, struct qib_ib_header *hdr) 970{ 971 struct qib_other_headers *ohdr; 972 struct qib_swqe *wqe; 973 struct ib_wc wc; 974 unsigned i; 975 u32 opcode; 976 u32 psn; 977 978 if (!(ib_qib_state_ops[qp->state] & QIB_PROCESS_OR_FLUSH_SEND)) 979 return; 980 981 /* Find out where the BTH is */ 982 if ((be16_to_cpu(hdr->lrh[0]) & 3) == QIB_LRH_BTH) 983 ohdr = &hdr->u.oth; 984 else 985 ohdr = &hdr->u.l.oth; 986 987 opcode = be32_to_cpu(ohdr->bth[0]) >> 24; 988 if (opcode >= OP(RDMA_READ_RESPONSE_FIRST) && 989 opcode <= OP(ATOMIC_ACKNOWLEDGE)) { 990 WARN_ON(!qp->s_rdma_ack_cnt); 991 qp->s_rdma_ack_cnt--; 992 return; 993 } 994 995 psn = be32_to_cpu(ohdr->bth[2]); 996 reset_sending_psn(qp, psn); 997 998 /* 999 * Start timer after a packet requesting an ACK has been sent and 1000 * there are still requests that haven't been acked. 1001 */ 1002 if ((psn & IB_BTH_REQ_ACK) && qp->s_acked != qp->s_tail && 1003 !(qp->s_flags & (QIB_S_TIMER | QIB_S_WAIT_RNR | QIB_S_WAIT_PSN)) && 1004 (ib_qib_state_ops[qp->state] & QIB_PROCESS_RECV_OK)) 1005 start_timer(qp); 1006 1007 while (qp->s_last != qp->s_acked) { 1008 wqe = get_swqe_ptr(qp, qp->s_last); 1009 if (qib_cmp24(wqe->lpsn, qp->s_sending_psn) >= 0 && 1010 qib_cmp24(qp->s_sending_psn, qp->s_sending_hpsn) <= 0) 1011 break; 1012 for (i = 0; i < wqe->wr.num_sge; i++) { 1013 struct qib_sge *sge = &wqe->sg_list[i]; 1014 1015 qib_put_mr(sge->mr); 1016 } 1017 /* Post a send completion queue entry if requested. */ 1018 if (!(qp->s_flags & QIB_S_SIGNAL_REQ_WR) || 1019 (wqe->wr.send_flags & IB_SEND_SIGNALED)) { 1020 memset(&wc, 0, sizeof wc); 1021 wc.wr_id = wqe->wr.wr_id; 1022 wc.status = IB_WC_SUCCESS; 1023 wc.opcode = ib_qib_wc_opcode[wqe->wr.opcode]; 1024 wc.byte_len = wqe->length; 1025 wc.qp = &qp->ibqp; 1026 qib_cq_enter(to_icq(qp->ibqp.send_cq), &wc, 0); 1027 } 1028 if (++qp->s_last >= qp->s_size) 1029 qp->s_last = 0; 1030 } 1031 /* 1032 * If we were waiting for sends to complete before resending, 1033 * and they are now complete, restart sending. 1034 */ 1035 if (qp->s_flags & QIB_S_WAIT_PSN && 1036 qib_cmp24(qp->s_sending_psn, qp->s_sending_hpsn) > 0) { 1037 qp->s_flags &= ~QIB_S_WAIT_PSN; 1038 qp->s_sending_psn = qp->s_psn; 1039 qp->s_sending_hpsn = qp->s_psn - 1; 1040 qib_schedule_send(qp); 1041 } 1042} 1043 1044static inline void update_last_psn(struct qib_qp *qp, u32 psn) 1045{ 1046 qp->s_last_psn = psn; 1047} 1048 1049/* 1050 * Generate a SWQE completion. 1051 * This is similar to qib_send_complete but has to check to be sure 1052 * that the SGEs are not being referenced if the SWQE is being resent. 1053 */ 1054static struct qib_swqe *do_rc_completion(struct qib_qp *qp, 1055 struct qib_swqe *wqe, 1056 struct qib_ibport *ibp) 1057{ 1058 struct ib_wc wc; 1059 unsigned i; 1060 1061 /* 1062 * Don't decrement refcount and don't generate a 1063 * completion if the SWQE is being resent until the send 1064 * is finished. 1065 */ 1066 if (qib_cmp24(wqe->lpsn, qp->s_sending_psn) < 0 || 1067 qib_cmp24(qp->s_sending_psn, qp->s_sending_hpsn) > 0) { 1068 for (i = 0; i < wqe->wr.num_sge; i++) { 1069 struct qib_sge *sge = &wqe->sg_list[i]; 1070 1071 qib_put_mr(sge->mr); 1072 } 1073 /* Post a send completion queue entry if requested. */ 1074 if (!(qp->s_flags & QIB_S_SIGNAL_REQ_WR) || 1075 (wqe->wr.send_flags & IB_SEND_SIGNALED)) { 1076 memset(&wc, 0, sizeof wc); 1077 wc.wr_id = wqe->wr.wr_id; 1078 wc.status = IB_WC_SUCCESS; 1079 wc.opcode = ib_qib_wc_opcode[wqe->wr.opcode]; 1080 wc.byte_len = wqe->length; 1081 wc.qp = &qp->ibqp; 1082 qib_cq_enter(to_icq(qp->ibqp.send_cq), &wc, 0); 1083 } 1084 if (++qp->s_last >= qp->s_size) 1085 qp->s_last = 0; 1086 } else 1087 ibp->n_rc_delayed_comp++; 1088 1089 qp->s_retry = qp->s_retry_cnt; 1090 update_last_psn(qp, wqe->lpsn); 1091 1092 /* 1093 * If we are completing a request which is in the process of 1094 * being resent, we can stop resending it since we know the 1095 * responder has already seen it. 1096 */ 1097 if (qp->s_acked == qp->s_cur) { 1098 if (++qp->s_cur >= qp->s_size) 1099 qp->s_cur = 0; 1100 qp->s_acked = qp->s_cur; 1101 wqe = get_swqe_ptr(qp, qp->s_cur); 1102 if (qp->s_acked != qp->s_tail) { 1103 qp->s_state = OP(SEND_LAST); 1104 qp->s_psn = wqe->psn; 1105 } 1106 } else { 1107 if (++qp->s_acked >= qp->s_size) 1108 qp->s_acked = 0; 1109 if (qp->state == IB_QPS_SQD && qp->s_acked == qp->s_cur) 1110 qp->s_draining = 0; 1111 wqe = get_swqe_ptr(qp, qp->s_acked); 1112 } 1113 return wqe; 1114} 1115 1116/** 1117 * do_rc_ack - process an incoming RC ACK 1118 * @qp: the QP the ACK came in on 1119 * @psn: the packet sequence number of the ACK 1120 * @opcode: the opcode of the request that resulted in the ACK 1121 * 1122 * This is called from qib_rc_rcv_resp() to process an incoming RC ACK 1123 * for the given QP. 1124 * Called at interrupt level with the QP s_lock held. 1125 * Returns 1 if OK, 0 if current operation should be aborted (NAK). 1126 */ 1127static int do_rc_ack(struct qib_qp *qp, u32 aeth, u32 psn, int opcode, 1128 u64 val, struct qib_ctxtdata *rcd) 1129{ 1130 struct qib_ibport *ibp; 1131 enum ib_wc_status status; 1132 struct qib_swqe *wqe; 1133 int ret = 0; 1134 u32 ack_psn; 1135 int diff; 1136 1137 /* Remove QP from retry timer */ 1138 if (qp->s_flags & (QIB_S_TIMER | QIB_S_WAIT_RNR)) { 1139 qp->s_flags &= ~(QIB_S_TIMER | QIB_S_WAIT_RNR); 1140 del_timer(&qp->s_timer); 1141 } 1142 1143 /* 1144 * Note that NAKs implicitly ACK outstanding SEND and RDMA write 1145 * requests and implicitly NAK RDMA read and atomic requests issued 1146 * before the NAK'ed request. The MSN won't include the NAK'ed 1147 * request but will include an ACK'ed request(s). 1148 */ 1149 ack_psn = psn; 1150 if (aeth >> 29) 1151 ack_psn--; 1152 wqe = get_swqe_ptr(qp, qp->s_acked); 1153 ibp = to_iport(qp->ibqp.device, qp->port_num); 1154 1155 /* 1156 * The MSN might be for a later WQE than the PSN indicates so 1157 * only complete WQEs that the PSN finishes. 1158 */ 1159 while ((diff = qib_cmp24(ack_psn, wqe->lpsn)) >= 0) { 1160 /* 1161 * RDMA_READ_RESPONSE_ONLY is a special case since 1162 * we want to generate completion events for everything 1163 * before the RDMA read, copy the data, then generate 1164 * the completion for the read. 1165 */ 1166 if (wqe->wr.opcode == IB_WR_RDMA_READ && 1167 opcode == OP(RDMA_READ_RESPONSE_ONLY) && 1168 diff == 0) { 1169 ret = 1; 1170 goto bail; 1171 } 1172 /* 1173 * If this request is a RDMA read or atomic, and the ACK is 1174 * for a later operation, this ACK NAKs the RDMA read or 1175 * atomic. In other words, only a RDMA_READ_LAST or ONLY 1176 * can ACK a RDMA read and likewise for atomic ops. Note 1177 * that the NAK case can only happen if relaxed ordering is 1178 * used and requests are sent after an RDMA read or atomic 1179 * is sent but before the response is received. 1180 */ 1181 if ((wqe->wr.opcode == IB_WR_RDMA_READ && 1182 (opcode != OP(RDMA_READ_RESPONSE_LAST) || diff != 0)) || 1183 ((wqe->wr.opcode == IB_WR_ATOMIC_CMP_AND_SWP || 1184 wqe->wr.opcode == IB_WR_ATOMIC_FETCH_AND_ADD) && 1185 (opcode != OP(ATOMIC_ACKNOWLEDGE) || diff != 0))) { 1186 /* Retry this request. */ 1187 if (!(qp->r_flags & QIB_R_RDMAR_SEQ)) { 1188 qp->r_flags |= QIB_R_RDMAR_SEQ; 1189 qib_restart_rc(qp, qp->s_last_psn + 1, 0); 1190 if (list_empty(&qp->rspwait)) { 1191 qp->r_flags |= QIB_R_RSP_SEND; 1192 atomic_inc(&qp->refcount); 1193 list_add_tail(&qp->rspwait, 1194 &rcd->qp_wait_list); 1195 } 1196 } 1197 /* 1198 * No need to process the ACK/NAK since we are 1199 * restarting an earlier request. 1200 */ 1201 goto bail; 1202 } 1203 if (wqe->wr.opcode == IB_WR_ATOMIC_CMP_AND_SWP || 1204 wqe->wr.opcode == IB_WR_ATOMIC_FETCH_AND_ADD) { 1205 u64 *vaddr = wqe->sg_list[0].vaddr; 1206 *vaddr = val; 1207 } 1208 if (qp->s_num_rd_atomic && 1209 (wqe->wr.opcode == IB_WR_RDMA_READ || 1210 wqe->wr.opcode == IB_WR_ATOMIC_CMP_AND_SWP || 1211 wqe->wr.opcode == IB_WR_ATOMIC_FETCH_AND_ADD)) { 1212 qp->s_num_rd_atomic--; 1213 /* Restart sending task if fence is complete */ 1214 if ((qp->s_flags & QIB_S_WAIT_FENCE) && 1215 !qp->s_num_rd_atomic) { 1216 qp->s_flags &= ~(QIB_S_WAIT_FENCE | 1217 QIB_S_WAIT_ACK); 1218 qib_schedule_send(qp); 1219 } else if (qp->s_flags & QIB_S_WAIT_RDMAR) { 1220 qp->s_flags &= ~(QIB_S_WAIT_RDMAR | 1221 QIB_S_WAIT_ACK); 1222 qib_schedule_send(qp); 1223 } 1224 } 1225 wqe = do_rc_completion(qp, wqe, ibp); 1226 if (qp->s_acked == qp->s_tail) 1227 break; 1228 } 1229 1230 switch (aeth >> 29) { 1231 case 0: /* ACK */ 1232 ibp->n_rc_acks++; 1233 if (qp->s_acked != qp->s_tail) { 1234 /* 1235 * We are expecting more ACKs so 1236 * reset the retransmit timer. 1237 */ 1238 start_timer(qp); 1239 /* 1240 * We can stop resending the earlier packets and 1241 * continue with the next packet the receiver wants. 1242 */ 1243 if (qib_cmp24(qp->s_psn, psn) <= 0) 1244 reset_psn(qp, psn + 1); 1245 } else if (qib_cmp24(qp->s_psn, psn) <= 0) { 1246 qp->s_state = OP(SEND_LAST); 1247 qp->s_psn = psn + 1; 1248 } 1249 if (qp->s_flags & QIB_S_WAIT_ACK) { 1250 qp->s_flags &= ~QIB_S_WAIT_ACK; 1251 qib_schedule_send(qp); 1252 } 1253 qib_get_credit(qp, aeth); 1254 qp->s_rnr_retry = qp->s_rnr_retry_cnt; 1255 qp->s_retry = qp->s_retry_cnt; 1256 update_last_psn(qp, psn); 1257 ret = 1; 1258 goto bail; 1259 1260 case 1: /* RNR NAK */ 1261 ibp->n_rnr_naks++; 1262 if (qp->s_acked == qp->s_tail) 1263 goto bail; 1264 if (qp->s_flags & QIB_S_WAIT_RNR) 1265 goto bail; 1266 if (qp->s_rnr_retry == 0) { 1267 status = IB_WC_RNR_RETRY_EXC_ERR; 1268 goto class_b; 1269 } 1270 if (qp->s_rnr_retry_cnt < 7) 1271 qp->s_rnr_retry--; 1272 1273 /* The last valid PSN is the previous PSN. */ 1274 update_last_psn(qp, psn - 1); 1275 1276 ibp->n_rc_resends += (qp->s_psn - psn) & QIB_PSN_MASK; 1277 1278 reset_psn(qp, psn); 1279 1280 qp->s_flags &= ~(QIB_S_WAIT_SSN_CREDIT | QIB_S_WAIT_ACK); 1281 qp->s_flags |= QIB_S_WAIT_RNR; 1282 qp->s_timer.function = qib_rc_rnr_retry; 1283 qp->s_timer.expires = jiffies + usecs_to_jiffies( 1284 ib_qib_rnr_table[(aeth >> QIB_AETH_CREDIT_SHIFT) & 1285 QIB_AETH_CREDIT_MASK]); 1286 add_timer(&qp->s_timer); 1287 goto bail; 1288 1289 case 3: /* NAK */ 1290 if (qp->s_acked == qp->s_tail) 1291 goto bail; 1292 /* The last valid PSN is the previous PSN. */ 1293 update_last_psn(qp, psn - 1); 1294 switch ((aeth >> QIB_AETH_CREDIT_SHIFT) & 1295 QIB_AETH_CREDIT_MASK) { 1296 case 0: /* PSN sequence error */ 1297 ibp->n_seq_naks++; 1298 /* 1299 * Back up to the responder's expected PSN. 1300 * Note that we might get a NAK in the middle of an 1301 * RDMA READ response which terminates the RDMA 1302 * READ. 1303 */ 1304 qib_restart_rc(qp, psn, 0); 1305 qib_schedule_send(qp); 1306 break; 1307 1308 case 1: /* Invalid Request */ 1309 status = IB_WC_REM_INV_REQ_ERR; 1310 ibp->n_other_naks++; 1311 goto class_b; 1312 1313 case 2: /* Remote Access Error */ 1314 status = IB_WC_REM_ACCESS_ERR; 1315 ibp->n_other_naks++; 1316 goto class_b; 1317 1318 case 3: /* Remote Operation Error */ 1319 status = IB_WC_REM_OP_ERR; 1320 ibp->n_other_naks++; 1321class_b: 1322 if (qp->s_last == qp->s_acked) { 1323 qib_send_complete(qp, wqe, status); 1324 qib_error_qp(qp, IB_WC_WR_FLUSH_ERR); 1325 } 1326 break; 1327 1328 default: 1329 /* Ignore other reserved NAK error codes */ 1330 goto reserved; 1331 } 1332 qp->s_retry = qp->s_retry_cnt; 1333 qp->s_rnr_retry = qp->s_rnr_retry_cnt; 1334 goto bail; 1335 1336 default: /* 2: reserved */ 1337reserved: 1338 /* Ignore reserved NAK codes. */ 1339 goto bail; 1340 } 1341 1342bail: 1343 return ret; 1344} 1345 1346/* 1347 * We have seen an out of sequence RDMA read middle or last packet. 1348 * This ACKs SENDs and RDMA writes up to the first RDMA read or atomic SWQE. 1349 */ 1350static void rdma_seq_err(struct qib_qp *qp, struct qib_ibport *ibp, u32 psn, 1351 struct qib_ctxtdata *rcd) 1352{ 1353 struct qib_swqe *wqe; 1354 1355 /* Remove QP from retry timer */ 1356 if (qp->s_flags & (QIB_S_TIMER | QIB_S_WAIT_RNR)) { 1357 qp->s_flags &= ~(QIB_S_TIMER | QIB_S_WAIT_RNR); 1358 del_timer(&qp->s_timer); 1359 } 1360 1361 wqe = get_swqe_ptr(qp, qp->s_acked); 1362 1363 while (qib_cmp24(psn, wqe->lpsn) > 0) { 1364 if (wqe->wr.opcode == IB_WR_RDMA_READ || 1365 wqe->wr.opcode == IB_WR_ATOMIC_CMP_AND_SWP || 1366 wqe->wr.opcode == IB_WR_ATOMIC_FETCH_AND_ADD) 1367 break; 1368 wqe = do_rc_completion(qp, wqe, ibp); 1369 } 1370 1371 ibp->n_rdma_seq++; 1372 qp->r_flags |= QIB_R_RDMAR_SEQ; 1373 qib_restart_rc(qp, qp->s_last_psn + 1, 0); 1374 if (list_empty(&qp->rspwait)) { 1375 qp->r_flags |= QIB_R_RSP_SEND; 1376 atomic_inc(&qp->refcount); 1377 list_add_tail(&qp->rspwait, &rcd->qp_wait_list); 1378 } 1379} 1380 1381/** 1382 * qib_rc_rcv_resp - process an incoming RC response packet 1383 * @ibp: the port this packet came in on 1384 * @ohdr: the other headers for this packet 1385 * @data: the packet data 1386 * @tlen: the packet length 1387 * @qp: the QP for this packet 1388 * @opcode: the opcode for this packet 1389 * @psn: the packet sequence number for this packet 1390 * @hdrsize: the header length 1391 * @pmtu: the path MTU 1392 * 1393 * This is called from qib_rc_rcv() to process an incoming RC response 1394 * packet for the given QP. 1395 * Called at interrupt level. 1396 */ 1397static void qib_rc_rcv_resp(struct qib_ibport *ibp, 1398 struct qib_other_headers *ohdr, 1399 void *data, u32 tlen, 1400 struct qib_qp *qp, 1401 u32 opcode, 1402 u32 psn, u32 hdrsize, u32 pmtu, 1403 struct qib_ctxtdata *rcd) 1404{ 1405 struct qib_swqe *wqe; 1406 struct qib_pportdata *ppd = ppd_from_ibp(ibp); 1407 enum ib_wc_status status; 1408 unsigned long flags; 1409 int diff; 1410 u32 pad; 1411 u32 aeth; 1412 u64 val; 1413 1414 if (opcode != OP(RDMA_READ_RESPONSE_MIDDLE)) { 1415 /* 1416 * If ACK'd PSN on SDMA busy list try to make progress to 1417 * reclaim SDMA credits. 1418 */ 1419 if ((qib_cmp24(psn, qp->s_sending_psn) >= 0) && 1420 (qib_cmp24(qp->s_sending_psn, qp->s_sending_hpsn) <= 0)) { 1421 1422 /* 1423 * If send tasklet not running attempt to progress 1424 * SDMA queue. 1425 */ 1426 if (!(qp->s_flags & QIB_S_BUSY)) { 1427 /* Acquire SDMA Lock */ 1428 spin_lock_irqsave(&ppd->sdma_lock, flags); 1429 /* Invoke sdma make progress */ 1430 qib_sdma_make_progress(ppd); 1431 /* Release SDMA Lock */ 1432 spin_unlock_irqrestore(&ppd->sdma_lock, flags); 1433 } 1434 } 1435 } 1436 1437 spin_lock_irqsave(&qp->s_lock, flags); 1438 if (!(ib_qib_state_ops[qp->state] & QIB_PROCESS_RECV_OK)) 1439 goto ack_done; 1440 1441 /* Ignore invalid responses. */ 1442 if (qib_cmp24(psn, qp->s_next_psn) >= 0) 1443 goto ack_done; 1444 1445 /* Ignore duplicate responses. */ 1446 diff = qib_cmp24(psn, qp->s_last_psn); 1447 if (unlikely(diff <= 0)) { 1448 /* Update credits for "ghost" ACKs */ 1449 if (diff == 0 && opcode == OP(ACKNOWLEDGE)) { 1450 aeth = be32_to_cpu(ohdr->u.aeth); 1451 if ((aeth >> 29) == 0) 1452 qib_get_credit(qp, aeth); 1453 } 1454 goto ack_done; 1455 } 1456 1457 /* 1458 * Skip everything other than the PSN we expect, if we are waiting 1459 * for a reply to a restarted RDMA read or atomic op. 1460 */ 1461 if (qp->r_flags & QIB_R_RDMAR_SEQ) { 1462 if (qib_cmp24(psn, qp->s_last_psn + 1) != 0) 1463 goto ack_done; 1464 qp->r_flags &= ~QIB_R_RDMAR_SEQ; 1465 } 1466 1467 if (unlikely(qp->s_acked == qp->s_tail)) 1468 goto ack_done; 1469 wqe = get_swqe_ptr(qp, qp->s_acked); 1470 status = IB_WC_SUCCESS; 1471 1472 switch (opcode) { 1473 case OP(ACKNOWLEDGE): 1474 case OP(ATOMIC_ACKNOWLEDGE): 1475 case OP(RDMA_READ_RESPONSE_FIRST): 1476 aeth = be32_to_cpu(ohdr->u.aeth); 1477 if (opcode == OP(ATOMIC_ACKNOWLEDGE)) { 1478 __be32 *p = ohdr->u.at.atomic_ack_eth; 1479 1480 val = ((u64) be32_to_cpu(p[0]) << 32) | 1481 be32_to_cpu(p[1]); 1482 } else 1483 val = 0; 1484 if (!do_rc_ack(qp, aeth, psn, opcode, val, rcd) || 1485 opcode != OP(RDMA_READ_RESPONSE_FIRST)) 1486 goto ack_done; 1487 hdrsize += 4; 1488 wqe = get_swqe_ptr(qp, qp->s_acked); 1489 if (unlikely(wqe->wr.opcode != IB_WR_RDMA_READ)) 1490 goto ack_op_err; 1491 /* 1492 * If this is a response to a resent RDMA read, we 1493 * have to be careful to copy the data to the right 1494 * location. 1495 */ 1496 qp->s_rdma_read_len = restart_sge(&qp->s_rdma_read_sge, 1497 wqe, psn, pmtu); 1498 goto read_middle; 1499 1500 case OP(RDMA_READ_RESPONSE_MIDDLE): 1501 /* no AETH, no ACK */ 1502 if (unlikely(qib_cmp24(psn, qp->s_last_psn + 1))) 1503 goto ack_seq_err; 1504 if (unlikely(wqe->wr.opcode != IB_WR_RDMA_READ)) 1505 goto ack_op_err; 1506read_middle: 1507 if (unlikely(tlen != (hdrsize + pmtu + 4))) 1508 goto ack_len_err; 1509 if (unlikely(pmtu >= qp->s_rdma_read_len)) 1510 goto ack_len_err; 1511 1512 /* 1513 * We got a response so update the timeout. 1514 * 4.096 usec. * (1 << qp->timeout) 1515 */ 1516 qp->s_flags |= QIB_S_TIMER; 1517 mod_timer(&qp->s_timer, jiffies + qp->timeout_jiffies); 1518 if (qp->s_flags & QIB_S_WAIT_ACK) { 1519 qp->s_flags &= ~QIB_S_WAIT_ACK; 1520 qib_schedule_send(qp); 1521 } 1522 1523 if (opcode == OP(RDMA_READ_RESPONSE_MIDDLE)) 1524 qp->s_retry = qp->s_retry_cnt; 1525 1526 /* 1527 * Update the RDMA receive state but do the copy w/o 1528 * holding the locks and blocking interrupts. 1529 */ 1530 qp->s_rdma_read_len -= pmtu; 1531 update_last_psn(qp, psn); 1532 spin_unlock_irqrestore(&qp->s_lock, flags); 1533 qib_copy_sge(&qp->s_rdma_read_sge, data, pmtu, 0); 1534 goto bail; 1535 1536 case OP(RDMA_READ_RESPONSE_ONLY): 1537 aeth = be32_to_cpu(ohdr->u.aeth); 1538 if (!do_rc_ack(qp, aeth, psn, opcode, 0, rcd)) 1539 goto ack_done; 1540 /* Get the number of bytes the message was padded by. */ 1541 pad = (be32_to_cpu(ohdr->bth[0]) >> 20) & 3; 1542 /* 1543 * Check that the data size is >= 0 && <= pmtu. 1544 * Remember to account for the AETH header (4) and 1545 * ICRC (4). 1546 */ 1547 if (unlikely(tlen < (hdrsize + pad + 8))) 1548 goto ack_len_err; 1549 /* 1550 * If this is a response to a resent RDMA read, we 1551 * have to be careful to copy the data to the right 1552 * location. 1553 */ 1554 wqe = get_swqe_ptr(qp, qp->s_acked); 1555 qp->s_rdma_read_len = restart_sge(&qp->s_rdma_read_sge, 1556 wqe, psn, pmtu); 1557 goto read_last; 1558 1559 case OP(RDMA_READ_RESPONSE_LAST): 1560 /* ACKs READ req. */ 1561 if (unlikely(qib_cmp24(psn, qp->s_last_psn + 1))) 1562 goto ack_seq_err; 1563 if (unlikely(wqe->wr.opcode != IB_WR_RDMA_READ)) 1564 goto ack_op_err; 1565 /* Get the number of bytes the message was padded by. */ 1566 pad = (be32_to_cpu(ohdr->bth[0]) >> 20) & 3; 1567 /* 1568 * Check that the data size is >= 1 && <= pmtu. 1569 * Remember to account for the AETH header (4) and 1570 * ICRC (4). 1571 */ 1572 if (unlikely(tlen <= (hdrsize + pad + 8))) 1573 goto ack_len_err; 1574read_last: 1575 tlen -= hdrsize + pad + 8; 1576 if (unlikely(tlen != qp->s_rdma_read_len)) 1577 goto ack_len_err; 1578 aeth = be32_to_cpu(ohdr->u.aeth); 1579 qib_copy_sge(&qp->s_rdma_read_sge, data, tlen, 0); 1580 WARN_ON(qp->s_rdma_read_sge.num_sge); 1581 (void) do_rc_ack(qp, aeth, psn, 1582 OP(RDMA_READ_RESPONSE_LAST), 0, rcd); 1583 goto ack_done; 1584 } 1585 1586ack_op_err: 1587 status = IB_WC_LOC_QP_OP_ERR; 1588 goto ack_err; 1589 1590ack_seq_err: 1591 rdma_seq_err(qp, ibp, psn, rcd); 1592 goto ack_done; 1593 1594ack_len_err: 1595 status = IB_WC_LOC_LEN_ERR; 1596ack_err: 1597 if (qp->s_last == qp->s_acked) { 1598 qib_send_complete(qp, wqe, status); 1599 qib_error_qp(qp, IB_WC_WR_FLUSH_ERR); 1600 } 1601ack_done: 1602 spin_unlock_irqrestore(&qp->s_lock, flags); 1603bail: 1604 return; 1605} 1606 1607/** 1608 * qib_rc_rcv_error - process an incoming duplicate or error RC packet 1609 * @ohdr: the other headers for this packet 1610 * @data: the packet data 1611 * @qp: the QP for this packet 1612 * @opcode: the opcode for this packet 1613 * @psn: the packet sequence number for this packet 1614 * @diff: the difference between the PSN and the expected PSN 1615 * 1616 * This is called from qib_rc_rcv() to process an unexpected 1617 * incoming RC packet for the given QP. 1618 * Called at interrupt level. 1619 * Return 1 if no more processing is needed; otherwise return 0 to 1620 * schedule a response to be sent. 1621 */ 1622static int qib_rc_rcv_error(struct qib_other_headers *ohdr, 1623 void *data, 1624 struct qib_qp *qp, 1625 u32 opcode, 1626 u32 psn, 1627 int diff, 1628 struct qib_ctxtdata *rcd) 1629{ 1630 struct qib_ibport *ibp = to_iport(qp->ibqp.device, qp->port_num); 1631 struct qib_ack_entry *e; 1632 unsigned long flags; 1633 u8 i, prev; 1634 int old_req; 1635 1636 if (diff > 0) { 1637 /* 1638 * Packet sequence error. 1639 * A NAK will ACK earlier sends and RDMA writes. 1640 * Don't queue the NAK if we already sent one. 1641 */ 1642 if (!qp->r_nak_state) { 1643 ibp->n_rc_seqnak++; 1644 qp->r_nak_state = IB_NAK_PSN_ERROR; 1645 /* Use the expected PSN. */ 1646 qp->r_ack_psn = qp->r_psn; 1647 /* 1648 * Wait to send the sequence NAK until all packets 1649 * in the receive queue have been processed. 1650 * Otherwise, we end up propagating congestion. 1651 */ 1652 if (list_empty(&qp->rspwait)) { 1653 qp->r_flags |= QIB_R_RSP_NAK; 1654 atomic_inc(&qp->refcount); 1655 list_add_tail(&qp->rspwait, &rcd->qp_wait_list); 1656 } 1657 } 1658 goto done; 1659 } 1660 1661 /* 1662 * Handle a duplicate request. Don't re-execute SEND, RDMA 1663 * write or atomic op. Don't NAK errors, just silently drop 1664 * the duplicate request. Note that r_sge, r_len, and 1665 * r_rcv_len may be in use so don't modify them. 1666 * 1667 * We are supposed to ACK the earliest duplicate PSN but we 1668 * can coalesce an outstanding duplicate ACK. We have to 1669 * send the earliest so that RDMA reads can be restarted at 1670 * the requester's expected PSN. 1671 * 1672 * First, find where this duplicate PSN falls within the 1673 * ACKs previously sent. 1674 * old_req is true if there is an older response that is scheduled 1675 * to be sent before sending this one. 1676 */ 1677 e = NULL; 1678 old_req = 1; 1679 ibp->n_rc_dupreq++; 1680 1681 spin_lock_irqsave(&qp->s_lock, flags); 1682 1683 for (i = qp->r_head_ack_queue; ; i = prev) { 1684 if (i == qp->s_tail_ack_queue) 1685 old_req = 0; 1686 if (i) 1687 prev = i - 1; 1688 else 1689 prev = QIB_MAX_RDMA_ATOMIC; 1690 if (prev == qp->r_head_ack_queue) { 1691 e = NULL; 1692 break; 1693 } 1694 e = &qp->s_ack_queue[prev]; 1695 if (!e->opcode) { 1696 e = NULL; 1697 break; 1698 } 1699 if (qib_cmp24(psn, e->psn) >= 0) { 1700 if (prev == qp->s_tail_ack_queue && 1701 qib_cmp24(psn, e->lpsn) <= 0) 1702 old_req = 0; 1703 break; 1704 } 1705 } 1706 switch (opcode) { 1707 case OP(RDMA_READ_REQUEST): { 1708 struct ib_reth *reth; 1709 u32 offset; 1710 u32 len; 1711 1712 /* 1713 * If we didn't find the RDMA read request in the ack queue, 1714 * we can ignore this request. 1715 */ 1716 if (!e || e->opcode != OP(RDMA_READ_REQUEST)) 1717 goto unlock_done; 1718 /* RETH comes after BTH */ 1719 reth = &ohdr->u.rc.reth; 1720 /* 1721 * Address range must be a subset of the original 1722 * request and start on pmtu boundaries. 1723 * We reuse the old ack_queue slot since the requester 1724 * should not back up and request an earlier PSN for the 1725 * same request. 1726 */ 1727 offset = ((psn - e->psn) & QIB_PSN_MASK) * 1728 qp->pmtu; 1729 len = be32_to_cpu(reth->length); 1730 if (unlikely(offset + len != e->rdma_sge.sge_length)) 1731 goto unlock_done; 1732 if (e->rdma_sge.mr) { 1733 qib_put_mr(e->rdma_sge.mr); 1734 e->rdma_sge.mr = NULL; 1735 } 1736 if (len != 0) { 1737 u32 rkey = be32_to_cpu(reth->rkey); 1738 u64 vaddr = be64_to_cpu(reth->vaddr); 1739 int ok; 1740 1741 ok = qib_rkey_ok(qp, &e->rdma_sge, len, vaddr, rkey, 1742 IB_ACCESS_REMOTE_READ); 1743 if (unlikely(!ok)) 1744 goto unlock_done; 1745 } else { 1746 e->rdma_sge.vaddr = NULL; 1747 e->rdma_sge.length = 0; 1748 e->rdma_sge.sge_length = 0; 1749 } 1750 e->psn = psn; 1751 if (old_req) 1752 goto unlock_done; 1753 qp->s_tail_ack_queue = prev; 1754 break; 1755 } 1756 1757 case OP(COMPARE_SWAP): 1758 case OP(FETCH_ADD): { 1759 /* 1760 * If we didn't find the atomic request in the ack queue 1761 * or the send tasklet is already backed up to send an 1762 * earlier entry, we can ignore this request. 1763 */ 1764 if (!e || e->opcode != (u8) opcode || old_req) 1765 goto unlock_done; 1766 qp->s_tail_ack_queue = prev; 1767 break; 1768 } 1769 1770 default: 1771 /* 1772 * Ignore this operation if it doesn't request an ACK 1773 * or an earlier RDMA read or atomic is going to be resent. 1774 */ 1775 if (!(psn & IB_BTH_REQ_ACK) || old_req) 1776 goto unlock_done; 1777 /* 1778 * Resend the most recent ACK if this request is 1779 * after all the previous RDMA reads and atomics. 1780 */ 1781 if (i == qp->r_head_ack_queue) { 1782 spin_unlock_irqrestore(&qp->s_lock, flags); 1783 qp->r_nak_state = 0; 1784 qp->r_ack_psn = qp->r_psn - 1; 1785 goto send_ack; 1786 } 1787 /* 1788 * Try to send a simple ACK to work around a Mellanox bug 1789 * which doesn't accept a RDMA read response or atomic 1790 * response as an ACK for earlier SENDs or RDMA writes. 1791 */ 1792 if (!(qp->s_flags & QIB_S_RESP_PENDING)) { 1793 spin_unlock_irqrestore(&qp->s_lock, flags); 1794 qp->r_nak_state = 0; 1795 qp->r_ack_psn = qp->s_ack_queue[i].psn - 1; 1796 goto send_ack; 1797 } 1798 /* 1799 * Resend the RDMA read or atomic op which 1800 * ACKs this duplicate request. 1801 */ 1802 qp->s_tail_ack_queue = i; 1803 break; 1804 } 1805 qp->s_ack_state = OP(ACKNOWLEDGE); 1806 qp->s_flags |= QIB_S_RESP_PENDING; 1807 qp->r_nak_state = 0; 1808 qib_schedule_send(qp); 1809 1810unlock_done: 1811 spin_unlock_irqrestore(&qp->s_lock, flags); 1812done: 1813 return 1; 1814 1815send_ack: 1816 return 0; 1817} 1818 1819void qib_rc_error(struct qib_qp *qp, enum ib_wc_status err) 1820{ 1821 unsigned long flags; 1822 int lastwqe; 1823 1824 spin_lock_irqsave(&qp->s_lock, flags); 1825 lastwqe = qib_error_qp(qp, err); 1826 spin_unlock_irqrestore(&qp->s_lock, flags); 1827 1828 if (lastwqe) { 1829 struct ib_event ev; 1830 1831 ev.device = qp->ibqp.device; 1832 ev.element.qp = &qp->ibqp; 1833 ev.event = IB_EVENT_QP_LAST_WQE_REACHED; 1834 qp->ibqp.event_handler(&ev, qp->ibqp.qp_context); 1835 } 1836} 1837 1838static inline void qib_update_ack_queue(struct qib_qp *qp, unsigned n) 1839{ 1840 unsigned next; 1841 1842 next = n + 1; 1843 if (next > QIB_MAX_RDMA_ATOMIC) 1844 next = 0; 1845 qp->s_tail_ack_queue = next; 1846 qp->s_ack_state = OP(ACKNOWLEDGE); 1847} 1848 1849/** 1850 * qib_rc_rcv - process an incoming RC packet 1851 * @rcd: the context pointer 1852 * @hdr: the header of this packet 1853 * @has_grh: true if the header has a GRH 1854 * @data: the packet data 1855 * @tlen: the packet length 1856 * @qp: the QP for this packet 1857 * 1858 * This is called from qib_qp_rcv() to process an incoming RC packet 1859 * for the given QP. 1860 * Called at interrupt level. 1861 */ 1862void qib_rc_rcv(struct qib_ctxtdata *rcd, struct qib_ib_header *hdr, 1863 int has_grh, void *data, u32 tlen, struct qib_qp *qp) 1864{ 1865 struct qib_ibport *ibp = &rcd->ppd->ibport_data; 1866 struct qib_other_headers *ohdr; 1867 u32 opcode; 1868 u32 hdrsize; 1869 u32 psn; 1870 u32 pad; 1871 struct ib_wc wc; 1872 u32 pmtu = qp->pmtu; 1873 int diff; 1874 struct ib_reth *reth; 1875 unsigned long flags; 1876 int ret; 1877 1878 /* Check for GRH */ 1879 if (!has_grh) { 1880 ohdr = &hdr->u.oth; 1881 hdrsize = 8 + 12; /* LRH + BTH */ 1882 } else { 1883 ohdr = &hdr->u.l.oth; 1884 hdrsize = 8 + 40 + 12; /* LRH + GRH + BTH */ 1885 } 1886 1887 opcode = be32_to_cpu(ohdr->bth[0]); 1888 if (qib_ruc_check_hdr(ibp, hdr, has_grh, qp, opcode)) 1889 return; 1890 1891 psn = be32_to_cpu(ohdr->bth[2]); 1892 opcode >>= 24; 1893 1894 /* 1895 * Process responses (ACKs) before anything else. Note that the 1896 * packet sequence number will be for something in the send work 1897 * queue rather than the expected receive packet sequence number. 1898 * In other words, this QP is the requester. 1899 */ 1900 if (opcode >= OP(RDMA_READ_RESPONSE_FIRST) && 1901 opcode <= OP(ATOMIC_ACKNOWLEDGE)) { 1902 qib_rc_rcv_resp(ibp, ohdr, data, tlen, qp, opcode, psn, 1903 hdrsize, pmtu, rcd); 1904 return; 1905 } 1906 1907 /* Compute 24 bits worth of difference. */ 1908 diff = qib_cmp24(psn, qp->r_psn); 1909 if (unlikely(diff)) { 1910 if (qib_rc_rcv_error(ohdr, data, qp, opcode, psn, diff, rcd)) 1911 return; 1912 goto send_ack; 1913 } 1914 1915 /* Check for opcode sequence errors. */ 1916 switch (qp->r_state) { 1917 case OP(SEND_FIRST): 1918 case OP(SEND_MIDDLE): 1919 if (opcode == OP(SEND_MIDDLE) || 1920 opcode == OP(SEND_LAST) || 1921 opcode == OP(SEND_LAST_WITH_IMMEDIATE)) 1922 break; 1923 goto nack_inv; 1924 1925 case OP(RDMA_WRITE_FIRST): 1926 case OP(RDMA_WRITE_MIDDLE): 1927 if (opcode == OP(RDMA_WRITE_MIDDLE) || 1928 opcode == OP(RDMA_WRITE_LAST) || 1929 opcode == OP(RDMA_WRITE_LAST_WITH_IMMEDIATE)) 1930 break; 1931 goto nack_inv; 1932 1933 default: 1934 if (opcode == OP(SEND_MIDDLE) || 1935 opcode == OP(SEND_LAST) || 1936 opcode == OP(SEND_LAST_WITH_IMMEDIATE) || 1937 opcode == OP(RDMA_WRITE_MIDDLE) || 1938 opcode == OP(RDMA_WRITE_LAST) || 1939 opcode == OP(RDMA_WRITE_LAST_WITH_IMMEDIATE)) 1940 goto nack_inv; 1941 /* 1942 * Note that it is up to the requester to not send a new 1943 * RDMA read or atomic operation before receiving an ACK 1944 * for the previous operation. 1945 */ 1946 break; 1947 } 1948 1949 if (qp->state == IB_QPS_RTR && !(qp->r_flags & QIB_R_COMM_EST)) { 1950 qp->r_flags |= QIB_R_COMM_EST; 1951 if (qp->ibqp.event_handler) { 1952 struct ib_event ev; 1953 1954 ev.device = qp->ibqp.device; 1955 ev.element.qp = &qp->ibqp; 1956 ev.event = IB_EVENT_COMM_EST; 1957 qp->ibqp.event_handler(&ev, qp->ibqp.qp_context); 1958 } 1959 } 1960 1961 /* OK, process the packet. */ 1962 switch (opcode) { 1963 case OP(SEND_FIRST): 1964 ret = qib_get_rwqe(qp, 0); 1965 if (ret < 0) 1966 goto nack_op_err; 1967 if (!ret) 1968 goto rnr_nak; 1969 qp->r_rcv_len = 0; 1970 /* FALLTHROUGH */ 1971 case OP(SEND_MIDDLE): 1972 case OP(RDMA_WRITE_MIDDLE): 1973send_middle: 1974 /* Check for invalid length PMTU or posted rwqe len. */ 1975 if (unlikely(tlen != (hdrsize + pmtu + 4))) 1976 goto nack_inv; 1977 qp->r_rcv_len += pmtu; 1978 if (unlikely(qp->r_rcv_len > qp->r_len)) 1979 goto nack_inv; 1980 qib_copy_sge(&qp->r_sge, data, pmtu, 1); 1981 break; 1982 1983 case OP(RDMA_WRITE_LAST_WITH_IMMEDIATE): 1984 /* consume RWQE */ 1985 ret = qib_get_rwqe(qp, 1); 1986 if (ret < 0) 1987 goto nack_op_err; 1988 if (!ret) 1989 goto rnr_nak; 1990 goto send_last_imm; 1991 1992 case OP(SEND_ONLY): 1993 case OP(SEND_ONLY_WITH_IMMEDIATE): 1994 ret = qib_get_rwqe(qp, 0); 1995 if (ret < 0) 1996 goto nack_op_err; 1997 if (!ret) 1998 goto rnr_nak; 1999 qp->r_rcv_len = 0; 2000 if (opcode == OP(SEND_ONLY)) 2001 goto no_immediate_data; 2002 /* FALLTHROUGH for SEND_ONLY_WITH_IMMEDIATE */ 2003 case OP(SEND_LAST_WITH_IMMEDIATE): 2004send_last_imm: 2005 wc.ex.imm_data = ohdr->u.imm_data; 2006 hdrsize += 4; 2007 wc.wc_flags = IB_WC_WITH_IMM; 2008 goto send_last; 2009 case OP(SEND_LAST): 2010 case OP(RDMA_WRITE_LAST): 2011no_immediate_data: 2012 wc.wc_flags = 0; 2013 wc.ex.imm_data = 0; 2014send_last: 2015 /* Get the number of bytes the message was padded by. */ 2016 pad = (be32_to_cpu(ohdr->bth[0]) >> 20) & 3; 2017 /* Check for invalid length. */ 2018 /* XXX LAST len should be >= 1 */ 2019 if (unlikely(tlen < (hdrsize + pad + 4))) 2020 goto nack_inv; 2021 /* Don't count the CRC. */ 2022 tlen -= (hdrsize + pad + 4); 2023 wc.byte_len = tlen + qp->r_rcv_len; 2024 if (unlikely(wc.byte_len > qp->r_len)) 2025 goto nack_inv; 2026 qib_copy_sge(&qp->r_sge, data, tlen, 1); 2027 qib_put_ss(&qp->r_sge); 2028 qp->r_msn++; 2029 if (!test_and_clear_bit(QIB_R_WRID_VALID, &qp->r_aflags)) 2030 break; 2031 wc.wr_id = qp->r_wr_id; 2032 wc.status = IB_WC_SUCCESS; 2033 if (opcode == OP(RDMA_WRITE_LAST_WITH_IMMEDIATE) || 2034 opcode == OP(RDMA_WRITE_ONLY_WITH_IMMEDIATE)) 2035 wc.opcode = IB_WC_RECV_RDMA_WITH_IMM; 2036 else 2037 wc.opcode = IB_WC_RECV; 2038 wc.qp = &qp->ibqp; 2039 wc.src_qp = qp->remote_qpn; 2040 wc.slid = qp->remote_ah_attr.dlid; 2041 wc.sl = qp->remote_ah_attr.sl; 2042 /* zero fields that are N/A */ 2043 wc.vendor_err = 0; 2044 wc.pkey_index = 0; 2045 wc.dlid_path_bits = 0; 2046 wc.port_num = 0; 2047 /* Signal completion event if the solicited bit is set. */ 2048 qib_cq_enter(to_icq(qp->ibqp.recv_cq), &wc, 2049 (ohdr->bth[0] & 2050 cpu_to_be32(IB_BTH_SOLICITED)) != 0); 2051 break; 2052 2053 case OP(RDMA_WRITE_FIRST): 2054 case OP(RDMA_WRITE_ONLY): 2055 case OP(RDMA_WRITE_ONLY_WITH_IMMEDIATE): 2056 if (unlikely(!(qp->qp_access_flags & IB_ACCESS_REMOTE_WRITE))) 2057 goto nack_inv; 2058 /* consume RWQE */ 2059 reth = &ohdr->u.rc.reth; 2060 hdrsize += sizeof(*reth); 2061 qp->r_len = be32_to_cpu(reth->length); 2062 qp->r_rcv_len = 0; 2063 qp->r_sge.sg_list = NULL; 2064 if (qp->r_len != 0) { 2065 u32 rkey = be32_to_cpu(reth->rkey); 2066 u64 vaddr = be64_to_cpu(reth->vaddr); 2067 int ok; 2068 2069 /* Check rkey & NAK */ 2070 ok = qib_rkey_ok(qp, &qp->r_sge.sge, qp->r_len, vaddr, 2071 rkey, IB_ACCESS_REMOTE_WRITE); 2072 if (unlikely(!ok)) 2073 goto nack_acc; 2074 qp->r_sge.num_sge = 1; 2075 } else { 2076 qp->r_sge.num_sge = 0; 2077 qp->r_sge.sge.mr = NULL; 2078 qp->r_sge.sge.vaddr = NULL; 2079 qp->r_sge.sge.length = 0; 2080 qp->r_sge.sge.sge_length = 0; 2081 } 2082 if (opcode == OP(RDMA_WRITE_FIRST)) 2083 goto send_middle; 2084 else if (opcode == OP(RDMA_WRITE_ONLY)) 2085 goto no_immediate_data; 2086 ret = qib_get_rwqe(qp, 1); 2087 if (ret < 0) 2088 goto nack_op_err; 2089 if (!ret) 2090 goto rnr_nak; 2091 wc.ex.imm_data = ohdr->u.rc.imm_data; 2092 hdrsize += 4; 2093 wc.wc_flags = IB_WC_WITH_IMM; 2094 goto send_last; 2095 2096 case OP(RDMA_READ_REQUEST): { 2097 struct qib_ack_entry *e; 2098 u32 len; 2099 u8 next; 2100 2101 if (unlikely(!(qp->qp_access_flags & IB_ACCESS_REMOTE_READ))) 2102 goto nack_inv; 2103 next = qp->r_head_ack_queue + 1; 2104 /* s_ack_queue is size QIB_MAX_RDMA_ATOMIC+1 so use > not >= */ 2105 if (next > QIB_MAX_RDMA_ATOMIC) 2106 next = 0; 2107 spin_lock_irqsave(&qp->s_lock, flags); 2108 if (unlikely(next == qp->s_tail_ack_queue)) { 2109 if (!qp->s_ack_queue[next].sent) 2110 goto nack_inv_unlck; 2111 qib_update_ack_queue(qp, next); 2112 } 2113 e = &qp->s_ack_queue[qp->r_head_ack_queue]; 2114 if (e->opcode == OP(RDMA_READ_REQUEST) && e->rdma_sge.mr) { 2115 qib_put_mr(e->rdma_sge.mr); 2116 e->rdma_sge.mr = NULL; 2117 } 2118 reth = &ohdr->u.rc.reth; 2119 len = be32_to_cpu(reth->length); 2120 if (len) { 2121 u32 rkey = be32_to_cpu(reth->rkey); 2122 u64 vaddr = be64_to_cpu(reth->vaddr); 2123 int ok; 2124 2125 /* Check rkey & NAK */ 2126 ok = qib_rkey_ok(qp, &e->rdma_sge, len, vaddr, 2127 rkey, IB_ACCESS_REMOTE_READ); 2128 if (unlikely(!ok)) 2129 goto nack_acc_unlck; 2130 /* 2131 * Update the next expected PSN. We add 1 later 2132 * below, so only add the remainder here. 2133 */ 2134 if (len > pmtu) 2135 qp->r_psn += (len - 1) / pmtu; 2136 } else { 2137 e->rdma_sge.mr = NULL; 2138 e->rdma_sge.vaddr = NULL; 2139 e->rdma_sge.length = 0; 2140 e->rdma_sge.sge_length = 0; 2141 } 2142 e->opcode = opcode; 2143 e->sent = 0; 2144 e->psn = psn; 2145 e->lpsn = qp->r_psn; 2146 /* 2147 * We need to increment the MSN here instead of when we 2148 * finish sending the result since a duplicate request would 2149 * increment it more than once. 2150 */ 2151 qp->r_msn++; 2152 qp->r_psn++; 2153 qp->r_state = opcode; 2154 qp->r_nak_state = 0; 2155 qp->r_head_ack_queue = next; 2156 2157 /* Schedule the send tasklet. */ 2158 qp->s_flags |= QIB_S_RESP_PENDING; 2159 qib_schedule_send(qp); 2160 2161 goto sunlock; 2162 } 2163 2164 case OP(COMPARE_SWAP): 2165 case OP(FETCH_ADD): { 2166 struct ib_atomic_eth *ateth; 2167 struct qib_ack_entry *e; 2168 u64 vaddr; 2169 atomic64_t *maddr; 2170 u64 sdata; 2171 u32 rkey; 2172 u8 next; 2173 2174 if (unlikely(!(qp->qp_access_flags & IB_ACCESS_REMOTE_ATOMIC))) 2175 goto nack_inv; 2176 next = qp->r_head_ack_queue + 1; 2177 if (next > QIB_MAX_RDMA_ATOMIC) 2178 next = 0; 2179 spin_lock_irqsave(&qp->s_lock, flags); 2180 if (unlikely(next == qp->s_tail_ack_queue)) { 2181 if (!qp->s_ack_queue[next].sent) 2182 goto nack_inv_unlck; 2183 qib_update_ack_queue(qp, next); 2184 } 2185 e = &qp->s_ack_queue[qp->r_head_ack_queue]; 2186 if (e->opcode == OP(RDMA_READ_REQUEST) && e->rdma_sge.mr) { 2187 qib_put_mr(e->rdma_sge.mr); 2188 e->rdma_sge.mr = NULL; 2189 } 2190 ateth = &ohdr->u.atomic_eth; 2191 vaddr = ((u64) be32_to_cpu(ateth->vaddr[0]) << 32) | 2192 be32_to_cpu(ateth->vaddr[1]); 2193 if (unlikely(vaddr & (sizeof(u64) - 1))) 2194 goto nack_inv_unlck; 2195 rkey = be32_to_cpu(ateth->rkey); 2196 /* Check rkey & NAK */ 2197 if (unlikely(!qib_rkey_ok(qp, &qp->r_sge.sge, sizeof(u64), 2198 vaddr, rkey, 2199 IB_ACCESS_REMOTE_ATOMIC))) 2200 goto nack_acc_unlck; 2201 /* Perform atomic OP and save result. */ 2202 maddr = (atomic64_t *) qp->r_sge.sge.vaddr; 2203 sdata = be64_to_cpu(ateth->swap_data); 2204 e->atomic_data = (opcode == OP(FETCH_ADD)) ? 2205 (u64) atomic64_add_return(sdata, maddr) - sdata : 2206 (u64) cmpxchg((u64 *) qp->r_sge.sge.vaddr, 2207 be64_to_cpu(ateth->compare_data), 2208 sdata); 2209 qib_put_mr(qp->r_sge.sge.mr); 2210 qp->r_sge.num_sge = 0; 2211 e->opcode = opcode; 2212 e->sent = 0; 2213 e->psn = psn; 2214 e->lpsn = psn; 2215 qp->r_msn++; 2216 qp->r_psn++; 2217 qp->r_state = opcode; 2218 qp->r_nak_state = 0; 2219 qp->r_head_ack_queue = next; 2220 2221 /* Schedule the send tasklet. */ 2222 qp->s_flags |= QIB_S_RESP_PENDING; 2223 qib_schedule_send(qp); 2224 2225 goto sunlock; 2226 } 2227 2228 default: 2229 /* NAK unknown opcodes. */ 2230 goto nack_inv; 2231 } 2232 qp->r_psn++; 2233 qp->r_state = opcode; 2234 qp->r_ack_psn = psn; 2235 qp->r_nak_state = 0; 2236 /* Send an ACK if requested or required. */ 2237 if (psn & (1 << 31)) 2238 goto send_ack; 2239 return; 2240 2241rnr_nak: 2242 qp->r_nak_state = IB_RNR_NAK | qp->r_min_rnr_timer; 2243 qp->r_ack_psn = qp->r_psn; 2244 /* Queue RNR NAK for later */ 2245 if (list_empty(&qp->rspwait)) { 2246 qp->r_flags |= QIB_R_RSP_NAK; 2247 atomic_inc(&qp->refcount); 2248 list_add_tail(&qp->rspwait, &rcd->qp_wait_list); 2249 } 2250 return; 2251 2252nack_op_err: 2253 qib_rc_error(qp, IB_WC_LOC_QP_OP_ERR); 2254 qp->r_nak_state = IB_NAK_REMOTE_OPERATIONAL_ERROR; 2255 qp->r_ack_psn = qp->r_psn; 2256 /* Queue NAK for later */ 2257 if (list_empty(&qp->rspwait)) { 2258 qp->r_flags |= QIB_R_RSP_NAK; 2259 atomic_inc(&qp->refcount); 2260 list_add_tail(&qp->rspwait, &rcd->qp_wait_list); 2261 } 2262 return; 2263 2264nack_inv_unlck: 2265 spin_unlock_irqrestore(&qp->s_lock, flags); 2266nack_inv: 2267 qib_rc_error(qp, IB_WC_LOC_QP_OP_ERR); 2268 qp->r_nak_state = IB_NAK_INVALID_REQUEST; 2269 qp->r_ack_psn = qp->r_psn; 2270 /* Queue NAK for later */ 2271 if (list_empty(&qp->rspwait)) { 2272 qp->r_flags |= QIB_R_RSP_NAK; 2273 atomic_inc(&qp->refcount); 2274 list_add_tail(&qp->rspwait, &rcd->qp_wait_list); 2275 } 2276 return; 2277 2278nack_acc_unlck: 2279 spin_unlock_irqrestore(&qp->s_lock, flags); 2280nack_acc: 2281 qib_rc_error(qp, IB_WC_LOC_PROT_ERR); 2282 qp->r_nak_state = IB_NAK_REMOTE_ACCESS_ERROR; 2283 qp->r_ack_psn = qp->r_psn; 2284send_ack: 2285 qib_send_rc_ack(qp); 2286 return; 2287 2288sunlock: 2289 spin_unlock_irqrestore(&qp->s_lock, flags); 2290} 2291