[openib-general] [PATCH] for review to timeout send MADs

Sean Hefty
Wed Oct 6 16:01:44 PDT 2004


Here are some modifications to support timing out send MADs in the access layer.  I haven't tested this code beyond building it, but wanted to make it available for review.  There are a few race conditions that need to be avoided when handling timeouts, so if it looks like something was missed, let me know.

Hal, can you either send me your test code or check it into svn somewhere?  I want to verify that this doesn't break your current tests, then expand the tests to check that the timeout code is working properly.

Additional comments:
A couple of structure elements were renamed to better reflect their new usage.

mad_agents now have two lists: send_list and wait_list.  mad_send_wr on the send_list have active work requests posted for them.  Once all work requests have completed, if the mad_send_wr has a timeout and hasn't been canceled, it is moved to the wait_list.

A workqueue is used to schedule delayed processing of MAD timeouts.  The scheduling delay of the timeout thread is adjusted when a send completes or is canceled.  If anyone sees an issue with my usage of the workqueue, just let me know.

- Sean

-- Index: access/ib_mad_priv.h
===================================================================
--- access/ib_mad_priv.h        (revision 946)
+++ access/ib_mad_priv.h        (working copy)
@@ -58,6 +58,8 @@
 
 #include <linux/pci.h>
 #include <linux/kthread.h>
+#include <linux/timer.h>
+#include <linux/workqueue.h>
 #include <ib_mad.h>
 #include <ib_smi.h>
 
@@ -106,8 +108,11 @@
        struct ib_mad_reg_req *reg_req;
        struct ib_mad_port_private *port_priv;
 
-       spinlock_t send_list_lock;
+       spinlock_t lock;
        struct list_head send_list;
+       struct list_head wait_list;
+       struct work_struct work;
+       unsigned long timeout;
 
        atomic_t refcount;
        wait_queue_head_t wait;
@@ -116,11 +121,11 @@
 
 struct ib_mad_send_wr_private {
        struct list_head send_list;
-       struct list_head agent_send_list;
+       struct list_head agent_list;
        struct ib_mad_agent *agent;
        u64 wr_id;                      /* client WR ID */
        u64 tid;
-       int timeout_ms;
+       unsigned long timeout;
        int refcount;
        enum ib_wc_status status;
 };
Index: access/ib_mad.c
===================================================================
--- access/ib_mad.c     (revision 946)
+++ access/ib_mad.c     (working copy)
@@ -87,6 +87,7 @@
 static void cancel_mads(struct ib_mad_agent_private *mad_agent_priv);
 static void ib_mad_complete_send_wr(struct ib_mad_send_wr_private *mad_send_wr,
                                    struct ib_mad_send_wc *mad_send_wc);
+static void timeout_sends(void *data);
 
 /*
  * ib_register_mad_agent - Register to send/receive MADs
@@ -225,8 +226,10 @@
        list_add_tail(&mad_agent_priv->agent_list, &port_priv->agent_list);
        spin_unlock_irqrestore(&port_priv->reg_lock, flags);
 
-       spin_lock_init(&mad_agent_priv->send_list_lock);
+       spin_lock_init(&mad_agent_priv->lock);
        INIT_LIST_HEAD(&mad_agent_priv->send_list);
+       INIT_LIST_HEAD(&mad_agent_priv->wait_list);
+       INIT_WORK(&mad_agent_priv->work, timeout_sends, mad_agent_priv);
        atomic_set(&mad_agent_priv->refcount, 1);
        init_waitqueue_head(&mad_agent_priv->wait);
        mad_agent_priv->port_priv = port_priv;
@@ -254,17 +257,26 @@
        mad_agent_priv = container_of(mad_agent, struct ib_mad_agent_private,
                                      agent);
 
-       /* Cleanup pending receives for this agent !!! */
+       /* Note that we could still be handling received MADs. */
+
+       /*
+        * Canceling all sends results in dropping received response MADs,
+        * preventing us from queuing additional work.
+        */
        cancel_mads(mad_agent_priv);
 
+       cancel_delayed_work(&mad_agent_priv->work);
+       flush_scheduled_work();
+
        spin_lock_irqsave(&mad_agent_priv->port_priv->reg_lock, flags);
        remove_mad_reg_req(mad_agent_priv);
        list_del(&mad_agent_priv->agent_list);
        spin_unlock_irqrestore(&mad_agent_priv->port_priv->reg_lock, flags);
+       
+       /* Cleanup pending RMPP receives for this agent !!! */
 
        atomic_dec(&mad_agent_priv->refcount);
-       wait_event(mad_agent_priv->wait,
-                  !atomic_read(&mad_agent_priv->refcount));
+       wait_event(mad_agent_priv->wait, !atomic_read(&mad_agent_priv->refcount));
 
        if (mad_agent_priv->reg_req)
                kfree(mad_agent_priv->reg_req);
@@ -346,19 +358,19 @@
 
                mad_send_wr->tid = send_wr->wr.ud.mad_hdr->tid;
                mad_send_wr->agent = mad_agent;
-               mad_send_wr->timeout_ms = cur_send_wr->wr.ud.timeout_ms;
-               if (mad_send_wr->timeout_ms)
-                       mad_send_wr->refcount = 2;
-               else
-                       mad_send_wr->refcount = 1;
+               /* Timeout will be updated after send completes. */
+               mad_send_wr->timeout = msecs_to_jiffies(
+                                       cur_send_wr->wr.ud.timeout_ms);
+               /* One reference for each work request to QP + response. */
+               mad_send_wr->refcount = 1 + (mad_send_wr->timeout > 0);
                mad_send_wr->status = IB_WC_SUCCESS;
 
                /* Reference MAD agent until send completes */
                atomic_inc(&mad_agent_priv->refcount);
-               spin_lock_irqsave(&mad_agent_priv->send_list_lock, flags);
-               list_add_tail(&mad_send_wr->agent_send_list,
+               spin_lock_irqsave(&mad_agent_priv->lock, flags);
+               list_add_tail(&mad_send_wr->agent_list,
                              &mad_agent_priv->send_list);
-               spin_unlock_irqrestore(&mad_agent_priv->send_list_lock, flags);
+               spin_unlock_irqrestore(&mad_agent_priv->lock, flags);
 
                cur_send_wr->next = NULL;
                ret = ib_send_mad(mad_agent_priv, mad_send_wr,
@@ -367,16 +379,12 @@
                        /* Handle QP overrun separately... -ENOMEM */
 
                        /* Fail send request */
-                       spin_lock_irqsave(&mad_agent_priv->send_list_lock,
-                                         flags);
-                       list_del(&mad_send_wr->agent_send_list);
-                       spin_unlock_irqrestore(&mad_agent_priv->send_list_lock,
-                                              flags);
+                       spin_lock_irqsave(&mad_agent_priv->lock, flags);
+                       list_del(&mad_send_wr->agent_list);
+                       spin_unlock_irqrestore(&mad_agent_priv->lock, flags);
 
                        *bad_send_wr = cur_send_wr;
-                       if (atomic_dec_and_test(&mad_agent_priv->refcount))
-                               wake_up(&mad_agent_priv->wait);
-
+                       atomic_dec(&mad_agent_priv->refcount);
                        return ret;             
                }
                cur_send_wr= next_send_wr;
@@ -690,28 +698,35 @@
                        }
                }
                if (!mad_agent) {
-                       printk(KERN_ERR "No client 0x%x for received MAD on port %d\n",
-                              hi_tid, port_priv->port_num);
+                       printk(KERN_ERR "No client 0x%x for received MAD on "
+                              "port %d\n", hi_tid, port_priv->port_num);
                        goto ret;
                }
        } else {
                /* Routing is based on version, class, and method */
                if (mad->mad_hdr.class_version >= MAX_MGMT_VERSION) {
-                       printk(KERN_ERR "MAD received with unsupported class version %d on port %d\n",
+                       printk(KERN_ERR "MAD received with unsupported class "
+                              "version %d on port %d\n",
                               mad->mad_hdr.class_version, port_priv->port_num);
                        goto ret;
                }
                version = port_priv->version[mad->mad_hdr.class_version];
                if (!version) {
-                       printk(KERN_ERR "MAD received on port %d for class version %d with no client\n", port_priv->port_num, mad->mad_hdr.class_version);
+                       printk(KERN_ERR "MAD received on port %d for class "
+                              "version %d with no client\n",
+                              port_priv->port_num, mad->mad_hdr.class_version);
                        goto ret;
                }
-               class = version->method_table[convert_mgmt_class(mad->mad_hdr.mgmt_class)];       
+               class = version->method_table[convert_mgmt_class(
+                                             mad->mad_hdr.mgmt_class)];
                if (!class) {
-                       printk(KERN_ERR "MAD received on port %d for class %d with no client\n", port_priv->port_num, mad->mad_hdr.mgmt_class);
+                       printk(KERN_ERR "MAD received on port %d for class %d "
+                              "with no client\n", port_priv->port_num,
+                              mad->mad_hdr.mgmt_class);
                        goto ret;
                }
-               mad_agent = class->agent[mad->mad_hdr.method & ~IB_MGMT_METHOD_RESP];         
+               mad_agent = class->agent[mad->mad_hdr.method & 
+                                        ~IB_MGMT_METHOD_RESP];         
        }
 
 ret:
@@ -724,8 +739,8 @@
 
        /* Make sure MAD base version is understood */
        if (mad->mad_hdr.base_version != IB_MGMT_BASE_VERSION) {
-               printk(KERN_ERR "MAD received with unsupported base version %d\n",
-                      mad->mad_hdr.base_version);
+               printk(KERN_ERR "MAD received with unsupported base "
+                      "version %d\n", mad->mad_hdr.base_version);
                goto ret;
        }
 
@@ -761,16 +776,24 @@
 {
        struct ib_mad_send_wr_private *mad_send_wr;
 
+       list_for_each_entry(mad_send_wr, &mad_agent_priv->wait_list,
+                           agent_list) {
+
+               if (mad_send_wr->tid == tid)
+                       return mad_send_wr;
+       }
+
+       /*
+        * It's possible to receive the response before we've been notified
+        * that the send has completed.
+        */
        list_for_each_entry(mad_send_wr, &mad_agent_priv->send_list,
-                           agent_send_list) {
+                           agent_list) {
 
-               if (mad_send_wr->tid == tid) {
-                       /* Verify request is still valid */
-                       if (mad_send_wr->status == IB_WC_SUCCESS &&
-                           mad_send_wr->timeout_ms)
-                               return mad_send_wr;
-                       else
-                               return NULL;
+               if (mad_send_wr->tid == tid && mad_send_wr->timeout) {
+                       /* Verify request has not been canceled. */
+                       return (mad_send_wr->status == IB_WC_SUCCESS) ?
+                               mad_send_wr : NULL;
                }
        }
        return NULL;
@@ -791,17 +814,17 @@
 
        /* Complete corresponding request */
        if (solicited) {
-               spin_lock_irqsave(&mad_agent_priv->send_list_lock, flags);
+               spin_lock_irqsave(&mad_agent_priv->lock, flags);
                mad_send_wr = find_send_req(mad_agent_priv,
                                            recv->mad.mad.mad_hdr.tid);
                if (!mad_send_wr) {
-                       spin_unlock_irqrestore(&mad_agent_priv->send_list_lock,
-                                              flags);
+                       spin_unlock_irqrestore(&mad_agent_priv->lock, flags);
                        ib_free_recv_mad(&recv->header.recv_wc);
                        return;
                }
-               mad_send_wr->timeout_ms = 0;
-               spin_unlock_irqrestore(&mad_agent_priv->send_list_lock, flags);
+               /* Timeout = 0 means that we won't wait for a response. */
+               mad_send_wr->timeout = 0;
+               spin_unlock_irqrestore(&mad_agent_priv->lock, flags);
 
                /* Defined behavior is to complete response before request */
                mad_agent_priv->agent.recv_handler(&mad_agent_priv->agent,
@@ -849,20 +872,20 @@
        spin_lock_irqsave(&port_priv->recv_list_lock, flags);
        if (!list_empty(&port_priv->recv_posted_mad_list[qpn])) {
                rbuf = list_entry(&port_priv->recv_posted_mad_list[qpn],
-                                struct ib_mad_recv_buf,
-                                list);
-               rbuf = (struct ib_mad_recv_buf *)rbuf->list.next;
+                                 struct ib_mad_recv_buf,
+                                 list);
+               rbuf = (struct ib_mad_recv_buf*)rbuf->list.next;
                mad_priv_hdr = container_of(rbuf, struct ib_mad_private_header,
                                            recv_buf);
-               recv = container_of(mad_priv_hdr, struct ib_mad_private, header);
+               recv = container_of(mad_priv_hdr,struct ib_mad_private,header);
        
                /* Remove from posted receive MAD list */
                list_del(&recv->header.recv_buf.list);
                port_priv->recv_posted_mad_count[qpn]--;
 
        } else {
-               printk(KERN_ERR "Receive completion WR ID 0x%Lx on QP %d with no"
-                      "posted receive\n", wc->wr_id, qp_num);
+               printk(KERN_ERR "Receive completion WR ID 0x%Lx on QP %d with "
+                      "no posted receive\n", wc->wr_id, qp_num);
                spin_unlock_irqrestore(&port_priv->recv_list_lock, flags);
                ib_mad_post_receive_mad(port_priv, port_priv->qp[qp_num]);
                return;
@@ -893,7 +916,8 @@
                                   solicited);
        if (!mad_agent) {
                spin_unlock_irqrestore(&port_priv->reg_lock, flags);
-               printk(KERN_NOTICE "No matching mad agent found for received MAD on port %d\n", port_priv->port_num);      
+               printk(KERN_NOTICE "No matching mad agent found for received "
+                      "MAD on port %d\n", port_priv->port_num);   
        } else {
                atomic_inc(&mad_agent->refcount);
                spin_unlock_irqrestore(&port_priv->reg_lock, flags);
@@ -911,6 +935,59 @@
        return;
 }
 
+static void adjust_timeout(struct ib_mad_agent_private *mad_agent_priv)
+{
+       struct ib_mad_send_wr_private *mad_send_wr;
+       unsigned long delay;
+
+       if (list_empty(&mad_agent_priv->wait_list)) {
+               cancel_delayed_work(&mad_agent_priv->work);
+       } else {
+
+               mad_send_wr = list_entry(mad_agent_priv->wait_list.next,
+                                        struct ib_mad_send_wr_private,
+                                        agent_list);
+
+               if (time_after(mad_agent_priv->timeout, mad_send_wr->timeout)) {
+
+                       mad_agent_priv->timeout = mad_send_wr->timeout;
+                       cancel_delayed_work(&mad_agent_priv->work);
+                       delay = mad_send_wr->timeout - jiffies;
+                       if ((long)delay <= 0)
+                               delay = 1;
+                       schedule_delayed_work(&mad_agent_priv->work, delay);
+               }
+       }
+}
+
+static void wait_for_response(struct ib_mad_agent_private *mad_agent_priv,
+                             struct ib_mad_send_wr_private *mad_send_wr )
+{
+       struct ib_mad_send_wr_private *temp_mad_send_wr;
+       struct list_head *list_item;
+       unsigned long delay;
+
+       list_del(&mad_send_wr->agent_list);
+
+       delay = mad_send_wr->timeout;
+       mad_send_wr->timeout += jiffies;
+
+       list_for_each_prev(list_item, &mad_agent_priv->wait_list) {
+               temp_mad_send_wr = list_entry(list_item,
+                                             struct ib_mad_send_wr_private,
+                                             agent_list);
+               if (time_after(mad_send_wr->timeout, temp_mad_send_wr->timeout))
+                       break;
+       }
+       list_add(&mad_send_wr->agent_list, list_item);
+
+       /* Re-schedule a work item if we have a shorter timeout. */
+       if (mad_agent_priv->wait_list.next == &mad_send_wr->agent_list) {
+               cancel_delayed_work(&mad_agent_priv->work);
+               schedule_delayed_work(&mad_agent_priv->work, delay);
+       }
+}
+
 /*
  * Process a send work completion.
  */
@@ -923,30 +1000,27 @@
        mad_agent_priv = container_of(mad_send_wr->agent,
                                      struct ib_mad_agent_private, agent);
 
-       spin_lock_irqsave(&mad_agent_priv->send_list_lock, flags);
+       spin_lock_irqsave(&mad_agent_priv->lock, flags);
        if (mad_send_wc->status != IB_WC_SUCCESS &&
            mad_send_wr->status == IB_WC_SUCCESS) {
 
                mad_send_wr->status = mad_send_wc->status;
-               if (mad_send_wr->timeout_ms) {
-                       mad_send_wr->timeout_ms = 0;
-                       mad_send_wr->refcount--;
-               }
+               mad_send_wr->refcount -= (mad_send_wr->timeout > 0);
        }
 
-       /*
-        * Leave sends with timeouts on the send list
-        * until either matching response is received
-        * or timeout occurs
-        */
        if (--mad_send_wr->refcount > 0) {
-               spin_unlock_irqrestore(&mad_agent_priv->send_list_lock, flags);
+               if (mad_send_wr->refcount == 1 && mad_send_wr->timeout &&
+                   mad_send_wr->status == IB_WC_SUCCESS) {
+                       wait_for_response(mad_agent_priv, mad_send_wr);
+               }
+               spin_unlock_irqrestore(&mad_agent_priv->lock, flags);
                return;
        }
 
        /* Remove send from MAD agent and notify client of completion */
-       list_del(&mad_send_wr->agent_send_list);
-       spin_unlock_irqrestore(&mad_agent_priv->send_list_lock, flags);
+       list_del(&mad_send_wr->agent_list);
+       adjust_timeout(mad_agent_priv);
+       spin_unlock_irqrestore(&mad_agent_priv->lock, flags);
        
        if (mad_send_wr->status != IB_WC_SUCCESS )
                mad_send_wc->status = mad_send_wr->status;
@@ -1045,40 +1119,33 @@
 
        INIT_LIST_HEAD(&cancel_list);
 
-       spin_lock_irqsave(&mad_agent_priv->send_list_lock, flags);
+       spin_lock_irqsave(&mad_agent_priv->lock, flags);
        list_for_each_entry_safe(mad_send_wr, temp_mad_send_wr,
-                                &mad_agent_priv->send_list, agent_send_list) {
+                                &mad_agent_priv->send_list, agent_list) {
 
-               if (mad_send_wr->status == IB_WC_SUCCESS)
+               if (mad_send_wr->status == IB_WC_SUCCESS) {
                        mad_send_wr->status = IB_WC_WR_FLUSH_ERR;
-
-               if (mad_send_wr->timeout_ms) {
-                       mad_send_wr->timeout_ms = 0;
-                       mad_send_wr->refcount--;
-               }
-
-               if (mad_send_wr->refcount == 0) {
-                       list_del(&mad_send_wr->agent_send_list);
-                       list_add_tail(&mad_send_wr->agent_send_list,
-                                     &cancel_list);
+                       mad_send_wr->refcount -= (mad_send_wr->timeout > 0);
                }
        }
-       spin_unlock_irqrestore(&mad_agent_priv->send_list_lock, flags);
+
+       /* Empty wait list to prevent receives from finding a request. */
+       list_splice_init(&mad_agent_priv->wait_list, &cancel_list);
+       spin_unlock_irqrestore(&mad_agent_priv->lock, flags);
 
        /* Report all cancelled requests */
        mad_send_wc.status = IB_WC_WR_FLUSH_ERR;
        mad_send_wc.vendor_err = 0;
 
        list_for_each_entry_safe(mad_send_wr, temp_mad_send_wr,
-                                &cancel_list, agent_send_list) {
+                                &cancel_list, agent_list) {
 
                mad_send_wc.wr_id = mad_send_wr->wr_id;
                mad_agent_priv->agent.send_handler(&mad_agent_priv->agent,
                                                   &mad_send_wc);
 
-               list_del(&mad_send_wr->agent_send_list);
+               list_del(&mad_send_wr->agent_list);
                kfree(mad_send_wr);
-
                atomic_dec(&mad_agent_priv->refcount);
        }
 }
@@ -1089,11 +1156,18 @@
 {
        struct ib_mad_send_wr_private *mad_send_wr;
 
+       list_for_each_entry(mad_send_wr, &mad_agent_priv->wait_list,
+                           agent_list) {
+               if (mad_send_wr->wr_id == wr_id)
+                       return mad_send_wr;
+       }
+
        list_for_each_entry(mad_send_wr, &mad_agent_priv->send_list,
-                           agent_send_list) {
+                           agent_list) {
                if (mad_send_wr->wr_id == wr_id)
                        return mad_send_wr;
        }
+
        return NULL;
 }
 
@@ -1107,28 +1181,25 @@
 
        mad_agent_priv = container_of(mad_agent, struct ib_mad_agent_private,
                                      agent);
-       spin_lock_irqsave(&mad_agent_priv->send_list_lock, flags);
+       spin_lock_irqsave(&mad_agent_priv->lock, flags);
        mad_send_wr = find_send_by_wr_id(mad_agent_priv, wr_id);
        if (!mad_send_wr) {
-               spin_unlock_irqrestore(&mad_agent_priv->send_list_lock, flags);
+               spin_unlock_irqrestore(&mad_agent_priv->lock, flags);
                goto ret;
        }
 
        if (mad_send_wr->status == IB_WC_SUCCESS)
-               mad_send_wr->status = IB_WC_WR_FLUSH_ERR;
-
-       if (mad_send_wr->timeout_ms) {
-               mad_send_wr->timeout_ms = 0;
-               mad_send_wr->refcount--;
-       }
+               mad_send_wr->refcount -= (mad_send_wr->timeout > 0);
 
        if (mad_send_wr->refcount != 0) {
-               spin_unlock_irqrestore(&mad_agent_priv->send_list_lock, flags);
+               mad_send_wr->status = IB_WC_WR_FLUSH_ERR;
+               spin_unlock_irqrestore(&mad_agent_priv->lock, flags);
                goto ret;
        }
 
-       list_del(&mad_send_wr->agent_send_list);
-       spin_unlock_irqrestore(&mad_agent_priv->send_list_lock, flags);
+       list_del(&mad_send_wr->agent_list);
+       adjust_timeout(mad_agent_priv);
+       spin_unlock_irqrestore(&mad_agent_priv->lock, flags);
 
        mad_send_wc.status = IB_WC_WR_FLUSH_ERR;
        mad_send_wc.vendor_err = 0;
@@ -1145,6 +1216,47 @@
 }
 EXPORT_SYMBOL(ib_cancel_mad);
 
+static void timeout_sends(void *data)
+{
+       struct ib_mad_agent_private *mad_agent_priv;
+       struct ib_mad_send_wr_private *mad_send_wr;
+       struct ib_mad_send_wc mad_send_wc;
+       unsigned long flags, delay;
+
+       mad_agent_priv = (struct ib_mad_agent_private*)data;
+
+       mad_send_wc.status = IB_WC_RESP_TIMEOUT_ERR;
+       mad_send_wc.vendor_err = 0;
+
+       spin_lock_irqsave(&mad_agent_priv->lock, flags);
+       while (!list_empty(&mad_agent_priv->wait_list)) {
+
+               mad_send_wr = list_entry(mad_agent_priv->wait_list.next,
+                                        struct ib_mad_send_wr_private,
+                                        agent_list);
+
+               if (time_after(mad_send_wr->timeout, jiffies)) {
+                       delay = mad_send_wr->timeout - jiffies;
+                       if ((long)delay <= 0)
+                               delay = 1;
+                       schedule_delayed_work(&mad_agent_priv->work, delay);
+                       break;
+               }
+
+               list_del(&mad_send_wr->agent_list);
+               spin_unlock_irqrestore(&mad_agent_priv->lock, flags);
+
+               mad_send_wc.wr_id = mad_send_wr->wr_id;
+               mad_agent_priv->agent.send_handler(&mad_agent_priv->agent,
+                                                  &mad_send_wc);
+
+               kfree(mad_send_wr);
+               atomic_dec(&mad_agent_priv->refcount);
+               spin_lock_irqsave(&mad_agent_priv->lock, flags);
+       }
+       spin_unlock_irqrestore(&mad_agent_priv->lock, flags);
+}
+
 /*
  * IB MAD thread
  */
@@ -1815,6 +1927,8 @@
 
 static int __init ib_mad_init_module(void)
 {
+       int ret;
+
        ib_mad_cache = kmem_cache_create("ib_mad",
                                         sizeof(struct ib_mad_private),
                                         0,
@@ -1830,10 +1944,14 @@
 
        if (ib_register_client(&mad_client)) {
                printk(KERN_ERR "Couldn't register ib_mad client\n");
-               return -EINVAL;
+               ret = -EINVAL;
+               goto error;
        }
-
        return 0;
+
+error:
+       kmem_cache_destroy(ib_mad_cache);
+       return ret;
 }
 
 static void __exit ib_mad_cleanup_module(void)



More information about the openib-general mailing list