lots of useless recvfrom() calls

Sun Jun 3 00:52:06 MSK 2018

On Tue, May 08, 2018 at 12:12:54PM +0300, Lennert Buytenhek wrote:

> > > I am using the patch below to avoid one useless recvfrom() call (that
> > > returns EAGAIN) for every call to dns_ioevent().  Under low load, where
> > > every call to dns_ioevent() processes one DNS reply, this saves half of
> > > the total number of recvfrom() calls.
> > 
> > Is it really a problem under low load? If the load is low, it doesn't
> > really matter how many recvfrom() system calls are made, I think.
> 
> 'low load' here is 'a low expected number of reply packets per POLLIN
> cycle', but even for a large number of queries per second and multiple
> parallel queries, I would not expect there to be more than one reply
> packet per POLLIN cycle very often.
> 
> Is it a 'problem' per se, well, probably not, since it's not a
> correctness issue.  But it looks ugly in strace, and the difference in
> (system) CPU time used is visible in a microbenchmark (kPTI woohoo),
> and it's basically just stealing some CPU cycles from more useful tasks.
> 
> (Unfortunately, my only benchmark is synthetic, but I've attached it
> below.  It needs ivykis, which should be in your distro, and iv_udns
> which I CCd you on.  Run it as:
> 
> 	$ ./bench <parallel_queries> <seconds_to_run_for>
> 
> )

I've been testing with the ugly patch below, which adds dns_ioevent_lt(),
which is an alternative version of dns_ioevent() that only ever polls
the receive fd once.  (The naming is horrible, I agree.  Maybe something
like dns_ioevent_once() would be better?)

My benchmark is between two Skylake machines, both connected to a
gigabit LAN, one running bind serving an authoritative zone, and the
other running the benchmark program I sent earlier, with 15 second runs
of DNS queries at 1 concurrent outstanding query, and using dns_ioevent()
("stock") and dns_ioevent_lt() ("earlyout") on alternating runs, for a
whole bunch of runs, where I measure the amount of user time used in
every 15 second run.  (The local network is slightly noisy, and
sometimes it drops queries, so I discard data for runs where we had to
retransmit.)

The benchmark is still running, but so far I have this:

$ ministat -c 99.5 -w 73 -q stock.user earlyout.user
x stock.user
+ earlyout.user
    N           Min           Max        Median           Avg        Stddev
x 747          0.17          0.94          0.57    0.55676037    0.10852262
+ 743          0.19          0.86          0.54     0.5276716    0.10745821
Difference at 99.5% confidence
        -0.0290888 +/- 0.0172899
        -5.22465% +/- 3.10545%
        (Student's t, pooled s = 0.107993)
$

This shows a small but statistically significant difference in user time
between the recv-until-EAGAIN and recv-only-once approaches, where the
latter uses on average 5.2% less user CPU time (+/- 3.1%).  This is on
kernel 4.16.12, with kPTI enabled.



> > Also, if the number of system calls really is a concern, maybe using
> > recvmmsg() instead of recvfrom(), on linux, will be a good idea.
> 
> Sounds good!  But it's probably worth microbenchmarking to see how
> much slower recvmmsg() is compared to recvfrom() for the case where
> there is one reply, I'll do this if you don't beat me to it.

I'll do this next.

diff --git a/udns.h b/udns.h
index 371e697..e9e283e 100644
--- a/udns.h
+++ b/udns.h
@@ -460,6 +460,8 @@ dns_setstatus(struct dns_ctx *ctx, int status);
 /* handle I/O event on UDP socket */
 UDNS_API void
 dns_ioevent(struct dns_ctx *ctx, time_t now);
+UDNS_API void
+dns_ioevent_lt(struct dns_ctx *ctx, time_t now);
 
 /* process any timeouts, return time in secounds to the
  * next timeout (or -1 if none) but not greather than maxwait */
diff --git a/udns_resolver.c b/udns_resolver.c
index b8f899a..a626fa6 100644
--- a/udns_resolver.c
+++ b/udns_resolver.c
@@ -947,18 +947,7 @@ dns_submit_p(struct dns_ctx *ctx,
     dns_submit_dn(ctx, ctx->dnsc_pbuf, qcls, qtyp, flags, parse, cbck, data);
 }
 
-/* process readable fd condition.
- * To be usable in edge-triggered environment, the routine
- * should consume all input so it should loop over.
- * Note it isn't really necessary to loop here, because
- * an application may perform the loop just fine by it's own,
- * but in this case we should return some sensitive result,
- * to indicate when to stop calling and error conditions.
- * Note also we may encounter all sorts of recvfrom()
- * errors which aren't fatal, and at the same time we may
- * loop forever if an error IS fatal.
- */
-void dns_ioevent(struct dns_ctx *ctx, time_t now) {
+static int dns_ioevent_internal(struct dns_ctx *ctx, time_t now) {
   int r;
   unsigned servi;
   struct dns_query *q;
@@ -968,16 +957,8 @@ void dns_ioevent(struct dns_ctx *ctx, time_t now) {
   union sockaddr_ns sns;
   socklen_t slen;
 
-  SETCTX(ctx);
-  if (!CTXOPEN(ctx))
-    return;
-  dns_assert_ctx(ctx);
   pbuf = ctx->dnsc_pbuf;
 
-  if (!now) now = time(NULL);
-
-again: /* receive the reply */
-
   slen = sizeof(sns);
   r = recvfrom(ctx->dnsc_udpsock, (void*)pbuf, ctx->dnsc_udpbuf,
                MSG_DONTWAIT, &sns.sa, &slen);
@@ -996,11 +977,8 @@ again: /* receive the reply */
 #else
     if (errno == EAGAIN)
 #endif
-    {
-      dns_request_utm(ctx, now);
-      return;
-    }
-    goto again;
+      return 0;
+    return 1;
   }
 
   pend = pbuf + r;
@@ -1009,7 +987,7 @@ again: /* receive the reply */
   /* check reply header */
   if (pcur > pend || dns_numqd(pbuf) > 1 || dns_opcode(pbuf) != 0) {
     DNS_DBG(ctx, -1/*bad reply*/, &sns.sa, slen, pbuf, r);
-    goto again;
+    return 1;
   }
 
   /* find the matching query, by qID */
@@ -1017,7 +995,7 @@ again: /* receive the reply */
     if (!q) {
       /* no more requests: old reply? */
       DNS_DBG(ctx, -5/*no matching query*/, &sns.sa, slen, pbuf, r);
-      goto again;
+      return 1;
     }
     if (pbuf[DNS_H_QID1] == q->dnsq_id[0] &&
         pbuf[DNS_H_QID2] == q->dnsq_id[1])
@@ -1031,13 +1009,13 @@ again: /* receive the reply */
     if (dns_getdn(pbuf, &pcur, pend, dn, sizeof(dn)) < 0 ||
         pcur + 4 > pend) {
       DNS_DBG(ctx, -1/*bad reply*/, &sns.sa, slen, pbuf, r);
-      goto again;
+      return 1;
     }
     if (!dns_dnequal(dn, q->dnsq_dn) ||
         memcmp(pcur, q->dnsq_typcls, 4) != 0) {
       /* not this query */
       DNS_DBG(ctx, -5/*no matching query*/, &sns.sa, slen, pbuf, r);
-      goto again;
+      return 1;
     }
     /* here, query match, and pcur points past qDN in query section in pbuf */
   }
@@ -1045,7 +1023,7 @@ again: /* receive the reply */
   else if (dns_rcode(pbuf) != DNS_R_FORMERR) {
     /* treat it as bad reply if !FORMERR */
     DNS_DBG(ctx, -1/*bad reply*/, &sns.sa, slen, pbuf, r);
-    goto again;
+    return 1;
   }
   else {
     /* else it's FORMERR, handled below */
@@ -1072,7 +1050,7 @@ again: /* receive the reply */
    * Note we can receive reply from first try if we're already at next */
   if (!(q->dnsq_servwait & (1 << servi))) { /* if ever asked this NS */
     DNS_DBG(ctx, -2/*wrong server*/, &sns.sa, slen, pbuf, r);
-    goto again;
+    return 1;
   }
 
   /* we got (some) reply for our query */
@@ -1125,7 +1103,7 @@ again: /* receive the reply */
       memcpy(result, pbuf, r);
       dns_end_query(ctx, q, r, result);
     }
-    goto again;
+    return 1;
 
   case DNS_R_NXDOMAIN:	/* Non-existing domain. */
     if (dns_next_srch(ctx, q))
@@ -1136,7 +1114,7 @@ again: /* receive the reply */
        * if we've seen it before, or NXDOMAIN if not. */
       dns_end_query(ctx, q,
            q->dnsq_flags & DNS_SEEN_NODATA ? DNS_E_NODATA : DNS_E_NXDOMAIN, 0);
-    goto again;
+    return 1;
 
   case DNS_R_FORMERR:
   case DNS_R_NOTIMPL:
@@ -1155,7 +1133,7 @@ again: /* receive the reply */
        */
       q->dnsq_servnEDNS0 |= 1 << servi;
       dns_send_this(ctx, q, servi, now);
-      goto again;
+      return 1;
     }
     /* else we handle it the same as SERVFAIL etc */
 
@@ -1195,8 +1173,44 @@ again: /* receive the reply */
   else {
     /* else don't do anything - not all servers replied yet */
   }
-  goto again;
+  return 1;
+}
+
+/* process readable fd condition.
+ * To be usable in edge-triggered environment, the routine
+ * should consume all input so it should loop over.
+ * Note it isn't really necessary to loop here, because
+ * an application may perform the loop just fine by it's own,
+ * but in this case we should return some sensitive result,
+ * to indicate when to stop calling and error conditions.
+ * Note also we may encounter all sorts of recvfrom()
+ * errors which aren't fatal, and at the same time we may
+ * loop forever if an error IS fatal.
+ */
+void dns_ioevent(struct dns_ctx *ctx, time_t now) {
+  SETCTX(ctx);
+  if (!CTXOPEN(ctx))
+    return;
+  dns_assert_ctx(ctx);
+
+  if (!now) now = time(NULL);
+
+  while (dns_ioevent_internal(ctx, now));
+
+  dns_request_utm(ctx, now);
+}
+
+void dns_ioevent_lt(struct dns_ctx *ctx, time_t now) {
+  SETCTX(ctx);
+  if (!CTXOPEN(ctx))
+    return;
+  dns_assert_ctx(ctx);
+
+  if (!now) now = time(NULL);
+
+  dns_ioevent_internal(ctx, now);
 
+  dns_request_utm(ctx, now);
 }
 
 /* handle all timeouts */