How To DTrace poll(2) System Calls

Max Bruning has been teaching Unix Internals, Device Drivers, and various other courses since 1981. This is a new weekly column to answer your SmartOS, DTrace, and MDB questions. You can submit questions to Max by email at MrBruning@joyent.com, ask on twitter at hashtag #BruningQuestions, or by attending one of his courses.

Question: How To DTrace poll(2) System Calls

I received a question from Mohamed A. Khalfella (@khalfella) about a DTrace script he wrote to trace poll(2) system calls. Basically, his script was not working, and he wanted to know why not.

Here is his script:

syscall::pollsys:entry/execname == "xxx" && pid == 11528/{        self->pfds_user = arg0;        self->fds_cnt = arg1;        self->pfds= (struct pollfd*) copyin(self->pfds_user, self->fds_cnt* sizeof(structpollfds));        self->fds_print = arg1;        self->in_pollsys = 1;        printf("pollsys thread = %d arg1 = %d\n", tid, self->fds_cnt);}syscall::pollsys:entry/self->fds_print/{        self->fd=self->fds_cnt - self->fds_print;        self->pr_fd = (struct pollfd*) &self->pfds[self->fd];        printf ("thread = %d FD[%d]  fd=%d events=%x revents=%x\n",                tid,                self->fd,                self->pfds[self->fd].fd,                self->pfds[self->fd].events,                self->pfds[self->fd].revents);        self->fds_print--;}syscall::pollsys:entry/self->fds_print/{        self->fd=self->fds_cnt - self->fds_print;        self->pr_fd = (struct pollfd*) &self->pfds[self->fd];        printf ("thread = %d FD[%d]  fd=%d events=%x revents=%x\n",                tid,                self->fd,                self->pfds[self->fd].fd,                self->pfds[self->fd].events,                self->pfds[self->fd].revents);        self->fds_print--;}syscall::pollsys:entry/self->fds_print/{        self->fd=self->fds_cnt - self->fds_print;        self->pr_fd = (struct pollfd*) &self->pfds[self->fd];        printf ("thread = %d FD[%d]  fd=%d events=%x revents=%x\n",                tid,                self->fd,                self->pfds[self->fd].fd,                self->pfds[self->fd].events,                self->pfds[self->fd].revents);        self->fds_print--;}syscall::pollsys:entry/self->fds_print/{        self->fd=self->fds_cnt - self->fds_print;        self->pr_fd = (struct pollfd*) &self->pfds[self->fd];        printf ("thread = %d FD[%d]  fd=%d events=%x revents=%x\n",                tid,                self->fd,                self->pfds[self->fd].fd,                self->pfds[self->fd].events,                self->pfds[self->fd].revents);        self->fds_print--;}syscall::pollsys:entry/self->fds_print/{        self->fd=self->fds_cnt - self->fds_print;        self->pr_fd = (struct pollfd*) &self->pfds[self->fd];        printf ("thread = %d FD[%d]  fd=%d events=%x revents=%x\n",                tid,                self->fd,                self->pfds[self->fd].fd,                self->pfds[self->fd].events,                self->pfds[self->fd].revents);        self->fds_print--;}syscall::pollsys:entry/self->fds_print/{        self->fd=self->fds_cnt - self->fds_print;        self->pr_fd = (struct pollfd*) &self->pfds[self->fd];        printf ("thread = %d FD[%d]  fd=%d events=%x revents=%x\n",                tid,                self->fd,                self->pfds[self->fd].fd,                self->pfds[self->fd].events,                self->pfds[self->fd].revents);        self->fds_print--;}syscall::pollsys:entry/self->fds_print/{        self->fd=self->fds_cnt - self->fds_print;        self->pr_fd = (struct pollfd*) &self->pfds[self->fd];        printf ("thread = %d FD[%d]  fd=%d events=%x revents=%x\n",                tid,                self->fd,                self->pfds[self->fd].fd,                self->pfds[self->fd].events,                self->pfds[self->fd].revents);        self->fds_print--;}/* return probes */syscall::pollsys:return/self->in_pollsys/{        printf ("RETURN Pollsys thread = %d returns %d:%d\n", tid, arg0, arg1);        self->pfds= (struct pollfd*) copyin(self->pfds_user, self->fds_cnt*sizeof(structpollfds));        self->fds_print = self->fds_cnt;        self->in_pollsys = 0;}syscall::pollsys:return/self->fds_print/{        self->fd=self->fds_cnt - self->fds_print;        self->pr_fd = (struct pollfd*) &self->pfds[self->fd];        printf ("thread = %d FD[%d]  fd=%d events=%x revents=%x\n",                tid,                self->fd,                self->pfds[self->fd].fd,                self->pfds[self->fd].events,                self->pfds[self->fd].revents);        self->fds_print--;}syscall::pollsys:return/self->fds_print/{        self->fd=self->fds_cnt - self->fds_print;        self->pr_fd = (struct pollfd*) &self->pfds[self->fd];        printf ("thread = %d FD[%d]  fd=%d events=%x revents=%x\n",                tid,                self->fd,                self->pfds[self->fd].fd,                self->pfds[self->fd].events,                self->pfds[self->fd].revents);        self->fds_print--;}syscall::pollsys:return/self->fds_print/{        self->fd=self->fds_cnt - self->fds_print;        self->pr_fd = (struct pollfd*) &self->pfds[self->fd];        printf ("thread = %d FD[%d]  fd=%d events=%x revents=%x\n",                tid,                self->fd,                self->pfds[self->fd].fd,                self->pfds[self->fd].events,                self->pfds[self->fd].revents);        self->fds_print--;}syscall::pollsys:return/self->fds_print/{        self->fd=self->fds_cnt - self->fds_print;        self->pr_fd = (struct pollfd*) &self->pfds[self->fd];        printf ("thread = %d FD[%d]  fd=%d events=%x revents=%x\n",                tid,                self->fd,                self->pfds[self->fd].fd,                self->pfds[self->fd].events,                self->pfds[self->fd].revents);        self->fds_print--;}syscall::pollsys:return/self->fds_print/{        self->fd=self->fds_cnt - self->fds_print;        self->pr_fd = (struct pollfd*) &self->pfds[self->fd];        printf ("thread = %d FD[%d]  fd=%d events=%x revents=%x\n",                tid,                self->fd,                self->pfds[self->fd].fd,                self->pfds[self->fd].events,                self->pfds[self->fd].revents);        self->fds_print--;}syscall::pollsys:return/self->fds_print/{        self->fd=self->fds_cnt - self->fds_print;        self->pr_fd = (struct pollfd*) &self->pfds[self->fd];        printf ("thread = %d FD[%d]  fd=%d events=%x revents=%x\n",                tid,                self->fd,                self->pfds[self->fd].fd,                self->pfds[self->fd].events,                self->pfds[self->fd].revents);        self->fds_print--;}syscall::pollsys:return/self->fds_print/{        self->fd=self->fds_cnt - self->fds_print;        self->pr_fd = (struct pollfd*) &self->pfds[self->fd];        printf ("thread = %d FD[%d]  fd=%d events=%x revents=%x\n",                tid,                self->fd,                self->pfds[self->fd].fd,                self->pfds[self->fd].events,                self->pfds[self->fd].revents);        self->fds_print--;}

So, the script is tracing a specific process (name is "xxx" and pid is 11528). The poll(2 system call calls pollsys in the kernel. For those not familiar with poll(2), it is basically equivalent to select(2), but arguments are different. The select(3c) call actually calls pollsys in the kernel. An advantage to poll(2) is that it can handle an arbitrary number of descriptors. select is limited to FD_SETSIZE descriptors (see /usr/include/sys/select.h for details).

Here is some example (pseudo-) code using poll(2):

struct pollfd pfd[...];   /* one pollfd per file descriptor and/or socket being polled */int npfds;                    /* the number of pollfd entries */pfd[0].fd = open(file, mode);   /* or socket(...) */pfd[0].events = POLLIN;           /* see poll(2) man page, may also poll for other events */pfd[1].fd = open(file1, mode);  /* again, may be a socket call instead of open */pfd[1].events = POLLIN;...  /* repeat for each file/socket you are interested in */nevents = poll(pfd, npfds, timeout);   /* npfds is the number of entries in pfd.  See the man page for timeout. */if (nevents > 0) {        for (i = 0; i < npfds; i++) {            if (pfd[i].revents & POLLIN)                handle input event on pfd[i].fd;        ...    }}

Mohamed wants to list the pollfd structures in the array passed into the system call, and list them again when the system call returns (that way, he can see which file descriptors/sockets had events set in revents). The number of pollfd structures passed in as an array to the system call (npfds) may vary. There is no way to do a "for" loop in DTrace, so the script uses a predicate. It sets a counter for the number of pollfd structures passed to poll(2), and has multiple clauses for entry and return that use a predicate to determine when the script is finished enumerating the array. Note that there is a separate clausefor each pollfd structure to be printed. Since there are only 7 clauses that print the structure, this script will print up to the first seven structures passed to poll(2). If he expects more than seven, he would have to have the same clause that many more times. But for the app he is tracing, 7 is more than enough.

Now let's take a look at the output from the script:

pollsys thread = 1 arg1 = 3thread = 1 FD[0]  fd=2 events=0 revents=1thread = 1 FD[1]  fd=0 events=0 revents=0thread = 1 FD[2]  fd=2 events=0 revents=1RETURN Pollsys thread = 1 returns 1:1thread = 1 FD[0]  fd=10 events=0 revents=1thread = 1 FD[1]  fd=0 events=0 revents=0thread = 1 FD[2]  fd=10 events=0 revents=1pollsys thread = 1 arg1 = 3thread = 1 FD[0]  fd=2 events=0 revents=1thread = 1 FD[1]  fd=0 events=0 revents=0thread = 1 FD[2]  fd=2 events=0 revents=1RETURN Pollsys thread = 1 returns 1:1thread = 1 FD[0]  fd=10 events=0 revents=1thread = 1 FD[1]  fd=0 events=0 revents=0thread = 1 FD[2]  fd=10 events=0 revents=1

According to the above output, poll(2) is being called with 3 pollfd structures every time. But 2 of the file descriptors are being used twice. Also, revents is 1 in 2 of the returned structures. This implies that the return value (1:1 in the output) is not correct. When I tried the script on my machine, I also got strange output:

pollsys thread = 1 arg1 = 5thread = 1 FD[0]  fd=2 events=3f10 revents=2thread = 1 FD[1]  fd=1532892142 events=1 revents=0thread = 1 FD[2]  fd=0 events=0 revents=0thread = 1 FD[3]  fd=2 events=3f10 revents=2thread = 1 FD[4]  fd=-1649627104 events=ff01 revents=ffffRETURN pid: 2548 Pollsys thread = 1 returns 0:0thread = 1 FD[0]  fd=10 events=3f10 revents=2thread = 1 FD[1]  fd=1532993475 events=1 revents=0thread = 1 FD[2]  fd=0 events=0 revents=0thread = 1 FD[3]  fd=10 events=3f10 revents=2thread = 1 FD[4]  fd=11 events=3f10 revents=2

This simply looks wrong. A file descriptor of 1532892142??? And another file descriptor that is anegative number. Also, the return value from the system call is 0, but revents is set in 3 of the 5pollfd structures.

So, what output should I expect?

I don't have the "xxx" program on my machine, so first I'll use DTrace to see what processes use poll(2).

# dtrace -n 'syscall::pollsys:entry{@[execname] = count();} tick-30s{exit(0);}'dtrace: description 'syscall::pollsys:entry' matched 2 probesCPU     ID                    FUNCTION:NAME  0  75153                        :tick-30s  fmd                                                                  3  sendmail                                                          6  sshd                                                                 8  mdnsd                                                            25  bridged                                                           30#

Let's take a look at sshd. First, we'll pick a specific sshd. I have two ssh sessions opened on the machine. Here is one of them:

# ptree $$2594  /usr/lib/ssh/sshd  5897  /usr/lib/ssh/sshd    5898  /usr/lib/ssh/sshd      5901  -bash        7666  ptree 5901#

Let's take a look at the pollfd structure passed into the poll system call.

# pargs 58985898:   /usr/lib/ssh/sshdargv[0]: /usr/lib/ssh/sshd### mdb /usr/lib/ssh/sshd> 0t5898:A   <-- attach to the process, this stops the process.  The "0t" is because the numberis decimal.Loading modules: [ ld.so.1 libc.so.1 libproc.so.1 ]>> __pollsys::bp   <-- set a breakpoint at the user level library code that calls poll(2)>> :c   <-- continue the process

Now, in the window where pid 5898 is running bash, I'll type a carriage return, and back in the window where I'm running mdb, I see:

mdb: stop at libc_hwcap1.so.1`__pollsysmdb: target stopped at:libc_hwcap1.so.1`__pollsys:     call   +0x0     >> $c   <-- let's get a stack tracelibc_hwcap1.so.1`__pollsys(8047380, 5, 0, 0, 4, 80b0040)libc_hwcap1.so.1`pselect+0x1bf(a, 80d5530, 80b8f50, fef79f00, 0, 0)libc_hwcap1.so.1`select+0x8e(a, 80d5530, 80b8f50, 0, 0, c7)wait_until_can_do_something+0x233(80474f0, 0, 8047518, 806fafb, 4, 9)server_loop2+0x190(80d4ba8, 80bd560, 8047548, 808506c)do_authenticated2+0xc(80d4ba8, 80d4ba8, 8047e18, 806282c, 80d4ba8, 31)do_authenticated+0x31(80d4ba8, 31)main+0x14c4(809035a, feffb0a4, 8047e48, 805fbcf, 1, 8047e54)_start+0x83(1, 8047ef4, 0, 8047f06, 8047f20, 8047f31)>

The pollfd array is at virtual address 0x8047380 in the process. There are 5 structures in the array (the second argument to __pollsys()). In passing, you can see that sshd uses select(3C), not poll(2).

> 8047380,5::print -t struct pollfdstruct pollfd {    int fd = 0x4    short events = 0x40    short revents = 0x80b}struct pollfd {    int fd = 0x5    short events = 0x40    short revents = 0xfef7}struct pollfd {    int fd = 0x7    short events = 0x40    short revents = 0x80d}struct pollfd {    int fd = 0x8    short events = 0x4    short revents = 0xfee9}struct pollfd {    int fd = 0x9    short events = 0x40    short revents = 0xfed9}>

Now we'll set a breakpoint at the place where __pollsys() returns. At this point, therevents field should be set. The return value (i.e., number of events that occurred) should be in the %eax register. We'll check that, and look at the returned pollfd structures. Why does the revents field have strange values? I have a good idea why this occurs, but it is left to the reader as an exercise.

> pselect+1bf::bp  <-- from the stack trace above> :c  <-- continuemdb: stop at libc_hwcap1.so.1`pselect+0x1bfmdb: target stopped at:libc_hwcap1.so.1`pselect+0x1bf: movl   %eax,%esi>> > 8047380,5::print -t struct pollfdstruct pollfd {    int fd = 0x4    short events = 0x40    short revents = 0}struct pollfd {    int fd = 0x5    short events = 0x40    short revents = 0}struct pollfd {    int fd = 0x7    short events = 0x40    short revents = 0}struct pollfd {    int fd = 0x8    short events = 0x4    short revents = 0x4  <-- here is the one event}struct pollfd {    int fd = 0x9    short events = 0x40    short revents = 0}> $q  <-- and exit mdb#

So, now we know what we should expect to see from the DTrace script.

With illumos, a short DTrace script that does what is wanted is:

#!/usr/sbin/dtrace -qssyscall::pollsys:entry/pid==5898/{        printf("\npolling %d file descriptors/sockets\n", arg1);        self->pfds = (struct pollfd *) copyin(arg0, sizeof(struct pollfd)*arg1);        self->savedpfds = (struct pollfd *)arg0;    self->pfdcnt = arg1;    print(self->pfds[0]);    print(self->pfds[1]);    print(self->pfds[2]);    print(self->pfds[3]);    print(self->pfds[4]);}syscall::pollsys:return/self->savedpfds/{        printf("\npoll returning %d events\n", arg1);        self->pfds = (struct pollfd *) copyin((uintptr_t)self->savedpfds, sizeof(structpollfd)*self->pfdcnt);    print(self->pfds[0]);    print(self->pfds[1]);    print(self->pfds[2]);    print(self->pfds[3]);    print(self->pfds[4]);}

And the output:

# ./shortpoll.dpolling 5 file descriptors/socketsstruct pollfd {    int fd = 0x4    short events = 0x40    short revents = 0x80b}struct pollfd {    int fd = 0x5    short events = 0x40    short revents = 0xfef7}struct pollfd {    int fd = 0x7    short events = 0x40    short revents = 0x80d}struct pollfd {    int fd = 0x8    short events = 0x4    short revents = 0xfee9}struct pollfd {    int fd = 0x9    short events = 0x40    short revents = 0xfed9}poll returning 1 events  <-- poll returns 1struct pollfd {    int fd = 0x4    short events = 0x40    short revents = 0}struct pollfd {    int fd = 0x5    short events = 0x40    short revents = 0}struct pollfd {    int fd = 0x7    short events = 0x40    short revents = 0}struct pollfd {    int fd = 0x8    short events = 0x4    short revents = 0x4  <-- here is the 1 event}struct pollfd {    int fd = 0x9    short events = 0x40    short revents = 0}

A problem with this script is that if there is less than 5 descriptors being polled, it prints garbage for the extra ones, and, of course, it won't print more than 5.

But Mohamed is running on Solaris 11, which does not have the print() action I am using in the above script. Here is a script that works similarily to Mohamed's, but gives the correct data.

#!/usr/sbin/dtrace -qssyscall::pollsys:entry/arg0 && arg1/{    self->in_pollsys = 1;        self->pfds_user = arg0;        self->fds_cnt = (int)arg1;        this->pfds= (struct pollfd*) copyin(self->pfds_user, sizeof(struct pollfd));        printf("ENTRY pid: %d Pollsys thread = %d npfds = %d\n",          pid, tid, arg1);        printf("%s:%s fd = %d, events = %x, revents = %x\n",          probefunc, probename,      this->pfds->fd, this->pfds->events, this->pfds->revents);        self->fds_print = (int)arg1 - 1;}syscall::pollsys:entry/self->fds_print/{        self->next = self->fds_cnt-self->fds_print;    this->pfds= (struct pollfd*) copyin((uintptr_t)((char *)self->pfds_user +      self->next * sizeof(struct pollfd)),      sizeof(struct pollfd));    printf("%s:%s fd = %d, events = %x, revents = %x\n",      probefunc, probename,      this->pfds->fd, this->pfds->events, this->pfds->revents);        self->fds_print--;}syscall::pollsys:entry/self->fds_print/{    self->next = self->fds_cnt-self->fds_print;    this->pfds= (struct pollfd*) copyin((uintptr_t)((char *)self->pfds_user +      self->next * sizeof(struct pollfd)),      sizeof(struct pollfd));    printf("%s:%s fd = %d, events = %x, revents = %x\n",      probefunc, probename,      this->pfds->fd, this->pfds->events, this->pfds->revents);        self->fds_print--;}syscall::pollsys:entry/self->fds_print/{    self->next = self->fds_cnt-self->fds_print;    this->pfds= (struct pollfd*) copyin((uintptr_t)((char *)self->pfds_user +      self->next * sizeof(struct pollfd)),      sizeof(struct pollfd));    printf("%s:%s fd = %d, events = %x, revents = %x\n",      probefunc, probename,      this->pfds->fd, this->pfds->events, this->pfds->revents);        self->fds_print--;}syscall::pollsys:entry/self->fds_print/{    self->next = self->fds_cnt-self->fds_print;    this->pfds= (struct pollfd*) copyin((uintptr_t)((char *)self->pfds_user +      self->next * sizeof(struct pollfd)),      sizeof(struct pollfd));    printf("%s:%s fd = %d, events = %x, revents = %x\n",      probefunc, probename,      this->pfds->fd, this->pfds->events, this->pfds->revents);        self->fds_print--;}syscall::pollsys:entry/self->fds_print/{    self->next = self->fds_cnt-self->fds_print;    this->pfds= (struct pollfd*) copyin((uintptr_t)((char *)self->pfds_user +      self->next * sizeof(struct pollfd)),      sizeof(struct pollfd));    printf("%s:%s fd = %d, events = %x, revents = %x\n",      probefunc, probename,      this->pfds->fd, this->pfds->events, this->pfds->revents);        self->fds_print--;}syscall::pollsys:return/self->in_pollsys/{        printf ("RETURN pid: %d Pollsys thread = %d returns %d:%d\n", pid, tid, arg0, arg1);        this->pfds= (struct pollfd*) copyin(self->pfds_user, sizeof(struct pollfd));    printf("%s:%s fd = %d, events = %x, revents = %x\n",      probefunc, probename,      this->pfds->fd, this->pfds->events, this->pfds->revents);        self->fds_print = (int)self->fds_cnt - 1;        self->in_pollsys = 0;}syscall::pollsys:return/self->fds_print/{    self->next = self->fds_cnt-self->fds_print;    this->pfds= (struct pollfd*) copyin((uintptr_t)((char *)self->pfds_user +      self->next * sizeof(struct pollfd)),      sizeof(struct pollfd));    printf("%s:%s fd = %d, events = %x, revents = %x\n",      probefunc, probename,      this->pfds->fd, this->pfds->events, this->pfds->revents);        self->fds_print--;}syscall::pollsys:return/self->fds_print/{    self->next = self->fds_cnt-self->fds_print;    this->pfds= (struct pollfd*) copyin((uintptr_t)((char *)self->pfds_user +      self->next * sizeof(struct pollfd)),      sizeof(struct pollfd));    printf("%s:%s fd = %d, events = %x, revents = %x\n",      probefunc, probename,      this->pfds->fd, this->pfds->events, this->pfds->revents);        self->fds_print--;}syscall::pollsys:return/self->fds_print/{    self->next = self->fds_cnt-self->fds_print;    this->pfds= (struct pollfd*) copyin((uintptr_t)((char *)self->pfds_user +      self->next * sizeof(struct pollfd)),      sizeof(struct pollfd));    printf("%s:%s fd = %d, events = %x, revents = %x\n",      probefunc, probename,      this->pfds->fd, this->pfds->events, this->pfds->revents);        self->fds_print--;}syscall::pollsys:return/self->fds_print/{    self->next = self->fds_cnt-self->fds_print;    this->pfds= (struct pollfd*) copyin((uintptr_t)((char *)self->pfds_user +      self->next * sizeof(struct pollfd)),      sizeof(struct pollfd));    printf("%s:%s fd = %d, events = %x, revents = %x\n",      probefunc, probename,      this->pfds->fd, this->pfds->events, this->pfds->revents);        self->fds_print--;}syscall::pollsys:return/self->fds_print/{    self->next = self->fds_cnt-self->fds_print;    this->pfds= (struct pollfd*) copyin((uintptr_t)((char *)self->pfds_user +      self->next * sizeof(struct pollfd)),      sizeof(struct pollfd));    printf("%s:%s fd = %d, events = %x, revents = %x\n",      probefunc, probename,      this->pfds->fd, this->pfds->events, this->pfds->revents);        self->fds_print--;}

Here is output:

# ./poll.dENTRY pid: 2475 Pollsys thread = 1 npfds = 3pollsys:entry fd = 7, events = 1, revents = 0pollsys:entry fd = 5, events = 1, revents = 0pollsys:entry fd = 10, events = 1, revents = 0ENTRY pid: 2548 Pollsys thread = 1 npfds = 5pollsys:entry fd = 3, events = 40, revents = 0pollsys:entry fd = 4, events = 40, revents = 0pollsys:entry fd = 5, events = 40, revents = 0pollsys:entry fd = 6, events = 40, revents = 0pollsys:entry fd = 7, events = 40, revents = 0ENTRY pid: 6218 Pollsys thread = 1 npfds = 4pollsys:entry fd = 4, events = 44, revents = 0pollsys:entry fd = 5, events = 40, revents = 807pollsys:entry fd = 7, events = 40, revents = 804pollsys:entry fd = 9, events = 40, revents = fedaRETURN pid: 6218 Pollsys thread = 1 returns 1:1pollsys:return fd = 4, events = 44, revents = 4pollsys:return fd = 5, events = 40, revents = 0pollsys:return fd = 7, events = 40, revents = 0pollsys:return fd = 9, events = 40, revents = 0...

The only difference between this script and Mohamed's script is that the copyin() action is called for each pollfd structure in each clause, instead of doing the copyin at the beginning of the entry and return probes. Mohamed's script looks like it should have worked. However, copyin() data goes into scratch space which is not saved across clauses.

I'll take this time to wish everyone a happy holiday. See you next week!



Post written by Mr. Max Bruning