cdp_avp: added README file
[sip-router] / io_wait.h
1 /*
2  * $Id$
3  *
4  * Copyright (C) 2005 iptelorg GmbH
5  *
6  * Permission to use, copy, modify, and distribute this software for any
7  * purpose with or without fee is hereby granted, provided that the above
8  * copyright notice and this permission notice appear in all copies.
9  *
10  * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
11  * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
12  * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
13  * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
14  * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
15  * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
16  * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
17  */
18 /*
19  * tcp io wait common stuff used by tcp_main.c & tcp_read.c
20  * All the functions are inline because of speed reasons and because they are
21  * used only from 2 places.
22  * You also have to define:
23  *     int handle_io(struct fd_map* fm, short events, int idx) (see below)
24  *     (this could be trivially replaced by a callback pointer entry attached
25  *      to the io_wait handler if more flexibility rather then performance
26  *      is needed)
27  *      fd_type - define to some enum of you choice and define also
28  *                FD_TYPE_DEFINED (if you don't do it fd_type will be defined
29  *                to int). 0 has a special not set/not init. meaning
30  *                (a lot of sanity checks and the sigio_rt code are based on
31  *                 this assumption)
32  *     local_malloc (defaults to pkg_malloc)
33  *     local_free   (defaults to pkg_free)
34  *
35  */
36 /*
37  * History:
38  * --------
39  *  2005-06-13  created by andrei
40  *  2005-06-26  added kqueue (andrei)
41  *  2005-07-01  added /dev/poll (andrei)
42  *  2006-05-30  sigio 64 bit workarround enabled for kernels < 2.6.5 (andrei)
43  *  2007-11-22  when handle_io() is called in a loop check & stop if the fd was
44  *               removed inside handle_io() (andrei)
45  *  2007-11-29  support for write (POLLOUT); added io_watch_chg() (andrei)
46  *  2008-02-04  POLLRDHUP & EPOLLRDHUP support (automatically enabled if POLLIN
47  *               is set) (andrei)
48  *  2010-06-17  re-enabled & enhanced the EV_ERROR for kqueue (andrei)
49  */
50
51
52
53 #ifndef _io_wait_h
54 #define _io_wait_h
55
56 #include <errno.h>
57 #include <string.h>
58 #ifdef HAVE_SIGIO_RT
59 #define __USE_GNU /* or else F_SETSIG won't be included */
60 #include <sys/types.h> /* recv */
61 #include <sys/socket.h> /* recv */
62 #include <signal.h> /* sigprocmask, sigwait a.s.o */
63 #endif
64
65 #define _GNU_SOURCE  /* for POLLRDHUP on linux */
66 #include <sys/poll.h>
67 #include <fcntl.h>
68
69 #ifdef HAVE_EPOLL
70 #include <sys/epoll.h>
71 #endif
72 #ifdef HAVE_KQUEUE
73 #include <sys/types.h> /* needed on freebsd */
74 #include <sys/event.h>
75 #include <sys/time.h>
76 #endif
77 #ifdef HAVE_DEVPOLL
78 #include <sys/devpoll.h>
79 #endif
80 #ifdef HAVE_SELECT
81 /* needed on openbsd for select*/
82 #include <sys/time.h>
83 #include <sys/types.h>
84 #include <unistd.h>
85 /* needed according to POSIX for select*/
86 #include <sys/select.h>
87 #endif
88
89 #include "dprint.h"
90
91 #include "poll_types.h" /* poll_types*/
92 #ifdef HAVE_SIGIO_RT
93 #include "pt.h" /* mypid() */
94 #endif
95
96 #include "compiler_opt.h"
97
98
99 #ifdef HAVE_EPOLL
100 /* fix defines for EPOLL */
101 #if defined POLLRDHUP && ! defined EPOLLRDHUP
102 #define EPOLLRDHUP POLLRDHUP  /* should work on all linuxes */
103 #endif /* POLLRDHUP && EPOLLRDHUP */
104 #endif /* HAVE_EPOLL */
105
106
107 extern int _os_ver; /* os version number, needed to select bugs workarrounds */
108
109
110 #if 0
111 enum fd_types; /* this should be defined from the including file,
112                                   see tcp_main.c for an example,
113                                   0 has a special meaning: not used/empty*/
114 #endif
115
116 #ifndef FD_TYPE_DEFINED
117 typedef int fd_type;
118 #define FD_TYPE_DEFINED
119 #endif
120
121 /* maps a fd to some other structure; used in almost all cases
122  * except epoll and maybe kqueue or /dev/poll */
123 struct fd_map{
124         int fd;               /* fd no */
125         fd_type type;         /* "data" type */
126         void* data;           /* pointer to the corresponding structure */
127         short events;         /* events we are interested int */
128 };
129
130
131 #ifdef HAVE_KQUEUE
132 #ifndef KQ_CHANGES_ARRAY_SIZE
133 #define KQ_CHANGES_ARRAY_SIZE 256
134
135 #ifdef __OS_netbsd
136 #define KEV_UDATA_CAST (intptr_t)
137 #else
138 #define KEV_UDATA_CAST
139 #endif
140
141 #endif
142 #endif
143
144
145 /* handler structure */
146 struct io_wait_handler{
147         enum poll_types poll_method;
148         int flags;
149         struct fd_map* fd_hash;
150         int fd_no; /*  current index used in fd_array and the passed size for
151                                    ep_array (for kq_array at least
152                                     max(twice the size, kq_changes_size) should be
153                                    be passed). */
154         int max_fd_no; /* maximum fd no, is also the size of fd_array,
155                                                        fd_hash  and ep_array*/
156         /* common stuff for POLL, SIGIO_RT and SELECT
157          * since poll support is always compiled => this will always be compiled */
158         struct pollfd* fd_array; /* used also by devpoll as devpoll array */
159         int crt_fd_array_idx; /*  crt idx for which handle_io is called
160                                                          (updated also by del -> internal optimization) */
161         /* end of common stuff */
162 #ifdef HAVE_EPOLL
163         int epfd; /* epoll ctrl fd */
164         struct epoll_event* ep_array;
165 #endif
166 #ifdef HAVE_SIGIO_RT
167         sigset_t sset; /* signal mask for sigio & sigrtmin */
168         int signo;     /* real time signal used */
169 #endif
170 #ifdef HAVE_KQUEUE
171         int kq_fd;
172         struct kevent* kq_array;   /* used for the eventlist*/
173         struct kevent* kq_changes; /* used for the changelist */
174         size_t kq_nchanges;
175         size_t kq_array_size;   /* array size */
176         size_t kq_changes_size; /* size of the changes array */
177 #endif
178 #ifdef HAVE_DEVPOLL
179         int dpoll_fd;
180 #endif
181 #ifdef HAVE_SELECT
182         fd_set master_rset; /* read set */
183         fd_set master_wset; /* write set */
184         int max_fd_select; /* maximum select used fd */
185 #endif
186 };
187
188 typedef struct io_wait_handler io_wait_h;
189
190
191 /* get the corresponding fd_map structure pointer */
192 #define get_fd_map(h, fd)               (&(h)->fd_hash[(fd)])
193 /* remove a fd_map structure from the hash; the pointer must be returned
194  * by get_fd_map or hash_fd_map*/
195 #define unhash_fd_map(pfm)      \
196         do{ \
197                 (pfm)->type=0 /*F_NONE */; \
198                 (pfm)->fd=-1; \
199         }while(0)
200
201 /* add a fd_map structure to the fd hash */
202 static inline struct fd_map* hash_fd_map(       io_wait_h* h,
203                                                                                         int fd,
204                                                                                         short events,
205                                                                                         fd_type type,
206                                                                                         void* data)
207 {
208         h->fd_hash[fd].fd=fd;
209         h->fd_hash[fd].events=events;
210         h->fd_hash[fd].type=type;
211         h->fd_hash[fd].data=data;
212         return &h->fd_hash[fd];
213 }
214
215
216
217 #ifdef HANDLE_IO_INLINE
218 /* generic handle io routine, this must be defined in the including file
219  * (faster then registering a callback pointer)
220  *
221  * params:  fm     - pointer to a fd hash entry
222  *          events - combinations of POLLIN, POLLOUT, POLLERR & POLLHUP
223  *          idx    - index in the fd_array (or -1 if not known)
224  * return: -1 on error
225  *          0 on EAGAIN or when by some other way it is known that no more
226  *            io events are queued on the fd (the receive buffer is empty).
227  *            Usefull to detect when there are no more io events queued for
228  *            sigio_rt, epoll_et, kqueue.
229  *         >0 on successfull read from the fd (when there might be more io
230  *            queued -- the receive buffer might still be non-empty)
231  */
232 inline static int handle_io(struct fd_map* fm, short events, int idx);
233 #else
234 int handle_io(struct fd_map* fm, short events, int idx);
235 #endif
236
237
238
239 #ifdef HAVE_KQUEUE
240 /*
241  * kqueue specific function: register a change
242  * (adds a change to the kevent change array, and if full flushes it first)
243  *
244  * TODO: check if the event already exists in the change list or if it's
245  *       complementary to an event in the list (e.g. EVFILT_WRITE, EV_DELETE
246  *       and EVFILT_WRITE, EV_ADD for the same fd).
247  * returns: -1 on error, 0 on success
248  */
249 static inline int kq_ev_change(io_wait_h* h, int fd, int filter, int flag,
250                                                                 void* data)
251 {
252         int n;
253         int r;
254         struct timespec tspec;
255
256         if (h->kq_nchanges>=h->kq_changes_size){
257                 /* changes array full ! */
258                 LOG(L_WARN, "WARNING: kq_ev_change: kqueue changes array full"
259                                         " trying to flush...\n");
260                 tspec.tv_sec=0;
261                 tspec.tv_nsec=0;
262 again:
263                 n=kevent(h->kq_fd, h->kq_changes, h->kq_nchanges, 0, 0, &tspec);
264                 if (unlikely(n == -1)){
265                         if (unlikely(errno == EINTR)) goto again;
266                         else {
267                                 /* for a detailed explanation of what follows see
268                                    io_wait_loop_kqueue EV_ERROR case */
269                                 if (unlikely(!(errno == EBADF || errno == ENOENT)))
270                                         BUG("kq_ev_change: kevent flush changes failed"
271                                                         " (unexpected error): %s [%d]\n",
272                                                         strerror(errno), errno);
273                                         /* ignore error even if it's not a EBADF/ENOENT */
274                                 /* one of the file descriptors is bad, probably already
275                                    closed => try to apply changes one-by-one */
276                                 for (r = 0; r < h->kq_nchanges; r++) {
277 retry2:
278                                         n = kevent(h->kq_fd, &h->kq_changes[r], 1, 0, 0, &tspec);
279                                         if (n==-1) {
280                                                 if (unlikely(errno == EINTR))
281                                                         goto retry2;
282                                         /* for a detailed explanation of what follows see
283                                                 io_wait_loop_kqueue EV_ERROR case */
284                                                 if (unlikely(!(errno == EBADF || errno == ENOENT)))
285                                                         BUG("kq_ev_change: kevent flush changes failed:"
286                                                                         " (unexpected error) %s [%d] (%d/%lu)\n",
287                                                                                 strerror(errno), errno,
288                                                                                 r, (unsigned long)h->kq_nchanges);
289                                                 continue; /* skip over it */
290                                         }
291                                 }
292                         }
293                 }
294                 h->kq_nchanges=0; /* changes array is empty */
295         }
296         EV_SET(&h->kq_changes[h->kq_nchanges], fd, filter, flag, 0, 0,
297                         KEV_UDATA_CAST data);
298         h->kq_nchanges++;
299         return 0;
300 }
301 #endif
302
303
304
305 /* generic io_watch_add function
306  * Params:
307  *     h      - pointer to initialized io_wait handle
308  *     fd     - fd to watch
309  *     events - bitmap with the fd events for which the fd should be watched
310  *              (combination of POLLIN and POLLOUT)
311  *     type   - fd type (non 0 value, returned in the call to handle_io)
312  *     data   - pointer/private data returned in the handle_io call
313  * returns 0 on success, -1 on error
314  *
315  * WARNING: handle_io() can be called immediately (from io_watch_add()) so
316  *  make sure that any dependent init. (e.g. data stuff) is made before
317  *  calling io_watch_add
318  *
319  * this version should be faster than pointers to poll_method specific
320  * functions (it avoids functions calls, the overhead being only an extra
321  *  switch())*/
322 inline static int io_watch_add( io_wait_h* h,
323                                                                 int fd,
324                                                                 short events,
325                                                                 fd_type type,
326                                                                 void* data)
327 {
328
329         /* helper macros */
330 #define fd_array_setup(ev) \
331         do{ \
332                 h->fd_array[h->fd_no].fd=fd; \
333                 h->fd_array[h->fd_no].events=(ev); /* useless for select */ \
334                 h->fd_array[h->fd_no].revents=0;     /* useless for select */ \
335         }while(0)
336         
337 #define set_fd_flags(f) \
338         do{ \
339                         flags=fcntl(fd, F_GETFL); \
340                         if (flags==-1){ \
341                                 LOG(L_ERR, "ERROR: io_watch_add: fnctl: GETFL failed:" \
342                                                 " %s [%d]\n", strerror(errno), errno); \
343                                 goto error; \
344                         } \
345                         if (fcntl(fd, F_SETFL, flags|(f))==-1){ \
346                                 LOG(L_ERR, "ERROR: io_watch_add: fnctl: SETFL" \
347                                                         " failed: %s [%d]\n", strerror(errno), errno); \
348                                 goto error; \
349                         } \
350         }while(0)
351         
352         
353         struct fd_map* e;
354         int flags;
355 #ifdef HAVE_EPOLL
356         struct epoll_event ep_event;
357 #endif
358 #ifdef HAVE_DEVPOLL
359         struct pollfd pfd;
360 #endif
361 #if defined(HAVE_SIGIO_RT) || defined (HAVE_EPOLL)
362         int n;
363 #endif
364 #if defined(HAVE_SIGIO_RT)
365         int idx;
366         int check_io;
367         struct pollfd pf;
368         
369         check_io=0; /* set to 1 if we need to check for pre-existing queued
370                                    io/data on the fd */
371         idx=-1;
372 #endif
373         e=0;
374         /* sanity checks */
375         if (unlikely(fd==-1)){
376                 LOG(L_CRIT, "BUG: io_watch_add: fd is -1!\n");
377                 goto error;
378         }
379         if (unlikely((events&(POLLIN|POLLOUT))==0)){
380                 LOG(L_CRIT, "BUG: io_watch_add: invalid events: 0x%0x\n", events);
381                 goto error;
382         }
383         /* check if not too big */
384         if (unlikely(h->fd_no>=h->max_fd_no)){
385                 LOG(L_CRIT, "ERROR: io_watch_add: maximum fd number exceeded:"
386                                 " %d/%d\n", h->fd_no, h->max_fd_no);
387                 goto error;
388         }
389         DBG("DBG: io_watch_add(%p, %d, %d, %p), fd_no=%d\n",
390                         h, fd, type, data, h->fd_no);
391         /*  hash sanity check */
392         e=get_fd_map(h, fd);
393         if (unlikely(e && (e->type!=0 /*F_NONE*/))){
394                 LOG(L_ERR, "ERROR: io_watch_add: trying to overwrite entry %d"
395                                 " watched for %x in the hash(%d, %d, %p) with (%d, %d, %p)\n",
396                                 fd, events, e->fd, e->type, e->data, fd, type, data);
397                 e=0;
398                 goto error;
399         }
400         
401         if (unlikely((e=hash_fd_map(h, fd, events, type, data))==0)){
402                 LOG(L_ERR, "ERROR: io_watch_add: failed to hash the fd %d\n", fd);
403                 goto error;
404         }
405         switch(h->poll_method){ /* faster then pointer to functions */
406                 case POLL_POLL:
407 #ifdef POLLRDHUP
408                         /* listen to POLLRDHUP by default (if POLLIN) */
409                         events|=((int)!(events & POLLIN) - 1) & POLLRDHUP;
410 #endif /* POLLRDHUP */
411                         fd_array_setup(events);
412                         set_fd_flags(O_NONBLOCK);
413                         break;
414 #ifdef HAVE_SELECT
415                 case POLL_SELECT:
416                         fd_array_setup(events);
417                         if (likely(events & POLLIN))
418                                 FD_SET(fd, &h->master_rset);
419                         if (unlikely(events & POLLOUT))
420                                 FD_SET(fd, &h->master_wset);
421                         if (h->max_fd_select<fd) h->max_fd_select=fd;
422                         break;
423 #endif
424 #ifdef HAVE_SIGIO_RT
425                 case POLL_SIGIO_RT:
426                         fd_array_setup(events);
427                         /* re-set O_ASYNC might be needed, if not done from
428                          * io_watch_del (or if somebody wants to add a fd which has
429                          * already O_ASYNC/F_SETSIG set on a duplicate)
430                          */
431                         /* set async & signal */
432                         if (fcntl(fd, F_SETOWN, my_pid())==-1){
433                                 LOG(L_ERR, "ERROR: io_watch_add: fnctl: SETOWN"
434                                 " failed: %s [%d]\n", strerror(errno), errno);
435                                 goto error;
436                         }
437                         if (fcntl(fd, F_SETSIG, h->signo)==-1){
438                                 LOG(L_ERR, "ERROR: io_watch_add: fnctl: SETSIG"
439                                         " failed: %s [%d]\n", strerror(errno), errno);
440                                 goto error;
441                         }
442                         /* set both non-blocking and async */
443                         set_fd_flags(O_ASYNC| O_NONBLOCK);
444 #ifdef EXTRA_DEBUG
445                         DBG("io_watch_add: sigio_rt on f %d, signal %d to pid %d\n",
446                                         fd,  h->signo, my_pid());
447 #endif
448                         /* empty socket receive buffer, if buffer is already full
449                          * no more space to put packets
450                          * => no more signals are ever generated
451                          * also when moving fds, the freshly moved fd might have
452                          *  already some bytes queued, we want to get them now
453                          *  and not later -- andrei */
454                         idx=h->fd_no;
455                         check_io=1;
456                         break;
457 #endif
458 #ifdef HAVE_EPOLL
459                 case POLL_EPOLL_LT:
460                         ep_event.events=
461 #ifdef POLLRDHUP
462                                                 /* listen for EPOLLRDHUP too */
463                                                 ((EPOLLIN|EPOLLRDHUP) & ((int)!(events & POLLIN)-1) ) |
464 #else /* POLLRDHUP */
465                                                 (EPOLLIN & ((int)!(events & POLLIN)-1) ) |
466 #endif /* POLLRDHUP */
467                                                 (EPOLLOUT & ((int)!(events & POLLOUT)-1) );
468                         ep_event.data.ptr=e;
469 again1:
470                         n=epoll_ctl(h->epfd, EPOLL_CTL_ADD, fd, &ep_event);
471                         if (unlikely(n==-1)){
472                                 if (errno==EAGAIN) goto again1;
473                                 LOG(L_ERR, "ERROR: io_watch_add: epoll_ctl failed: %s [%d]\n",
474                                         strerror(errno), errno);
475                                 goto error;
476                         }
477                         break;
478                 case POLL_EPOLL_ET:
479                         set_fd_flags(O_NONBLOCK);
480                         ep_event.events=
481 #ifdef POLLRDHUP
482                                                 /* listen for EPOLLRDHUP too */
483                                                 ((EPOLLIN|EPOLLRDHUP) & ((int)!(events & POLLIN)-1) ) |
484 #else /* POLLRDHUP */
485                                                 (EPOLLIN & ((int)!(events & POLLIN)-1) ) |
486 #endif /* POLLRDHUP */
487                                                 (EPOLLOUT & ((int)!(events & POLLOUT)-1) ) |
488                                                 EPOLLET;
489                         ep_event.data.ptr=e;
490 again2:
491                         n=epoll_ctl(h->epfd, EPOLL_CTL_ADD, fd, &ep_event);
492                         if (unlikely(n==-1)){
493                                 if (errno==EAGAIN) goto again2;
494                                 LOG(L_ERR, "ERROR: io_watch_add: epoll_ctl failed: %s [%d]\n",
495                                         strerror(errno), errno);
496                                 goto error;
497                         }
498                         break;
499 #endif
500 #ifdef HAVE_KQUEUE
501                 case POLL_KQUEUE:
502                         if (likely( events & POLLIN)){
503                                 if (unlikely(kq_ev_change(h, fd, EVFILT_READ, EV_ADD, e)==-1))
504                                         goto error;
505                         }
506                         if (unlikely( events & POLLOUT)){
507                                 if (unlikely(kq_ev_change(h, fd, EVFILT_WRITE, EV_ADD, e)==-1))
508                                 {
509                                         if (likely(events & POLLIN)){
510                                                 kq_ev_change(h, fd, EVFILT_READ, EV_DELETE, 0);
511                                         }
512                                         goto error;
513                                 }
514                         }
515                         break;
516 #endif
517 #ifdef HAVE_DEVPOLL
518                 case POLL_DEVPOLL:
519                         pfd.fd=fd;
520                         pfd.events=events;
521                         pfd.revents=0;
522 again_devpoll:
523                         if (write(h->dpoll_fd, &pfd, sizeof(pfd))==-1){
524                                 if (errno==EAGAIN) goto again_devpoll;
525                                 LOG(L_ERR, "ERROR: io_watch_add: /dev/poll write failed:"
526                                                         "%s [%d]\n", strerror(errno), errno);
527                                 goto error;
528                         }
529                         break;
530 #endif
531                         
532                 default:
533                         LOG(L_CRIT, "BUG: io_watch_add: no support for poll method "
534                                         " %s (%d)\n", poll_method_str[h->poll_method],
535                                         h->poll_method);
536                         goto error;
537         }
538         
539         h->fd_no++; /* "activate" changes, for epoll/kqueue/devpoll it
540                                    has only informative value */
541 #if defined(HAVE_SIGIO_RT)
542         if (check_io){
543                 /* handle possible pre-existing events */
544                 pf.fd=fd;
545                 pf.events=events;
546 check_io_again:
547                 n=0;
548                 while(e->type && ((n=poll(&pf, 1, 0))>0) &&
549                                 (handle_io(e, pf.revents, idx)>0) &&
550                                 (pf.revents & (e->events|POLLERR|POLLHUP)));
551                 if (unlikely(e->type && (n==-1))){
552                         if (errno==EINTR) goto check_io_again;
553                         LOG(L_ERR, "ERROR: io_watch_add: check_io poll: %s [%d]\n",
554                                                 strerror(errno), errno);
555                 }
556         }
557 #endif
558         return 0;
559 error:
560         if (e) unhash_fd_map(e);
561         return -1;
562 #undef fd_array_setup
563 #undef set_fd_flags
564 }
565
566
567
568 #define IO_FD_CLOSING 16
569 /* parameters:    h - handler
570  *               fd - file descriptor
571  *            index - index in the fd_array if known, -1 if not
572  *                    (if index==-1 fd_array will be searched for the
573  *                     corresponding fd* entry -- slower but unavoidable in
574  *                     some cases). index is not used (no fd_array) for epoll,
575  *                     /dev/poll and kqueue
576  *            flags - optimization flags, e.g. IO_FD_CLOSING, the fd was
577  *                    or will shortly be closed, in some cases we can avoid
578  *                    extra remove operations (e.g.: epoll, kqueue, sigio)
579  * returns 0 if ok, -1 on error */
580 inline static int io_watch_del(io_wait_h* h, int fd, int idx, int flags)
581 {
582         
583 #define fix_fd_array \
584         do{\
585                         if (unlikely(idx==-1)){ \
586                                 /* fix idx if -1 and needed */ \
587                                 for (idx=0; (idx<h->fd_no) && \
588                                                         (h->fd_array[idx].fd!=fd); idx++); \
589                         } \
590                         if (likely(idx<h->fd_no)){ \
591                                 memmove(&h->fd_array[idx], &h->fd_array[idx+1], \
592                                         (h->fd_no-(idx+1))*sizeof(*(h->fd_array))); \
593                                 if ((idx<=h->crt_fd_array_idx) && (h->crt_fd_array_idx>=0)) \
594                                         h->crt_fd_array_idx--; \
595                         } \
596         }while(0)
597         
598         struct fd_map* e;
599         int events;
600 #ifdef HAVE_EPOLL
601         int n;
602         struct epoll_event ep_event;
603 #endif
604 #ifdef HAVE_DEVPOLL
605         struct pollfd pfd;
606 #endif
607 #ifdef HAVE_SIGIO_RT
608         int fd_flags;
609 #endif
610         
611         if (unlikely((fd<0) || (fd>=h->max_fd_no))){
612                 LOG(L_CRIT, "BUG: io_watch_del: invalid fd %d, not in [0, %d) \n",
613                                                 fd, h->fd_no);
614                 goto error;
615         }
616         DBG("DBG: io_watch_del (%p, %d, %d, 0x%x) fd_no=%d called\n",
617                         h, fd, idx, flags, h->fd_no);
618         e=get_fd_map(h, fd);
619         /* more sanity checks */
620         if (unlikely(e==0)){
621                 LOG(L_CRIT, "BUG: io_watch_del: no corresponding hash entry for %d\n",
622                                         fd);
623                 goto error;
624         }
625         if (unlikely(e->type==0 /*F_NONE*/)){
626                 LOG(L_ERR, "ERROR: io_watch_del: trying to delete already erased"
627                                 " entry %d in the hash(%d, %d, %p) flags %x)\n",
628                                 fd, e->fd, e->type, e->data, flags);
629                 goto error;
630         }
631         events=e->events;
632         
633         switch(h->poll_method){
634                 case POLL_POLL:
635                         fix_fd_array;
636                         break;
637 #ifdef HAVE_SELECT
638                 case POLL_SELECT:
639                         if (likely(events & POLLIN))
640                                 FD_CLR(fd, &h->master_rset);
641                         if (unlikely(events & POLLOUT))
642                                 FD_CLR(fd, &h->master_wset);
643                         if (unlikely(h->max_fd_select && (h->max_fd_select==fd)))
644                                 /* we don't know the prev. max, so we just decrement it */
645                                 h->max_fd_select--;
646                         fix_fd_array;
647                         break;
648 #endif
649 #ifdef HAVE_SIGIO_RT
650                 case POLL_SIGIO_RT:
651                         /* the O_ASYNC flag must be reset all the time, the fd
652                          *  can be changed only if  O_ASYNC is reset (if not and
653                          *  the fd is a duplicate, you will get signals from the dup. fd
654                          *  and not from the original, even if the dup. fd was closed
655                          *  and the signals re-set on the original) -- andrei
656                          */
657                         /*if (!(flags & IO_FD_CLOSING)){*/
658                                 /* reset ASYNC */
659                                 fd_flags=fcntl(fd, F_GETFL);
660                                 if (unlikely(fd_flags==-1)){
661                                         LOG(L_ERR, "ERROR: io_watch_del: fnctl: GETFL failed:"
662                                                         " %s [%d]\n", strerror(errno), errno);
663                                         goto error;
664                                 }
665                                 if (unlikely(fcntl(fd, F_SETFL, fd_flags&(~O_ASYNC))==-1)){
666                                         LOG(L_ERR, "ERROR: io_watch_del: fnctl: SETFL"
667                                                                 " failed: %s [%d]\n", strerror(errno), errno);
668                                         goto error;
669                                 }
670                         fix_fd_array; /* only on success */
671                         break;
672 #endif
673 #ifdef HAVE_EPOLL
674                 case POLL_EPOLL_LT:
675                 case POLL_EPOLL_ET:
676                         /* epoll doesn't seem to automatically remove sockets,
677                          * if the socket is a duplicate/moved and the original
678                          * is still open. The fd is removed from the epoll set
679                          * only when the original (and all the  copies?) is/are
680                          * closed. This is probably a bug in epoll. --andrei */
681 #ifdef EPOLL_NO_CLOSE_BUG
682                         if (!(flags & IO_FD_CLOSING)){
683 #endif
684 again_epoll:
685                                 n=epoll_ctl(h->epfd, EPOLL_CTL_DEL, fd, &ep_event);
686                                 if (unlikely(n==-1)){
687                                         if (errno==EAGAIN) goto again_epoll;
688                                         LOG(L_ERR, "ERROR: io_watch_del: removing fd from epoll "
689                                                         "list failed: %s [%d]\n", strerror(errno), errno);
690                                         goto error;
691                                 }
692 #ifdef EPOLL_NO_CLOSE_BUG
693                         }
694 #endif
695                         break;
696 #endif
697 #ifdef HAVE_KQUEUE
698                 case POLL_KQUEUE:
699                         if (!(flags & IO_FD_CLOSING)){
700                                 if (likely(events & POLLIN)){
701                                         if (unlikely(kq_ev_change(h, fd, EVFILT_READ,
702                                                                                                         EV_DELETE, 0) ==-1)){
703                                                 /* try to delete the write filter anyway */
704                                                 if (events & POLLOUT){
705                                                         kq_ev_change(h, fd, EVFILT_WRITE, EV_DELETE, 0);
706                                                 }
707                                                 goto error;
708                                         }
709                                 }
710                                 if (unlikely(events & POLLOUT)){
711                                         if (unlikely(kq_ev_change(h, fd, EVFILT_WRITE,
712                                                                                                         EV_DELETE, 0) ==-1))
713                                                 goto error;
714                                 }
715                         }
716                         break;
717 #endif
718 #ifdef HAVE_DEVPOLL
719                 case POLL_DEVPOLL:
720                                 /* for /dev/poll the closed fds _must_ be removed
721                                    (they are not removed automatically on close()) */
722                                 pfd.fd=fd;
723                                 pfd.events=POLLREMOVE;
724                                 pfd.revents=0;
725 again_devpoll:
726                                 if (write(h->dpoll_fd, &pfd, sizeof(pfd))==-1){
727                                         if (errno==EINTR) goto again_devpoll;
728                                         LOG(L_ERR, "ERROR: io_watch_del: removing fd from "
729                                                                 "/dev/poll failed: %s [%d]\n",
730                                                                 strerror(errno), errno);
731                                         goto error;
732                                 }
733                                 break;
734 #endif
735                 default:
736                         LOG(L_CRIT, "BUG: io_watch_del: no support for poll method "
737                                         " %s (%d)\n", poll_method_str[h->poll_method],
738                                         h->poll_method);
739                         goto error;
740         }
741         unhash_fd_map(e); /* only on success */
742         h->fd_no--;
743         return 0;
744 error:
745         return -1;
746 #undef fix_fd_array
747 }
748
749
750
751 /* parameters:    h - handler
752  *               fd - file descriptor
753  *           events - new events to watch for
754  *              idx - index in the fd_array if known, -1 if not
755  *                    (if index==-1 fd_array will be searched for the
756  *                     corresponding fd* entry -- slower but unavoidable in
757  *                     some cases). index is not used (no fd_array) for epoll,
758  *                     /dev/poll and kqueue
759  * returns 0 if ok, -1 on error */
760 inline static int io_watch_chg(io_wait_h* h, int fd, short events, int idx )
761 {
762         
763 #define fd_array_chg(ev) \
764         do{\
765                         if (unlikely(idx==-1)){ \
766                                 /* fix idx if -1 and needed */ \
767                                 for (idx=0; (idx<h->fd_no) && \
768                                                         (h->fd_array[idx].fd!=fd); idx++); \
769                         } \
770                         if (likely(idx<h->fd_no)){ \
771                                 h->fd_array[idx].events=(ev); \
772                         } \
773         }while(0)
774         
775         struct fd_map* e;
776         int add_events;
777         int del_events;
778 #ifdef HAVE_DEVPOLL
779         struct pollfd pfd;
780 #endif
781 #ifdef HAVE_EPOLL
782         int n;
783         struct epoll_event ep_event;
784 #endif
785         
786         if (unlikely((fd<0) || (fd>=h->max_fd_no))){
787                 LOG(L_CRIT, "BUG: io_watch_chg: invalid fd %d, not in [0, %d) \n",
788                                                 fd, h->fd_no);
789                 goto error;
790         }
791         if (unlikely((events&(POLLIN|POLLOUT))==0)){
792                 LOG(L_CRIT, "BUG: io_watch_chg: invalid events: 0x%0x\n", events);
793                 goto error;
794         }
795         DBG("DBG: io_watch_chg (%p, %d, 0x%x, 0x%x) fd_no=%d called\n",
796                         h, fd, events, idx, h->fd_no);
797         e=get_fd_map(h, fd);
798         /* more sanity checks */
799         if (unlikely(e==0)){
800                 LOG(L_CRIT, "BUG: io_watch_chg: no corresponding hash entry for %d\n",
801                                         fd);
802                 goto error;
803         }
804         if (unlikely(e->type==0 /*F_NONE*/)){
805                 LOG(L_ERR, "ERROR: io_watch_chg: trying to change an already erased"
806                                 " entry %d in the hash(%d, %d, %p) )\n",
807                                 fd, e->fd, e->type, e->data);
808                 goto error;
809         }
810         
811         add_events=events & ~e->events;
812         del_events=e->events & ~events;
813         switch(h->poll_method){
814                 case POLL_POLL:
815 #ifdef POLLRDHUP
816                         fd_array_chg(events |
817                                                         /* listen to POLLRDHUP by default (if POLLIN) */
818                                                         (((int)!(events & POLLIN) - 1) & POLLRDHUP)
819                                                 );
820 #else /* POLLRDHUP */
821                         fd_array_chg(events);
822 #endif /* POLLRDHUP */
823                         break;
824 #ifdef HAVE_SELECT
825                 case POLL_SELECT:
826                         fd_array_chg(events);
827                         if (unlikely(del_events & POLLIN))
828                                 FD_CLR(fd, &h->master_rset);
829                         else if (unlikely(add_events & POLLIN))
830                                 FD_SET(fd, &h->master_rset);
831                         if (likely(del_events & POLLOUT))
832                                 FD_CLR(fd, &h->master_wset);
833                         else if (likely(add_events & POLLOUT))
834                                 FD_SET(fd, &h->master_wset);
835                         break;
836 #endif
837 #ifdef HAVE_SIGIO_RT
838                 case POLL_SIGIO_RT:
839                         fd_array_chg(events);
840                         /* no need for check_io, since SIGIO_RT listens by default for all
841                          * the events */
842                         break;
843 #endif
844 #ifdef HAVE_EPOLL
845                 case POLL_EPOLL_LT:
846                                 ep_event.events=
847 #ifdef POLLRDHUP
848                                                 /* listen for EPOLLRDHUP too */
849                                                 ((EPOLLIN|EPOLLRDHUP) & ((int)!(events & POLLIN)-1) ) |
850 #else /* POLLRDHUP */
851                                                 (EPOLLIN & ((int)!(events & POLLIN)-1) ) |
852 #endif /* POLLRDHUP */
853                                                 (EPOLLOUT & ((int)!(events & POLLOUT)-1) );
854                                 ep_event.data.ptr=e;
855 again_epoll_lt:
856                                 n=epoll_ctl(h->epfd, EPOLL_CTL_MOD, fd, &ep_event);
857                                 if (unlikely(n==-1)){
858                                         if (errno==EAGAIN) goto again_epoll_lt;
859                                         LOG(L_ERR, "ERROR: io_watch_chg: modifying epoll events"
860                                                         " failed: %s [%d]\n", strerror(errno), errno);
861                                         goto error;
862                                 }
863                         break;
864                 case POLL_EPOLL_ET:
865                                 ep_event.events=
866 #ifdef POLLRDHUP
867                                                 /* listen for EPOLLRDHUP too */
868                                                 ((EPOLLIN|EPOLLRDHUP) & ((int)!(events & POLLIN)-1) ) |
869 #else /* POLLRDHUP */
870                                                 (EPOLLIN & ((int)!(events & POLLIN)-1) ) |
871 #endif /* POLLRDHUP */
872                                                 (EPOLLOUT & ((int)!(events & POLLOUT)-1) ) |
873                                                 EPOLLET;
874                                 ep_event.data.ptr=e;
875 again_epoll_et:
876                                 n=epoll_ctl(h->epfd, EPOLL_CTL_MOD, fd, &ep_event);
877                                 if (unlikely(n==-1)){
878                                         if (errno==EAGAIN) goto again_epoll_et;
879                                         LOG(L_ERR, "ERROR: io_watch_chg: modifying epoll events"
880                                                         " failed: %s [%d]\n", strerror(errno), errno);
881                                         goto error;
882                                 }
883                         break;
884 #endif
885 #ifdef HAVE_KQUEUE
886                 case POLL_KQUEUE:
887                         if (unlikely(del_events & POLLIN)){
888                                 if (unlikely(kq_ev_change(h, fd, EVFILT_READ,
889                                                                                                                 EV_DELETE, 0) ==-1))
890                                                 goto error;
891                         }else if (unlikely(add_events & POLLIN)){
892                                 if (unlikely(kq_ev_change(h, fd, EVFILT_READ, EV_ADD, e) ==-1))
893                                         goto error;
894                         }
895                         if (likely(del_events & POLLOUT)){
896                                 if (unlikely(kq_ev_change(h, fd, EVFILT_WRITE,
897                                                                                                                 EV_DELETE, 0) ==-1))
898                                                 goto error;
899                         }else if (likely(add_events & POLLOUT)){
900                                 if (unlikely(kq_ev_change(h, fd, EVFILT_WRITE, EV_ADD, e)==-1))
901                                         goto error;
902                         }
903                         break;
904 #endif
905 #ifdef HAVE_DEVPOLL
906                 case POLL_DEVPOLL:
907                                 /* for /dev/poll the closed fds _must_ be removed
908                                    (they are not removed automatically on close()) */
909                                 pfd.fd=fd;
910                                 pfd.events=POLLREMOVE;
911                                 pfd.revents=0;
912 again_devpoll1:
913                                 if (unlikely(write(h->dpoll_fd, &pfd, sizeof(pfd))==-1)){
914                                         if (errno==EINTR) goto again_devpoll1;
915                                         LOG(L_ERR, "ERROR: io_watch_chg: removing fd from "
916                                                                 "/dev/poll failed: %s [%d]\n",
917                                                                 strerror(errno), errno);
918                                         goto error;
919                                 }
920 again_devpoll2:
921                                 pfd.events=events;
922                                 pfd.revents=0;
923                                 if (unlikely(write(h->dpoll_fd, &pfd, sizeof(pfd))==-1)){
924                                         if (errno==EINTR) goto again_devpoll2;
925                                         LOG(L_ERR, "ERROR: io_watch_chg: re-adding fd to "
926                                                                 "/dev/poll failed: %s [%d]\n",
927                                                                 strerror(errno), errno);
928                                         /* error re-adding the fd => mark it as removed/unhash */
929                                         unhash_fd_map(e);
930                                         goto error;
931                                 }
932                                 break;
933 #endif
934                 default:
935                         LOG(L_CRIT, "BUG: io_watch_chg: no support for poll method "
936                                         " %s (%d)\n", poll_method_str[h->poll_method],
937                                         h->poll_method);
938                         goto error;
939         }
940         e->events=events; /* only on success */
941         return 0;
942 error:
943         return -1;
944 #undef fix_fd_array
945 }
946
947
948
949 /* io_wait_loop_x style function.
950  * wait for io using poll()
951  * params: h      - io_wait handle
952  *         t      - timeout in s
953  *         repeat - if !=0 handle_io will be called until it returns <=0
954  * returns: number of IO events handled on success (can be 0), -1 on error
955  */
956 inline static int io_wait_loop_poll(io_wait_h* h, int t, int repeat)
957 {
958         int n, r;
959         int ret;
960         struct fd_map* fm;
961         
962 again:
963                 ret=n=poll(h->fd_array, h->fd_no, t*1000);
964                 if (n==-1){
965                         if (errno==EINTR) goto again; /* signal, ignore it */
966                         else{
967                                 LOG(L_ERR, "ERROR:io_wait_loop_poll: poll: %s [%d]\n",
968                                                 strerror(errno), errno);
969                                 goto error;
970                         }
971                 }
972                 for (r=0; (r<h->fd_no) && n; r++){
973                         fm=get_fd_map(h, h->fd_array[r].fd);
974                         if (h->fd_array[r].revents & (fm->events|POLLERR|POLLHUP)){
975                                 n--;
976                                 /* sanity checks */
977                                 if (unlikely((h->fd_array[r].fd >= h->max_fd_no)||
978                                                                 (h->fd_array[r].fd < 0))){
979                                         LOG(L_CRIT, "BUG: io_wait_loop_poll: bad fd %d "
980                                                         "(no in the 0 - %d range)\n",
981                                                         h->fd_array[r].fd, h->max_fd_no);
982                                         /* try to continue anyway */
983                                         h->fd_array[r].events=0; /* clear the events */
984                                         continue;
985                                 }
986                                 h->crt_fd_array_idx=r;
987                                 /* repeat handle_io if repeat, fd still watched (not deleted
988                                  *  inside handle_io), handle_io returns that there's still
989                                  *  IO and the fd is still watched for the triggering event */
990                                 while(fm->type &&
991                                                 (handle_io(fm, h->fd_array[r].revents, r) > 0) &&
992                                                 repeat && ((fm->events|POLLERR|POLLHUP) &
993                                                                                                         h->fd_array[r].revents));
994                                 r=h->crt_fd_array_idx; /* can change due to io_watch_del(fd)
995                                                                                   array shifting */
996                         }
997                 }
998 error:
999         return ret;
1000 }
1001
1002
1003
1004 #ifdef HAVE_SELECT
1005 /* wait for io using select */
1006 inline static int io_wait_loop_select(io_wait_h* h, int t, int repeat)
1007 {
1008         fd_set sel_rset;
1009         fd_set sel_wset;
1010         int n, ret;
1011         struct timeval timeout;
1012         int r;
1013         struct fd_map* fm;
1014         int revents;
1015         
1016 again:
1017                 sel_rset=h->master_rset;
1018                 sel_wset=h->master_wset;
1019                 timeout.tv_sec=t;
1020                 timeout.tv_usec=0;
1021                 ret=n=select(h->max_fd_select+1, &sel_rset, &sel_wset, 0, &timeout);
1022                 if (n<0){
1023                         if (errno==EINTR) goto again; /* just a signal */
1024                         LOG(L_ERR, "ERROR: io_wait_loop_select: select: %s [%d]\n",
1025                                         strerror(errno), errno);
1026                         n=0;
1027                         /* continue */
1028                 }
1029                 /* use poll fd array */
1030                 for(r=0; (r<h->fd_no) && n; r++){
1031                         revents=0;
1032                         if (likely(FD_ISSET(h->fd_array[r].fd, &sel_rset)))
1033                                 revents|=POLLIN;
1034                         if (unlikely(FD_ISSET(h->fd_array[r].fd, &sel_wset)))
1035                                 revents|=POLLOUT;
1036                         if (unlikely(revents)){
1037                                 h->crt_fd_array_idx=r;
1038                                 fm=get_fd_map(h, h->fd_array[r].fd);
1039                                 while(fm->type && (fm->events & revents) &&
1040                                                 (handle_io(fm, revents, r)>0) && repeat);
1041                                 r=h->crt_fd_array_idx; /* can change due to io_watch_del(fd)
1042                                                                                   array shifting */
1043                                 n--;
1044                         }
1045                 };
1046         return ret;
1047 }
1048 #endif
1049
1050
1051
1052 #ifdef HAVE_EPOLL
1053 inline static int io_wait_loop_epoll(io_wait_h* h, int t, int repeat)
1054 {
1055         int n, r;
1056         struct fd_map* fm;
1057         int revents;
1058         
1059 again:
1060                 n=epoll_wait(h->epfd, h->ep_array, h->fd_no, t*1000);
1061                 if (unlikely(n==-1)){
1062                         if (errno==EINTR) goto again; /* signal, ignore it */
1063                         else{
1064                                 LOG(L_ERR, "ERROR:io_wait_loop_epoll: "
1065                                                 "epoll_wait(%d, %p, %d, %d): %s [%d]\n",
1066                                                 h->epfd, h->ep_array, h->fd_no, t*1000,
1067                                                 strerror(errno), errno);
1068                                 goto error;
1069                         }
1070                 }
1071 #if 0
1072                 if (n>1){
1073                         for(r=0; r<n; r++){
1074                                 LOG(L_ERR, "WARNING: ep_array[%d]= %x, %p\n",
1075                                                 r, h->ep_array[r].events, h->ep_array[r].data.ptr);
1076                         }
1077                 }
1078 #endif
1079                 for (r=0; r<n; r++){
1080                         revents= (POLLIN & (!(h->ep_array[r].events & (EPOLLIN|EPOLLPRI))
1081                                                 -1)) |
1082                                          (POLLOUT & (!(h->ep_array[r].events & EPOLLOUT)-1)) |
1083                                          (POLLERR & (!(h->ep_array[r].events & EPOLLERR)-1)) |
1084                                          (POLLHUP & (!(h->ep_array[r].events & EPOLLHUP)-1))
1085 #ifdef POLLRDHUP
1086                                         | (POLLRDHUP & (!(h->ep_array[r].events & EPOLLRDHUP)-1))
1087 #endif
1088                                         ;
1089                         if (likely(revents)){
1090                                 fm=(struct fd_map*)h->ep_array[r].data.ptr;
1091                                 while(fm->type && ((fm->events|POLLERR|POLLHUP) & revents) &&
1092                                                 (handle_io(fm, revents, -1)>0) && repeat);
1093                         }else{
1094                                 LOG(L_ERR, "ERROR:io_wait_loop_epoll: unexpected event %x"
1095                                                         " on %d/%d, data=%p\n", h->ep_array[r].events,
1096                                                         r+1, n, h->ep_array[r].data.ptr);
1097                         }
1098                 }
1099 error:
1100         return n;
1101 }
1102 #endif
1103
1104
1105
1106 #ifdef HAVE_KQUEUE
1107 inline static int io_wait_loop_kqueue(io_wait_h* h, int t, int repeat)
1108 {
1109         int n, r;
1110         struct timespec tspec;
1111         struct fd_map* fm;
1112         int orig_changes;
1113         int apply_changes;
1114         int revents;
1115         
1116         tspec.tv_sec=t;
1117         tspec.tv_nsec=0;
1118         orig_changes=h->kq_nchanges;
1119         apply_changes=orig_changes;
1120         do {
1121 again:
1122                 n=kevent(h->kq_fd, h->kq_changes, apply_changes,  h->kq_array,
1123                                         h->kq_array_size, &tspec);
1124                 if (unlikely(n==-1)){
1125                         if (unlikely(errno==EINTR)) goto again; /* signal, ignore it */
1126                         else {
1127                                 /* for a detailed explanation of what follows see below
1128                                    the EV_ERROR case */
1129                                 if (unlikely(!(errno==EBADF || errno==ENOENT)))
1130                                         BUG("io_wait_loop_kqueue: kevent: unexpected error"
1131                                                 " %s [%d]\n", strerror(errno), errno);
1132                                 /* some of the FDs in kq_changes are bad (already closed)
1133                                    and there is not enough space in kq_array to return all
1134                                    of them back */
1135                                 apply_changes = h->kq_array_size;
1136                                 goto again;
1137                         }
1138                 }
1139                 /* remove applied changes */
1140                 h->kq_nchanges -= apply_changes;
1141                 if (unlikely(apply_changes < orig_changes)) {
1142                         orig_changes -= apply_changes;
1143                         memmove(&h->kq_changes[0], &h->kq_changes[apply_changes],
1144                                                                         sizeof(h->kq_changes[0])*h->kq_nchanges);
1145                         apply_changes = (orig_changes < h->kq_array_size) ? orig_changes :
1146                                                                 h->kq_array_size;
1147                 } else {
1148                         orig_changes = 0;
1149                         apply_changes = 0;
1150                 }
1151                 for (r=0; r<n; r++){
1152 #ifdef EXTRA_DEBUG
1153                         DBG("DBG: kqueue: event %d/%d: fd=%d, udata=%lx, flags=0x%x\n",
1154                                         r, n, h->kq_array[r].ident, (long)h->kq_array[r].udata,
1155                                         h->kq_array[r].flags);
1156 #endif
1157                         if (unlikely((h->kq_array[r].flags & EV_ERROR) ||
1158                                                          h->kq_array[r].udata == 0)){
1159                                 /* error in changes: we ignore it if it has to do with a
1160                                    bad fd or update==0. It can be caused by trying to remove an
1161                                    already closed fd: race between adding something to the
1162                                    changes array, close() and applying the changes (EBADF).
1163                                    E.g. for ser tcp: tcp_main sends a fd to child for reading
1164                                     => deletes it from the watched fds => the changes array
1165                                         will contain an EV_DELETE for it. Before the changes
1166                                         are applied (they are at the end of the main io_wait loop,
1167                                         after all the fd events were processed), a CON_ERR sent
1168                                         to tcp_main by a sender (send fail) is processed and causes
1169                                         the fd to be closed. When the changes are applied =>
1170                                         error for the EV_DELETE attempt of a closed fd.
1171                                         Something similar can happen when a fd is scheduled
1172                                         for removal, is close()'ed before being removed and
1173                                         re-opened(a new sock. get the same fd). When the
1174                                         watched fd changes will be applied the fd will be valid
1175                                         (so no EBADF), but it's not already watch => ENOENT.
1176                                         We report a BUG for the other errors (there's nothing
1177                                         constructive we can do if we get an error we don't know
1178                                         how to handle), but apart from that we ignore it in the
1179                                         idea that it is better apply the rest of the changes,
1180                                         rather then dropping all of them.
1181                                 */
1182                                 /*
1183                                         example EV_ERROR for trying to delete a read watched fd,
1184                                         that was already closed:
1185                                         {
1186                                                 ident = 63,  [fd]
1187                                                 filter = -1, [EVFILT_READ]
1188                                                 flags = 16384, [EV_ERROR]
1189                                                 fflags = 0,
1190                                                 data = 9, [errno = EBADF]
1191                                                 udata = 0x0
1192                                         }
1193                                 */
1194                                 if (h->kq_array[r].data != EBADF &&
1195                                                 h->kq_array[r].data != ENOENT)
1196                                         BUG("io_wait_loop_kqueue: kevent unexpected error on "
1197                                                         "fd %ld udata %lx: %s [%ld]\n",
1198                                                         (long)h->kq_array[r].ident,
1199                                                         (long)h->kq_array[r].udata,
1200                                                         strerror(h->kq_array[r].data),
1201                                                         (long)h->kq_array[r].data);
1202                         }else{
1203                                 fm=(struct fd_map*)h->kq_array[r].udata;
1204                                 if (likely(h->kq_array[r].filter==EVFILT_READ)){
1205                                         revents=POLLIN |
1206                                                 (((int)!(h->kq_array[r].flags & EV_EOF)-1)&POLLHUP) |
1207                                                 (((int)!((h->kq_array[r].flags & EV_EOF) &&
1208                                                                         h->kq_array[r].fflags != 0) - 1)&POLLERR);
1209                                         while(fm->type && (fm->events & revents) &&
1210                                                         (handle_io(fm, revents, -1)>0) && repeat);
1211                                 }else if (h->kq_array[r].filter==EVFILT_WRITE){
1212                                         revents=POLLOUT |
1213                                                 (((int)!(h->kq_array[r].flags & EV_EOF)-1)&POLLHUP) |
1214                                                 (((int)!((h->kq_array[r].flags & EV_EOF) &&
1215                                                                         h->kq_array[r].fflags != 0) - 1)&POLLERR);
1216                                         while(fm->type && (fm->events & revents) &&
1217                                                         (handle_io(fm, revents, -1)>0) && repeat);
1218                                 }else{
1219                                         BUG("io_wait_loop_kqueue: unknown filter: kqueue: event "
1220                                                         "%d/%d: fd=%d, filter=%d, flags=0x%x, fflags=0x%x,"
1221                                                         " data=%lx, udata=%lx\n",
1222                                         r, n, (int)h->kq_array[r].ident, (int)h->kq_array[r].filter,
1223                                         h->kq_array[r].flags, h->kq_array[r].fflags,
1224                                         (unsigned long)h->kq_array[r].data,
1225                                         (unsigned long)h->kq_array[r].udata);
1226                                 }
1227                         }
1228                 }
1229         } while(unlikely(orig_changes));
1230         return n;
1231 }
1232 #endif
1233
1234
1235
1236 #ifdef HAVE_SIGIO_RT
1237 /* sigio rt version has no repeat (it doesn't make sense)*/
1238 inline static int io_wait_loop_sigio_rt(io_wait_h* h, int t)
1239 {
1240         int n;
1241         int ret;
1242         struct timespec ts;
1243         siginfo_t siginfo;
1244         int sigio_band;
1245         int sigio_fd;
1246         struct fd_map* fm;
1247         int revents;
1248 #ifdef SIGINFO64_WORKARROUND
1249         int* pi;
1250 #endif
1251         
1252         
1253         ret=1; /* 1 event per call normally */
1254         ts.tv_sec=t;
1255         ts.tv_nsec=0;
1256         if (unlikely(!sigismember(&h->sset, h->signo) ||
1257                                         !sigismember(&h->sset, SIGIO))) {
1258                 LOG(L_CRIT, "BUG: io_wait_loop_sigio_rt: the signal mask"
1259                                 " is not properly set!\n");
1260                 goto error;
1261         }
1262 again:
1263         n=sigtimedwait(&h->sset, &siginfo, &ts);
1264         if (unlikely(n==-1)){
1265                 if (errno==EINTR) goto again; /* some other signal, ignore it */
1266                 else if (errno==EAGAIN){ /* timeout */
1267                         ret=0;
1268                         goto end;
1269                 }else{
1270                         LOG(L_ERR, "ERROR: io_wait_loop_sigio_rt: sigtimed_wait"
1271                                         " %s [%d]\n", strerror(errno), errno);
1272                         goto error;
1273                 }
1274         }
1275         if (likely(n!=SIGIO)){
1276 #ifdef SIGINFO64_WORKARROUND
1277                 /* on linux siginfo.si_band is defined as long in userspace
1278                  * and as int in kernel (< 2.6.5) => on 64 bits things will break!
1279                  * (si_band will include si_fd, and si_fd will contain
1280                  *  garbage).
1281                  *  see /usr/src/linux/include/asm-generic/siginfo.h and
1282                  *      /usr/include/bits/siginfo.h
1283                  *  On newer kernels this is fixed (si_band is long in the kernel too).
1284                  * -- andrei */
1285                 if  ((_os_ver<0x020605) && (sizeof(siginfo.si_band)>sizeof(int))){
1286                         pi=(int*)(void*)&siginfo.si_band; /* avoid type punning warnings */
1287                         sigio_band=*pi;
1288                         sigio_fd=*(pi+1);
1289                 }else
1290 #endif
1291                 {
1292                         sigio_band=siginfo.si_band;
1293                         sigio_fd=siginfo.si_fd;
1294                 }
1295                 if (unlikely(siginfo.si_code==SI_SIGIO)){
1296                         /* old style, we don't know the event (linux 2.2.?) */
1297                         LOG(L_WARN, "WARNING: io_wait_loop_sigio_rt: old style sigio"
1298                                         " interface\n");
1299                         fm=get_fd_map(h, sigio_fd);
1300                         /* we can have queued signals generated by fds not watched
1301                          * any more, or by fds in transition, to a child => ignore them*/
1302                         if (fm->type)
1303                                 handle_io(fm, POLLIN|POLLOUT, -1);
1304                 }else{
1305                         /* si_code contains the SIGPOLL reason: POLL_IN, POLL_OUT,
1306                          *  POLL_MSG, POLL_ERR, POLL_PRI or POLL_HUP
1307                          * and si_band the translated poll event bitmap:
1308                          *  POLLIN|POLLRDNORM  (=POLL_IN),
1309                          *  POLLOUT|POLLWRNORM|POLLWRBAND (=POLL_OUT),
1310                          *  POLLIN|POLLRDNORM|POLLMSG (=POLL_MSG),
1311                          *  POLLERR (=POLL_ERR),
1312                          *  POLLPRI|POLLRDBAND (=POLL_PRI),
1313                          *  POLLHUP|POLLERR (=POLL_HUP)
1314                          *  [linux 2.6.22 fs/fcntl.c:447]
1315                          */
1316 #ifdef EXTRA_DEBUG
1317                         DBG("io_wait_loop_sigio_rt: siginfo: signal=%d (%d),"
1318                                         " si_code=%d, si_band=0x%x,"
1319                                         " si_fd=%d\n",
1320                                         siginfo.si_signo, n, siginfo.si_code,
1321                                         (unsigned)sigio_band,
1322                                         sigio_fd);
1323 #endif
1324                         /* on some errors (e.g. when receving TCP RST), sigio_band will
1325                          * be set to 0x08 (POLLERR) or 0x18 (POLLERR|POLLHUP - on stream
1326                          *  unix socket close) , so better catch all events --andrei */
1327                         if (likely(sigio_band)){
1328                                 fm=get_fd_map(h, sigio_fd);
1329                                 revents=sigio_band;
1330                                 /* fix revents==POLLPRI case */
1331                                 revents |= (!(revents & POLLPRI)-1) & POLLIN;
1332                                 /* we can have queued signals generated by fds not watched
1333                                  * any more, or by fds in transition, to a child
1334                                  * => ignore them */
1335                                 if (fm->type && ((fm->events|POLLERR|POLLHUP) & revents))
1336                                         handle_io(fm, revents, -1);
1337                                 else
1338                                         DBG("WARNING: io_wait_loop_sigio_rt: ignoring event"
1339                                                         " %x on fd %d, watching for %x, si_code=%x "
1340                                                         "(fm->type=%d, fm->fd=%d, fm->data=%p)\n",
1341                                                         sigio_band, sigio_fd, fm->events, siginfo.si_code,
1342                                                         fm->type, fm->fd, fm->data);
1343                         }else{
1344                                 LOG(L_ERR, "ERROR: io_wait_loop_sigio_rt: unexpected event"
1345                                                         " on fd %d: %x\n", sigio_fd, sigio_band);
1346                         }
1347                 }
1348         }else{
1349                 /* signal queue overflow
1350                  * TODO: increase signal queue size: 2.4x /proc/.., 2.6x -rlimits */
1351                 LOG(L_WARN, "WARNING: io_wait_loop_sigio_rt: signal queue overflowed"
1352                                         "- falling back to poll\n");
1353                 /* clear real-time signal queue
1354                  * both SIG_IGN and SIG_DFL are needed , it doesn't work
1355                  * only with SIG_DFL  */
1356                 if (signal(h->signo, SIG_IGN)==SIG_ERR){
1357                         LOG(L_CRIT, "BUG: do_poll: couldn't reset signal to IGN\n");
1358                 }
1359                 
1360                 if (signal(h->signo, SIG_DFL)==SIG_ERR){
1361                         LOG(L_CRIT, "BUG: do_poll: couldn't reset signal to DFL\n");
1362                 }
1363                 /* falling back to normal poll */
1364                 ret=io_wait_loop_poll(h, -1, 1);
1365         }
1366 end:
1367         return ret;
1368 error:
1369         return -1;
1370 }
1371 #endif
1372
1373
1374
1375 #ifdef HAVE_DEVPOLL
1376 inline static int io_wait_loop_devpoll(io_wait_h* h, int t, int repeat)
1377 {
1378         int n, r;
1379         int ret;
1380         struct dvpoll dpoll;
1381         struct fd_map* fm;
1382
1383                 dpoll.dp_timeout=t*1000;
1384                 dpoll.dp_nfds=h->fd_no;
1385                 dpoll.dp_fds=h->fd_array;
1386 again:
1387                 ret=n=ioctl(h->dpoll_fd, DP_POLL, &dpoll);
1388                 if (unlikely(n==-1)){
1389                         if (errno==EINTR) goto again; /* signal, ignore it */
1390                         else{
1391                                 LOG(L_ERR, "ERROR:io_wait_loop_devpoll: ioctl: %s [%d]\n",
1392                                                 strerror(errno), errno);
1393                                 goto error;
1394                         }
1395                 }
1396                 for (r=0; r< n; r++){
1397                         if (h->fd_array[r].revents & (POLLNVAL|POLLERR)){
1398                                 LOG(L_ERR, "ERROR: io_wait_loop_devpoll: pollinval returned"
1399                                                         " for fd %d, revents=%x\n",
1400                                                         h->fd_array[r].fd, h->fd_array[r].revents);
1401                         }
1402                         /* POLLIN|POLLHUP just go through */
1403                         fm=get_fd_map(h, h->fd_array[r].fd);
1404                         while(fm->type && (fm->events & h->fd_array[r].revents) &&
1405                                         (handle_io(fm, h->fd_array[r].revents, r) > 0) && repeat);
1406                 }
1407 error:
1408         return ret;
1409 }
1410 #endif
1411
1412
1413
1414 /* init */
1415
1416
1417 /* initializes the static vars/arrays
1418  * params:      h - pointer to the io_wait_h that will be initialized
1419  *         max_fd - maximum allowed fd number
1420  *         poll_m - poll method (0 for automatic best fit)
1421  */
1422 int init_io_wait(io_wait_h* h, int max_fd, enum poll_types poll_method);
1423
1424 /* destroys everything init_io_wait allocated */
1425 void destroy_io_wait(io_wait_h* h);
1426
1427
1428 #endif