f3ce32400e6ed73255fe5e78145dd6acc950aaf7
[sip-router] / io_wait.h
1 /*
2  * $Id$
3  *
4  * Copyright (C) 2005 iptelorg GmbH
5  *
6  * Permission to use, copy, modify, and distribute this software for any
7  * purpose with or without fee is hereby granted, provided that the above
8  * copyright notice and this permission notice appear in all copies.
9  *
10  * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
11  * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
12  * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
13  * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
14  * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
15  * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
16  * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
17  */
18 /*
19  * tcp io wait common stuff used by tcp_main.c & tcp_read.c
20  * All the functions are inline because of speed reasons and because they are
21  * used only from 2 places.
22  * You also have to define:
23  *     int handle_io(struct fd_map* fm, short events, int idx) (see below)
24  *     (this could be trivially replaced by a callback pointer entry attached
25  *      to the io_wait handler if more flexibility rather then performance
26  *      is needed)
27  *      fd_type - define to some enum of you choice and define also
28  *                FD_TYPE_DEFINED (if you don't do it fd_type will be defined
29  *                to int). 0 has a special not set/not init. meaning
30  *                (a lot of sanity checks and the sigio_rt code are based on
31  *                 this assumption)
32  *     local_malloc (defaults to pkg_malloc)
33  *     local_free   (defaults to pkg_free)
34  *
35  */
36 /*
37  * History:
38  * --------
39  *  2005-06-13  created by andrei
40  *  2005-06-26  added kqueue (andrei)
41  *  2005-07-01  added /dev/poll (andrei)
42  *  2006-05-30  sigio 64 bit workarround enabled for kernels < 2.6.5 (andrei)
43  *  2007-11-22  when handle_io() is called in a loop check & stop if the fd was
44  *               removed inside handle_io() (andrei)
45  *  2007-11-29  support for write (POLLOUT); added io_watch_chg() (andrei)
46  *  2008-02-04  POLLRDHUP & EPOLLRDHUP support (automatically enabled if POLLIN
47  *               is set) (andrei)
48  *  2010-06-17  re-enabled & enhanced the EV_ERROR for kqueue (andrei)
49  */
50
51
52
53 #ifndef _io_wait_h
54 #define _io_wait_h
55
56 #include <errno.h>
57 #include <string.h>
58 #ifdef HAVE_SIGIO_RT
59 #define __USE_GNU /* or else F_SETSIG won't be included */
60 #include <sys/types.h> /* recv */
61 #include <sys/socket.h> /* recv */
62 #include <signal.h> /* sigprocmask, sigwait a.s.o */
63 #endif
64
65 #define _GNU_SOURCE  /* for POLLRDHUP on linux */
66 #include <sys/poll.h>
67 #include <fcntl.h>
68
69 #ifdef HAVE_EPOLL
70 #include <sys/epoll.h>
71 #endif
72 #ifdef HAVE_KQUEUE
73 #include <sys/types.h> /* needed on freebsd */
74 #include <sys/event.h>
75 #include <sys/time.h>
76 #endif
77 #ifdef HAVE_DEVPOLL
78 #include <sys/devpoll.h>
79 #endif
80 #ifdef HAVE_SELECT
81 /* needed on openbsd for select*/
82 #include <sys/time.h>
83 #include <sys/types.h>
84 #include <unistd.h>
85 /* needed according to POSIX for select*/
86 #include <sys/select.h>
87 #endif
88
89 #include "dprint.h"
90
91 #include "poll_types.h" /* poll_types*/
92 #ifdef HAVE_SIGIO_RT
93 #include "pt.h" /* mypid() */
94 #endif
95
96 #include "compiler_opt.h"
97
98
99 #ifdef HAVE_EPOLL
100 /* fix defines for EPOLL */
101 #if defined POLLRDHUP && ! defined EPOLLRDHUP
102 #define EPOLLRDHUP POLLRDHUP  /* should work on all linuxes */
103 #endif /* POLLRDHUP && EPOLLRDHUP */
104 #endif /* HAVE_EPOLL */
105
106
107 extern int _os_ver; /* os version number, needed to select bugs workarrounds */
108
109
110 #if 0
111 enum fd_types; /* this should be defined from the including file,
112                                   see tcp_main.c for an example,
113                                   0 has a special meaning: not used/empty*/
114 #endif
115
116 #ifndef FD_TYPE_DEFINED
117 typedef int fd_type;
118 #define FD_TYPE_DEFINED
119 #endif
120
121 /* maps a fd to some other structure; used in almost all cases
122  * except epoll and maybe kqueue or /dev/poll */
123 struct fd_map{
124         int fd;               /* fd no */
125         fd_type type;         /* "data" type */
126         void* data;           /* pointer to the corresponding structure */
127         short events;         /* events we are interested int */
128 };
129
130
131 #ifdef HAVE_KQUEUE
132 #ifndef KQ_CHANGES_ARRAY_SIZE
133 #define KQ_CHANGES_ARRAY_SIZE 256
134
135 #ifdef __OS_netbsd
136 #define KEV_UDATA_CAST (intptr_t)
137 #else
138 #define KEV_UDATA_CAST
139 #endif
140
141 #endif
142 #endif
143
144
145 /* handler structure */
146 struct io_wait_handler{
147         enum poll_types poll_method;
148         int flags;
149         struct fd_map* fd_hash;
150         int fd_no; /*  current index used in fd_array and the passed size for
151                                    ep_array (for kq_array at least
152                                     max(twice the size, kq_changes_size) should be
153                                    be passed). */
154         int max_fd_no; /* maximum fd no, is also the size of fd_array,
155                                                        fd_hash  and ep_array*/
156         /* common stuff for POLL, SIGIO_RT and SELECT
157          * since poll support is always compiled => this will always be compiled */
158         struct pollfd* fd_array; /* used also by devpoll as devpoll array */
159         int crt_fd_array_idx; /*  crt idx for which handle_io is called
160                                                          (updated also by del -> internal optimization) */
161         /* end of common stuff */
162 #ifdef HAVE_EPOLL
163         int epfd; /* epoll ctrl fd */
164         struct epoll_event* ep_array;
165 #endif
166 #ifdef HAVE_SIGIO_RT
167         sigset_t sset; /* signal mask for sigio & sigrtmin */
168         int signo;     /* real time signal used */
169 #endif
170 #ifdef HAVE_KQUEUE
171         int kq_fd;
172         struct kevent* kq_array;   /* used for the eventlist*/
173         struct kevent* kq_changes; /* used for the changelist */
174         size_t kq_nchanges;
175         size_t kq_array_size;   /* array size */
176         size_t kq_changes_size; /* size of the changes array */
177 #endif
178 #ifdef HAVE_DEVPOLL
179         int dpoll_fd;
180 #endif
181 #ifdef HAVE_SELECT
182         fd_set master_rset; /* read set */
183         fd_set master_wset; /* write set */
184         int max_fd_select; /* maximum select used fd */
185 #endif
186 };
187
188 typedef struct io_wait_handler io_wait_h;
189
190
191 /* get the corresponding fd_map structure pointer */
192 #define get_fd_map(h, fd)               (&(h)->fd_hash[(fd)])
193 /* remove a fd_map structure from the hash; the pointer must be returned
194  * by get_fd_map or hash_fd_map*/
195 #define unhash_fd_map(pfm)      \
196         do{ \
197                 (pfm)->type=0 /*F_NONE */; \
198                 (pfm)->fd=-1; \
199         }while(0)
200
201 /* add a fd_map structure to the fd hash */
202 static inline struct fd_map* hash_fd_map(       io_wait_h* h,
203                                                                                         int fd,
204                                                                                         short events,
205                                                                                         fd_type type,
206                                                                                         void* data)
207 {
208         h->fd_hash[fd].fd=fd;
209         h->fd_hash[fd].events=events;
210         h->fd_hash[fd].type=type;
211         h->fd_hash[fd].data=data;
212         return &h->fd_hash[fd];
213 }
214
215
216
217 #ifdef HANDLE_IO_INLINE
218 /* generic handle io routine, this must be defined in the including file
219  * (faster then registering a callback pointer)
220  *
221  * params:  fm     - pointer to a fd hash entry
222  *          events - combinations of POLLIN, POLLOUT, POLLERR & POLLHUP
223  *          idx    - index in the fd_array (or -1 if not known)
224  * return: -1 on error
225  *          0 on EAGAIN or when by some other way it is known that no more
226  *            io events are queued on the fd (the receive buffer is empty).
227  *            Usefull to detect when there are no more io events queued for
228  *            sigio_rt, epoll_et, kqueue.
229  *         >0 on successfull read from the fd (when there might be more io
230  *            queued -- the receive buffer might still be non-empty)
231  */
232 inline static int handle_io(struct fd_map* fm, short events, int idx);
233 #else
234 int handle_io(struct fd_map* fm, short events, int idx);
235 #endif
236
237
238
239 #ifdef HAVE_KQUEUE
240 /*
241  * kqueue specific function: register a change
242  * (adds a change to the kevent change array, and if full flushes it first)
243  *
244  * TODO: check if the event already exists in the change list or if it's
245  *       complementary to an event in the list (e.g. EVFILT_WRITE, EV_DELETE
246  *       and EVFILT_WRITE, EV_ADD for the same fd).
247  * returns: -1 on error, 0 on success
248  */
249 static inline int kq_ev_change(io_wait_h* h, int fd, int filter, int flag,
250                                                                 void* data)
251 {
252         int n;
253         int r;
254         struct timespec tspec;
255
256         if (h->kq_nchanges>=h->kq_changes_size){
257                 /* changes array full ! */
258                 LOG(L_WARN, "WARNING: kq_ev_change: kqueue changes array full"
259                                         " trying to flush...\n");
260                 tspec.tv_sec=0;
261                 tspec.tv_nsec=0;
262 again:
263                 n=kevent(h->kq_fd, h->kq_changes, h->kq_nchanges, 0, 0, &tspec);
264                 if (unlikely(n == -1)){
265                         if (unlikely(errno == EINTR)) goto again;
266                         else {
267                                 /* for a detailed explanation of what follows see
268                                    io_wait_loop_kqueue EV_ERROR case */
269                                 if (unlikely(!(errno == EBADF || errno == ENOENT)))
270                                         BUG("kq_ev_change: kevent flush changes failed"
271                                                         " (unexpected error): %s [%d]\n",
272                                                         strerror(errno), errno);
273                                         /* ignore error even if it's not a EBADF/ENOENT */
274                                 /* one of the file descriptors is bad, probably already
275                                    closed => try to apply changes one-by-one */
276                                 for (r = 0; r < h->kq_nchanges; r++) {
277 retry2:
278                                         n = kevent(h->kq_fd, &h->kq_changes[r], 1, 0, 0, &tspec);
279                                         if (n==-1) {
280                                                 if (unlikely(errno == EINTR))
281                                                         goto retry2;
282                                         /* for a detailed explanation of what follows see
283                                                 io_wait_loop_kqueue EV_ERROR case */
284                                                 if (unlikely(!(errno == EBADF || errno == ENOENT)))
285                                                         BUG("kq_ev_change: kevent flush changes failed:"
286                                                                         " (unexpected error) %s [%d] (%d/%d)\n",
287                                                                                 strerror(errno), errno,
288                                                                                 r, h->kq_nchanges);
289                                                 continue; /* skip over it */
290                                         }
291                                 }
292                         }
293                 }
294                 h->kq_nchanges=0; /* changes array is empty */
295         }
296         EV_SET(&h->kq_changes[h->kq_nchanges], fd, filter, flag, 0, 0,
297                         KEV_UDATA_CAST data);
298         h->kq_nchanges++;
299         return 0;
300 }
301 #endif
302
303
304
305 /* generic io_watch_add function
306  * Params:
307  *     h      - pointer to initialized io_wait handle
308  *     fd     - fd to watch
309  *     events - bitmap with the fd events for which the fd should be watched
310  *              (combination of POLLIN and POLLOUT)
311  *     type   - fd type (non 0 value, returned in the call to handle_io)
312  *     data   - pointer/private data returned in the handle_io call
313  * returns 0 on success, -1 on error
314  *
315  * WARNING: handle_io() can be called immediately (from io_watch_add()) so
316  *  make sure that any dependent init. (e.g. data stuff) is made before
317  *  calling io_watch_add
318  *
319  * this version should be faster than pointers to poll_method specific
320  * functions (it avoids functions calls, the overhead being only an extra
321  *  switch())*/
322 inline static int io_watch_add( io_wait_h* h,
323                                                                 int fd,
324                                                                 short events,
325                                                                 fd_type type,
326                                                                 void* data)
327 {
328
329         /* helper macros */
330 #define fd_array_setup(ev) \
331         do{ \
332                 h->fd_array[h->fd_no].fd=fd; \
333                 h->fd_array[h->fd_no].events=(ev); /* useless for select */ \
334                 h->fd_array[h->fd_no].revents=0;     /* useless for select */ \
335         }while(0)
336         
337 #define set_fd_flags(f) \
338         do{ \
339                         flags=fcntl(fd, F_GETFL); \
340                         if (flags==-1){ \
341                                 LOG(L_ERR, "ERROR: io_watch_add: fnctl: GETFL failed:" \
342                                                 " %s [%d]\n", strerror(errno), errno); \
343                                 goto error; \
344                         } \
345                         if (fcntl(fd, F_SETFL, flags|(f))==-1){ \
346                                 LOG(L_ERR, "ERROR: io_watch_add: fnctl: SETFL" \
347                                                         " failed: %s [%d]\n", strerror(errno), errno); \
348                                 goto error; \
349                         } \
350         }while(0)
351         
352         
353         struct fd_map* e;
354         int flags;
355 #ifdef HAVE_EPOLL
356         struct epoll_event ep_event;
357 #endif
358 #ifdef HAVE_DEVPOLL
359         struct pollfd pfd;
360 #endif
361 #if defined(HAVE_SIGIO_RT) || defined (HAVE_EPOLL)
362         int n;
363 #endif
364 #if defined(HAVE_SIGIO_RT)
365         int idx;
366         int check_io;
367         struct pollfd pf;
368         
369         check_io=0; /* set to 1 if we need to check for pre-existing queued
370                                    io/data on the fd */
371         idx=-1;
372 #endif
373         e=0;
374         /* sanity checks */
375         if (unlikely(fd==-1)){
376                 LOG(L_CRIT, "BUG: io_watch_add: fd is -1!\n");
377                 goto error;
378         }
379         if (unlikely((events&(POLLIN|POLLOUT))==0)){
380                 LOG(L_CRIT, "BUG: io_watch_add: invalid events: 0x%0x\n", events);
381                 goto error;
382         }
383         /* check if not too big */
384         if (unlikely(h->fd_no>=h->max_fd_no)){
385                 LOG(L_CRIT, "ERROR: io_watch_add: maximum fd number exceeded:"
386                                 " %d/%d\n", h->fd_no, h->max_fd_no);
387                 goto error;
388         }
389         DBG("DBG: io_watch_add(%p, %d, %d, %p), fd_no=%d\n",
390                         h, fd, type, data, h->fd_no);
391         /*  hash sanity check */
392         e=get_fd_map(h, fd);
393         if (unlikely(e && (e->type!=0 /*F_NONE*/))){
394                 LOG(L_ERR, "ERROR: io_watch_add: trying to overwrite entry %d"
395                                 " watched for %x in the hash(%d, %d, %p) with (%d, %d, %p)\n",
396                                 fd, events, e->fd, e->type, e->data, fd, type, data);
397                 e=0;
398                 goto error;
399         }
400         
401         if (unlikely((e=hash_fd_map(h, fd, events, type, data))==0)){
402                 LOG(L_ERR, "ERROR: io_watch_add: failed to hash the fd %d\n", fd);
403                 goto error;
404         }
405         switch(h->poll_method){ /* faster then pointer to functions */
406                 case POLL_POLL:
407 #ifdef POLLRDHUP
408                         /* listen to POLLRDHUP by default (if POLLIN) */
409                         events|=((int)!(events & POLLIN) - 1) & POLLRDHUP;
410 #endif /* POLLRDHUP */
411                         fd_array_setup(events);
412                         set_fd_flags(O_NONBLOCK);
413                         break;
414 #ifdef HAVE_SELECT
415                 case POLL_SELECT:
416                         fd_array_setup(events);
417                         if (likely(events & POLLIN))
418                                 FD_SET(fd, &h->master_rset);
419                         if (unlikely(events & POLLOUT))
420                                 FD_SET(fd, &h->master_wset);
421                         if (h->max_fd_select<fd) h->max_fd_select=fd;
422                         break;
423 #endif
424 #ifdef HAVE_SIGIO_RT
425                 case POLL_SIGIO_RT:
426                         fd_array_setup(events);
427                         /* re-set O_ASYNC might be needed, if not done from
428                          * io_watch_del (or if somebody wants to add a fd which has
429                          * already O_ASYNC/F_SETSIG set on a duplicate)
430                          */
431                         /* set async & signal */
432                         if (fcntl(fd, F_SETOWN, my_pid())==-1){
433                                 LOG(L_ERR, "ERROR: io_watch_add: fnctl: SETOWN"
434                                 " failed: %s [%d]\n", strerror(errno), errno);
435                                 goto error;
436                         }
437                         if (fcntl(fd, F_SETSIG, h->signo)==-1){
438                                 LOG(L_ERR, "ERROR: io_watch_add: fnctl: SETSIG"
439                                         " failed: %s [%d]\n", strerror(errno), errno);
440                                 goto error;
441                         }
442                         /* set both non-blocking and async */
443                         set_fd_flags(O_ASYNC| O_NONBLOCK);
444 #ifdef EXTRA_DEBUG
445                         DBG("io_watch_add: sigio_rt on f %d, signal %d to pid %d\n",
446                                         fd,  h->signo, my_pid());
447 #endif
448                         /* empty socket receive buffer, if buffer is already full
449                          * no more space to put packets
450                          * => no more signals are ever generated
451                          * also when moving fds, the freshly moved fd might have
452                          *  already some bytes queued, we want to get them now
453                          *  and not later -- andrei */
454                         idx=h->fd_no;
455                         check_io=1;
456                         break;
457 #endif
458 #ifdef HAVE_EPOLL
459                 case POLL_EPOLL_LT:
460                         ep_event.events=
461 #ifdef POLLRDHUP
462                                                 /* listen for EPOLLRDHUP too */
463                                                 ((EPOLLIN|EPOLLRDHUP) & ((int)!(events & POLLIN)-1) ) |
464 #else /* POLLRDHUP */
465                                                 (EPOLLIN & ((int)!(events & POLLIN)-1) ) |
466 #endif /* POLLRDHUP */
467                                                 (EPOLLOUT & ((int)!(events & POLLOUT)-1) );
468                         ep_event.data.ptr=e;
469 again1:
470                         n=epoll_ctl(h->epfd, EPOLL_CTL_ADD, fd, &ep_event);
471                         if (unlikely(n==-1)){
472                                 if (errno==EAGAIN) goto again1;
473                                 LOG(L_ERR, "ERROR: io_watch_add: epoll_ctl failed: %s [%d]\n",
474                                         strerror(errno), errno);
475                                 goto error;
476                         }
477                         break;
478                 case POLL_EPOLL_ET:
479                         set_fd_flags(O_NONBLOCK);
480                         ep_event.events=
481 #ifdef POLLRDHUP
482                                                 /* listen for EPOLLRDHUP too */
483                                                 ((EPOLLIN|EPOLLRDHUP) & ((int)!(events & POLLIN)-1) ) |
484 #else /* POLLRDHUP */
485                                                 (EPOLLIN & ((int)!(events & POLLIN)-1) ) |
486 #endif /* POLLRDHUP */
487                                                 (EPOLLOUT & ((int)!(events & POLLOUT)-1) ) |
488                                                 EPOLLET;
489                         ep_event.data.ptr=e;
490 again2:
491                         n=epoll_ctl(h->epfd, EPOLL_CTL_ADD, fd, &ep_event);
492                         if (unlikely(n==-1)){
493                                 if (errno==EAGAIN) goto again2;
494                                 LOG(L_ERR, "ERROR: io_watch_add: epoll_ctl failed: %s [%d]\n",
495                                         strerror(errno), errno);
496                                 goto error;
497                         }
498                         break;
499 #endif
500 #ifdef HAVE_KQUEUE
501                 case POLL_KQUEUE:
502                         if (likely( events & POLLIN)){
503                                 if (unlikely(kq_ev_change(h, fd, EVFILT_READ, EV_ADD, e)==-1))
504                                         goto error;
505                         }
506                         if (unlikely( events & POLLOUT)){
507                                 if (unlikely(kq_ev_change(h, fd, EVFILT_WRITE, EV_ADD, e)==-1))
508                                 {
509                                         if (likely(events & POLLIN)){
510                                                 kq_ev_change(h, fd, EVFILT_READ, EV_DELETE, 0);
511                                         }
512                                         goto error;
513                                 }
514                         }
515                         break;
516 #endif
517 #ifdef HAVE_DEVPOLL
518                 case POLL_DEVPOLL:
519                         pfd.fd=fd;
520                         pfd.events=events;
521                         pfd.revents=0;
522 again_devpoll:
523                         if (write(h->dpoll_fd, &pfd, sizeof(pfd))==-1){
524                                 if (errno==EAGAIN) goto again_devpoll;
525                                 LOG(L_ERR, "ERROR: io_watch_add: /dev/poll write failed:"
526                                                         "%s [%d]\n", strerror(errno), errno);
527                                 goto error;
528                         }
529                         break;
530 #endif
531                         
532                 default:
533                         LOG(L_CRIT, "BUG: io_watch_add: no support for poll method "
534                                         " %s (%d)\n", poll_method_str[h->poll_method],
535                                         h->poll_method);
536                         goto error;
537         }
538         
539         h->fd_no++; /* "activate" changes, for epoll/kqueue/devpoll it
540                                    has only informative value */
541 #if defined(HAVE_SIGIO_RT)
542         if (check_io){
543                 /* handle possible pre-existing events */
544                 pf.fd=fd;
545                 pf.events=events;
546 check_io_again:
547                 n=0;
548                 while(e->type && ((n=poll(&pf, 1, 0))>0) &&
549                                 (handle_io(e, pf.revents, idx)>0) &&
550                                 (pf.revents & (e->events|POLLERR|POLLHUP)));
551                 if (unlikely(e->type && (n==-1))){
552                         if (errno==EINTR) goto check_io_again;
553                         LOG(L_ERR, "ERROR: io_watch_add: check_io poll: %s [%d]\n",
554                                                 strerror(errno), errno);
555                 }
556         }
557 #endif
558         return 0;
559 error:
560         if (e) unhash_fd_map(e);
561         return -1;
562 #undef fd_array_setup
563 #undef set_fd_flags
564 }
565
566
567
568 #define IO_FD_CLOSING 16
569 /* parameters:    h - handler
570  *               fd - file descriptor
571  *            index - index in the fd_array if known, -1 if not
572  *                    (if index==-1 fd_array will be searched for the
573  *                     corresponding fd* entry -- slower but unavoidable in
574  *                     some cases). index is not used (no fd_array) for epoll,
575  *                     /dev/poll and kqueue
576  *            flags - optimization flags, e.g. IO_FD_CLOSING, the fd was
577  *                    or will shortly be closed, in some cases we can avoid
578  *                    extra remove operations (e.g.: epoll, kqueue, sigio)
579  * returns 0 if ok, -1 on error */
580 inline static int io_watch_del(io_wait_h* h, int fd, int idx, int flags)
581 {
582         
583 #define fix_fd_array \
584         do{\
585                         if (unlikely(idx==-1)){ \
586                                 /* fix idx if -1 and needed */ \
587                                 for (idx=0; (idx<h->fd_no) && \
588                                                         (h->fd_array[idx].fd!=fd); idx++); \
589                         } \
590                         if (likely(idx<h->fd_no)){ \
591                                 memmove(&h->fd_array[idx], &h->fd_array[idx+1], \
592                                         (h->fd_no-(idx+1))*sizeof(*(h->fd_array))); \
593                                 if ((idx<=h->crt_fd_array_idx) && (h->crt_fd_array_idx>=0)) \
594                                         h->crt_fd_array_idx--; \
595                         } \
596         }while(0)
597         
598         struct fd_map* e;
599         int events;
600 #ifdef HAVE_EPOLL
601         int n;
602         struct epoll_event ep_event;
603 #endif
604 #ifdef HAVE_DEVPOLL
605         struct pollfd pfd;
606 #endif
607 #ifdef HAVE_SIGIO_RT
608         int fd_flags;
609 #endif
610         
611         if (unlikely((fd<0) || (fd>=h->max_fd_no))){
612                 LOG(L_CRIT, "BUG: io_watch_del: invalid fd %d, not in [0, %d) \n",
613                                                 fd, h->fd_no);
614                 goto error;
615         }
616         DBG("DBG: io_watch_del (%p, %d, %d, 0x%x) fd_no=%d called\n",
617                         h, fd, idx, flags, h->fd_no);
618         e=get_fd_map(h, fd);
619         /* more sanity checks */
620         if (unlikely(e==0)){
621                 LOG(L_CRIT, "BUG: io_watch_del: no corresponding hash entry for %d\n",
622                                         fd);
623                 goto error;
624         }
625         if (unlikely(e->type==0 /*F_NONE*/)){
626                 LOG(L_ERR, "ERROR: io_watch_del: trying to delete already erased"
627                                 " entry %d in the hash(%d, %d, %p) flags %x)\n",
628                                 fd, e->fd, e->type, e->data, flags);
629                 goto error;
630         }
631         events=e->events;
632         
633         switch(h->poll_method){
634                 case POLL_POLL:
635                         fix_fd_array;
636                         break;
637 #ifdef HAVE_SELECT
638                 case POLL_SELECT:
639                         if (likely(events & POLLIN))
640                                 FD_CLR(fd, &h->master_rset);
641                         if (unlikely(events & POLLOUT))
642                                 FD_CLR(fd, &h->master_wset);
643                         if (unlikely(h->max_fd_select && (h->max_fd_select==fd)))
644                                 /* we don't know the prev. max, so we just decrement it */
645                                 h->max_fd_select--;
646                         fix_fd_array;
647                         break;
648 #endif
649 #ifdef HAVE_SIGIO_RT
650                 case POLL_SIGIO_RT:
651                         /* the O_ASYNC flag must be reset all the time, the fd
652                          *  can be changed only if  O_ASYNC is reset (if not and
653                          *  the fd is a duplicate, you will get signals from the dup. fd
654                          *  and not from the original, even if the dup. fd was closed
655                          *  and the signals re-set on the original) -- andrei
656                          */
657                         /*if (!(flags & IO_FD_CLOSING)){*/
658                                 /* reset ASYNC */
659                                 fd_flags=fcntl(fd, F_GETFL);
660                                 if (unlikely(fd_flags==-1)){
661                                         LOG(L_ERR, "ERROR: io_watch_del: fnctl: GETFL failed:"
662                                                         " %s [%d]\n", strerror(errno), errno);
663                                         goto error;
664                                 }
665                                 if (unlikely(fcntl(fd, F_SETFL, fd_flags&(~O_ASYNC))==-1)){
666                                         LOG(L_ERR, "ERROR: io_watch_del: fnctl: SETFL"
667                                                                 " failed: %s [%d]\n", strerror(errno), errno);
668                                         goto error;
669                                 }
670                         fix_fd_array; /* only on success */
671                         break;
672 #endif
673 #ifdef HAVE_EPOLL
674                 case POLL_EPOLL_LT:
675                 case POLL_EPOLL_ET:
676                         /* epoll doesn't seem to automatically remove sockets,
677                          * if the socket is a duplicate/moved and the original
678                          * is still open. The fd is removed from the epoll set
679                          * only when the original (and all the  copies?) is/are
680                          * closed. This is probably a bug in epoll. --andrei */
681 #ifdef EPOLL_NO_CLOSE_BUG
682                         if (!(flags & IO_FD_CLOSING)){
683 #endif
684 again_epoll:
685                                 n=epoll_ctl(h->epfd, EPOLL_CTL_DEL, fd, &ep_event);
686                                 if (unlikely(n==-1)){
687                                         if (errno==EAGAIN) goto again_epoll;
688                                         LOG(L_ERR, "ERROR: io_watch_del: removing fd from epoll "
689                                                         "list failed: %s [%d]\n", strerror(errno), errno);
690                                         goto error;
691                                 }
692 #ifdef EPOLL_NO_CLOSE_BUG
693                         }
694 #endif
695                         break;
696 #endif
697 #ifdef HAVE_KQUEUE
698                 case POLL_KQUEUE:
699                         if (!(flags & IO_FD_CLOSING)){
700                                 if (likely(events & POLLIN)){
701                                         if (unlikely(kq_ev_change(h, fd, EVFILT_READ,
702                                                                                                         EV_DELETE, 0) ==-1)){
703                                                 /* try to delete the write filter anyway */
704                                                 if (events & POLLOUT){
705                                                         kq_ev_change(h, fd, EVFILT_WRITE, EV_DELETE, 0);
706                                                 }
707                                                 goto error;
708                                         }
709                                 }
710                                 if (unlikely(events & POLLOUT)){
711                                         if (unlikely(kq_ev_change(h, fd, EVFILT_WRITE,
712                                                                                                         EV_DELETE, 0) ==-1))
713                                                 goto error;
714                                 }
715                         }
716                         break;
717 #endif
718 #ifdef HAVE_DEVPOLL
719                 case POLL_DEVPOLL:
720                                 /* for /dev/poll the closed fds _must_ be removed
721                                    (they are not removed automatically on close()) */
722                                 pfd.fd=fd;
723                                 pfd.events=POLLREMOVE;
724                                 pfd.revents=0;
725 again_devpoll:
726                                 if (write(h->dpoll_fd, &pfd, sizeof(pfd))==-1){
727                                         if (errno==EINTR) goto again_devpoll;
728                                         LOG(L_ERR, "ERROR: io_watch_del: removing fd from "
729                                                                 "/dev/poll failed: %s [%d]\n",
730                                                                 strerror(errno), errno);
731                                         goto error;
732                                 }
733                                 break;
734 #endif
735                 default:
736                         LOG(L_CRIT, "BUG: io_watch_del: no support for poll method "
737                                         " %s (%d)\n", poll_method_str[h->poll_method],
738                                         h->poll_method);
739                         goto error;
740         }
741         unhash_fd_map(e); /* only on success */
742         h->fd_no--;
743         return 0;
744 error:
745         return -1;
746 #undef fix_fd_array
747 }
748
749
750
751 /* parameters:    h - handler
752  *               fd - file descriptor
753  *           events - new events to watch for
754  *              idx - index in the fd_array if known, -1 if not
755  *                    (if index==-1 fd_array will be searched for the
756  *                     corresponding fd* entry -- slower but unavoidable in
757  *                     some cases). index is not used (no fd_array) for epoll,
758  *                     /dev/poll and kqueue
759  * returns 0 if ok, -1 on error */
760 inline static int io_watch_chg(io_wait_h* h, int fd, short events, int idx )
761 {
762         
763 #define fd_array_chg(ev) \
764         do{\
765                         if (unlikely(idx==-1)){ \
766                                 /* fix idx if -1 and needed */ \
767                                 for (idx=0; (idx<h->fd_no) && \
768                                                         (h->fd_array[idx].fd!=fd); idx++); \
769                         } \
770                         if (likely(idx<h->fd_no)){ \
771                                 h->fd_array[idx].events=(ev); \
772                         } \
773         }while(0)
774         
775         struct fd_map* e;
776         int add_events;
777         int del_events;
778 #ifdef HAVE_DEVPOLL
779         struct pollfd pfd;
780 #endif
781 #ifdef HAVE_EPOLL
782         int n;
783         struct epoll_event ep_event;
784 #endif
785         
786         if (unlikely((fd<0) || (fd>=h->max_fd_no))){
787                 LOG(L_CRIT, "BUG: io_watch_chg: invalid fd %d, not in [0, %d) \n",
788                                                 fd, h->fd_no);
789                 goto error;
790         }
791         if (unlikely((events&(POLLIN|POLLOUT))==0)){
792                 LOG(L_CRIT, "BUG: io_watch_chg: invalid events: 0x%0x\n", events);
793                 goto error;
794         }
795         DBG("DBG: io_watch_chg (%p, %d, 0x%x, 0x%x) fd_no=%d called\n",
796                         h, fd, events, idx, h->fd_no);
797         e=get_fd_map(h, fd);
798         /* more sanity checks */
799         if (unlikely(e==0)){
800                 LOG(L_CRIT, "BUG: io_watch_chg: no corresponding hash entry for %d\n",
801                                         fd);
802                 goto error;
803         }
804         if (unlikely(e->type==0 /*F_NONE*/)){
805                 LOG(L_ERR, "ERROR: io_watch_chg: trying to change an already erased"
806                                 " entry %d in the hash(%d, %d, %p) )\n",
807                                 fd, e->fd, e->type, e->data);
808                 goto error;
809         }
810         
811         add_events=events & ~e->events;
812         del_events=e->events & ~events;
813         switch(h->poll_method){
814                 case POLL_POLL:
815                         fd_array_chg(events
816 #ifdef POLLRDHUP
817                                                         /* listen to POLLRDHUP by default (if POLLIN) */
818                                                         | (((int)!(events & POLLIN) - 1) & POLLRDHUP)
819 #endif /* POLLRDHUP */
820                                                 );
821                         break;
822 #ifdef HAVE_SELECT
823                 case POLL_SELECT:
824                         fd_array_chg(events);
825                         if (unlikely(del_events & POLLIN))
826                                 FD_CLR(fd, &h->master_rset);
827                         else if (unlikely(add_events & POLLIN))
828                                 FD_SET(fd, &h->master_rset);
829                         if (likely(del_events & POLLOUT))
830                                 FD_CLR(fd, &h->master_wset);
831                         else if (likely(add_events & POLLOUT))
832                                 FD_SET(fd, &h->master_wset);
833                         break;
834 #endif
835 #ifdef HAVE_SIGIO_RT
836                 case POLL_SIGIO_RT:
837                         fd_array_chg(events);
838                         /* no need for check_io, since SIGIO_RT listens by default for all
839                          * the events */
840                         break;
841 #endif
842 #ifdef HAVE_EPOLL
843                 case POLL_EPOLL_LT:
844                                 ep_event.events=
845 #ifdef POLLRDHUP
846                                                 /* listen for EPOLLRDHUP too */
847                                                 ((EPOLLIN|EPOLLRDHUP) & ((int)!(events & POLLIN)-1) ) |
848 #else /* POLLRDHUP */
849                                                 (EPOLLIN & ((int)!(events & POLLIN)-1) ) |
850 #endif /* POLLRDHUP */
851                                                 (EPOLLOUT & ((int)!(events & POLLOUT)-1) );
852                                 ep_event.data.ptr=e;
853 again_epoll_lt:
854                                 n=epoll_ctl(h->epfd, EPOLL_CTL_MOD, fd, &ep_event);
855                                 if (unlikely(n==-1)){
856                                         if (errno==EAGAIN) goto again_epoll_lt;
857                                         LOG(L_ERR, "ERROR: io_watch_chg: modifying epoll events"
858                                                         " failed: %s [%d]\n", strerror(errno), errno);
859                                         goto error;
860                                 }
861                         break;
862                 case POLL_EPOLL_ET:
863                                 ep_event.events=
864 #ifdef POLLRDHUP
865                                                 /* listen for EPOLLRDHUP too */
866                                                 ((EPOLLIN|EPOLLRDHUP) & ((int)!(events & POLLIN)-1) ) |
867 #else /* POLLRDHUP */
868                                                 (EPOLLIN & ((int)!(events & POLLIN)-1) ) |
869 #endif /* POLLRDHUP */
870                                                 (EPOLLOUT & ((int)!(events & POLLOUT)-1) ) |
871                                                 EPOLLET;
872                                 ep_event.data.ptr=e;
873 again_epoll_et:
874                                 n=epoll_ctl(h->epfd, EPOLL_CTL_MOD, fd, &ep_event);
875                                 if (unlikely(n==-1)){
876                                         if (errno==EAGAIN) goto again_epoll_et;
877                                         LOG(L_ERR, "ERROR: io_watch_chg: modifying epoll events"
878                                                         " failed: %s [%d]\n", strerror(errno), errno);
879                                         goto error;
880                                 }
881                         break;
882 #endif
883 #ifdef HAVE_KQUEUE
884                 case POLL_KQUEUE:
885                         if (unlikely(del_events & POLLIN)){
886                                 if (unlikely(kq_ev_change(h, fd, EVFILT_READ,
887                                                                                                                 EV_DELETE, 0) ==-1))
888                                                 goto error;
889                         }else if (unlikely(add_events & POLLIN)){
890                                 if (unlikely(kq_ev_change(h, fd, EVFILT_READ, EV_ADD, e) ==-1))
891                                         goto error;
892                         }
893                         if (likely(del_events & POLLOUT)){
894                                 if (unlikely(kq_ev_change(h, fd, EVFILT_WRITE,
895                                                                                                                 EV_DELETE, 0) ==-1))
896                                                 goto error;
897                         }else if (likely(add_events & POLLOUT)){
898                                 if (unlikely(kq_ev_change(h, fd, EVFILT_WRITE, EV_ADD, e)==-1))
899                                         goto error;
900                         }
901                         break;
902 #endif
903 #ifdef HAVE_DEVPOLL
904                 case POLL_DEVPOLL:
905                                 /* for /dev/poll the closed fds _must_ be removed
906                                    (they are not removed automatically on close()) */
907                                 pfd.fd=fd;
908                                 pfd.events=POLLREMOVE;
909                                 pfd.revents=0;
910 again_devpoll1:
911                                 if (unlikely(write(h->dpoll_fd, &pfd, sizeof(pfd))==-1)){
912                                         if (errno==EINTR) goto again_devpoll1;
913                                         LOG(L_ERR, "ERROR: io_watch_chg: removing fd from "
914                                                                 "/dev/poll failed: %s [%d]\n",
915                                                                 strerror(errno), errno);
916                                         goto error;
917                                 }
918 again_devpoll2:
919                                 pfd.events=events;
920                                 pfd.revents=0;
921                                 if (unlikely(write(h->dpoll_fd, &pfd, sizeof(pfd))==-1)){
922                                         if (errno==EINTR) goto again_devpoll2;
923                                         LOG(L_ERR, "ERROR: io_watch_chg: re-adding fd to "
924                                                                 "/dev/poll failed: %s [%d]\n",
925                                                                 strerror(errno), errno);
926                                         /* error re-adding the fd => mark it as removed/unhash */
927                                         unhash_fd_map(e);
928                                         goto error;
929                                 }
930                                 break;
931 #endif
932                 default:
933                         LOG(L_CRIT, "BUG: io_watch_chg: no support for poll method "
934                                         " %s (%d)\n", poll_method_str[h->poll_method],
935                                         h->poll_method);
936                         goto error;
937         }
938         e->events=events; /* only on success */
939         return 0;
940 error:
941         return -1;
942 #undef fix_fd_array
943 }
944
945
946
947 /* io_wait_loop_x style function.
948  * wait for io using poll()
949  * params: h      - io_wait handle
950  *         t      - timeout in s
951  *         repeat - if !=0 handle_io will be called until it returns <=0
952  * returns: number of IO events handled on success (can be 0), -1 on error
953  */
954 inline static int io_wait_loop_poll(io_wait_h* h, int t, int repeat)
955 {
956         int n, r;
957         int ret;
958         struct fd_map* fm;
959         
960 again:
961                 ret=n=poll(h->fd_array, h->fd_no, t*1000);
962                 if (n==-1){
963                         if (errno==EINTR) goto again; /* signal, ignore it */
964                         else{
965                                 LOG(L_ERR, "ERROR:io_wait_loop_poll: poll: %s [%d]\n",
966                                                 strerror(errno), errno);
967                                 goto error;
968                         }
969                 }
970                 for (r=0; (r<h->fd_no) && n; r++){
971                         fm=get_fd_map(h, h->fd_array[r].fd);
972                         if (h->fd_array[r].revents & (fm->events|POLLERR|POLLHUP)){
973                                 n--;
974                                 /* sanity checks */
975                                 if (unlikely((h->fd_array[r].fd >= h->max_fd_no)||
976                                                                 (h->fd_array[r].fd < 0))){
977                                         LOG(L_CRIT, "BUG: io_wait_loop_poll: bad fd %d "
978                                                         "(no in the 0 - %d range)\n",
979                                                         h->fd_array[r].fd, h->max_fd_no);
980                                         /* try to continue anyway */
981                                         h->fd_array[r].events=0; /* clear the events */
982                                         continue;
983                                 }
984                                 h->crt_fd_array_idx=r;
985                                 /* repeat handle_io if repeat, fd still watched (not deleted
986                                  *  inside handle_io), handle_io returns that there's still
987                                  *  IO and the fd is still watched for the triggering event */
988                                 while(fm->type &&
989                                                 (handle_io(fm, h->fd_array[r].revents, r) > 0) &&
990                                                 repeat && ((fm->events|POLLERR|POLLHUP) &
991                                                                                                         h->fd_array[r].revents));
992                                 r=h->crt_fd_array_idx; /* can change due to io_watch_del(fd)
993                                                                                   array shifting */
994                         }
995                 }
996 error:
997         return ret;
998 }
999
1000
1001
1002 #ifdef HAVE_SELECT
1003 /* wait for io using select */
1004 inline static int io_wait_loop_select(io_wait_h* h, int t, int repeat)
1005 {
1006         fd_set sel_rset;
1007         fd_set sel_wset;
1008         int n, ret;
1009         struct timeval timeout;
1010         int r;
1011         struct fd_map* fm;
1012         int revents;
1013         
1014 again:
1015                 sel_rset=h->master_rset;
1016                 sel_wset=h->master_wset;
1017                 timeout.tv_sec=t;
1018                 timeout.tv_usec=0;
1019                 ret=n=select(h->max_fd_select+1, &sel_rset, &sel_wset, 0, &timeout);
1020                 if (n<0){
1021                         if (errno==EINTR) goto again; /* just a signal */
1022                         LOG(L_ERR, "ERROR: io_wait_loop_select: select: %s [%d]\n",
1023                                         strerror(errno), errno);
1024                         n=0;
1025                         /* continue */
1026                 }
1027                 /* use poll fd array */
1028                 for(r=0; (r<h->fd_no) && n; r++){
1029                         revents=0;
1030                         if (likely(FD_ISSET(h->fd_array[r].fd, &sel_rset)))
1031                                 revents|=POLLIN;
1032                         if (unlikely(FD_ISSET(h->fd_array[r].fd, &sel_wset)))
1033                                 revents|=POLLOUT;
1034                         if (unlikely(revents)){
1035                                 h->crt_fd_array_idx=r;
1036                                 fm=get_fd_map(h, h->fd_array[r].fd);
1037                                 while(fm->type && (fm->events & revents) &&
1038                                                 (handle_io(fm, revents, r)>0) && repeat);
1039                                 r=h->crt_fd_array_idx; /* can change due to io_watch_del(fd)
1040                                                                                   array shifting */
1041                                 n--;
1042                         }
1043                 };
1044         return ret;
1045 }
1046 #endif
1047
1048
1049
1050 #ifdef HAVE_EPOLL
1051 inline static int io_wait_loop_epoll(io_wait_h* h, int t, int repeat)
1052 {
1053         int n, r;
1054         struct fd_map* fm;
1055         int revents;
1056         
1057 again:
1058                 n=epoll_wait(h->epfd, h->ep_array, h->fd_no, t*1000);
1059                 if (unlikely(n==-1)){
1060                         if (errno==EINTR) goto again; /* signal, ignore it */
1061                         else{
1062                                 LOG(L_ERR, "ERROR:io_wait_loop_epoll: "
1063                                                 "epoll_wait(%d, %p, %d, %d): %s [%d]\n",
1064                                                 h->epfd, h->ep_array, h->fd_no, t*1000,
1065                                                 strerror(errno), errno);
1066                                 goto error;
1067                         }
1068                 }
1069 #if 0
1070                 if (n>1){
1071                         for(r=0; r<n; r++){
1072                                 LOG(L_ERR, "WARNING: ep_array[%d]= %x, %p\n",
1073                                                 r, h->ep_array[r].events, h->ep_array[r].data.ptr);
1074                         }
1075                 }
1076 #endif
1077                 for (r=0; r<n; r++){
1078                         revents= (POLLIN & (!(h->ep_array[r].events & (EPOLLIN|EPOLLPRI))
1079                                                 -1)) |
1080                                          (POLLOUT & (!(h->ep_array[r].events & EPOLLOUT)-1)) |
1081                                          (POLLERR & (!(h->ep_array[r].events & EPOLLERR)-1)) |
1082                                          (POLLHUP & (!(h->ep_array[r].events & EPOLLHUP)-1))
1083 #ifdef POLLRDHUP
1084                                         | (POLLRDHUP & (!(h->ep_array[r].events & EPOLLRDHUP)-1))
1085 #endif
1086                                         ;
1087                         if (likely(revents)){
1088                                 fm=(struct fd_map*)h->ep_array[r].data.ptr;
1089                                 while(fm->type && ((fm->events|POLLERR|POLLHUP) & revents) &&
1090                                                 (handle_io(fm, revents, -1)>0) && repeat);
1091                         }else{
1092                                 LOG(L_ERR, "ERROR:io_wait_loop_epoll: unexpected event %x"
1093                                                         " on %d/%d, data=%p\n", h->ep_array[r].events,
1094                                                         r+1, n, h->ep_array[r].data.ptr);
1095                         }
1096                 }
1097 error:
1098         return n;
1099 }
1100 #endif
1101
1102
1103
1104 #ifdef HAVE_KQUEUE
1105 inline static int io_wait_loop_kqueue(io_wait_h* h, int t, int repeat)
1106 {
1107         int n, r;
1108         struct timespec tspec;
1109         struct fd_map* fm;
1110         int orig_changes;
1111         int apply_changes;
1112         int revents;
1113         
1114         tspec.tv_sec=t;
1115         tspec.tv_nsec=0;
1116         orig_changes=h->kq_nchanges;
1117         apply_changes=orig_changes;
1118         do {
1119 again:
1120                 n=kevent(h->kq_fd, h->kq_changes, apply_changes,  h->kq_array,
1121                                         h->kq_array_size, &tspec);
1122                 if (unlikely(n==-1)){
1123                         if (unlikely(errno==EINTR)) goto again; /* signal, ignore it */
1124                         else {
1125                                 /* for a detailed explanation of what follows see below
1126                                    the EV_ERROR case */
1127                                 if (unlikely(!(errno==EBADF || errno==ENOENT)))
1128                                         BUG("io_wait_loop_kqueue: kevent: unexpected error"
1129                                                 " %s [%d]\n", strerror(errno), errno);
1130                                 /* some of the FDs in kq_changes are bad (already closed)
1131                                    and there is not enough space in kq_array to return all
1132                                    of them back */
1133                                 apply_changes = h->kq_array_size;
1134                                 goto again;
1135                         }
1136                 }
1137                 /* remove applied changes */
1138                 h->kq_nchanges -= apply_changes;
1139                 if (unlikely(apply_changes < orig_changes)) {
1140                         orig_changes -= apply_changes;
1141                         memmove(&h->kq_changes[0], &h->kq_changes[apply_changes],
1142                                                                         sizeof(h->kq_changes[0])*h->kq_nchanges);
1143                         apply_changes = (orig_changes < h->kq_array_size) ? orig_changes :
1144                                                                 h->kq_array_size;
1145                 } else {
1146                         orig_changes = 0;
1147                         apply_changes = 0;
1148                 }
1149                 for (r=0; r<n; r++){
1150 #ifdef EXTRA_DEBUG
1151                         DBG("DBG: kqueue: event %d/%d: fd=%d, udata=%lx, flags=0x%x\n",
1152                                         r, n, h->kq_array[r].ident, (long)h->kq_array[r].udata,
1153                                         h->kq_array[r].flags);
1154 #endif
1155                         if (unlikely((h->kq_array[r].flags & EV_ERROR) ||
1156                                                          h->kq_array[r].udata == 0)){
1157                                 /* error in changes: we ignore it if it has to do with a
1158                                    bad fd or update==0. It can be caused by trying to remove an
1159                                    already closed fd: race between adding something to the
1160                                    changes array, close() and applying the changes (EBADF).
1161                                    E.g. for ser tcp: tcp_main sends a fd to child for reading
1162                                     => deletes it from the watched fds => the changes array
1163                                         will contain an EV_DELETE for it. Before the changes
1164                                         are applied (they are at the end of the main io_wait loop,
1165                                         after all the fd events were processed), a CON_ERR sent
1166                                         to tcp_main by a sender (send fail) is processed and causes
1167                                         the fd to be closed. When the changes are applied =>
1168                                         error for the EV_DELETE attempt of a closed fd.
1169                                         Something similar can happen when a fd is scheduled
1170                                         for removal, is close()'ed before being removed and
1171                                         re-opened(a new sock. get the same fd). When the
1172                                         watched fd changes will be applied the fd will be valid
1173                                         (so no EBADF), but it's not already watch => ENOENT.
1174                                         We report a BUG for the other errors (there's nothing
1175                                         constructive we can do if we get an error we don't know
1176                                         how to handle), but apart from that we ignore it in the
1177                                         idea that it is better apply the rest of the changes,
1178                                         rather then dropping all of them.
1179                                 */
1180                                 /*
1181                                         example EV_ERROR for trying to delete a read watched fd,
1182                                         that was already closed:
1183                                         {
1184                                                 ident = 63,  [fd]
1185                                                 filter = -1, [EVFILT_READ]
1186                                                 flags = 16384, [EV_ERROR]
1187                                                 fflags = 0,
1188                                                 data = 9, [errno = EBADF]
1189                                                 udata = 0x0
1190                                         }
1191                                 */
1192                                 if (h->kq_array[r].data != EBADF &&
1193                                                 h->kq_array[r].data != ENOENT)
1194                                         BUG("io_wait_loop_kqueue: kevent unexpected error on "
1195                                                         "fd %ld udata %lx: %s [%ld]\n",
1196                                                         (long)h->kq_array[r].ident,
1197                                                         (long)h->kq_array[r].udata,
1198                                                         strerror(h->kq_array[r].data),
1199                                                         (long)h->kq_array[r].data);
1200                         }else{
1201                                 fm=(struct fd_map*)h->kq_array[r].udata;
1202                                 if (likely(h->kq_array[r].filter==EVFILT_READ)){
1203                                         revents=POLLIN |
1204                                                 (((int)!(h->kq_array[r].flags & EV_EOF)-1)&POLLHUP) |
1205                                                 (((int)!((h->kq_array[r].flags & EV_EOF) &&
1206                                                                         h->kq_array[r].fflags != 0) - 1)&POLLERR);
1207                                         while(fm->type && (fm->events & revents) &&
1208                                                         (handle_io(fm, revents, -1)>0) && repeat);
1209                                 }else if (h->kq_array[r].filter==EVFILT_WRITE){
1210                                         revents=POLLOUT |
1211                                                 (((int)!(h->kq_array[r].flags & EV_EOF)-1)&POLLHUP) |
1212                                                 (((int)!((h->kq_array[r].flags & EV_EOF) &&
1213                                                                         h->kq_array[r].fflags != 0) - 1)&POLLERR);
1214                                         while(fm->type && (fm->events & revents) &&
1215                                                         (handle_io(fm, revents, -1)>0) && repeat);
1216                                 }else{
1217                                         BUG("io_wait_loop_kqueue: unknown filter: kqueue: event "
1218                                                         "%d/%d: fd=%d, filter=%d, flags=0x%x, fflags=0x%x,"
1219                                                         " data=%lx, udata=%lx\n",
1220                                         r, n, h->kq_array[r].ident, h->kq_array[r].filter,
1221                                         h->kq_array[r].flags, h->kq_array[r].fflags,
1222                                         (long)h->kq_array[r].data, (long)h->kq_array[r].udata);
1223                                 }
1224                         }
1225                 }
1226         } while(unlikely(orig_changes));
1227         return n;
1228 }
1229 #endif
1230
1231
1232
1233 #ifdef HAVE_SIGIO_RT
1234 /* sigio rt version has no repeat (it doesn't make sense)*/
1235 inline static int io_wait_loop_sigio_rt(io_wait_h* h, int t)
1236 {
1237         int n;
1238         int ret;
1239         struct timespec ts;
1240         siginfo_t siginfo;
1241         int sigio_band;
1242         int sigio_fd;
1243         struct fd_map* fm;
1244         int revents;
1245 #ifdef SIGINFO64_WORKARROUND
1246         int* pi;
1247 #endif
1248         
1249         
1250         ret=1; /* 1 event per call normally */
1251         ts.tv_sec=t;
1252         ts.tv_nsec=0;
1253         if (unlikely(!sigismember(&h->sset, h->signo) ||
1254                                         !sigismember(&h->sset, SIGIO))) {
1255                 LOG(L_CRIT, "BUG: io_wait_loop_sigio_rt: the signal mask"
1256                                 " is not properly set!\n");
1257                 goto error;
1258         }
1259 again:
1260         n=sigtimedwait(&h->sset, &siginfo, &ts);
1261         if (unlikely(n==-1)){
1262                 if (errno==EINTR) goto again; /* some other signal, ignore it */
1263                 else if (errno==EAGAIN){ /* timeout */
1264                         ret=0;
1265                         goto end;
1266                 }else{
1267                         LOG(L_ERR, "ERROR: io_wait_loop_sigio_rt: sigtimed_wait"
1268                                         " %s [%d]\n", strerror(errno), errno);
1269                         goto error;
1270                 }
1271         }
1272         if (likely(n!=SIGIO)){
1273 #ifdef SIGINFO64_WORKARROUND
1274                 /* on linux siginfo.si_band is defined as long in userspace
1275                  * and as int in kernel (< 2.6.5) => on 64 bits things will break!
1276                  * (si_band will include si_fd, and si_fd will contain
1277                  *  garbage).
1278                  *  see /usr/src/linux/include/asm-generic/siginfo.h and
1279                  *      /usr/include/bits/siginfo.h
1280                  *  On newer kernels this is fixed (si_band is long in the kernel too).
1281                  * -- andrei */
1282                 if  ((_os_ver<0x020605) && (sizeof(siginfo.si_band)>sizeof(int))){
1283                         pi=(int*)(void*)&siginfo.si_band; /* avoid type punning warnings */
1284                         sigio_band=*pi;
1285                         sigio_fd=*(pi+1);
1286                 }else
1287 #endif
1288                 {
1289                         sigio_band=siginfo.si_band;
1290                         sigio_fd=siginfo.si_fd;
1291                 }
1292                 if (unlikely(siginfo.si_code==SI_SIGIO)){
1293                         /* old style, we don't know the event (linux 2.2.?) */
1294                         LOG(L_WARN, "WARNING: io_wait_loop_sigio_rt: old style sigio"
1295                                         " interface\n");
1296                         fm=get_fd_map(h, sigio_fd);
1297                         /* we can have queued signals generated by fds not watched
1298                          * any more, or by fds in transition, to a child => ignore them*/
1299                         if (fm->type)
1300                                 handle_io(fm, POLLIN|POLLOUT, -1);
1301                 }else{
1302                         /* si_code contains the SIGPOLL reason: POLL_IN, POLL_OUT,
1303                          *  POLL_MSG, POLL_ERR, POLL_PRI or POLL_HUP
1304                          * and si_band the translated poll event bitmap:
1305                          *  POLLIN|POLLRDNORM  (=POLL_IN),
1306                          *  POLLOUT|POLLWRNORM|POLLWRBAND (=POLL_OUT),
1307                          *  POLLIN|POLLRDNORM|POLLMSG (=POLL_MSG),
1308                          *  POLLERR (=POLL_ERR),
1309                          *  POLLPRI|POLLRDBAND (=POLL_PRI),
1310                          *  POLLHUP|POLLERR (=POLL_HUP)
1311                          *  [linux 2.6.22 fs/fcntl.c:447]
1312                          */
1313 #ifdef EXTRA_DEBUG
1314                         DBG("io_wait_loop_sigio_rt: siginfo: signal=%d (%d),"
1315                                         " si_code=%d, si_band=0x%x,"
1316                                         " si_fd=%d\n",
1317                                         siginfo.si_signo, n, siginfo.si_code,
1318                                         (unsigned)sigio_band,
1319                                         sigio_fd);
1320 #endif
1321                         /* on some errors (e.g. when receving TCP RST), sigio_band will
1322                          * be set to 0x08 (POLLERR) or 0x18 (POLLERR|POLLHUP - on stream
1323                          *  unix socket close) , so better catch all events --andrei */
1324                         if (likely(sigio_band)){
1325                                 fm=get_fd_map(h, sigio_fd);
1326                                 revents=sigio_band;
1327                                 /* fix revents==POLLPRI case */
1328                                 revents |= (!(revents & POLLPRI)-1) & POLLIN;
1329                                 /* we can have queued signals generated by fds not watched
1330                                  * any more, or by fds in transition, to a child
1331                                  * => ignore them */
1332                                 if (fm->type && ((fm->events|POLLERR|POLLHUP) & revents))
1333                                         handle_io(fm, revents, -1);
1334                                 else
1335                                         DBG("WARNING: io_wait_loop_sigio_rt: ignoring event"
1336                                                         " %x on fd %d, watching for %x, si_code=%x "
1337                                                         "(fm->type=%d, fm->fd=%d, fm->data=%p)\n",
1338                                                         sigio_band, sigio_fd, fm->events, siginfo.si_code,
1339                                                         fm->type, fm->fd, fm->data);
1340                         }else{
1341                                 LOG(L_ERR, "ERROR: io_wait_loop_sigio_rt: unexpected event"
1342                                                         " on fd %d: %x\n", sigio_fd, sigio_band);
1343                         }
1344                 }
1345         }else{
1346                 /* signal queue overflow
1347                  * TODO: increase signal queue size: 2.4x /proc/.., 2.6x -rlimits */
1348                 LOG(L_WARN, "WARNING: io_wait_loop_sigio_rt: signal queue overflowed"
1349                                         "- falling back to poll\n");
1350                 /* clear real-time signal queue
1351                  * both SIG_IGN and SIG_DFL are needed , it doesn't work
1352                  * only with SIG_DFL  */
1353                 if (signal(h->signo, SIG_IGN)==SIG_ERR){
1354                         LOG(L_CRIT, "BUG: do_poll: couldn't reset signal to IGN\n");
1355                 }
1356                 
1357                 if (signal(h->signo, SIG_DFL)==SIG_ERR){
1358                         LOG(L_CRIT, "BUG: do_poll: couldn't reset signal to DFL\n");
1359                 }
1360                 /* falling back to normal poll */
1361                 ret=io_wait_loop_poll(h, -1, 1);
1362         }
1363 end:
1364         return ret;
1365 error:
1366         return -1;
1367 }
1368 #endif
1369
1370
1371
1372 #ifdef HAVE_DEVPOLL
1373 inline static int io_wait_loop_devpoll(io_wait_h* h, int t, int repeat)
1374 {
1375         int n, r;
1376         int ret;
1377         struct dvpoll dpoll;
1378         struct fd_map* fm;
1379
1380                 dpoll.dp_timeout=t*1000;
1381                 dpoll.dp_nfds=h->fd_no;
1382                 dpoll.dp_fds=h->fd_array;
1383 again:
1384                 ret=n=ioctl(h->dpoll_fd, DP_POLL, &dpoll);
1385                 if (unlikely(n==-1)){
1386                         if (errno==EINTR) goto again; /* signal, ignore it */
1387                         else{
1388                                 LOG(L_ERR, "ERROR:io_wait_loop_devpoll: ioctl: %s [%d]\n",
1389                                                 strerror(errno), errno);
1390                                 goto error;
1391                         }
1392                 }
1393                 for (r=0; r< n; r++){
1394                         if (h->fd_array[r].revents & (POLLNVAL|POLLERR)){
1395                                 LOG(L_ERR, "ERROR: io_wait_loop_devpoll: pollinval returned"
1396                                                         " for fd %d, revents=%x\n",
1397                                                         h->fd_array[r].fd, h->fd_array[r].revents);
1398                         }
1399                         /* POLLIN|POLLHUP just go through */
1400                         fm=get_fd_map(h, h->fd_array[r].fd);
1401                         while(fm->type && (fm->events & h->fd_array[r].revents) &&
1402                                         (handle_io(fm, h->fd_array[r].revents, r) > 0) && repeat);
1403                 }
1404 error:
1405         return ret;
1406 }
1407 #endif
1408
1409
1410
1411 /* init */
1412
1413
1414 /* initializes the static vars/arrays
1415  * params:      h - pointer to the io_wait_h that will be initialized
1416  *         max_fd - maximum allowed fd number
1417  *         poll_m - poll method (0 for automatic best fit)
1418  */
1419 int init_io_wait(io_wait_h* h, int max_fd, enum poll_types poll_method);
1420
1421 /* destroys everything init_io_wait allocated */
1422 void destroy_io_wait(io_wait_h* h);
1423
1424
1425 #endif