- tcp major changes part 1: support for >1024 connection, better io poll model
authorAndrei Pelinescu-Onciul <andrei@iptel.org>
Thu, 16 Jun 2005 14:05:24 +0000 (14:05 +0000)
committerAndrei Pelinescu-Onciul <andrei@iptel.org>
Thu, 16 Jun 2005 14:05:24 +0000 (14:05 +0000)
 (best poll method selected automatically, there is also an option to enforce
  it). So far support for epoll (linux >= 2.5.66), sigio + real time signals
   (linux), poll , select. kqueue (*bsd) and /dev/poll (solaris) comming soon.
WARNING: this is still work in progress, the tcp reader part is still not
 converted to he new model (this means that while the tcp_main process supports
  > 1024 fds, the tcp childs don't), the main reason for leaving this out for
 now is debugging.
 Still to do: config options for poll_method (for now use -W method if you
  want to force one), config options for tcp timeouts a.s.o.

14 files changed:
Makefile.defs
fifo_server.c
globals.h
io_wait.c [new file with mode: 0644]
io_wait.h [new file with mode: 0644]
main.c
parser/msg_parser.h
pass_fd.c
pass_fd.h
poll_types.h [new file with mode: 0644]
tcp_conn.h
tcp_main.c
tcp_read.c
unixsock_server.c

index d66ab20..2ec8590 100644 (file)
@@ -55,7 +55,7 @@ MAIN_NAME=ser
 VERSION = 0
 PATCHLEVEL = 10
 SUBLEVEL =   99
-EXTRAVERSION = -dev7
+EXTRAVERSION = -dev8-new_tcp
 
 RELEASE=$(VERSION).$(PATCHLEVEL).$(SUBLEVEL)$(EXTRAVERSION)
 OS = $(shell uname -s | sed -e s/SunOS/solaris/ | tr "[A-Z]" "[a-z]")
@@ -913,6 +913,18 @@ ifeq ($(OS), linux)
                DEFS+= -DUSE_SYSV_SEM  # try posix sems
                found_lock_method=yes
        endif
+       # check for 2.6
+       ifneq ($(shell echo "$(OSREL)"|grep "^2\.6\."),)
+               ifeq ($(NO_EPOLL),)
+                       DEFS+=-DHAVE_EPOLL
+               endif
+       endif
+       ifeq ($(NO_SIGIO),)
+               DEFS+=-DHAVE_SIGIO_RT
+       endif
+       ifeq ($(NO_SELECT),)
+               DEFS+=-DHAVE_SELECT
+       endif
 endif
 
 ifeq  ($(OS), solaris)
@@ -922,6 +934,9 @@ ifeq  ($(OS), solaris)
                DEFS+= -DUSE_PTHREAD_MUTEX  # try pthread sems
                found_lock_method=yes
        endif
+       ifeq ($(NO_SELECT),)
+               DEFS+=-DHAVE_SELECT
+       endif
        ifeq ($(mode), release)
                #use these only if you're using gcc with Solaris ld
                #LDFLAGS=-O2 $(PROFILE)
@@ -956,6 +971,9 @@ ifeq ($(OS), freebsd)
        else
                LIBS= -lfl  #dlopen is in libc
        endif
+       ifeq ($(NO_SELECT),)
+               DEFS+=-DHAVE_SELECT
+       endif
        YACC=yacc
 endif
 
@@ -966,6 +984,9 @@ ifeq ($(OS), openbsd)
                DEFS+= -DUSE_PTHREAD_MUTEX  # try pthread sems
                found_lock_method=yes
        endif
+       ifeq ($(NO_SELECT),)
+               DEFS+=-DHAVE_SELECT
+       endif
        # (symbols on openbsd are prefixed by "_")
        YACC=yacc
        # no sched_yield on openbsd unless linking with c_r (not recommended)
@@ -995,6 +1016,9 @@ ifeq ($(OS), netbsd)
                DEFS+= -DUSE_SYSV_SEM  # try pthread sems
                found_lock_method=yes
        endif
+       ifeq ($(NO_SELECT),)
+               DEFS+=-DHAVE_SELECT
+       endif
        YACC=yacc
        LIBS= -lfl 
 endif
@@ -1014,6 +1038,9 @@ ifeq ($(OS), darwin)
        else
                LIBS= -lfl -lresolv  #dlopen is in libc
        endif
+       ifeq ($(NO_SELECT),)
+               DEFS+=-DHAVE_SELECT
+       endif
        LDFLAGS=        # darwin doesn't like -O2 or -E
        MOD_LDFLAGS= -bundle -bundle_loader ../../$(MAIN_NAME)
        YACC=yacc
index ebef493..3117435 100644 (file)
@@ -672,6 +672,9 @@ int start_fifo_server()
                return -1;
        }
        if (fifo_pid==0) { /* child == FIFO server */
+               /* record pid twice to avoid the child using it, before
+                * parent gets a chance to set it*/
+               pt[process_no].pid=getpid();
                LOG(L_INFO, "INFO: fifo process starting: %d\n", getpid());
                /* call per-child module initialization too -- some
                   FIFO commands may need it
index bb18f7d..035b99d 100644 (file)
--- a/globals.h
+++ b/globals.h
@@ -36,6 +36,7 @@
 #include "types.h"
 #include "ip_addr.h"
 #include "str.h"
+#include "poll_types.h"
 
 #define NO_DNS     0
 #define DO_DNS     1
@@ -80,6 +81,8 @@ extern int tcp_disable;
 extern int tcp_accept_aliases;
 extern int tcp_connect_timeout;
 extern int tcp_send_timeout;
+extern enum poll_types tcp_poll_method;
+extern int tcp_max_fd_no;
 #endif
 #ifdef USE_TLS
 extern int tls_disable;
@@ -119,9 +122,9 @@ extern unsigned int msg_no;
 extern unsigned long shm_mem_size;
 
 /* FIFO server config */
-char extern *fifo; /* FIFO name */
+extern char *fifo; /* FIFO name */
 extern int fifo_mode;
-char extern *fifo_dir; /* dir. where  reply fifos are allowed */
+extern char *fifo_dir; /* dir. where  reply fifos are allowed */
 extern char *fifo_db_url;  /* db url used by db_fifo interface */
 
 /* UNIX domain socket configuration */
diff --git a/io_wait.c b/io_wait.c
new file mode 100644 (file)
index 0000000..c14c991
--- /dev/null
+++ b/io_wait.c
@@ -0,0 +1,492 @@
+/* 
+ * $Id$
+ * 
+ * Copyright (C) 2005 iptelorg GmbH
+ *
+ * This file is part of ser, a free SIP server.
+ *
+ * ser is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version
+ *
+ * For a license to use the ser software under conditions
+ * other than those described here, or to purchase support for this
+ * software, please contact iptel.org by e-mail at the following addresses:
+ *    info@iptel.org
+ *
+ * ser is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ */
+/* 
+ * tcp io wait common stuff used by tcp_main.c & tcp_read.c
+ * (see io_wait.h)
+ */
+/* 
+ * History:
+ * --------
+ *  2005-06-15  created by andrei
+ */
+
+
+
+#ifdef USE_TCP /* for now it make sense only with tcp */
+
+#ifdef HAVE_EPOLL
+#include <unistd.h> /* close() */
+#endif
+
+#include <sys/utsname.h> /* uname() */
+#include <stdlib.h> /* strtol() */
+#include "io_wait.h"
+
+
+#include "mem/mem.h"
+
+#ifndef local_malloc
+#define local_malloc pkg_malloc
+#endif
+#ifndef local_free
+#define local_free pkg_free
+#endif
+
+char* poll_support="poll"
+#ifdef HAVE_EPOLL
+", epoll_lt, epoll_et"
+#endif
+#ifdef HAVE_SIGIO_RT
+", sigio_rt"
+#endif
+#ifdef HAVE_SELECT
+", select"
+#endif
+#ifdef HAVE_KQUEUE
+", kqueue"
+#endif
+#ifdef HAVE_DEVPOLL
+", /dev/poll"
+#endif
+;
+
+
+char* poll_method_str[POLL_END]={ "none", "poll", "epoll_lt", "epoll_et", 
+                                                                 "sigio_rt", "select", "kqueue",  "/dev/poll"
+                                                               };
+
+#ifdef HAVE_SIGIO_RT
+static int _sigio_init=0;
+static int _sigio_crt_rtsig;
+static sigset_t _sigio_rtsig_used;
+#endif
+
+
+
+#ifdef HAVE_SIGIO_RT
+/* sigio specific init
+ * returns -1 on error, 0 on success */
+static int init_sigio(io_wait_h* h, int rsig)
+{
+       int r;
+       int n;
+       int signo;
+       int start_sig;
+       sigset_t oldset;
+       
+       if (!_sigio_init){
+               _sigio_init=1;
+               _sigio_crt_rtsig=SIGRTMIN;
+               sigemptyset(&_sigio_rtsig_used);
+       }
+       h->signo=0;
+       
+       if (rsig==0){
+               start_sig=_sigio_crt_rtsig;
+               n=SIGRTMAX-SIGRTMIN;
+       }else{
+               if ((rsig < SIGRTMIN) || (rsig >SIGRTMAX)){
+                       LOG(L_CRIT, "ERROR: init_sigio: real time signal %d out of"
+                                                 " range  [%d, %d]\n", rsig, SIGRTMIN, SIGRTMAX);
+                       goto error;
+               }
+               start_sig=rsig;
+               n=0;
+       }
+       
+       sigemptyset(&h->sset);
+       sigemptyset(&oldset);
+retry1:
+       /* get current block mask */
+       if (sigprocmask(SIG_BLOCK, &h->sset, &oldset )==-1){
+               if (errno==EINTR) goto retry1;
+               LOG(L_ERR, "ERROR: init_sigio: 1st sigprocmask failed: %s [%d]\n",
+                               strerror(errno), errno);
+               /* try to continue */
+       }
+       
+       for (r=start_sig; r<=(n+start_sig); r++){
+               signo=(r>SIGRTMAX)?r-SIGRTMAX+SIGRTMIN:r;
+               if (! sigismember(&_sigio_rtsig_used, signo) &&
+                       ! sigismember(&oldset, signo)){
+                       sigaddset(&_sigio_rtsig_used, signo);
+                       h->signo=signo;
+                       _sigio_crt_rtsig=(signo<SIGRTMAX)?signo+1:SIGRTMIN;
+                       break;
+               }
+       }
+       
+       if (h->signo==0){
+                       LOG(L_CRIT, "ERROR: init_sigio: %s\n",
+                                       rsig?"could not assign requested real-time signal":
+                                                "out of real-time signals");
+                       goto error;
+       }
+
+       DBG("init_sigio: trying signal %d... \n", h->signo);
+       
+       if (sigaddset(&h->sset, h->signo)==-1){
+               LOG(L_ERR, "ERROR: init_sigio: sigaddset failed for %d: %s [%d]\n",
+                               h->signo, strerror(errno), errno);
+               goto error;
+       }
+       if (sigaddset(&h->sset, SIGIO)==-1){
+               LOG(L_ERR, "ERROR: init_sigio: sigaddset failed for %d: %s [%d]\n",
+                               SIGIO, strerror(errno), errno);
+               goto error;
+       }
+retry:
+       if (sigprocmask(SIG_BLOCK, &h->sset, 0)==-1){
+               if (errno==EINTR) goto retry;
+               LOG(L_ERR, "ERROR: init_sigio: sigprocmask failed: %s [%d]\n",
+                               strerror(errno), errno);
+               goto error;
+       }
+       return 0;
+error:
+       h->signo=0;
+       sigemptyset(&h->sset);
+       return -1;
+}
+
+
+
+/* sigio specific destroy */
+static void destroy_sigio(io_wait_h* h)
+{
+       if (h->signo){
+               sigprocmask(SIG_UNBLOCK, &h->sset, 0);
+               sigemptyset(&h->sset);
+               sigdelset(&_sigio_rtsig_used, h->signo);
+               h->signo=0;
+       }
+}
+#endif
+
+
+
+#ifdef HAVE_EPOLL
+/* epoll specific init
+ * returns -1 on error, 0 on success */
+static int init_epoll(io_wait_h* h)
+{
+       h->epfd=epoll_create(h->max_fd_no);
+       if (h->epfd==-1){
+               LOG(L_ERR, "ERROR: init_epoll: epoll_create: %s [%d]\n",
+                               strerror(errno), errno);
+               return -1;
+       }
+       return 0;
+}
+
+
+
+static void destroy_epoll(io_wait_h* h)
+{
+       if (h->epfd!=-1){
+               close(h->epfd);
+               h->epfd=-1;
+       }
+}
+#endif
+
+
+
+#ifdef HAVE_SELECT
+static int init_select(io_wait_h* h)
+{
+       FD_ZERO(&h->master_set);
+       return 0;
+}
+#endif
+
+
+
+/* return system version (major.minor.minor2) as
+ *  (major<<16)|(minor)<<8|(minor2)
+ * (if some of them are missing, they are set to 0)
+ * if the parameters are not null they are set to the coresp. part 
+ */
+static unsigned int get_sys_version(int* major, int* minor, int* minor2)
+{
+       struct utsname un;
+       int m1;
+       int m2;
+       int m3;
+       char* p;
+       
+       memset (&un, 0, sizeof(un));
+       m1=m2=m3=0;
+       /* get sys version */
+       uname(&un);
+       m1=strtol(un.release, &p, 10);
+       if (*p=='.'){
+               p++;
+               m2=strtol(p, &p, 10);
+               if (*p=='.'){
+                       p++;
+                       m3=strtol(p, &p, 10);
+               }
+       }
+       if (major) *major=m1;
+       if (minor) *minor=m2;
+       if (minor2) *minor2=m3;
+       return ((m1<<16)|(m2<<8)|(m3));
+}
+
+
+
+/*
+ * returns 0 on success, and an error message on error
+ */
+char* check_poll_method(enum poll_types poll_method)
+{
+       char* ret;
+       ret=0;
+       
+       switch(poll_method){
+               case POLL_NONE:
+                       break;
+               case POLL_POLL:
+                       /* always supported */
+                       break;
+               case POLL_SELECT:
+                       /* should be always supported */
+#ifndef HAVE_SELECT
+                       ret="select not supported, try re-compiling with -DHAVE_SELECT";
+#endif
+                       break;
+               case POLL_EPOLL_LT:
+               case POLL_EPOLL_ET:
+                       /* only on 2.6 + */
+#ifndef HAVE_EPOLL
+                       ret="epoll not supported, try re-compiling with -DHAVE_EPOLL";
+#else
+                       if (get_sys_version(0,0,0)<0x020542) /* if ver < 2.5.66 */
+                               ret="epoll not supported on kernels < 2.6";
+#endif
+                       break;
+               case POLL_SIGIO_RT:
+#ifndef HAVE_SIGIO_RT
+                       ret="sigio_rt not supported, try re-compiling with"
+                               " -DHAVE_SIGIO_RT";
+#endif
+                       break;
+               default:
+                       ret="unknown not supported method";
+       }
+       return ret;
+}
+
+
+
+enum poll_types choose_poll_method()
+{
+       enum poll_types poll_method;
+       
+       poll_method=0;
+#ifdef HAVE_EPOLL
+       if (get_sys_version(0,0,0)>=0x020542) /* if ver >= 2.5.66 */
+               poll_method=POLL_EPOLL_LT; /* or POLL_EPOLL_ET */
+               
+#endif
+#ifdef  HAVE_SIGIO_RT
+               if (poll_method==0) poll_method=POLL_SIGIO_RT;
+#endif
+               if (poll_method==0) poll_method=POLL_POLL;
+       return poll_method;
+}
+
+
+
+char* poll_method_name(enum poll_types poll_method)
+{
+       if ((poll_method>=POLL_NONE) && (poll_method<POLL_END))
+               return poll_method_str[poll_method];
+       else
+               return "invalid poll method";
+}
+
+
+
+
+/* converts a string into a poll_method
+ * returns POLL_NONE (0) on error, else the corresponding poll type */
+enum poll_types get_poll_type(char* s)
+{
+       int r;
+       int l;
+       
+       l=strlen(s);
+       for (r=POLL_END-1; r>POLL_NONE; r--)
+               if ((strlen(poll_method_str[r])==l) &&
+                       (strncasecmp(poll_method_str[r], s, l)==0))
+                       break;
+       return r; 
+}
+
+
+
+/* initializes the static vars/arrays
+ * params:      h - pointer to the io_wait_h that will be initialized
+ *         max_fd - maximum allowed fd number
+ *         poll_m - poll method (0 for automatic best fit)
+ */
+int init_io_wait(io_wait_h* h, int max_fd, enum poll_types poll_method)
+{
+       char * poll_err;
+       
+       memset(h, 0, sizeof(*h));
+       h->max_fd_no=max_fd;
+#ifdef HAVE_EPOLL
+       h->epfd=-1;
+#endif
+       
+       poll_err=check_poll_method(poll_method);
+       
+       /* set an appropiate poll method */
+       if (poll_err || (poll_method==0)){
+               poll_method=choose_poll_method();
+               if (poll_err){
+                       LOG(L_ERR, "ERROR: init_io_wait: %s, using %s instead\n",
+                                       poll_err, poll_method_str[poll_method]);
+               }else{
+                       LOG(L_INFO, "init_io_wait: using %s as the io watch method"
+                                       " (auto detected)\n", poll_method_str[poll_method]);
+               }
+       }else{
+                       LOG(L_INFO, "init_io_wait: using %s io watch method (forced)\n",
+                                       poll_method_str[poll_method]);
+       }
+
+       
+       h->poll_method=poll_method;
+       
+       /* common stuff, evrybody has fd_hash */
+       h->fd_hash=local_malloc(sizeof(*(h->fd_hash))*h->max_fd_no);
+       if (h->fd_hash==0){
+               LOG(L_CRIT, "ERROR: init_io_wait: could not alloc"
+                                       " fd hashtable (%d bytes)\n",
+                                       sizeof(*(h->fd_hash))*h->max_fd_no );
+               goto error;
+       }
+       memset((void*)h->fd_hash, 0, sizeof(*(h->fd_hash))*h->max_fd_no);
+       
+       switch(poll_method){
+               case POLL_POLL:
+#ifdef HAVE_SELECT
+               case POLL_SELECT:
+#endif
+#ifdef HAVE_SIGIO_RT
+               case POLL_SIGIO_RT:
+#endif
+                       h->fd_array=local_malloc(sizeof(*(h->fd_array))*h->max_fd_no);
+                       if (h->fd_array==0){
+                               LOG(L_CRIT, "ERROR: init_io_wait: could not"
+                                                       " alloc fd array (%d bytes)\n",
+                                                       sizeof(*(h->fd_hash))*h->max_fd_no);
+                               goto error;
+                       }
+                       memset((void*)h->fd_array, 0, sizeof(*(h->fd_array))*h->max_fd_no);
+#ifdef HAVE_SIGIO_RT
+                       if ((poll_method==POLL_SIGIO_RT) && (init_sigio(h, 0)<0)){
+                               LOG(L_CRIT, "ERROR: init_io_wait: sigio init failed\n");
+                               goto error;
+                       }
+#endif
+#ifdef HAVE_SELECT
+                       if ((poll_method==POLL_SELECT) && (init_select(h)<0)){
+                               LOG(L_CRIT, "ERROR: init_io_wait: select init failed\n");
+                               goto error;
+                       }
+#endif
+                       
+                       break;
+#ifdef HAVE_EPOLL
+               case POLL_EPOLL_LT:
+               case POLL_EPOLL_ET:
+                       h->ep_array=local_malloc(sizeof(*(h->ep_array))*h->max_fd_no);
+                       if (h->ep_array==0){
+                               LOG(L_CRIT, "ERROR: init_io_wait: could not alloc"
+                                                       " epoll array\n");
+                               goto error;
+                       }
+                       memset((void*)h->ep_array, 0, sizeof(*(h->ep_array))*h->max_fd_no);
+                       if (init_epoll(h)<0){
+                               LOG(L_CRIT, "ERROR: init_io_wait: epoll init failed\n");
+                               goto error;
+                       }
+                       break;
+#endif
+               default:
+                       LOG(L_CRIT, "BUG: init_io_wait: unknown/unsupported poll"
+                                               " method %s (%d)\n",
+                                               poll_method_str[poll_method], poll_method);
+                       goto error;
+       }
+       return 0;
+error:
+       return -1;
+}
+
+
+
+/* destroys everything init_io_wait allocated */
+void destroy_io_wait(io_wait_h* h)
+{
+       switch(h->poll_method){
+#ifdef HAVE_EPOLL
+               case POLL_EPOLL_LT:
+               case POLL_EPOLL_ET:
+                       destroy_epoll(h);
+                       if (h->ep_array){
+                               local_free(h->ep_array);
+                               h->ep_array=0;
+                       }
+               break;
+#endif
+#ifdef HAVE_SIGIO_RT
+               case POLL_SIGIO_RT:
+                       destroy_sigio(h);
+                       break;
+#endif
+               default: /*do  nothing*/
+                       ;
+       }
+               if (h->fd_array){
+                       local_free(h->fd_array);
+                       h->fd_array=0;
+               }
+               if (h->fd_hash){
+                       local_free(h->fd_hash);
+                       h->fd_hash=0;
+               }
+}
+
+
+
+#endif
diff --git a/io_wait.h b/io_wait.h
new file mode 100644 (file)
index 0000000..6e5e586
--- /dev/null
+++ b/io_wait.h
@@ -0,0 +1,663 @@
+/* 
+ * $Id$
+ * 
+ * Copyright (C) 2005 iptelorg GmbH
+ *
+ * This file is part of ser, a free SIP server.
+ *
+ * ser is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version
+ *
+ * For a license to use the ser software under conditions
+ * other than those described here, or to purchase support for this
+ * software, please contact iptel.org by e-mail at the following addresses:
+ *    info@iptel.org
+ *
+ * ser is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ */
+/*
+ * tcp io wait common stuff used by tcp_main.c & tcp_read.c
+ * All the functions are inline because of speed reasons and because they are
+ * used only from 2 places.
+ * You also have to define:
+ *     int handle_io(struct fd_map* fm, int idx) (see below)
+ *     (this could be trivially replaced by a callback pointer entry attached
+ *      to the io_wait handler if more flexibility rather then performance
+ *      is needed)
+ *      fd_type - define to some enum of you choice and define also
+ *                FD_TYPE_DEFINED (if you don't do it fd_type will be defined
+ *                to int). 0 has a special not set/not init. meaning
+ *                (a lot of sanity checks and the sigio_rt code are based on
+ *                 this assumption)
+ *     local_malloc (defaults to pkg_malloc)
+ *     local_free   (defaults to pkg_free)
+ *  
+ */
+/* 
+ * History:
+ * --------
+ *  2005-06-13  created by andrei
+ */
+
+
+
+#ifndef _io_wait_h
+#define _io_wait_h
+
+#include <errno.h>
+#include <string.h>
+#ifdef HAVE_SIGIO_RT
+#define __USE_GNU /* or else F_SETSIG won't be included */
+#include <sys/types.h> /* recv */
+#include <sys/socket.h> /* recv */
+#include <signal.h> /* sigprocmask, sigwait a.s.o */
+#endif
+#ifdef HAVE_EPOLL
+#include <sys/epoll.h>
+#endif
+#ifdef HAVE_SELECT
+#include <sys/select.h>
+#endif
+#include <sys/poll.h>
+#include <fcntl.h>
+
+#include "dprint.h"
+
+#include "poll_types.h" /* poll_types*/
+#ifdef HAVE_SIGIO_RT
+#include "pt.h" /* mypid() */
+#endif
+
+
+#if 0
+enum fd_types; /* this should be defined from the including file,
+                                 see tcp_main.c for an example, 
+                                 0 has a special meaning: not used/empty*/
+#endif
+
+#ifndef FD_TYPE_DEFINED
+typedef int fd_type;
+#define FD_TYPE_DEFINED
+#endif
+
+/* maps a fd to some other structure; used in almost all cases
+ * except epoll and maybe kqueue or /dev/poll */
+struct fd_map{
+       int fd;               /* fd no */
+       fd_type type;         /* "data" type */
+       void* data;           /* pointer to the corresponding structure */
+};
+
+
+
+/* handler structure */
+struct io_wait_handler{
+#ifdef HAVE_EPOLL
+       struct epoll_event* ep_array;
+       int epfd; /* epoll ctrl fd */
+#endif
+#ifdef HAVE_SIGIO_RT
+       sigset_t sset; /* signal mask for sigio & sigrtmin */
+       int signo;     /* real time signal used */
+#endif
+#ifdef HAVE_SELECT
+       fd_set master_set;
+       int max_fd_select; /* maximum select used fd */
+#endif
+       /* common stuff for POLL, SIGIO_RT and SELECT
+        * since poll support is always compiled => this will always be compiled */
+       struct fd_map* fd_hash;
+       struct pollfd* fd_array;
+       int fd_no; /*  current index used in fd_array */
+       int max_fd_no; /* maximum fd no, is also the size of fd_array,
+                                                      fd_hash  and ep_array*/
+       enum poll_types poll_method;
+       int flags;
+};
+
+typedef struct io_wait_handler io_wait_h;
+
+
+/* get the corresponding fd_map structure pointer */
+#define get_fd_map(h, fd)              (&(h)->fd_hash[(fd)])
+/* remove a fd_map structure from the hash; the pointer must be returned
+ * by get_fd_map or hash_fd_map*/
+#define unhash_fd_map(pfm)     \
+       do{ \
+               (pfm)->type=0 /*F_NONE */; \
+               (pfm)->fd=-1; \
+       }while(0)
+
+/* add a fd_map structure to the fd hash */
+static inline struct fd_map* hash_fd_map(      io_wait_h* h,
+                                                                                       int fd,
+                                                                                       fd_type type,
+                                                                                       void* data)
+{
+       h->fd_hash[fd].fd=fd;
+       h->fd_hash[fd].type=type;
+       h->fd_hash[fd].data=data;
+       return &h->fd_hash[fd];
+}
+
+
+#ifdef HAVE_SIGIO_RT
+typedef unsigned int sigio_rtsig_mask_t;
+extern sigset_t _sigio_rtsig_used;
+extern int _sigio_crt_rtsig;
+extern int _sigio_init;
+#endif
+
+
+
+#ifdef HANDLE_IO_INLINE
+/* generic handle io routine, this must be defined in the including file
+ * (faster then registering a callback pointer)
+ *
+ * params:  fm  - pointer to a fd hash entry
+ *          idx - index in the fd_array (or -1 if not known)
+ * return: -1 on error
+ *          0 on EAGAIN or when by some other way it is known that no more 
+ *            io events are queued on the fd (the receive buffer is empty).
+ *            Usefull to detect when there are no more io events queued for
+ *            sigio_rt, epoll_et, kqueue.
+ *         >0 on successfull read from the fd (when there might be more io
+ *            queued -- the receive buffer might still be non-empty)
+ */
+inline static int handle_io(struct fd_map* fm, int idx);
+#else
+int handle_io(struct fd_map* fm, int idx);
+#endif
+
+
+
+/* generic io_watch_add function
+ * returns 0 on success, -1 on error
+ *
+ * this version should be faster than pointers to poll_method specific
+ * functions (it avoids functions calls, the overhead being only an extra
+ *  switch())*/
+inline static int io_watch_add(        io_wait_h* h,
+                                                               int fd,
+                                                               fd_type type,
+                                                               void* data)
+{
+
+       /* helper macros */
+#define fd_array_setup \
+       do{ \
+               h->fd_array[h->fd_no].fd=fd; \
+               h->fd_array[h->fd_no].events=POLLIN; /* useless for select */ \
+               h->fd_array[h->fd_no].revents=0;     /* useless for select */ \
+       }while(0)
+       
+#define set_fd_flags(f) \
+       do{ \
+                       flags=fcntl(fd, F_GETFL); \
+                       if (flags==-1){ \
+                               LOG(L_ERR, "ERROR: io_watch_add: fnctl: GETFL failed:" \
+                                               " %s [%d]\n", strerror(errno), errno); \
+                               goto error; \
+                       } \
+                       if (fcntl(fd, F_SETFL, flags|(f))==-1){ \
+                               LOG(L_ERR, "ERROR: io_watch_add: fnctl: SETFL" \
+                                                       " failed: %s [%d]\n", strerror(errno), errno); \
+                               goto error; \
+                       } \
+       }while(0)
+       
+       
+       struct fd_map* e;
+       int flags;
+#ifdef HAVE_EPOLL
+       int n;
+       struct epoll_event ep_event;
+#endif
+#ifdef HAVE_SIGIO_RT
+       static char buf[65536];
+#endif
+       
+       if (fd==-1){
+               LOG(L_CRIT, "BUG: io_watch_add: fd is -1!\n");
+               goto error;
+       }
+       /* add it to the poll fd array */
+       if (h->fd_no>=h->max_fd_no){
+               LOG(L_CRIT, "ERROR: io_watch_add: maximum fd number exceeded:"
+                               " %d/%d\n", h->fd_no, h->max_fd_no);
+               goto error;
+       }
+       DBG("DBG: io_watch_add(%p, %d, %d, %p), fd_no=%d\n",
+                       h, fd, type, data, h->fd_no);
+       /*  hash sanity check */
+       e=get_fd_map(h, fd);
+       if (e && (e->type!=0 /*F_NONE*/)){
+               LOG(L_ERR, "ERROR: io_watch_add: trying to overwrite entry %d"
+                               " in the hash(%d, %d, %p) with (%d, %d, %p)\n",
+                               fd, e->fd, e->type, e->data, fd, type, data);
+               goto error;
+       }
+       
+       if ((e=hash_fd_map(h, fd, type, data))==0){
+               LOG(L_ERR, "ERROR: io_watch_add: failed to hash the fd %d\n", fd);
+               goto error;
+       }
+       switch(h->poll_method){ /* faster then pointer to functions */
+               case POLL_POLL:
+                       fd_array_setup;
+                       set_fd_flags(O_NONBLOCK);
+                       break;
+#ifdef HAVE_SELECT
+               case POLL_SELECT:
+                       fd_array_setup;
+                       FD_SET(fd, &h->master_set);
+                       if (h->max_fd_select<fd) h->max_fd_select=fd;
+                       break;
+#endif
+#ifdef HAVE_SIGIO_RT
+               case POLL_SIGIO_RT:
+                       fd_array_setup;
+                       /* set async & signal */
+                       if (fcntl(fd, F_SETOWN, my_pid())==-1){
+                               LOG(L_ERR, "ERROR: io_watch_add: fnctl: SETOWN"
+                               " failed: %s [%d]\n", strerror(errno), errno);
+                               goto error;
+                       }
+                       if (fcntl(fd, F_SETSIG, h->signo)==-1){
+                               LOG(L_ERR, "ERROR: io_watch_add: fnctl: SETSIG"
+                                       " failed: %s [%d]\n", strerror(errno), errno);
+                               goto error;
+                       }
+                       /* set both non-blocking and async */
+                       set_fd_flags(O_ASYNC| O_NONBLOCK);
+#ifdef EXTRA_DEBUG
+                       DBG("io_watch_add: sigio_rt on f %d, signal %d to pid %d\n",
+                                       fd,  h->signo, pid);
+#endif
+                       /* empty socket receive buffer, if buffer is already full
+                        * (e.g. early media), no more space to put packets
+                        * => no more signals are ever generated -- andrei */
+                       while(recv(fd, buf, sizeof(buf), 0)>=0);
+                       break;
+#endif
+#ifdef HAVE_EPOLL
+               case POLL_EPOLL_LT:
+                       ep_event.events=EPOLLIN;
+                       ep_event.data.ptr=e;
+again1:
+                       n=epoll_ctl(h->epfd, EPOLL_CTL_ADD, fd, &ep_event);
+                       if (n==-1){
+                               if (errno==EAGAIN) goto again1;
+                               LOG(L_ERR, "ERROR: io_watch_add: epoll_ctl failed: %s [%d]\n",
+                                       strerror(errno), errno);
+                               goto error;
+                       }
+                       break;
+               case POLL_EPOLL_ET:
+                       set_fd_flags(O_NONBLOCK);
+                       ep_event.events=EPOLLIN|EPOLLET;
+                       ep_event.data.ptr=e;
+again2:
+                       n=epoll_ctl(h->epfd, EPOLL_CTL_ADD, fd, &ep_event);
+                       if (n==-1){
+                               if (errno==EAGAIN) goto again2;
+                               LOG(L_ERR, "ERROR: io_watch_add: epoll_ctl failed: %s [%d]\n",
+                                       strerror(errno), errno);
+                               goto error;
+                       }
+                       break;
+#endif
+               default:
+                       LOG(L_CRIT, "BUG: io_watch_add: no support for poll method "
+                                       " %s (%d)\n", poll_method_str[h->poll_method],
+                                       h->poll_method);
+                       goto error;
+       }
+       
+       h->fd_no++; /* "activate" changes, for epoll it
+                                  has only informative value */
+       return 0;
+error:
+       return -1;
+#undef fd_array_setup
+#undef set_fd_flags 
+}
+
+
+
+/* parameters: fd and index in the fd_array
+ * if index==-1, it fd_array will be searched for the corresponding fd
+ * entry (slower but unavoidable in some cases)
+ * index is not used (no fd_arry) for epoll, /dev/poll and kqueue
+ * returns 0 if ok, -1 on error */
+inline static int io_watch_del(io_wait_h* h, int fd, int idx)
+{
+       
+#define fix_fd_array \
+       do{\
+                       if (idx==-1){ \
+                               /* fix idx if -1 and needed */ \
+                               for (idx=0; (idx<h->fd_no) && \
+                                                       (h->fd_array[idx].fd!=fd); idx++); \
+                       } \
+                       if (idx<h->fd_no){ \
+                               memmove(&h->fd_array[idx], &h->fd_array[idx+1], \
+                                       (h->fd_no-(idx+1))*sizeof(*(h->fd_array))); \
+                       } \
+       }while(0)
+       
+       struct fd_map* e;
+#ifdef HAVE_EPOLL
+       int n;
+       struct epoll_event ep_event;
+#endif
+       
+       if ((fd<0) || (fd>=h->max_fd_no)){
+               LOG(L_CRIT, "BUG: io_watch_del: invalid fd %d, not in [0, %d) \n",
+                                               fd, h->fd_no);
+               goto error;
+       }
+       DBG("DBG: io_watch_del (%p, %d, %d) fd_no=%d called\n",
+                       h, fd, idx, h->fd_no);
+       e=get_fd_map(h, fd);
+       /* more sanity checks */
+       if (e==0){
+               LOG(L_CRIT, "BUG: io_watch_del: no corresponding hash entry for %d\n",
+                                       fd);
+               goto error;
+       }
+       if (e->type==0 /*F_NONE*/){
+               LOG(L_ERR, "ERROR: io_watch_del: trying to delete already erased"
+                               " entry %d in the hash(%d, %d, %p) )\n",
+                               fd, e->fd, e->type, e->data);
+               goto error;
+       }
+       
+       unhash_fd_map(e);
+       
+       switch(h->poll_method){
+               case POLL_POLL:
+                       fix_fd_array;
+                       break;
+#ifdef HAVE_SELECT
+               case POLL_SELECT:
+                       fix_fd_array;
+                       FD_CLR(fd, &h->master_set);
+                       if (h->max_fd_select && (h->max_fd_select==fd))
+                               /* we don't know the prev. max, so we just decrement it */
+                               h->max_fd_select--; 
+                       break;
+#endif
+#ifdef HAVE_SIGIO_RT
+               case POLL_SIGIO_RT:
+                       fix_fd_array;
+                       /* FIXME: re-set ASYNC? (not needed if the fd is/will be closed
+                        *        but might cause problems if the fd is "moved")
+                        *        update: probably not needed, the fd_map type!=0
+                        *        check should catch old queued signals or in-transit fd
+                        *        (so making another syscall to reset ASYNC is not 
+                        *         necessary)*/
+                       break;
+#endif
+#ifdef HAVE_EPOLL
+               case POLL_EPOLL_LT:
+               case POLL_EPOLL_ET:
+                       n=epoll_ctl(h->epfd, EPOLL_CTL_DEL, fd, &ep_event);
+                       if (n==-1){
+                               LOG(L_ERR, "ERROR: io_watch_del: removing fd from"
+                                       " epoll list failed: %s [%d]\n", strerror(errno), errno);
+                               goto error;
+                       }
+                       break;
+#endif
+               default:
+                       LOG(L_CRIT, "BUG: io_watch_del: no support for poll method "
+                                       " %s (%d)\n", poll_method_str[h->poll_method], 
+                                       h->poll_method);
+                       goto error;
+       }
+       h->fd_no--;
+       return 0;
+error:
+       return -1;
+#undef fix_fd_array
+}
+
+
+
+/* io_wait_loop_x style function 
+ * wait for io using poll()
+ * params: h      - io_wait handle
+ *         t      - timeout in s
+ *         repeat - if !=0 handle_io will be called until it returns <=0
+ * returns: 0 on success, -1 on err
+ */
+inline static int io_wait_loop_poll(io_wait_h* h, int t, int repeat)
+{
+       int n, r;
+       int ret;
+again:
+               ret=n=poll(h->fd_array, h->fd_no, t*1000);
+               if (n==-1){
+                       if (errno==EINTR) goto again; /* signal, ignore it */
+                       else{
+                               LOG(L_ERR, "ERROR:io_wait_loop_poll: poll: %s [%d]\n",
+                                               strerror(errno), errno);
+                               goto error;
+                       }
+               }
+               for (r=0; (r<h->fd_no) && n; r++){
+                       if (h->fd_array[r].revents & (POLLIN|POLLERR|POLLHUP)){
+                               n--;
+                               /* sanity checks */
+                               if ((h->fd_array[r].fd >= h->max_fd_no)||
+                                               (h->fd_array[r].fd < 0)){
+                                       LOG(L_CRIT, "BUG: io_wait_loop_poll: bad fd %d "
+                                                       "(no in the 0 - %d range)\n",
+                                                       h->fd_array[r].fd, h->max_fd_no);
+                                       /* try to continue anyway */
+                                       h->fd_array[r].events=0; /* clear the events */
+                                       continue;
+                               }
+                               while((handle_io(get_fd_map(h, h->fd_array[r].fd), r) > 0)
+                                                && repeat);
+                       }
+               }
+error:
+       return ret;
+}
+
+
+
+#ifdef HAVE_SELECT
+/* wait for io using select */
+inline static int io_wait_loop_select(io_wait_h* h, int t, int repeat)
+{
+       fd_set sel_set;
+       int n, ret;
+       struct timeval timeout;
+       int r;
+       
+again:
+               sel_set=h->master_set;
+               timeout.tv_sec=t;
+               timeout.tv_usec=0;
+               ret=n=select(h->max_fd_select+1, &sel_set, 0, 0, &timeout);
+               if (n<0){
+                       if (errno==EINTR) goto again; /* just a signal */
+                       LOG(L_ERR, "ERROR: io_wait_loop_select: select: %s [%d]\n",
+                                       strerror(errno), errno);
+                       n=0;
+                       /* continue */
+               }
+               /* use poll fd array */
+               for(r=0; (r<h->max_fd_no) && n; r++){
+                       if (FD_ISSET(h->fd_array[r].fd, &sel_set)){
+                               while((handle_io(get_fd_map(h, h->fd_array[r].fd), r)>0)
+                                               && repeat);
+                               n--;
+                       }
+               };
+       return ret;
+}
+#endif
+
+
+
+#ifdef HAVE_EPOLL
+inline static int io_wait_loop_epoll(io_wait_h* h, int t, int repeat)
+{
+       int n, r;
+       
+again:
+               n=epoll_wait(h->epfd, h->ep_array, h->fd_no, t*1000);
+               if (n==-1){
+                       if (errno==EINTR) goto again; /* signal, ignore it */
+                       else{
+                               LOG(L_ERR, "ERROR:io_wait_loop_epoll_et: epoll_wait:"
+                                               " %s [%d]\n", strerror(errno), errno);
+                               goto error;
+                       }
+               }
+               for (r=0; r<n; r++){
+                       while((handle_io((struct fd_map*)h->ep_array[r].data.ptr, -1)>0)
+                                       && repeat);
+               }
+error:
+       return n;
+}
+#endif
+
+
+
+#ifdef HAVE_SIGIO_RT
+/* sigio rt version has no repeat (it doesn't make sense)*/
+inline static int io_wait_loop_sigio_rt(io_wait_h* h, int t)
+{
+       int n;
+       int ret;
+       struct timespec ts;
+       siginfo_t siginfo;
+       int sigio_band;
+       int sigio_fd;
+       struct fd_map* fm;
+       
+       
+       ret=1; /* 1 event per call normally */
+       ts.tv_sec=t;
+       ts.tv_nsec=0;
+       if (!sigismember(&h->sset, h->signo) || !sigismember(&h->sset, SIGIO)){
+               LOG(L_CRIT, "BUG: io_wait_loop_sigio_rt: the signal mask"
+                               " is not properly set!\n");
+               goto error;
+       }
+
+again:
+       n=sigtimedwait(&h->sset, &siginfo, &ts);
+       if (n==-1){
+               if (errno==EINTR) goto again; /* some other signal, ignore it */
+               else if (errno==EAGAIN){ /* timeout */
+                       ret=0;
+                       goto end;
+               }else{
+                       LOG(L_ERR, "ERROR: io_wait_loop_sigio_rt: sigtimed_wait"
+                                       " %s [%d]\n", strerror(errno), errno);
+                       goto error;
+               }
+       }
+       if (n!=SIGIO){
+#ifdef SIGINFO64_WORKARROUND
+               /* on linux siginfo.si_band is defined as long in userspace
+                * and as int kernel => on 64 bits things will break!
+                * (si_band will include si_fd, and si_fd will contain
+                *  garbage)
+                *  see /usr/src/linux/include/asm-generic/siginfo.h and
+                *      /usr/include/bits/siginfo.h
+                * -- andrei */
+               if (sizeof(siginfo.si_band)>sizeof(int)){
+                       sigio_band=*((int*)&siginfo.si_band);
+                       sigio_fd=*(((int*)&siginfo.si_band)+1);
+               }else
+#endif
+               {
+                       sigio_band=siginfo.si_band;
+                       sigio_fd=siginfo.si_fd;
+               }
+               if (siginfo.si_code==SI_SIGIO){
+                       /* old style, we don't know the event (linux 2.2.?) */
+                       LOG(L_WARN, "WARNING: io_wait_loop_sigio_rt: old style sigio"
+                                       " interface\n");
+                       fm=get_fd_map(h, sigio_fd);
+                       /* we can have queued signals generated by fds not watched
+                        * any more, or by fds in transition, to a child => ignore them*/
+                       if (fm->type)
+                               handle_io(fm, -1);
+               }else{
+#ifdef EXTRA_DEBUG
+                       DBG("io_wait_loop_sigio_rt: siginfo: signal=%d (%d),"
+                                       " si_code=%d, si_band=0x%x,"
+                                       " si_fd=%d\n",
+                                       siginfo.si_signo, n, siginfo.si_code, 
+                                       (unsigned)sigio_band,
+                                       sigio_fd);
+#endif
+                       if (sigio_band&(POLL_IN|POLL_ERR)){
+                               fm=get_fd_map(h, sigio_fd);
+                               /* we can have queued signals generated by fds not watched
+                                * any more, or by fds in transition, to a child 
+                                * => ignore them */
+                               if (fm->type)
+                                       handle_io(fm, -1);
+                       }
+               }
+       }else{
+               /* signal queue overflow 
+                * TODO: increase signal queue size: 2.4x /proc/.., 2.6x -rlimits */
+               LOG(L_WARN, "WARNING: io_wait_loop_sigio_rt: signal queue overflowed"
+                                       "- falling back to poll\n");
+               /* clear real-time signal queue
+                * both SIG_IGN and SIG_DFL are needed , it doesn't work
+                * only with SIG_DFL  */
+               if (signal(h->signo, SIG_IGN)==SIG_ERR){
+                       LOG(L_CRIT, "BUG: do_poll: couldn't reset signal to IGN\n");
+               }
+               
+               if (signal(h->signo, SIG_DFL)==SIG_ERR){
+                       LOG(L_CRIT, "BUG: do_poll: couldn't reset signal to DFL\n");
+               }
+               /* falling back to normal poll */
+               ret=io_wait_loop_poll(h, -1, 1);
+       }
+end:
+       return ret;
+error:
+       return -1;
+}
+#endif
+
+
+
+/* init */
+
+
+/* initializes the static vars/arrays
+ * params:      h - pointer to the io_wait_h that will be initialized
+ *         max_fd - maximum allowed fd number
+ *         poll_m - poll method (0 for automatic best fit)
+ */
+int init_io_wait(io_wait_h* h, int max_fd, enum poll_types poll_method);
+
+/* destroys everything init_io_wait allocated */
+void destroy_io_wait(io_wait_h* h);
+
+
+#endif
diff --git a/main.c b/main.c
index 8e46ac9..14ac58c 100644 (file)
--- a/main.c
+++ b/main.c
@@ -56,6 +56,9 @@
  *               crashed childvwhich still holds the lock  (andrei)
  *  2004-12-02  removed -p, extended -l to support [proto:]address[:port],
  *               added parse_phostport, parse_proto (andrei)
+ *  2005-06-16  always record the pid in pt[process_no].pid twice: once in the
+ *               parent & once in the child to avoid a short window when one
+ *               of them might use it "unset" (andrei)
  */
 
 
 #include "script_cb.h"
 #include "ut.h"
 #ifdef USE_TCP
+#include "poll_types.h"
 #include "tcp_init.h"
 #ifdef USE_TLS
 #include "tls/tls_init.h"
@@ -155,7 +159,8 @@ Options:\n\
     -E           Log to stderr\n"
 #ifdef USE_TCP
 "    -T           Disable tcp\n\
-    -N           Number of tcp child processes (default: equal to `-n`)\n"
+    -N           Number of tcp child processes (default: equal to `-n`)\n\
+    -W           poll method\n"
 #endif
 "    -V           Version number\n\
     -h           This help message\n\
@@ -190,6 +195,9 @@ void print_ct_constants()
                        " MAX_URI_SIZE %d, BUF_SIZE %d\n",
                MAX_RECV_BUFFER_SIZE, MAX_LISTEN, MAX_URI_SIZE, 
                BUF_SIZE );
+#ifdef USE_TCP
+       printf("poll method support: %s.\n", poll_support);
+#endif
 }
 
 /* debugging function */
@@ -862,6 +870,9 @@ int main_loop()
                                
                                if (pid==0){
                                        /* child */
+                                       /* record pid twice to avoid the child using it, before
+                                        * parent gets a chance to set it*/
+                                       pt[process_no].pid=getpid();
                                        /* timer!*/
                                        /* process_bit = 0; */
                                        if (init_child(PROC_TIMER) < 0) {
@@ -1021,6 +1032,9 @@ int main_loop()
                                                unix_tcp_sock=sockfd[1];
                                        }
 #endif
+                                       /* record pid twice to avoid the child using it, before
+                                        * parent gets a chance to set it*/
+                                       pt[process_no].pid=getpid();
                                        bind_address=si; /* shortcut */
                                        if (init_child(i + 1) < 0) {
                                                LOG(L_ERR, "init_child failed\n");
@@ -1084,6 +1098,9 @@ int main_loop()
                                unix_tcp_sock=sockfd[1];
                        }
 #endif
+                       /* record pid twice to avoid the child using it, before
+                        * parent gets a chance to set it*/
+                       pt[process_no].pid=getpid();
                        if (init_child(PROC_TIMER) < 0) {
                                LOG(L_ERR, "timer: init_child failed\n");
                                goto error;
@@ -1120,6 +1137,9 @@ int main_loop()
                        }else if (pid==0){
                                /* child */
                                /* is_main=0; */
+                               /* record pid twice to avoid the child using it, before
+                                * parent gets a chance to set it*/
+                               pt[process_no].pid=getpid();
                                if (init_child(PROC_TCP_MAIN) < 0) {
                                        LOG(L_ERR, "tcp_main: error in init_child\n");
                                        goto error;
@@ -1215,7 +1235,7 @@ int main(int argc, char** argv)
 #ifdef STATS
        "s:"
 #endif
-       "f:cm:b:l:n:N:rRvdDETVhw:t:u:g:P:G:i:x:";
+       "f:cm:b:l:n:N:rRvdDETVhw:t:u:g:P:G:i:x:W:";
        
        while((c=getopt(argc,argv,options))!=-1){
                switch(c){
@@ -1308,6 +1328,18 @@ int main(int argc, char** argv)
                                        fprintf(stderr,"WARNING: tcp support not compiled in\n");
 #endif
                                        break;
+                       case 'W':
+#ifdef USE_TCP
+                                       tcp_poll_method=get_poll_type(optarg);
+                                       if (tcp_poll_method==POLL_NONE){
+                                               fprintf(stderr, "bad poll method name: -W %s\ntry "
+                                                                               "one of %s.\n", optarg, poll_support);
+                                               goto error;
+                                       }
+#else
+                                       fprintf(stderr,"WARNING: tcp support not compiled in\n");
+#endif
+                                       break;
                        case 'V':
                                        printf("version: %s\n", version);
                                        printf("flags: %s\n", flags );
index 6bf74a6..b71b7e0 100644 (file)
@@ -309,13 +309,13 @@ inline static int char_msg_val( struct sip_msg *msg, char *cv )
 inline static char* get_body(struct sip_msg *msg)
 {
        int offset;
-       int len;
+       unsigned int len;
 
        if ( parse_headers(msg, HDR_EOH_F, 0)==-1 )
                return 0;
 
        if (msg->unparsed){
-               len=(int)(msg->unparsed-msg->buf);
+               len=(unsigned int)(msg->unparsed-msg->buf);
        }else return 0;
        if ((len+2<=msg->len) && (strncmp(CRLF,msg->unparsed,CRLF_LEN)==0) )
                offset = CRLF_LEN;
index 02c0bef..6ec15a7 100644 (file)
--- a/pass_fd.c
+++ b/pass_fd.c
@@ -31,6 +31,8 @@
   *  2003-02-20  added solaris support (! HAVE_MSGHDR_MSG_CONTROL) (andrei)
   *  2003-11-03  added send_all, recv_all  and updated send/get_fd
   *               to handle signals  (andrei)
+  *  2005-06-13  added flags to recv_all & receive_fd, to allow full blocking
+  *              or semi-nonblocking mode (andrei)
   */
 
 #ifdef USE_TCP
 
 
 /* receive all the data or returns error (handles EINTR etc.)
+ * params: socket
+ *         data     - buffer for the results
+ *         data_len - 
+ *         flags    - recv flags for the first recv (see recv(2)), only
+ *                    0, MSG_WAITALL and MSG_DONTWAIT make sense
+ * if flags is set to MSG_DONWAIT (or to 0 and the socket fd is non-blocking),
+ * and if no data is queued on the fd, recv_all will not wait (it will 
+ * return error and set errno to EAGAIN/EWOULDBLOCK). However if even 1 byte
+ *  is queued, the call will block until the whole data_len was read or an
+ *  error or eof occured ("semi-nonblocking" behaviour,  some tcp code
+ *   counts on it).
+ * if flags is set to MSG_WAITALL it will block even if no byte is available.
+ *  
  * returns: bytes read or error (<0)
  * can return < data_len if EOF */
-int recv_all(int socket, void* data, int data_len)
+int recv_all(int socket, void* data, int data_len, int flags)
 {
        int b_read;
        int n;
        
        b_read=0;
-       do{
+again:
+       n=recv(socket, (char*)data, data_len, flags);
+       if (n<0){
+               /* error */
+               if (errno==EINTR) goto again; /* signal, try again */
+               /* on EAGAIN just return (let the caller know) */
+               if ((errno==EAGAIN)||(errno==EWOULDBLOCK)) return n;
+                       LOG(L_CRIT, "ERROR: recv_all: 1st recv on %d failed: %s\n",
+                                       socket, strerror(errno));
+                       return n;
+       }
+       b_read+=n;
+       while( (b_read!=data_len) && (n)){
                n=recv(socket, (char*)data+b_read, data_len-b_read, MSG_WAITALL);
                if (n<0){
                        /* error */
                        if (errno==EINTR) continue; /* signal, try again */
-                       LOG(L_CRIT, "ERROR: recv_all: recv on %d failed: %s\n",
+                       LOG(L_CRIT, "ERROR: recv_all: 2nd recv on %d failed: %s\n",
                                        socket, strerror(errno));
                        return n;
                }
                b_read+=n;
-       }while( (b_read!=data_len) && (n));
+       }
        return b_read;
 }
 
 
+
 /* sends all data (takes care of signals) (assumes blocking fd)
  * returns number of bytes sent or < 0 for an error */
 int send_all(int socket, void* data, int data_len)
@@ -136,7 +164,15 @@ again:
 
 
 
-int receive_fd(int unix_socket, void* data, int data_len, int* fd)
+/* receives a fd and data_len data
+ * params: unix_socket 
+ *         data
+ *         data_len
+ *         fd         - will be set to the passed fd value or -1 if no fd
+ *                      was passed
+ *         flags      - 0, MSG_DONTWAIT, MSG_WAITALL; same as recv_all flags
+ * returns: bytes read on success, -1 on error (and sets errno) */
+int receive_fd(int unix_socket, void* data, int data_len, int* fd, int flags)
 {
        struct msghdr msg;
        struct iovec iov[1];
@@ -166,9 +202,10 @@ int receive_fd(int unix_socket, void* data, int data_len, int* fd)
        msg.msg_iovlen=1;
        
 again:
-       ret=recvmsg(unix_socket, &msg, MSG_WAITALL);
+       ret=recvmsg(unix_socket, &msg, flags);
        if (ret<0){
                if (errno==EINTR) goto again;
+               if ((errno==EAGAIN)||(errno==EWOULDBLOCK)) goto error;
                LOG(L_CRIT, "ERROR: receive_fd: recvmsg on %d failed: %s\n",
                                unix_socket, strerror(errno));
                goto error;
@@ -181,7 +218,8 @@ again:
        if (ret<data_len){
                LOG(L_WARN, "WARNING: receive_fd: too few bytes read (%d from %d)"
                                    "trying to fix...\n", ret, data_len);
-               n=recv_all(unix_socket, (char*)data+ret, data_len-ret);
+               /* blocking recv_all */
+               n=recv_all(unix_socket, (char*)data+ret, data_len-ret, MSG_WAITALL);
                if (n>=0) ret+=n;
                else{
                        ret=n;
@@ -204,8 +242,9 @@ again:
                }
                *fd=*((int*) CMSG_DATA(cmsg));
        }else{
+               /*
                LOG(L_ERR, "ERROR: receive_fd: no descriptor passed, cmsg=%p,"
-                               "len=%d\n", cmsg, (unsigned)cmsg->cmsg_len);
+                               "len=%d\n", cmsg, (unsigned)cmsg->cmsg_len); */
                *fd=-1;
                /* it's not really an error */
        }
@@ -213,8 +252,8 @@ again:
        if (msg.msg_accrightslen==sizeof(int)){
                *fd=new_fd;
        }else{
-               LOG(L_ERR, "ERROR: receive_fd: no descriptor passed,"
-                               " accrightslen=%d\n", msg.msg_accrightslen);
+               /*LOG(L_ERR, "ERROR: receive_fd: no descriptor passed,"
+                               " accrightslen=%d\n", msg.msg_accrightslen); */
                *fd=-1;
        }
 #endif
index 1e11f86..c0cb6f4 100644 (file)
--- a/pass_fd.h
+++ b/pass_fd.h
@@ -30,9 +30,9 @@
 
 
 int send_fd(int unix_socket, void* data, int data_len, int fd);
-int receive_fd(int unix_socket, void* data, int data_len, int* fd);
+int receive_fd(int unix_socket, void* data, int data_len, int* fd, int flags);
 
-int recv_all(int socket, void* data, int data_len);
+int recv_all(int socket, void* data, int data_len, int flags);
 int send_all(int socket, void* data, int data_len);
 
 
diff --git a/poll_types.h b/poll_types.h
new file mode 100644 (file)
index 0000000..90c05ba
--- /dev/null
@@ -0,0 +1,61 @@
+/* 
+ * $Id$
+ * 
+ * Copyright (C) 2005 iptelorg GmbH
+ *
+ * This file is part of ser, a free SIP server.
+ *
+ * ser is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version
+ *
+ * For a license to use the ser software under conditions
+ * other than those described here, or to purchase support for this
+ * software, please contact iptel.org by e-mail at the following addresses:
+ *    info@iptel.org
+ *
+ * ser is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ */
+/* 
+ * io wait poll methods (enum, strings, related function)
+ * see io_wait.h for more details
+ * 
+ *  
+ */
+/* 
+ * History:
+ * --------
+ *  2005-06-15  created by andrei
+ */
+
+
+#ifndef _poll_types_h
+#define _poll_types_h
+
+enum poll_types { POLL_NONE, POLL_POLL, POLL_EPOLL_LT, POLL_EPOLL_ET,
+                                       POLL_SIGIO_RT, POLL_SELECT, POLL_KQUEUE, POLL_DEVPOLL,
+                                       POLL_END};
+
+/* all the function and vars are defined in io_wait.c */
+
+extern char* poll_method_str[POLL_END];
+extern char* poll_support; 
+
+
+enum poll_types choose_poll_method();
+
+/* returns 0 on success, and an error message on error */
+char* check_poll_method(enum poll_types poll_method);
+
+char* poll_method_name(enum poll_types poll_method);
+enum poll_types get_poll_type(char* s);
+
+#endif
index cf61003..eddf541 100644 (file)
@@ -50,6 +50,7 @@
                                                                           timeout */
 #define DEFAULT_TCP_CONNECT_TIMEOUT 10 /* if a connect doesn't complete in this
                                                                                  time, timeout */
+#define DEFAULT_TCP_MAX_FD_NO 2048 /* maximum fd number */
 #define TCP_CHILD_TIMEOUT 5 /* after 5 seconds, the child "returns" 
                                                         the connection to the tcp master process */
 #define TCP_MAIN_SELECT_TIMEOUT 5 /* how often "tcp main" checks for timeout*/
index d228f10..546d9cf 100644 (file)
@@ -52,6 +52,8 @@
  *              new socket if tcpconn_new return 0 (e.g. out of mem) (andrei)
  *  2003-11-28  tcp_blocking_write & tcp_blocking_connect added (andrei)
  *  2004-11-08  dropped find_tcp_si and replaced with find_si (andrei)
+ *  2005-06-07  new tcp optimized code, supports epoll (LT), sigio + real time
+ *               signals, poll & select (andrei)
  */
 
 
@@ -62,7 +64,6 @@
 #error "shared memory support needed (add -DSHM_MEM to Makefile.defs)"
 #endif
 
-
 #include <sys/time.h>
 #include <sys/types.h>
 #include <sys/select.h>
 #include <netinet/tcp.h>
 #include <sys/uio.h>  /* writev*/
 #include <netdb.h>
+#include <stdlib.h> /*exit() */
 
 #include <unistd.h>
-#include <fcntl.h>
 
 #include <errno.h>
 #include <string.h>
 
+#ifdef HAVE_SELECT
+#include <sys/select.h>
+#endif
+#include <sys/poll.h>
 
 
 #include "ip_addr.h"
 #include "tsend.h"
 #ifdef USE_TLS
 #include "tls/tls_server.h"
-#endif
-
-
-
+#endif 
 
 #define local_malloc pkg_malloc
 #define local_free   pkg_free
 
+#define HANDLE_IO_INLINE
+#include "io_wait.h"
+#include <fcntl.h> /* must be included after io_wait.h if SIGIO_RT is used */
+
 #define MAX_TCP_CHILDREN 100
 
+
+
+enum fd_types { F_NONE, F_SOCKINFO /* a tcp_listen fd */,
+                               F_TCPCONN, F_TCPCHILD, F_PROC };
+
 struct tcp_child{
        pid_t pid;
        int proc_no; /* ser proc_no, for debugging */
@@ -116,9 +127,12 @@ struct tcp_child{
 };
 
 
+
 int tcp_accept_aliases=0; /* by default don't accept aliases */
 int tcp_connect_timeout=DEFAULT_TCP_CONNECT_TIMEOUT;
 int tcp_send_timeout=DEFAULT_TCP_SEND_TIMEOUT;
+enum poll_types tcp_poll_method=0; /* by default choose the best method */
+int tcp_max_fd_no=DEFAULT_TCP_MAX_FD_NO;
 
 /* connection hash table (after ip&port) , includes also aliases */
 struct tcp_conn_alias** tcpconn_aliases_hash=0;
@@ -132,7 +146,10 @@ static int* connection_id=0; /*  unique for each connection, used for
                                                                for a reply */
 int unix_tcp_sock;
 
-int tcp_proto_no=-1; /* tcp protocol number as returned by getprotobyname */
+static int tcp_proto_no=-1; /* tcp protocol number as returned by
+                                                          getprotobyname */
+
+static io_wait_h io_h;
 
 
 
@@ -177,20 +194,38 @@ error:
 
 
 
+/* blocking connect on a non-blocking fd; it will timeout after
+ * tcp_connect_timeout 
+ * if BLOCKING_USE_SELECT and HAVE_SELECT are defined it will internally
+ * use select() instead of poll (bad if fd > FD_SET_SIZE, poll is preferred)
+ */
 static int tcp_blocking_connect(int fd, const struct sockaddr *servaddr,
                                                                socklen_t addrlen)
 {
        int n;
+#if defined(HAVE_SELECT) && defined(BLOCKING_USE_SELECT)
        fd_set sel_set;
+       fd_set orig_set;
        struct timeval timeout;
+#else
+       struct pollfd pf;
+#endif
+       int elapsed;
+       int to;
        int ticks;
        int err;
        unsigned int err_len;
        
+       to=tcp_connect_timeout;
+       ticks=get_ticks();
 again:
        n=connect(fd, servaddr, addrlen);
        if (n==-1){
-               if (errno==EINTR) goto again;
+               if (errno==EINTR){
+                       elapsed=(get_ticks()-ticks)*TIMER_TICK;
+                       if (elapsed<to)         goto again;
+                       else goto error_timeout;
+               }
                if (errno!=EINPROGRESS && errno!=EALREADY){
                        LOG(L_ERR, "ERROR: tcp_blocking_connect: (%d) %s\n",
                                        errno, strerror(errno));
@@ -198,28 +233,43 @@ again:
                }
        }else goto end;
        
+       /* poll/select loop */
+#if defined(HAVE_SELECT) && defined(BLOCKING_USE_SELECT)
+               FD_ZERO(&orig_set);
+               FD_SET(fd, &orig_set);
+#else
+               pf.fd=fd;
+               pf.events=POLLOUT;
+#endif
        while(1){
-               FD_ZERO(&sel_set);
-               FD_SET(fd, &sel_set);
-               timeout.tv_sec=tcp_connect_timeout;
+               elapsed=(get_ticks()-ticks)*TIMER_TICK;
+               if (elapsed<to)
+                       to-=elapsed;
+               else 
+                       goto error_timeout;
+#if defined(HAVE_SELECT) && defined(BLOCKING_USE_SELECT)
+               sel_set=orig_set;
+               timeout.tv_sec=to;
                timeout.tv_usec=0;
-               ticks=get_ticks();
                n=select(fd+1, 0, &sel_set, 0, &timeout);
+#else
+               n=poll(&pf, 1, to*1000);
+#endif
                if (n<0){
                        if (errno==EINTR) continue;
-                       LOG(L_ERR, "ERROR: tcp_blocking_connect: select failed: (%d) %s\n",
-                                       errno, strerror(errno));
+                       LOG(L_ERR, "ERROR: tcp_blocking_connect: poll/select failed:"
+                                       " (%d) %s\n", errno, strerror(errno));
                        goto error;
-               }else if (n==0){
-                       /* timeout */
-                       if (get_ticks()-ticks>=tcp_connect_timeout){
-                               LOG(L_ERR, "ERROR: tcp_blocking_connect: timeout (%d)\n",
-                                               tcp_connect_timeout);
-                               goto error;
-                       }
-                       continue;
-               }
+               }else if (n==0) /* timeout */ continue;
+#if defined(HAVE_SELECT) && defined(BLOCKING_USE_SELECT)
                if (FD_ISSET(fd, &sel_set)){
+#else
+               if (pf.revents&(POLLERR|POLLHUP|POLLNVAL)){ 
+                       LOG(L_ERR, "ERROR: tcp_blocking_connect: bad poll flags %x\n",
+                                       pf.revents);
+                       goto error;
+               }else{
+#endif
                        err_len=sizeof(err);
                        getsockopt(fd, SOL_SOCKET, SO_ERROR, &err, &err_len);
                        if (err==0) goto end;
@@ -230,6 +280,10 @@ again:
                        }
                }
        }
+error_timeout:
+       /* timeout */
+       LOG(L_ERR, "ERROR: tcp_blocking_connect: timeout %d s elapsed from %d s\n",
+                       elapsed, tcp_connect_timeout);
 error:
        return -1;
 end:
@@ -683,20 +737,13 @@ no_id:
                        /* send the new tcpconn to "tcp main" */
                        response[0]=(long)c;
                        response[1]=CONN_NEW;
-                       n=send_all(unix_tcp_sock, response, sizeof(response));
-                       if (n<=0){
-                               LOG(L_ERR, "BUG: tcp_send: failed write: %s (%d)\n",
-                                               strerror(errno), errno);
-                               n=-1;
-                               goto end;
-                       }       
-                       n=send_fd(unix_tcp_sock, &c, sizeof(c), c->s);
+                       n=send_fd(unix_tcp_sock, response, sizeof(response), c->s);
                        if (n<=0){
                                LOG(L_ERR, "BUG: tcp_send: failed send_fd: %s (%d)\n",
                                                strerror(errno), errno);
                                n=-1;
                                goto end;
-                       }
+                       }       
                        goto send_it;
                }
 get_fd:
@@ -715,7 +762,7 @@ get_fd:
                        }
                        DBG("tcp_send, c= %p, n=%d\n", c, n);
                        tmp=c;
-                       n=receive_fd(unix_tcp_sock, &c, sizeof(c), &fd);
+                       n=receive_fd(unix_tcp_sock, &c, sizeof(c), &fd, MSG_WAITALL);
                        if (n<=0){
                                LOG(L_ERR, "BUG: tcp_send: failed to get fd(receive_fd):"
                                                        " %s (%d)\n", strerror(errno), errno);
@@ -776,43 +823,6 @@ release_c:
 
 
 
-/* very inefficient for now - FIXME*/
-void tcpconn_timeout(fd_set* set)
-{
-       struct tcp_connection *c, *next;
-       int ticks;
-       unsigned h;
-       int fd;
-       
-       
-       ticks=get_ticks();
-       TCPCONN_LOCK; /* fixme: we can lock only on delete IMO */
-       for(h=0; h<TCP_ID_HASH_SIZE; h++){
-               c=tcpconn_id_hash[h];
-               while(c){
-                       next=c->id_next;
-                       if ((c->refcnt==0) && (ticks>c->timeout)) {
-                               DBG("tcpconn_timeout: timeout for hash=%d - %p (%d > %d)\n",
-                                               h, c, ticks, c->timeout);
-                               fd=c->s;
-#ifdef USE_TLS
-                               if (c->type==PROTO_TLS)
-                                       tls_close(c, fd);
-#endif
-                               _tcpconn_rm(c);
-                               if (fd>0) {
-                                       FD_CLR(fd, set);
-                                       close(fd);
-                               }
-                       }
-                       c=next;
-               }
-       }
-       TCPCONN_UNLOCK;
-}
-
-
-
 int tcp_init(struct socket_info* sock_info)
 {
        union sockaddr_union* addr;
@@ -880,10 +890,11 @@ int tcp_init(struct socket_info* sock_info)
                /* continue since this is not critical */
        }
        if (bind(sock_info->socket, &addr->s, sockaddru_len(*addr))==-1){
-               LOG(L_ERR, "ERROR: tcp_init: bind(%x, %p, %d) on %s: %s\n",
-                               sock_info->socket, &addr->s, 
+               LOG(L_ERR, "ERROR: tcp_init: bind(%x, %p, %d) on %s:%d : %s\n",
+                               sock_info->socket,  &addr->s, 
                                (unsigned)sockaddru_len(*addr),
                                sock_info->address_str.s,
+                               sock_info->port_no,
                                strerror(errno));
                goto error;
        }
@@ -946,60 +957,67 @@ static int send2child(struct tcp_connection* tcpconn)
 }
 
 
-/* handle a new connection, called internally by tcp_main_loop */
-static inline void handle_new_connect(struct socket_info* si,
-                                                                               fd_set* sel_set, int* n)
+/* handles a new connection, called internally by tcp_main_loop/handle_io.
+ * params: si - pointer to one of the tcp socket_info structures on which
+ *              an io event was detected (connection attempt)
+ * returns:  handle_* return convention: -1 on error, 0 on EAGAIN (no more
+ *           io events queued), >0 on success. success/error refer only to
+ *           the accept.
+ */
+static inline int handle_new_connect(struct socket_info* si)
 {
        union sockaddr_union su;
        struct tcp_connection* tcpconn;
        socklen_t su_len;
        int new_sock;
        
-       if ((FD_ISSET(si->socket, sel_set))){
-               /* got a connection on r */
-               su_len=sizeof(su);
-               new_sock=accept(si->socket, &(su.s), &su_len);
-               (*n)--;
-               if (new_sock==-1){
-                       LOG(L_ERR,  "WARNING: tcp_main_loop: error while accepting"
-                                       " connection(%d): %s\n", errno, strerror(errno));
-                       return;
-               }
-               if (init_sock_opt(new_sock)<0){
-                       LOG(L_ERR, "ERROR: tcp_main_loop: init_sock_opt failed\n");
-                       close(new_sock);
-                       return;
+       /* got a connection on r */
+       su_len=sizeof(su);
+       new_sock=accept(si->socket, &(su.s), &su_len);
+       if (new_sock==-1){
+               if ((errno==EAGAIN)||(errno==EWOULDBLOCK))
+                       return 0;
+               LOG(L_ERR,  "WARNING: handle_new_connect: error while accepting"
+                               " connection(%d): %s\n", errno, strerror(errno));
+               return -1;
+       }
+       if (init_sock_opt(new_sock)<0){
+               LOG(L_ERR, "ERROR: handle_new_connect: init_sock_opt failed\n");
+               close(new_sock);
+               return 1; /* success, because the accept was succesfull */
+       }
+       
+       /* add socket to list */
+       tcpconn=tcpconn_new(new_sock, &su, si, si->proto, S_CONN_ACCEPT);
+       if (tcpconn){
+               tcpconn->refcnt++; /* safe, not yet available to the
+                                                         outside world */
+               tcpconn_add(tcpconn);
+               DBG("handle_new_connect: new connection: %p %d\n",
+                       tcpconn, tcpconn->s);
+               /* pass it to a child */
+               if(send2child(tcpconn)<0){
+                       LOG(L_ERR,"ERROR: handle_new_connect: no children "
+                                       "available\n");
+                       TCPCONN_LOCK;
+                       tcpconn->refcnt--;
+                       if (tcpconn->refcnt==0){
+                               close(tcpconn->s);
+                               _tcpconn_rm(tcpconn);
+                       }else tcpconn->timeout=0; /* force expire */
+                       TCPCONN_UNLOCK;
                }
+       }else{ /*tcpconn==0 */
+               LOG(L_ERR, "ERROR: handle_new_connect: tcpconn_new failed, "
+                               "closing socket\n");
+               close(new_sock);
                
-               /* add socket to list */
-               tcpconn=tcpconn_new(new_sock, &su, si, si->proto, S_CONN_ACCEPT);
-               if (tcpconn){
-                       tcpconn->refcnt++; /* safe, not yet available to the
-                                                                 outside world */
-                       tcpconn_add(tcpconn);
-                       DBG("tcp_main_loop: new connection: %p %d\n",
-                               tcpconn, tcpconn->s);
-                       /* pass it to a child */
-                       if(send2child(tcpconn)<0){
-                               LOG(L_ERR,"ERROR: tcp_main_loop: no children "
-                                               "available\n");
-                               TCPCONN_LOCK;
-                               tcpconn->refcnt--;
-                               if (tcpconn->refcnt==0){
-                                       close(tcpconn->s);
-                                       _tcpconn_rm(tcpconn);
-                               }else tcpconn->timeout=0; /* force expire */
-                               TCPCONN_UNLOCK;
-                       }
-               }else{ /*tcpconn==0 */
-                       LOG(L_ERR, "ERROR: tcp_main_loop: tcpconn_new failed, "
-                                       "closing socket\n");
-                       close(new_sock);
-               }
        }
+       return 1; /* accept() was succesfull */
 }
 
 
+
 /* used internally by tcp_main_loop() */
 static void tcpconn_destroy(struct tcp_connection* tcpconn)
 {
@@ -1008,7 +1026,7 @@ static void tcpconn_destroy(struct tcp_connection* tcpconn)
        TCPCONN_LOCK; /*avoid races w/ tcp_send*/
        tcpconn->refcnt--;
        if (tcpconn->refcnt==0){ 
-               DBG("tcp_main_loop: destroying connection\n");
+               DBG("tcpconn_destroy: destroying connection\n");
                fd=tcpconn->s;
 #ifdef USE_TLS
                /*FIXME: lock ->writelock ? */
@@ -1021,37 +1039,393 @@ static void tcpconn_destroy(struct tcp_connection* tcpconn)
                /* force timeout */
                tcpconn->timeout=0;
                tcpconn->state=S_CONN_BAD;
-               DBG("tcp_main_loop: delaying ...\n");
+               DBG("tcpconn_destroy: delaying ...\n");
                
        }
        TCPCONN_UNLOCK;
 }
 
 
-void tcp_main_loop()
+
+/* handles an io event on one of the watched tcp connections
+ * 
+ * params: tcpconn - pointer to the tcp_connection for which we have an io ev.
+ *         fd_i    - index in the fd_array table (needed for delete)
+ * returns:  handle_* return convention, but on success it always returns 0
+ *           (because it's one-shot, after a succesfull execution the fd is
+ *            removed from tcp_main's watch fd list and passed to a child =>
+ *            tcp_main is not interested in further io events that might be
+ *            queued for this fd)
+ */
+inline static int handle_tcpconn_ev(struct tcp_connection* tcpconn, int fd_i)
+{
+       int fd;
+       
+       /* FIXME: is refcnt!=0 really necessary? */
+       if ((tcpconn->refcnt!=0)){
+               /* FIXME: might be valid for sigio_rt iff fd flags are not cleared
+                *        (there is a short window in which it could generate a sig
+                *         that would be catched by tcp_main) */
+               LOG(L_CRIT, "BUG: handle_tcpconn_ev: io event on referenced"
+                                       " tcpconn (%p), refcnt=%d, fd=%d\n",
+                                       tcpconn, tcpconn->refcnt, tcpconn->s);
+               return -1;
+       }
+       /* pass it to child, so remove it from the io watch list */
+       DBG("handle_tcpconn_ev: data available on %p %d\n", tcpconn, tcpconn->s);
+       if (io_watch_del(&io_h, tcpconn->s, fd_i)==-1) goto error;
+       tcpconn_ref(tcpconn); /* refcnt ++ */
+       if (send2child(tcpconn)<0){
+               LOG(L_ERR,"ERROR: handle_tcpconn_ev: no children available\n");
+               TCPCONN_LOCK;
+               tcpconn->refcnt--;
+               if (tcpconn->refcnt==0){
+                       fd=tcpconn->s;
+                       _tcpconn_rm(tcpconn);
+                       close(fd);
+               }else tcpconn->timeout=0; /* force expire*/
+               TCPCONN_UNLOCK;
+       }
+       return 0; /* we are not interested in possibly queued io events, 
+                                the fd was either passed to a child, or closed */
+error:
+       return -1;
+}
+
+
+
+/* handles io from a tcp child process
+ * params: tcp_c - pointer in the tcp_children array, to the entry for
+ *                 which an io event was detected 
+ *         fd_i  - fd index in the fd_array (usefull for optimizing
+ *                 io_watch_deletes)
+ * returns:  handle_* return convention: -1 on error, 0 on EAGAIN (no more
+ *           io events queued), >0 on success. success/error refer only to
+ *           the reads from the fd.
+ */
+inline static int handle_tcp_child(struct tcp_child* tcp_c, int fd_i)
 {
-       int r;
-       int n;
-       fd_set master_set;
-       fd_set sel_set;
-       int maxfd;
        struct tcp_connection* tcpconn;
-       unsigned h;
        long response[2];
        int cmd;
        int bytes;
-       struct timeval timeout;
+       
+       if (tcp_c->unix_sock<=0){
+               /* (we can't have a fd==0, 0 is never closed )*/
+               LOG(L_CRIT, "BUG: handle_tcp_child: fd %d for %d "
+                               "(pid %d, ser no %d)\n", tcp_c->unix_sock,
+                               (int)(tcp_c-&tcp_children[0]), tcp_c->pid, tcp_c->proc_no);
+               goto error;
+       }
+       /* read until sizeof(response)
+        * (this is a SOCK_STREAM so read is not atomic) */
+       bytes=recv_all(tcp_c->unix_sock, response, sizeof(response), MSG_DONTWAIT);
+       if (bytes<(int)sizeof(response)){
+               if (bytes==0){
+                       /* EOF -> bad, child has died */
+                       DBG("DBG: handle_tcp_child: dead tcp child %d (pid %d, no %d)"
+                                       " (shutting down?)\n", (int)(tcp_c-&tcp_children[0]), 
+                                       tcp_c->pid, tcp_c->proc_no );
+                                               /* don't listen on it any more */
+                       io_watch_del(&io_h, tcp_c->unix_sock, fd_i); 
+                       goto error; /* eof. so no more io here, it's ok to return error */
+               }else if (bytes<0){
+                       /* EAGAIN is ok if we try to empty the buffer
+                        * e.g.: SIGIO_RT overflow mode or EPOLL ET */
+                       if ((errno!=EAGAIN) && (errno!=EWOULDBLOCK)){
+                               LOG(L_CRIT, "ERROR: handle_tcp_child: read from tcp child %d "
+                                               " (pid %d, no %d) %s [%d]\n",
+                                               tcp_c-&tcp_children[0], tcp_c->pid, tcp_c->proc_no,
+                                               strerror(errno), errno );
+                       }else{
+                               bytes=0;
+                       }
+                       /* try to ignore ? */
+                       goto end;
+               }else{
+                       /* should never happen */
+                       LOG(L_CRIT, "BUG: handle_tcp_child: too few bytes received (%d)\n",
+                                       bytes );
+                       bytes=0; /* something was read so there is no error; otoh if
+                                         receive_fd returned less then requested => the receive
+                                         buffer is empty => no more io queued on this fd */
+                       goto end;
+               }
+       }
+       
+       DBG("handle_tcp_child: reader response= %lx, %ld from %d \n",
+                                       response[0], response[1], (int)(tcp_c-&tcp_children[0]));
+       cmd=response[1];
+       tcpconn=(struct tcp_connection*)response[0];
+       if (tcpconn==0){
+               /* should never happen */
+               LOG(L_CRIT, "BUG: handle_tcp_child: null tcpconn pointer received"
+                                " from tcp child %d (pid %d): %lx, %lx\n",
+                                       (int)(tcp_c-&tcp_children[0]), tcp_c->pid,
+                                       response[0], response[1]) ;
+               goto end;
+       }
+       switch(cmd){
+               case CONN_RELEASE:
+                       tcp_c->busy--;
+                       if (tcpconn->state==S_CONN_BAD){ 
+                               tcpconn_destroy(tcpconn);
+                               break;
+                       }
+                       io_watch_add(&io_h, tcpconn->s, F_TCPCONN, tcpconn);
+                       /* update the timeout*/
+                       tcpconn->timeout=get_ticks()+TCP_CON_TIMEOUT;
+                       tcpconn_put(tcpconn);
+                       DBG("handle_tcp_child: CONN_RELEASE  %p refcnt= %d\n", 
+                                                                                       tcpconn, tcpconn->refcnt);
+                       break;
+               case CONN_ERROR:
+               case CONN_DESTROY:
+               case CONN_EOF:
+                       /* WARNING: this will auto-dec. refcnt! */
+                               tcp_c->busy--;
+                               /* main doesn't listen on it => we don't have to delete it
+                                if (tcpconn->s!=-1)
+                                       io_watch_del(&io_h, tcpconn->s, -1);
+                               */
+                               tcpconn_destroy(tcpconn);
+                               break;
+               default:
+                               LOG(L_CRIT, "BUG: handle_tcp_child:  unknown cmd %d"
+                                                                       " from tcp reader %d\n",
+                                                                       cmd, (int)(tcp_c-&tcp_children[0]));
+       }
+end:
+       return bytes;
+error:
+       return -1;
+}
+
+
+
+/* handles io from a "generic" ser process (get fd or new_fd from a tcp_send)
+ * 
+ * params: p     - pointer in the ser processes array (pt[]), to the entry for
+ *                 which an io event was detected
+ *         fd_i  - fd index in the fd_array (usefull for optimizing
+ *                 io_watch_deletes)
+ * returns:  handle_* return convention:
+ *          -1 on error reading from the fd,
+ *           0 on EAGAIN  or when no  more io events are queued 
+ *             (receive buffer empty),
+ *           >0 on successfull reads from the fd (the receive buffer might
+ *             be non-empty).
+ */
+inline static int handle_ser_child(struct process_table* p, int fd_i)
+{
+       struct tcp_connection* tcpconn;
+       long response[2];
+       int cmd;
+       int bytes;
+       int ret;
        int fd;
-       struct socket_info* si;
+       
+       ret=-1;
+       if (p->unix_sock<=0){
+               /* (we can't have a fd==0, 0 is never closed )*/
+               LOG(L_CRIT, "BUG: handle_ser_child: fd %d for %d "
+                               "(pid %d)\n", p->unix_sock, (int)(p-&pt[0]), p->pid);
+               goto error;
+       }
+                       
+       /* get all bytes and the fd (if transmitted)
+        * (this is a SOCK_STREAM so read is not atomic) */
+       bytes=receive_fd(p->unix_sock, response, sizeof(response), &fd,
+                                               MSG_DONTWAIT);
+       if (bytes<(int)sizeof(response)){
+               /* too few bytes read */
+               if (bytes==0){
+                       /* EOF -> bad, child has died */
+                       DBG("DBG: handle_ser_child: dead child %d, pid %d"
+                                       " (shutting down?)\n", (int)(p-&pt[0]), p->pid);
+                       /* don't listen on it any more */
+                       io_watch_del(&io_h, p->unix_sock, fd_i);
+                       goto error; /* child dead => no further io events from it */
+               }else if (bytes<0){
+                       /* EAGAIN is ok if we try to empty the buffer
+                        * e.g: SIGIO_RT overflow mode or EPOLL ET */
+                       if ((errno!=EAGAIN) && (errno!=EWOULDBLOCK)){
+                               LOG(L_CRIT, "ERROR: handle_ser_child: read from child %d  "
+                                               "(pid %d):  %s [%d]\n", (int)(p-&pt[0]), p->pid,
+                                               strerror(errno), errno);
+                               ret=-1;
+                       }else{
+                               ret=0;
+                       }
+                       /* try to ignore ? */
+                       goto end;
+               }else{
+                       /* should never happen */
+                       LOG(L_CRIT, "BUG: handle_ser_child: too few bytes received (%d)\n",
+                                       bytes );
+                       ret=0; /* something was read so there is no error; otoh if
+                                         receive_fd returned less then requested => the receive
+                                         buffer is empty => no more io queued on this fd */
+                       goto end;
+               }
+       }
+       ret=1; /* something was received, there might be more queued */
+       DBG("handle_ser_child: read response= %lx, %ld, fd %d from %d (%d)\n",
+                                       response[0], response[1], fd, (int)(p-&pt[0]), p->pid);
+       cmd=response[1];
+       tcpconn=(struct tcp_connection*)response[0];
+       if (tcpconn==0){
+               LOG(L_CRIT, "BUG: handle_ser_child: null tcpconn pointer received"
+                                " from child %d (pid %d): %lx, %lx\n",
+                                       (int)(p-&pt[0]), p->pid, response[0], response[1]) ;
+               goto end;
+       }
+       switch(cmd){
+               case CONN_ERROR:
+                       if (tcpconn->s!=-1)
+                               io_watch_del(&io_h, tcpconn->s, -1);
+                       tcpconn_destroy(tcpconn);
+                       break;
+               case CONN_GET_FD:
+                       /* send the requested FD  */
+                       /* WARNING: take care of setting refcnt properly to
+                        * avoid race condition */
+                       if (send_fd(p->unix_sock, &tcpconn, sizeof(tcpconn),
+                                                       tcpconn->s)<=0){
+                               LOG(L_ERR, "ERROR: handle_ser_child: send_fd failed\n");
+                       }
+                       break;
+               case CONN_NEW:
+                       /* update the fd in the requested tcpconn*/
+                       /* WARNING: take care of setting refcnt properly to
+                        * avoid race condition */
+                       if (fd==-1){
+                               LOG(L_CRIT, "BUG: handle_ser_child: CONN_NEW:"
+                                                       " no fd received\n");
+                               break;
+                       }
+                       tcpconn->s=fd;
+                       /* add tcpconn to the list*/
+                       tcpconn_add(tcpconn);
+                       io_watch_add(&io_h, tcpconn->s, F_TCPCONN, tcpconn);
+                       /* update the timeout*/
+                       tcpconn->timeout=get_ticks()+TCP_CON_TIMEOUT;
+                       break;
+               default:
+                       LOG(L_CRIT, "BUG: handle_ser_child: unknown cmd %d\n", cmd);
+       }
+end:
+       return ret;
+error:
+       return -1;
+}
+
+
+
+/* generic handle io routine, it will call the appropiate
+ *  handle_xxx() based on the fd_map type
+ *
+ * params:  fm  - pointer to a fd hash entry
+ *          idx - index in the fd_array (or -1 if not known)
+ * return: -1 on error
+ *          0 on EAGAIN or when by some other way it is known that no more 
+ *            io events are queued on the fd (the receive buffer is empty).
+ *            Usefull to detect when there are no more io events queued for
+ *            sigio_rt, epoll_et, kqueue.
+ *         >0 on successfull read from the fd (when there might be more io
+ *            queued -- the receive buffer might still be non-empty)
+ */
+inline static int handle_io(struct fd_map* fm, int idx)
+{      
+       int ret;
+       
+       switch(fm->type){
+               case F_SOCKINFO:
+                       ret=handle_new_connect((struct socket_info*)fm->data);
+                       break;
+               case F_TCPCONN:
+                       ret=handle_tcpconn_ev((struct tcp_connection*)fm->data, idx);
+                       break;
+               case F_TCPCHILD:
+                       ret=handle_tcp_child((struct tcp_child*)fm->data, idx);
+                       break;
+               case F_PROC:
+                       ret=handle_ser_child((struct process_table*)fm->data, idx);
+                       break;
+               case F_NONE:
+                       LOG(L_CRIT, "BUG: handle_io: empty fd map\n");
+                       goto error;
+               default:
+                       LOG(L_CRIT, "BUG: handle_io: uknown fd type %d\n", fm->type); 
+                       goto error;
+       }
+       return ret;
+error:
+       return -1;
+}
+
+
+
+/* very inefficient for now - FIXME*/
+static void tcpconn_timeout()
+{
+       struct tcp_connection *c, *next;
+       int ticks;
+       unsigned h;
+       int fd;
+       
+       
+       ticks=get_ticks();
+       TCPCONN_LOCK; /* fixme: we can lock only on delete IMO */
+       for(h=0; h<TCP_ID_HASH_SIZE; h++){
+               c=tcpconn_id_hash[h];
+               while(c){
+                       next=c->id_next;
+                       if ((c->refcnt==0) && (ticks>c->timeout)) {
+                               DBG("tcpconn_timeout: timeout for hash=%d - %p (%d > %d)\n",
+                                               h, c, ticks, c->timeout);
+                               fd=c->s;
+#ifdef USE_TLS
+                               if (c->type==PROTO_TLS)
+                                       tls_close(c, fd);
+#endif
+                               _tcpconn_rm(c);
+                               if (fd>0) {
+                                       io_watch_del(&io_h, fd, -1);
+                                       close(fd);
+                               }
+                       }
+                       c=next;
+               }
+       }
+       TCPCONN_UNLOCK;
+}
+
 
-       /*init */
-       maxfd=0;
-       FD_ZERO(&master_set);
-       /* set all the listen addresses */
+
+/* tcp main loop */
+void tcp_main_loop()
+{
+
+       struct socket_info* si;
+       int r;
+       
+       /* init io_wait (here because we want the memory allocated only in
+        * the tcp_main process) */
+       
+       /* FIXME: TODO: make tcp_max_fd_no a config param */
+       if  (init_io_wait(&io_h, tcp_max_fd_no, tcp_poll_method)<0)
+               goto error;
+       /* init: start watching all the fds*/
+       
+       /* add all the sockets we listens on for connections */
        for (si=tcp_listen; si; si=si->next){
                if ((si->proto==PROTO_TCP) &&(si->socket!=-1)){
-                       FD_SET(si->socket, &master_set);
-                       if (si->socket>maxfd) maxfd=si->socket;
+                       if (io_watch_add(&io_h, si->socket, F_SOCKINFO, si)<0){
+                               LOG(L_CRIT, "ERROR: tcp_main_loop: init: failed to add "
+                                                       "listen socket to the fd list\n");
+                               goto error;
+                       }
                }else{
                        LOG(L_CRIT, "BUG: tcp_main_loop: non tcp address in tcp_listen\n");
                }
@@ -1060,239 +1434,113 @@ void tcp_main_loop()
        if (!tls_disable){
                for (si=tls_listen; si; si=si->next){
                        if ((si->proto==PROTO_TLS) && (si->socket!=-1)){
-                               FD_SET(si->socket, &master_set);
-                               if (si->socket>maxfd) maxfd=si->socket;
+                               if (io_watch_add(&io_h, si->socket, F_SOCKINFO, si)<0){
+                                       LOG(L_CRIT, "ERROR: tcp_main_loop: init: failed to add "
+                                                       "tls listen socket to the fd list\n");
+                                       goto error;
+                               }
                        }else{
                                LOG(L_CRIT, "BUG: tcp_main_loop: non tls address"
                                                " in tls_listen\n");
                        }
                }
-       }
 #endif
-       /* set all the unix sockets used for child comm */
+       /* add all the unix sockets used for communcation with other ser processes
+        *  (get fd, new connection a.s.o) */
        for (r=1; r<process_no; r++){
-               if (pt[r].unix_sock>0){ /* we can't have 0, we never close it!*/
-                       FD_SET(pt[r].unix_sock, &master_set);
-                       if (pt[r].unix_sock>maxfd) maxfd=pt[r].unix_sock;
-               }
+               if (pt[r].unix_sock>0) /* we can't have 0, we never close it!*/
+                       if (io_watch_add(&io_h, pt[r].unix_sock, F_PROC, &pt[r])<0){
+                                       LOG(L_CRIT, "ERROR: tcp_main_loop: init: failed to add "
+                                                       "process %d unix socket to the fd list\n", r);
+                                       goto error;
+                       }
        }
+       /* add all the unix sokets used for communication with the tcp childs */
        for (r=0; r<tcp_children_no; r++){
-               if (tcp_children[r].unix_sock>0){ /* we can't have 0, 
-                                                                                        we never close it!*/
-                       FD_SET(tcp_children[r].unix_sock, &master_set);
-                       if (tcp_children[r].unix_sock>maxfd)
-                               maxfd=tcp_children[r].unix_sock;
-               }
+               if (tcp_children[r].unix_sock>0)/*we can't have 0, we never close it!*/
+                       if (io_watch_add(&io_h, tcp_children[r].unix_sock, F_TCPCHILD,
+                                                       &tcp_children[r]) <0){
+                               LOG(L_CRIT, "ERROR: tcp_main_loop: init: failed to add "
+                                               "tcp child %d unix socket to the fd list\n", r);
+                               goto error;
+                       }
        }
        
-       
-       /* main loop*/
-       
-       while(1){
-               sel_set=master_set;
-               timeout.tv_sec=TCP_MAIN_SELECT_TIMEOUT;
-               timeout.tv_usec=0;
-               n=select(maxfd+1, &sel_set, 0 ,0 , &timeout);
-               if (n<0){
-                       if (errno==EINTR) continue; /* just a signal */
-                       /* errors */
-                       LOG(L_ERR, "ERROR: tcp_main_loop: select:(%d) %s\n", errno,
-                                       strerror(errno));
-                       n=0;
-               }
-               
-               for (si=tcp_listen; si && n; si=si->next)
-                       handle_new_connect(si, &sel_set, &n);
-#ifdef USE_TLS
-                       if (!tls_disable)
-                               for (si=tls_listen; si && n; si=si->next)
-                                       handle_new_connect(si, &sel_set, &n);
+       /* main loop */
+       switch(io_h.poll_method){
+               case POLL_POLL:
+                       while(1){
+                               /* wait and process IO */
+                               io_wait_loop_poll(&io_h, TCP_MAIN_SELECT_TIMEOUT, 0); 
+                               /* remove old connections */
+                               tcpconn_timeout();
+                       }
+                       break;
+#ifdef HAVE_SELECT
+               case POLL_SELECT:
+                       while(1){
+                               io_wait_loop_select(&io_h, TCP_MAIN_SELECT_TIMEOUT, 0);
+                               tcpconn_timeout();
+                       }
+                       break;
 #endif
-               
-               /* check all the read fds (from the tcpconn_addr_hash ) */
-               for (h=0; h<TCP_ID_HASH_SIZE; h++){
-                       for(tcpconn=tcpconn_id_hash[h]; tcpconn && n; 
-                                       tcpconn=tcpconn->id_next){
-                               /* FIXME: is refcnt==0 really necessary? */
-                               if ((tcpconn->refcnt==0)&&(FD_ISSET(tcpconn->s, &sel_set))){
-                                       /* new data available */
-                                       n--;
-                                       /* pass it to child, so remove it from select list */
-                                       DBG("tcp_main_loop: data available on %p [h:%d] %d\n",
-                                                       tcpconn, h, tcpconn->s);
-                                       FD_CLR(tcpconn->s, &master_set);
-                                       tcpconn_ref(tcpconn); /* refcnt ++ */
-                                       if (send2child(tcpconn)<0){
-                                               LOG(L_ERR,"ERROR: tcp_main_loop: no "
-                                                                       "children available\n");
-                                               TCPCONN_LOCK;
-                                               tcpconn->refcnt--;
-                                               if (tcpconn->refcnt==0){
-                                                       fd=tcpconn->s;
-                                                       _tcpconn_rm(tcpconn);
-                                                       close(fd);
-                                               }else tcpconn->timeout=0; /* force expire*/
-                                               TCPCONN_UNLOCK;
-                                       }
-                               }
+#ifdef HAVE_SIGIO_RT
+               case POLL_SIGIO_RT:
+                       while(1){
+                               io_wait_loop_sigio_rt(&io_h, TCP_MAIN_SELECT_TIMEOUT);
+                               tcpconn_timeout();
                        }
-               }
-               /* check unix sockets & listen | destroy connections */
-               /* tcp_children readers first */
-               for (r=0; r<tcp_children_no && n; r++){
-                       if ( (tcp_children[r].unix_sock>0) && 
-                                       FD_ISSET(tcp_children[r].unix_sock, &sel_set)){
-                               /* (we can't have a fd==0, 0 is never closed )*/
-                               n--;
-                               /* read until sizeof(response)
-                                * (this is a SOCK_STREAM so read is not atomic */
-                               bytes=recv_all(tcp_children[r].unix_sock, response,
-                                                               sizeof(response));
-                               if (bytes==0){
-                                       /* EOF -> bad, child has died */
-                                       DBG("DBG: tcp_main_loop: dead tcp child %d"
-                                                       " (shutting down?)\n", r);
-                                       /* don't listen on it any more */
-                                       FD_CLR(tcp_children[r].unix_sock, &master_set);
-                                       /*exit(-1);*/
-                                       continue; /* skip this and try the next one */
-                               }else if (bytes<0){
-                                       LOG(L_CRIT, "ERROR: tcp_main_loop: read from tcp child %d "
-                                                       "%s\n", r, strerror(errno));
-                                       /* try to ignore ? */
-                                       continue; /* skip this and try the next one */
-                               }
-                                       
-                               DBG("tcp_main_loop: reader response= %lx, %ld from %d \n",
-                                               response[0], response[1], r);
-                               cmd=response[1];
-                               tcpconn=(struct tcp_connection*)response[0];
-                               switch(cmd){
-                                       case CONN_RELEASE:
-                                               tcp_children[r].busy--;
-                                               if (tcpconn){
-                                                               if (tcpconn->state==S_CONN_BAD){ 
-                                                                       tcpconn_destroy(tcpconn);
-                                                                       break;
-                                                               }
-                                                               FD_SET(tcpconn->s, &master_set);
-                                                               if (maxfd<tcpconn->s) maxfd=tcpconn->s;
-                                                               /* update the timeout*/
-                                                               tcpconn->timeout=get_ticks()+TCP_CON_TIMEOUT;
-                                                               tcpconn_put(tcpconn);
-                                                               DBG("tcp_main_loop: CONN_RELEASE  %p"
-                                                                               " refcnt= %d\n", 
-                                                                               tcpconn, tcpconn->refcnt);
-                                               }
-                                               break;
-                                       case CONN_ERROR:
-                                       case CONN_DESTROY:
-                                       case CONN_EOF:
-                                               /* WARNING: this will auto-dec. refcnt! */
-                                               tcp_children[r].busy--;
-                                               if (tcpconn){
-                                                       if (tcpconn->s!=-1)
-                                                               FD_CLR(tcpconn->s, &master_set);
-                                                       tcpconn_destroy(tcpconn);
-                                               }
-                                               break;
-                                       default:
-                                                       LOG(L_CRIT, "BUG: tcp_main_loop:  unknown cmd %d"
-                                                                               " from tcp reader %d\n",
-                                                                       cmd, r);
-                               }
+                       break;
+#endif
+#ifdef HAVE_EPOLL
+               case POLL_EPOLL_LT:
+                       while(1){
+                               io_wait_loop_epoll(&io_h, TCP_MAIN_SELECT_TIMEOUT, 0);
+                               tcpconn_timeout();
                        }
-               }
-               /* check "send" unix sockets & listen | destroy connections */
-               /* start from 1, the "main" process does not transmit anything*/
-               for (r=1; r<process_no && n; r++){
-                       if ( (pt[r].unix_sock>0) && FD_ISSET(pt[r].unix_sock, &sel_set)){
-                               /* (we can't have a fd==0, 0 is never closed )*/
-                               n--;
-                               /* read until sizeof(response)
-                                * (this is a SOCK_STREAM so read is not atomic */
-                               bytes=recv_all(pt[r].unix_sock, response, sizeof(response));
-                               if (bytes==0){
-                                       /* EOF -> bad, child has died */
-                                       DBG("DBG: tcp_main_loop: dead child %d"
-                                                       " (shutting down?)\n", r);
-                                       /* don't listen on it any more */
-                                       FD_CLR(pt[r].unix_sock, &master_set);
-                                       /*exit(-1);*/
-                                       continue; /* skip this and try the next one */
-                               }else if (bytes<0){
-                                       LOG(L_CRIT, "ERROR: tcp_main_loop: read from child:  %s\n",
-                                                       strerror(errno));
-                                       /* try to ignore ? */
-                                       continue; /* skip this and try the next one */
-                               }
-                                       
-                               DBG("tcp_main_loop: read response= %lx, %ld from %d (%d)\n",
-                                               response[0], response[1], r, pt[r].pid);
-                               cmd=response[1];
-                               tcpconn=(struct tcp_connection*)response[0];
-                               switch(cmd){
-                                       case CONN_ERROR:
-                                               if (tcpconn){
-                                                       if (tcpconn->s!=-1)
-                                                               FD_CLR(tcpconn->s, &master_set);
-                                                       tcpconn_destroy(tcpconn);
-                                               }
-                                               break;
-                                       case CONN_GET_FD:
-                                               /* send the requested FD  */
-                                               /* WARNING: take care of setting refcnt properly to
-                                                * avoid race condition */
-                                               if (tcpconn){
-                                                       if (send_fd(pt[r].unix_sock, &tcpconn,
-                                                                               sizeof(tcpconn), tcpconn->s)<=0){
-                                                               LOG(L_ERR, "ERROR: tcp_main_loop:"
-                                                                               "send_fd failed\n");
-                                                       }
-                                               }else{
-                                                       LOG(L_CRIT, "BUG: tcp_main_loop: null pointer\n");
-                                               }
-                                               break;
-                                       case CONN_NEW:
-                                               /* update the fd in the requested tcpconn*/
-                                               /* WARNING: take care of setting refcnt properly to
-                                                * avoid race condition */
-                                               if (tcpconn){
-                                                       bytes=receive_fd(pt[r].unix_sock, &tcpconn,
-                                                                       sizeof(tcpconn), &tcpconn->s);
-                                                               if (bytes<sizeof(tcpconn)){
-                                                                       if (bytes<0){
-                                                                               LOG(L_CRIT, "BUG: tcp_main_loop:"
-                                                                                               " CONN_NEW: receive_fd "
-                                                                                               "failed\n");
-                                                                       }else{
-                                                                               LOG(L_CRIT, "BUG: tcp_main_loop:"
-                                                                                               " CONN_NEW: to few bytes "
-                                                                                               "received (%d)\n", bytes );
-                                                                       }
-                                                                       break; /* try to ignore */
-                                                               }
-                                                       /* add tcpconn to the list*/
-                                                       tcpconn_add(tcpconn);
-                                                       FD_SET(tcpconn->s, &master_set);
-                                                       if (maxfd<tcpconn->s) maxfd=tcpconn->s;
-                                                       /* update the timeout*/
-                                                       tcpconn->timeout=get_ticks()+TCP_CON_TIMEOUT;
-                                               }else{
-                                                       LOG(L_CRIT, "BUG: tcp_main_loop: null pointer\n");
-                                               }
-                                               break;
-                                       default:
-                                                       LOG(L_CRIT, "BUG: tcp_main_loop: unknown cmd %d\n",
-                                                                       cmd);
-                               }
+                       break;
+               case POLL_EPOLL_ET:
+                       while(1){
+                               io_wait_loop_epoll(&io_h, TCP_MAIN_SELECT_TIMEOUT, 1);
+                               tcpconn_timeout();
                        }
-               } /* for */
-               
-               /* remove old connections */
-               tcpconn_timeout(&master_set);
-       
+                       break;
+#endif
+               default:
+                       LOG(L_CRIT, "BUG: tcp_main_loop: no support for poll method "
+                                       " %s (%d)\n", 
+                                       poll_method_name(io_h.poll_method), io_h.poll_method);
+                       goto error;
        }
+error:
+       destroy_io_wait(&io_h);
+       LOG(L_CRIT, "ERROR: tcp_main_loop: exiting...");
+       exit(-1);
+}
+
+
+
+
+/* cleanup before exit */
+void destroy_tcp()
+{
+               if (tcpconn_id_hash){
+                       shm_free(tcpconn_id_hash);
+                       tcpconn_id_hash=0;
+               }
+               if (connection_id){
+                       shm_free(connection_id);
+                       connection_id=0;
+               }
+               if (tcpconn_aliases_hash){
+                       shm_free(tcpconn_aliases_hash);
+                       tcpconn_aliases_hash=0;
+               }
+               if (tcpconn_lock){
+                       lock_destroy(tcpconn_lock);
+                       lock_dealloc((void*)tcpconn_lock);
+                       tcpconn_lock=0;
+               }
 }
 
 
@@ -1315,9 +1563,6 @@ int init_tcp()
        connection_id=(int*)shm_malloc(sizeof(int));
        if (connection_id==0){
                LOG(L_CRIT, "ERROR: init_tcp: could not alloc globals\n");
-               lock_destroy(tcpconn_lock);
-               lock_dealloc((void*)tcpconn_lock);
-               tcpconn_lock=0;
                goto error;
        }
        *connection_id=1;
@@ -1326,25 +1571,12 @@ int init_tcp()
                        shm_malloc(TCP_ALIAS_HASH_SIZE* sizeof(struct tcp_conn_alias*));
        if (tcpconn_aliases_hash==0){
                LOG(L_CRIT, "ERROR: init_tcp: could not alloc address hashtable\n");
-               shm_free(connection_id);
-               connection_id=0;
-               lock_destroy(tcpconn_lock);
-               lock_dealloc((void*)tcpconn_lock);
-               tcpconn_lock=0;
                goto error;
        }
-       
        tcpconn_id_hash=(struct tcp_connection**)shm_malloc(TCP_ID_HASH_SIZE*
                                                                sizeof(struct tcp_connection*));
        if (tcpconn_id_hash==0){
                LOG(L_CRIT, "ERROR: init_tcp: could not alloc id hashtable\n");
-               shm_free(connection_id);
-               connection_id=0;
-               shm_free(tcpconn_aliases_hash);
-               tcpconn_aliases_hash=0;
-               lock_destroy(tcpconn_lock);
-               lock_dealloc((void*)tcpconn_lock);
-               tcpconn_lock=0;
                goto error;
        }
        /* init hashtables*/
@@ -1352,34 +1584,13 @@ int init_tcp()
                        TCP_ALIAS_HASH_SIZE * sizeof(struct tcp_conn_alias*));
        memset((void*)tcpconn_id_hash, 0, 
                        TCP_ID_HASH_SIZE * sizeof(struct tcp_connection*));
+       
+       
        return 0;
 error:
-               return -1;
-}
-
-
-
-/* cleanup before exit */
-void destroy_tcp()
-{
-       if (tcpconn_lock){
-               lock_destroy(tcpconn_lock);
-               lock_dealloc((void*)tcpconn_lock);
-               tcpconn_lock=0;
-       }
-       if(tcpconn_aliases_hash){
-               shm_free(tcpconn_aliases_hash);
-               tcpconn_aliases_hash=0;
-       }
-       if(tcpconn_id_hash){
-               shm_free(tcpconn_id_hash);
-               tcpconn_id_hash=0;
-       }
-       
-       if(connection_id){
-               shm_free(connection_id);
-               connection_id=0;
-       }
+       /* clean-up */
+       destroy_tcp();
+       return -1;
 }
 
 
@@ -1434,6 +1645,9 @@ int tcp_init_children()
                        unix_tcp_sock=sockfd[1];
                        bind_address=0; /* force a SEGFAULT if someone uses a non-init.
                                                           bind address on tcp */
+                       /* record pid twice to avoid the child using it, before
+                        * parent gets a chance to set it*/
+                       pt[process_no].pid=getpid();
                        if (init_child(r+children_no+1) < 0) {
                                LOG(L_ERR, "init_children failed\n");
                                goto error;
index f5d2ec0..68db461 100644 (file)
@@ -588,7 +588,7 @@ void tcp_receive_loop(int unix_sock)
                        if (FD_ISSET(unix_sock, &sel_set)){
                                nfds--;
                                /* a new conn from "main" */
-                               n=receive_fd(unix_sock, &con, sizeof(con), &s);
+                               n=receive_fd(unix_sock, &con, sizeof(con), &s, 0);
                                if (n<0){
                                        if (errno == EWOULDBLOCK || errno == EAGAIN ||
                                                        errno == EINTR){
index 10848e0..8519711 100644 (file)
@@ -605,6 +605,9 @@ int init_unixsock_children(void)
                                unix_tcp_sock=sockfd[1];
                        }
 #endif
+                       /* record pid twice to avoid the child using it, before
+                        * parent gets a chance to set it*/
+                       pt[process_no].pid=getpid();
                        if (init_child(PROC_UNIXSOCK) < 0) {
                                LOG(L_ERR, "init_unixsock_server: Error in "
                                    "init_child\n");