@cfg_get.<group_name>.<var_name> is documented
[sip-router] / tcp_main.c
1 /*
2  * $Id$
3  *
4  * Copyright (C) 2001-2003 FhG Fokus
5  *
6  * This file is part of ser, a free SIP server.
7  *
8  * ser is free software; you can redistribute it and/or modify
9  * it under the terms of the GNU General Public License as published by
10  * the Free Software Foundation; either version 2 of the License, or
11  * (at your option) any later version
12  *
13  * For a license to use the ser software under conditions
14  * other than those described here, or to purchase support for this
15  * software, please contact iptel.org by e-mail at the following addresses:
16  *    info@iptel.org
17  *
18  * ser is distributed in the hope that it will be useful,
19  * but WITHOUT ANY WARRANTY; without even the implied warranty of
20  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
21  * GNU General Public License for more details.
22  *
23  * You should have received a copy of the GNU General Public License
24  * along with this program; if not, write to the Free Software
25  * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
26  */
27 /*
28  * History:
29  * --------
30  *  2002-11-29  created by andrei
31  *  2002-12-11  added tcp_send (andrei)
32  *  2003-01-20  locking fixes, hashtables (andrei)
33  *  2003-02-20  s/lock_t/gen_lock_t/ to avoid a conflict on solaris (andrei)
34  *  2003-02-25  Nagle is disabled if -DDISABLE_NAGLE (andrei)
35  *  2003-03-29  SO_REUSEADDR before calling bind to allow
36  *              server restart, Nagle set on the (hopefuly) 
37  *              correct socket (jiri)
38  *  2003-03-31  always try to find the corresponding tcp listen socket for
39  *               a temp. socket and store in in *->bind_address: added
40  *               find_tcp_si, modified tcpconn_connect (andrei)
41  *  2003-04-14  set sockopts to TOS low delay (andrei)
42  *  2003-06-30  moved tcp new connect checking & handling to
43  *               handle_new_connect (andrei)
44  *  2003-07-09  tls_close called before closing the tcp connection (andrei)
45  *  2003-10-24  converted to the new socket_info lists (andrei)
46  *  2003-10-27  tcp port aliases support added (andrei)
47  *  2003-11-04  always lock before manipulating refcnt; sendchild
48  *              does not inc refcnt by itself anymore (andrei)
49  *  2003-11-07  different unix sockets are used for fd passing
50  *              to/from readers/writers (andrei)
51  *  2003-11-17  handle_new_connect & tcp_connect will close the 
52  *              new socket if tcpconn_new return 0 (e.g. out of mem) (andrei)
53  *  2003-11-28  tcp_blocking_write & tcp_blocking_connect added (andrei)
54  *  2004-11-08  dropped find_tcp_si and replaced with find_si (andrei)
55  *  2005-06-07  new tcp optimized code, supports epoll (LT), sigio + real time
56  *               signals, poll & select (andrei)
57  *  2005-06-26  *bsd kqueue support (andrei)
58  *  2005-07-04  solaris /dev/poll support (andrei)
59  *  2005-07-08  tcp_max_connections, tcp_connection_lifetime, don't accept
60  *               more connections if tcp_max_connections is exceeded (andrei)
61  *  2005-10-21  cleanup all the open connections on exit
62  *              decrement the no. of open connections on timeout too    (andrei) *  2006-01-30  queue send_fd request and execute them at the end of the
63  *              poll loop  (#ifdef) (andrei)
64  *              process all children requests, before attempting to send
65  *              them new stuff (fixes some deadlocks) (andrei)
66  *  2006-02-03  timers are run only once per s (andrei)
67  *              tcp children fds can be non-blocking; send fds are queued on
68  *              EAGAIN; lots of bug fixes (andrei)
69  *  2006-02-06  better tcp_max_connections checks, tcp_connections_no moved to
70  *              shm (andrei)
71  *  2006-04-12  tcp_send() changed to use struct dest_info (andrei)
72  *  2006-11-02  switched to atomic ops for refcnt, locking improvements 
73  *               (andrei)
74  *  2006-11-04  switched to raw ticks (to fix conversion errors which could
75  *               result in inf. lifetime) (andrei)
76  *  2007-07-25  tcpconn_connect can now bind the socket on a specified
77  *                source addr/port (andrei)
78  *  2007-07-26   tcp_send() and tcpconn_get() can now use a specified source
79  *                addr./port (andrei)
80  *  2007-08-23   getsockname() for INADDR_ANY(SI_IS_ANY) sockets (andrei)
81  *  2007-08-27   split init_sock_opt into a lightweight init_sock_opt_accept() 
82  *               used when accepting connections and init_sock_opt used for 
83  *               connect/ new sockets (andrei)
84  *  2007-11-22  always add the connection & clear the coresponding flags before
85  *               io_watch_add-ing its fd - it's safer this way (andrei)
86  *  2007-11-26  improved tcp timers: switched to local_timer (andrei)
87  *  2007-11-27  added send fd cache and reader fd reuse (andrei)
88  *  2007-11-28  added support for TCP_DEFER_ACCEPT, KEEPALIVE, KEEPINTVL,
89  *               KEEPCNT, QUICKACK, SYNCNT, LINGER2 (andrei)
90  *  2007-12-04  support for queueing write requests (andrei)
91  *  2007-12-12  destroy connection asap on wbuf. timeout (andrei)
92  *  2007-12-13  changed the refcnt and destroy scheme, now refcnt is 1 if
93  *                linked into the hash tables (was 0) (andrei)
94  *  2007-12-21  support for pending connects (connections are added to the
95  *               hash immediately and writes on them are buffered) (andrei)
96  */
97
98
99 #ifdef USE_TCP
100
101
102 #ifndef SHM_MEM
103 #error "shared memory support needed (add -DSHM_MEM to Makefile.defs)"
104 #endif
105
106 #include <sys/time.h>
107 #include <sys/types.h>
108 #include <sys/select.h>
109 #include <sys/socket.h>
110 #include <netinet/in.h>
111 #include <netinet/in_systm.h>
112 #include <netinet/ip.h>
113 #include <netinet/tcp.h>
114 #include <sys/uio.h>  /* writev*/
115 #include <netdb.h>
116 #include <stdlib.h> /*exit() */
117
118 #include <unistd.h>
119
120 #include <errno.h>
121 #include <string.h>
122
123 #ifdef HAVE_SELECT
124 #include <sys/select.h>
125 #endif
126 #include <sys/poll.h>
127
128
129 #include "ip_addr.h"
130 #include "pass_fd.h"
131 #include "tcp_conn.h"
132 #include "globals.h"
133 #include "pt.h"
134 #include "locking.h"
135 #include "mem/mem.h"
136 #include "mem/shm_mem.h"
137 #include "timer.h"
138 #include "sr_module.h"
139 #include "tcp_server.h"
140 #include "tcp_init.h"
141 #include "tsend.h"
142 #include "timer_ticks.h"
143 #include "local_timer.h"
144 #ifdef CORE_TLS
145 #include "tls/tls_server.h"
146 #define tls_loaded() 1
147 #else
148 #include "tls_hooks_init.h"
149 #include "tls_hooks.h"
150 #endif
151
152 #include "tcp_info.h"
153 #include "tcp_options.h"
154 #include "ut.h"
155 #include "cfg/cfg_struct.h"
156
157 #define local_malloc pkg_malloc
158 #define local_free   pkg_free
159
160 #define HANDLE_IO_INLINE
161 #include "io_wait.h"
162 #include <fcntl.h> /* must be included after io_wait.h if SIGIO_RT is used */
163
164
165 #define TCP_PASS_NEW_CONNECTION_ON_DATA /* don't pass a new connection
166                                                                                    immediately to a child, wait for
167                                                                                    some data on it first */
168 #define TCP_LISTEN_BACKLOG 1024
169 #define SEND_FD_QUEUE /* queue send fd requests on EAGAIN, instead of sending 
170                                                         them immediately */
171 #define TCP_CHILD_NON_BLOCKING 
172 #ifdef SEND_FD_QUEUE
173 #ifndef TCP_CHILD_NON_BLOCKING
174 #define TCP_CHILD_NON_BLOCKING
175 #endif
176 #define MAX_SEND_FD_QUEUE_SIZE  tcp_main_max_fd_no
177 #define SEND_FD_QUEUE_SIZE              128  /* initial size */
178 #define MAX_SEND_FD_RETRIES             96       /* FIXME: not used for now */
179 #define SEND_FD_QUEUE_TIMEOUT   MS_TO_TICKS(2000)  /* 2 s */
180 #endif
181
182 /* maximum accepted lifetime (maximum possible is  ~ MAXINT/2) */
183 #define MAX_TCP_CON_LIFETIME    ((1U<<(sizeof(ticks_t)*8-1))-1)
184 /* minimum interval local_timer_run() is allowed to run, in ticks */
185 #define TCPCONN_TIMEOUT_MIN_RUN 1  /* once per tick */
186 #define TCPCONN_WAIT_TIMEOUT 1 /* 1 tick */
187
188 #ifdef TCP_BUF_WRITE
189 #define TCP_WBUF_SIZE   1024 /* FIXME: after debugging switch to 16-32k */
190 static unsigned int* tcp_total_wq=0;
191 #endif
192
193
194 enum fd_types { F_NONE, F_SOCKINFO /* a tcp_listen fd */,
195                                 F_TCPCONN, F_TCPCHILD, F_PROC };
196
197
198 #ifdef TCP_FD_CACHE
199
200 #define TCP_FD_CACHE_SIZE 8
201
202 struct fd_cache_entry{
203         struct tcp_connection* con;
204         int id;
205         int fd;
206 };
207
208
209 static struct fd_cache_entry fd_cache[TCP_FD_CACHE_SIZE];
210 #endif /* TCP_FD_CACHE */
211
212 static int is_tcp_main=0;
213
214 int tcp_accept_aliases=0; /* by default don't accept aliases */
215 /* flags used for adding new aliases */
216 int tcp_alias_flags=TCP_ALIAS_FORCE_ADD;
217 /* flags used for adding the default aliases of a new tcp connection */
218 int tcp_new_conn_alias_flags=TCP_ALIAS_REPLACE;
219 int tcp_connect_timeout=DEFAULT_TCP_CONNECT_TIMEOUT;
220 int tcp_send_timeout=DEFAULT_TCP_SEND_TIMEOUT;
221 int tcp_con_lifetime=DEFAULT_TCP_CONNECTION_LIFETIME;
222 enum poll_types tcp_poll_method=0; /* by default choose the best method */
223 int tcp_max_connections=DEFAULT_TCP_MAX_CONNECTIONS;
224 int tcp_main_max_fd_no=0;
225
226 static union sockaddr_union tcp_source_ipv4_addr; /* saved bind/srv v4 addr. */
227 static union sockaddr_union* tcp_source_ipv4=0;
228 #ifdef USE_IPV6
229 static union sockaddr_union tcp_source_ipv6_addr; /* saved bind/src v6 addr. */
230 static union sockaddr_union* tcp_source_ipv6=0;
231 #endif
232
233 static int* tcp_connections_no=0; /* current open connections */
234
235 /* connection hash table (after ip&port) , includes also aliases */
236 struct tcp_conn_alias** tcpconn_aliases_hash=0;
237 /* connection hash table (after connection id) */
238 struct tcp_connection** tcpconn_id_hash=0;
239 gen_lock_t* tcpconn_lock=0;
240
241 struct tcp_child* tcp_children;
242 static int* connection_id=0; /*  unique for each connection, used for 
243                                                                 quickly finding the corresponding connection
244                                                                 for a reply */
245 int unix_tcp_sock;
246
247 static int tcp_proto_no=-1; /* tcp protocol number as returned by
248                                                            getprotobyname */
249
250 static io_wait_h io_h;
251
252 static struct local_timer tcp_main_ltimer;
253 static ticks_t tcp_main_prev_ticks;
254
255
256 static ticks_t tcpconn_main_timeout(ticks_t , struct timer_ln* , void* );
257
258 inline static int _tcpconn_add_alias_unsafe(struct tcp_connection* c, int port,
259                                                                                 struct ip_addr* l_ip, int l_port,
260                                                                                 int flags);
261
262
263
264 /* sets source address used when opening new sockets and no source is specified
265  *  (by default the address is choosen by the kernel)
266  * Should be used only on init.
267  * returns -1 on error */
268 int tcp_set_src_addr(struct ip_addr* ip)
269 {
270         switch (ip->af){
271                 case AF_INET:
272                         ip_addr2su(&tcp_source_ipv4_addr, ip, 0);
273                         tcp_source_ipv4=&tcp_source_ipv4_addr;
274                         break;
275                 #ifdef USE_IPV6
276                 case AF_INET6:
277                         ip_addr2su(&tcp_source_ipv6_addr, ip, 0);
278                         tcp_source_ipv6=&tcp_source_ipv6_addr;
279                         break;
280                 #endif
281                 default:
282                         return -1;
283         }
284         return 0;
285 }
286
287
288
289 static inline int init_sock_keepalive(int s)
290 {
291         int optval;
292         
293 #ifdef HAVE_SO_KEEPALIVE
294         if (tcp_options.keepalive){
295                 optval=1;
296                 if (setsockopt(s, SOL_SOCKET, SO_KEEPALIVE, &optval,
297                                                 sizeof(optval))<0){
298                         LOG(L_WARN, "WARNING: init_sock_keepalive: failed to enable"
299                                                 " SO_KEEPALIVE: %s\n", strerror(errno));
300                         return -1;
301                 }
302         }
303 #endif
304 #ifdef HAVE_TCP_KEEPINTVL
305         if (tcp_options.keepintvl){
306                 optval=tcp_options.keepintvl;
307                 if (setsockopt(s, IPPROTO_TCP, TCP_KEEPINTVL, &optval,
308                                                 sizeof(optval))<0){
309                         LOG(L_WARN, "WARNING: init_sock_keepalive: failed to set"
310                                                 " keepalive probes interval: %s\n", strerror(errno));
311                 }
312         }
313 #endif
314 #ifdef HAVE_TCP_KEEPIDLE
315         if (tcp_options.keepidle){
316                 optval=tcp_options.keepidle;
317                 if (setsockopt(s, IPPROTO_TCP, TCP_KEEPIDLE, &optval,
318                                                 sizeof(optval))<0){
319                         LOG(L_WARN, "WARNING: init_sock_keepalive: failed to set"
320                                                 " keepalive idle interval: %s\n", strerror(errno));
321                 }
322         }
323 #endif
324 #ifdef HAVE_TCP_KEEPCNT
325         if (tcp_options.keepcnt){
326                 optval=tcp_options.keepcnt;
327                 if (setsockopt(s, IPPROTO_TCP, TCP_KEEPCNT, &optval,
328                                                 sizeof(optval))<0){
329                         LOG(L_WARN, "WARNING: init_sock_keepalive: failed to set"
330                                                 " maximum keepalive count: %s\n", strerror(errno));
331                 }
332         }
333 #endif
334         return 0;
335 }
336
337
338
339 /* set all socket/fd options for new sockets (e.g. before connect): 
340  *  disable nagle, tos lowdelay, reuseaddr, non-blocking
341  *
342  * return -1 on error */
343 static int init_sock_opt(int s)
344 {
345         int flags;
346         int optval;
347         
348 #ifdef DISABLE_NAGLE
349         flags=1;
350         if ( (tcp_proto_no!=-1) && (setsockopt(s, tcp_proto_no , TCP_NODELAY,
351                                         &flags, sizeof(flags))<0) ){
352                 LOG(L_WARN, "WARNING: init_sock_opt: could not disable Nagle: %s\n",
353                                 strerror(errno));
354         }
355 #endif
356         /* tos*/
357         optval = tos;
358         if (setsockopt(s, IPPROTO_IP, IP_TOS, (void*)&optval,sizeof(optval)) ==-1){
359                 LOG(L_WARN, "WARNING: init_sock_opt: setsockopt tos: %s\n",
360                                 strerror(errno));
361                 /* continue since this is not critical */
362         }
363 #if  !defined(TCP_DONT_REUSEADDR) 
364         optval=1;
365         if (setsockopt(s, SOL_SOCKET, SO_REUSEADDR,
366                                                 (void*)&optval, sizeof(optval))==-1){
367                 LOG(L_ERR, "ERROR: setsockopt SO_REUSEADDR %s\n",
368                                 strerror(errno));
369                 /* continue, not critical */
370         }
371 #endif /* !TCP_DONT_REUSEADDR */
372 #ifdef HAVE_TCP_SYNCNT
373         if (tcp_options.syncnt){
374                 optval=tcp_options.syncnt;
375                 if (setsockopt(s, IPPROTO_TCP, TCP_SYNCNT, &optval,
376                                                 sizeof(optval))<0){
377                         LOG(L_WARN, "WARNING: init_sock_opt: failed to set"
378                                                 " maximum SYN retr. count: %s\n", strerror(errno));
379                 }
380         }
381 #endif
382 #ifdef HAVE_TCP_LINGER2
383         if (tcp_options.linger2){
384                 optval=tcp_options.linger2;
385                 if (setsockopt(s, IPPROTO_TCP, TCP_LINGER2, &optval,
386                                                 sizeof(optval))<0){
387                         LOG(L_WARN, "WARNING: init_sock_opt: failed to set"
388                                                 " maximum LINGER2 timeout: %s\n", strerror(errno));
389                 }
390         }
391 #endif
392 #ifdef HAVE_TCP_QUICKACK
393         if (tcp_options.delayed_ack){
394                 optval=0; /* reset quick ack => delayed ack */
395                 if (setsockopt(s, IPPROTO_TCP, TCP_QUICKACK, &optval,
396                                                 sizeof(optval))<0){
397                         LOG(L_WARN, "WARNING: init_sock_opt: failed to reset"
398                                                 " TCP_QUICKACK: %s\n", strerror(errno));
399                 }
400         }
401 #endif /* HAVE_TCP_QUICKACK */
402         init_sock_keepalive(s);
403         
404         /* non-blocking */
405         flags=fcntl(s, F_GETFL);
406         if (flags==-1){
407                 LOG(L_ERR, "ERROR: init_sock_opt: fnctl failed: (%d) %s\n",
408                                 errno, strerror(errno));
409                 goto error;
410         }
411         if (fcntl(s, F_SETFL, flags|O_NONBLOCK)==-1){
412                 LOG(L_ERR, "ERROR: init_sock_opt: fcntl: set non-blocking failed:"
413                                 " (%d) %s\n", errno, strerror(errno));
414                 goto error;
415         }
416         return 0;
417 error:
418         return -1;
419 }
420
421
422
423 /* set all socket/fd options for "accepted" sockets 
424  *  only nonblocking is set since the rest is inherited from the
425  *  "parent" (listening) socket
426  *  Note: setting O_NONBLOCK is required on linux but it's not needed on
427  *        BSD and possibly solaris (where the flag is inherited from the 
428  *        parent socket). However since there is no standard document 
429  *        requiring a specific behaviour in this case it's safer to always set
430  *        it (at least for now)  --andrei
431  *  TODO: check on which OSes  O_NONBLOCK is inherited and make this 
432  *        function a nop.
433  *
434  * return -1 on error */
435 static int init_sock_opt_accept(int s)
436 {
437         int flags;
438         
439         /* non-blocking */
440         flags=fcntl(s, F_GETFL);
441         if (flags==-1){
442                 LOG(L_ERR, "ERROR: init_sock_opt_accept: fnctl failed: (%d) %s\n",
443                                 errno, strerror(errno));
444                 goto error;
445         }
446         if (fcntl(s, F_SETFL, flags|O_NONBLOCK)==-1){
447                 LOG(L_ERR, "ERROR: init_sock_opt_accept: "
448                                         "fcntl: set non-blocking failed: (%d) %s\n",
449                                         errno, strerror(errno));
450                 goto error;
451         }
452         return 0;
453 error:
454         return -1;
455 }
456
457
458
459 /* blocking connect on a non-blocking fd; it will timeout after
460  * tcp_connect_timeout 
461  * if BLOCKING_USE_SELECT and HAVE_SELECT are defined it will internally
462  * use select() instead of poll (bad if fd > FD_SET_SIZE, poll is preferred)
463  */
464 static int tcp_blocking_connect(int fd, const struct sockaddr *servaddr,
465                                                                 socklen_t addrlen)
466 {
467         int n;
468 #if defined(HAVE_SELECT) && defined(BLOCKING_USE_SELECT)
469         fd_set sel_set;
470         fd_set orig_set;
471         struct timeval timeout;
472 #else
473         struct pollfd pf;
474 #endif
475         int elapsed;
476         int to;
477         int ticks;
478         int err;
479         unsigned int err_len;
480         int poll_err;
481         
482         poll_err=0;
483         to=tcp_connect_timeout;
484         ticks=get_ticks();
485 again:
486         n=connect(fd, servaddr, addrlen);
487         if (n==-1){
488                 if (errno==EINTR){
489                         elapsed=(get_ticks()-ticks)*TIMER_TICK;
490                         if (elapsed<to)         goto again;
491                         else goto error_timeout;
492                 }
493                 if (errno!=EINPROGRESS && errno!=EALREADY){
494                         LOG(L_ERR, "ERROR: tcp_blocking_connect: (%d) %s\n",
495                                         errno, strerror(errno));
496                         goto error;
497                 }
498         }else goto end;
499         
500         /* poll/select loop */
501 #if defined(HAVE_SELECT) && defined(BLOCKING_USE_SELECT)
502                 FD_ZERO(&orig_set);
503                 FD_SET(fd, &orig_set);
504 #else
505                 pf.fd=fd;
506                 pf.events=POLLOUT;
507 #endif
508         while(1){
509                 elapsed=(get_ticks()-ticks)*TIMER_TICK;
510                 if (elapsed<to)
511                         to-=elapsed;
512                 else 
513                         goto error_timeout;
514 #if defined(HAVE_SELECT) && defined(BLOCKING_USE_SELECT)
515                 sel_set=orig_set;
516                 timeout.tv_sec=to;
517                 timeout.tv_usec=0;
518                 n=select(fd+1, 0, &sel_set, 0, &timeout);
519 #else
520                 n=poll(&pf, 1, to*1000);
521 #endif
522                 if (n<0){
523                         if (errno==EINTR) continue;
524                         LOG(L_ERR, "ERROR: tcp_blocking_connect: poll/select failed:"
525                                         " (%d) %s\n", errno, strerror(errno));
526                         goto error;
527                 }else if (n==0) /* timeout */ continue;
528 #if defined(HAVE_SELECT) && defined(BLOCKING_USE_SELECT)
529                 if (FD_ISSET(fd, &sel_set))
530 #else
531                 if (pf.revents&(POLLERR|POLLHUP|POLLNVAL)){ 
532                         LOG(L_ERR, "ERROR: tcp_blocking_connect: poll error: flags %x\n",
533                                         pf.revents);
534                         poll_err=1;
535                 }
536 #endif
537                 {
538                         err_len=sizeof(err);
539                         getsockopt(fd, SOL_SOCKET, SO_ERROR, &err, &err_len);
540                         if ((err==0) && (poll_err==0)) goto end;
541                         if (err!=EINPROGRESS && err!=EALREADY){
542                                 LOG(L_ERR, "ERROR: tcp_blocking_connect: SO_ERROR (%d) %s\n",
543                                                 err, strerror(err));
544                                 goto error;
545                         }
546                 }
547         }
548 error_timeout:
549         /* timeout */
550         LOG(L_ERR, "ERROR: tcp_blocking_connect: timeout %d s elapsed from %d s\n",
551                         elapsed, tcp_connect_timeout);
552 error:
553         return -1;
554 end:
555         return 0;
556 }
557
558
559
560 inline static int _tcpconn_write_nb(int fd, struct tcp_connection* c,
561                                                                         char* buf, int len);
562
563
564 #ifdef TCP_BUF_WRITE
565
566
567 /* unsafe version */
568 #define _wbufq_empty(con) ((con)->wbuf_q.first==0)
569 /* unsafe version */
570 #define _wbufq_non_empty(con) ((con)->wbuf_q.first!=0)
571
572
573 /* unsafe version, call while holding the connection write lock */
574 inline static int _wbufq_add(struct  tcp_connection* c, char* data, 
575                                                         unsigned int size)
576 {
577         struct tcp_wbuffer_queue* q;
578         struct tcp_wbuffer* wb;
579         unsigned int last_free;
580         unsigned int wb_size;
581         unsigned int crt_size;
582         ticks_t t;
583         
584         q=&c->wbuf_q;
585         t=get_ticks_raw();
586         if (unlikely(   ((q->queued+size)>tcp_options.tcpconn_wq_max) ||
587                                         ((*tcp_total_wq+size)>tcp_options.tcp_wq_max) ||
588                                         (q->first &&
589                                         TICKS_LT(q->wr_timeout, t)) )){
590                 LOG(L_ERR, "ERROR: wbufq_add(%d bytes): write queue full or timeout "
591                                         " (%d, total %d, last write %d s ago)\n",
592                                         size, q->queued, *tcp_total_wq,
593                                         TICKS_TO_S(t-q->wr_timeout-tcp_options.tcp_wq_timeout));
594                 goto error;
595         }
596         
597         if (unlikely(q->last==0)){
598                 wb_size=MAX_unsigned(TCP_WBUF_SIZE, size);
599                 wb=shm_malloc(sizeof(*wb)+wb_size-1);
600                 if (unlikely(wb==0))
601                         goto error;
602                 wb->b_size=wb_size;
603                 wb->next=0;
604                 q->last=wb;
605                 q->first=wb;
606                 q->last_used=0;
607                 q->offset=0;
608                 q->wr_timeout=get_ticks_raw()+tcp_options.tcp_wq_timeout;
609         }else{
610                 wb=q->last;
611         }
612         
613         while(size){
614                 last_free=wb->b_size-q->last_used;
615                 if (last_free==0){
616                         wb_size=MAX_unsigned(TCP_WBUF_SIZE, size);
617                         wb=shm_malloc(sizeof(*wb)+wb_size-1);
618                         if (unlikely(wb==0))
619                                 goto error;
620                         wb->b_size=wb_size;
621                         wb->next=0;
622                         q->last->next=wb;
623                         q->last=wb;
624                         q->last_used=0;
625                         last_free=wb->b_size;
626                 }
627                 crt_size=MIN_unsigned(last_free, size);
628                 memcpy(wb->buf, data, crt_size);
629                 q->last_used+=crt_size;
630                 size-=crt_size;
631                 data+=crt_size;
632                 q->queued+=crt_size;
633                 atomic_add_int((int*)tcp_total_wq, crt_size);
634         }
635         return 0;
636 error:
637         return -1;
638 }
639
640
641
642 /* unsafe version, call while holding the connection write lock
643  * inserts data at the beginning, it ignores the max queue size checks and
644  * the timeout (use sparingly)
645  * Note: it should never be called on a write buffer after wbufq_run() */
646 inline static int _wbufq_insert(struct  tcp_connection* c, char* data, 
647                                                         unsigned int size)
648 {
649         struct tcp_wbuffer_queue* q;
650         struct tcp_wbuffer* wb;
651         
652         q=&c->wbuf_q;
653         if (likely(q->first==0)) /* if empty, use wbufq_add */
654                 return _wbufq_add(c, data, size);
655         
656         if (unlikely((*tcp_total_wq+size)>tcp_options.tcp_wq_max)){
657                 LOG(L_ERR, "ERROR: wbufq_insert(%d bytes): write queue full or timeout"
658                                         " (%d, total %d, last write %d s ago)\n",
659                                         size, q->queued, *tcp_total_wq,
660                                         TICKS_TO_S(get_ticks_raw()-q->wr_timeout-
661                                                                                 tcp_options.tcp_wq_timeout));
662                 goto error;
663         }
664         if (unlikely(q->offset)){
665                 LOG(L_CRIT, "BUG: wbufq_insert: non-null offset %d (bad call, should"
666                                 "never be called after the wbufq_run())\n", q->offset);
667                 goto error;
668         }
669         if ((q->first==q->last) && ((q->last->b_size-q->last_used)>=size)){
670                 /* one block with enough space in it for size bytes */
671                 memmove(q->first->buf+size, q->first->buf, size);
672                 memcpy(q->first->buf, data, size);
673                 q->last_used+=size;
674         }else{
675                 /* create a size bytes block directly */
676                 wb=shm_malloc(sizeof(*wb)+size-1);
677                 if (unlikely(wb==0))
678                         goto error;
679                 wb->b_size=size;
680                 /* insert it */
681                 wb->next=q->first;
682                 q->first=wb;
683                 memcpy(wb->buf, data, size);
684         }
685         
686         q->queued+=size;
687         atomic_add_int((int*)tcp_total_wq, size);
688         return 0;
689 error:
690         return -1;
691 }
692
693
694
695 /* unsafe version, call while holding the connection write lock */
696 inline static void _wbufq_destroy( struct  tcp_wbuffer_queue* q)
697 {
698         struct tcp_wbuffer* wb;
699         struct tcp_wbuffer* next_wb;
700         int unqueued;
701         
702         unqueued=0;
703         if (likely(q->first)){
704                 wb=q->first;
705                 do{
706                         next_wb=wb->next;
707                         unqueued+=(wb==q->last)?q->last_used:wb->b_size;
708                         if (wb==q->first)
709                                 unqueued-=q->offset;
710                         shm_free(wb);
711                         wb=next_wb;
712                 }while(wb);
713         }
714         memset(q, 0, sizeof(*q));
715         atomic_add_int((int*)tcp_total_wq, -unqueued);
716 }
717
718
719
720 /* tries to empty the queue  (safe version, c->write_lock must not be hold)
721  * returns -1 on error, bytes written on success (>=0) 
722  * if the whole queue is emptied => sets *empty*/
723 inline static int wbufq_run(int fd, struct tcp_connection* c, int* empty)
724 {
725         struct tcp_wbuffer_queue* q;
726         struct tcp_wbuffer* wb;
727         int n;
728         int ret;
729         int block_size;
730         ticks_t t;
731         char* buf;
732         
733         *empty=0;
734         ret=0;
735         t=get_ticks_raw();
736         lock_get(&c->write_lock);
737         q=&c->wbuf_q;
738         while(q->first){
739                 block_size=((q->first==q->last)?q->last_used:q->first->b_size)-
740                                                 q->offset;
741                 buf=q->first->buf+q->offset;
742                 n=_tcpconn_write_nb(fd, c, buf, block_size);
743                 if (likely(n>0)){
744                         ret+=n;
745                         if (likely(n==block_size)){
746                                 wb=q->first;
747                                 q->first=q->first->next; 
748                                 shm_free(wb);
749                                 q->offset=0;
750                                 q->queued-=block_size;
751                                 atomic_add_int((int*)tcp_total_wq, -block_size);
752                         }else{
753                                 q->offset+=n;
754                                 q->queued-=n;
755                                 atomic_add_int((int*)tcp_total_wq, -n);
756                                 break;
757                         }
758                         q->wr_timeout=t+tcp_options.tcp_wq_timeout;
759                         c->state=S_CONN_OK;
760                 }else{
761                         if (n<0){
762                                 /* EINTR is handled inside _tcpconn_write_nb */
763                                 if (!(errno==EAGAIN || errno==EWOULDBLOCK)){
764                                         ret=-1;
765                                         LOG(L_ERR, "ERROR: wbuf_runq: %s [%d]\n",
766                                                 strerror(errno), errno);
767                                 }
768                         }
769                         break;
770                 }
771         }
772         if (likely(q->first==0)){
773                 q->last=0;
774                 q->last_used=0;
775                 q->offset=0;
776                 *empty=1;
777         }
778         if (unlikely(c->state==S_CONN_CONNECT && (ret>0)))
779                         c->state=S_CONN_OK;
780         lock_release(&c->write_lock);
781         return ret;
782 }
783
784 #endif /* TCP_BUF_WRITE */
785
786
787
788 #if 0
789 /* blocking write even on non-blocking sockets 
790  * if TCP_TIMEOUT will return with error */
791 static int tcp_blocking_write(struct tcp_connection* c, int fd, char* buf,
792                                                                 unsigned int len)
793 {
794         int n;
795         fd_set sel_set;
796         struct timeval timeout;
797         int ticks;
798         int initial_len;
799         
800         initial_len=len;
801 again:
802         
803         n=send(fd, buf, len,
804 #ifdef HAVE_MSG_NOSIGNAL
805                         MSG_NOSIGNAL
806 #else
807                         0
808 #endif
809                 );
810         if (n<0){
811                 if (errno==EINTR)       goto again;
812                 else if (errno!=EAGAIN && errno!=EWOULDBLOCK){
813                         LOG(L_ERR, "tcp_blocking_write: failed to send: (%d) %s\n",
814                                         errno, strerror(errno));
815                         goto error;
816                 }
817         }else if (n<len){
818                 /* partial write */
819                 buf+=n;
820                 len-=n;
821         }else{
822                 /* success: full write */
823                 goto end;
824         }
825         while(1){
826                 FD_ZERO(&sel_set);
827                 FD_SET(fd, &sel_set);
828                 timeout.tv_sec=tcp_send_timeout;
829                 timeout.tv_usec=0;
830                 ticks=get_ticks();
831                 n=select(fd+1, 0, &sel_set, 0, &timeout);
832                 if (n<0){
833                         if (errno==EINTR) continue; /* signal, ignore */
834                         LOG(L_ERR, "ERROR: tcp_blocking_write: select failed: "
835                                         " (%d) %s\n", errno, strerror(errno));
836                         goto error;
837                 }else if (n==0){
838                         /* timeout */
839                         if (get_ticks()-ticks>=tcp_send_timeout){
840                                 LOG(L_ERR, "ERROR: tcp_blocking_write: send timeout (%d)\n",
841                                                 tcp_send_timeout);
842                                 goto error;
843                         }
844                         continue;
845                 }
846                 if (FD_ISSET(fd, &sel_set)){
847                         /* we can write again */
848                         goto again;
849                 }
850         }
851 error:
852                 return -1;
853 end:
854                 return initial_len;
855 }
856 #endif
857
858
859
860 struct tcp_connection* tcpconn_new(int sock, union sockaddr_union* su,
861                                                                         union sockaddr_union* local_addr,
862                                                                         struct socket_info* ba, int type, 
863                                                                         int state)
864 {
865         struct tcp_connection *c;
866         
867         c=(struct tcp_connection*)shm_malloc(sizeof(struct tcp_connection));
868         if (c==0){
869                 LOG(L_ERR, "ERROR: tcpconn_new: mem. allocation failure\n");
870                 goto error;
871         }
872         memset(c, 0, sizeof(struct tcp_connection)); /* zero init */
873         c->s=sock;
874         c->fd=-1; /* not initialized */
875         if (lock_init(&c->write_lock)==0){
876                 LOG(L_ERR, "ERROR: tcpconn_new: init lock failed\n");
877                 goto error;
878         }
879         
880         c->rcv.src_su=*su;
881         
882         atomic_set(&c->refcnt, 0);
883         local_timer_init(&c->timer, tcpconn_main_timeout, c, 0);
884         su2ip_addr(&c->rcv.src_ip, su);
885         c->rcv.src_port=su_getport(su);
886         c->rcv.bind_address=ba;
887         if (likely(local_addr)){
888                 su2ip_addr(&c->rcv.dst_ip, local_addr);
889                 c->rcv.dst_port=su_getport(local_addr);
890         }else if (ba){
891                 c->rcv.dst_ip=ba->address;
892                 c->rcv.dst_port=ba->port_no;
893         }
894         print_ip("tcpconn_new: new tcp connection: ", &c->rcv.src_ip, "\n");
895         DBG(     "tcpconn_new: on port %d, type %d\n", c->rcv.src_port, type);
896         init_tcp_req(&c->req);
897         c->id=(*connection_id)++;
898         c->rcv.proto_reserved1=0; /* this will be filled before receive_message*/
899         c->rcv.proto_reserved2=0;
900         c->state=state;
901         c->extra_data=0;
902 #ifdef USE_TLS
903         if (type==PROTO_TLS){
904                 if (tls_tcpconn_init(c, sock)==-1) goto error;
905         }else
906 #endif /* USE_TLS*/
907         {
908                 c->type=PROTO_TCP;
909                 c->rcv.proto=PROTO_TCP;
910                 c->timeout=get_ticks_raw()+tcp_con_lifetime;
911         }
912         c->flags|=F_CONN_REMOVED;
913         
914         return c;
915         
916 error:
917         if (c) shm_free(c);
918         return 0;
919 }
920
921
922
923 /* do the actual connect, set sock. options a.s.o
924  * returns socket on success, -1 on error
925  * sets also *res_local_addr, res_si and state (S_CONN_CONNECT for an
926  * unfinished connect and S_CONN_OK for a finished one)*/
927 inline static int tcp_do_connect(       union sockaddr_union* server,
928                                                                         union sockaddr_union* from,
929                                                                         int type,
930                                                                         union sockaddr_union* res_local_addr,
931                                                                         struct socket_info** res_si,
932                                                                         enum tcp_conn_states *state
933                                                                         )
934 {
935         int s;
936         union sockaddr_union my_name;
937         socklen_t my_name_len;
938         struct ip_addr ip;
939 #ifdef TCP_BUF_WRITE
940         int n;
941 #endif /* TCP_BUF_WRITE */
942
943         s=socket(AF2PF(server->s.sa_family), SOCK_STREAM, 0);
944         if (unlikely(s==-1)){
945                 LOG(L_ERR, "ERROR: tcp_do_connect: socket: (%d) %s\n",
946                                 errno, strerror(errno));
947                 goto error;
948         }
949         if (init_sock_opt(s)<0){
950                 LOG(L_ERR, "ERROR: tcp_do_connect: init_sock_opt failed\n");
951                 goto error;
952         }
953         
954         if (unlikely(from && bind(s, &from->s, sockaddru_len(*from)) != 0)){
955                 LOG(L_WARN, "WARNING: tcp_do_connect: binding to source address"
956                                         " failed: %s [%d]\n", strerror(errno), errno);
957         }
958         *state=S_CONN_OK;
959 #ifdef TCP_BUF_WRITE
960         if (likely(tcp_options.tcp_buf_write)){
961 again:
962                 n=connect(s, &server->s, sockaddru_len(*server));
963                 if (unlikely(n==-1)){
964                         if (errno==EINTR) goto again;
965                         if (likely(errno==EINPROGRESS))
966                                 *state=S_CONN_CONNECT;
967                         else if (errno!=EALREADY){
968                                 LOG(L_ERR, "ERROR: tcp_do_connect: connect: (%d) %s\n",
969                                                 errno, strerror(errno));
970                                 goto error;
971                         }
972                 }
973         }else{
974 #endif /* TCP_BUF_WRITE */
975                 if (tcp_blocking_connect(s, &server->s, sockaddru_len(*server))<0){
976                         LOG(L_ERR, "ERROR: tcp_do_connect: tcp_blocking_connect"
977                                                 " failed\n");
978                         goto error;
979                 }
980 #ifdef TCP_BUF_WRITE
981         }
982 #endif /* TCP_BUF_WRITE */
983         if (from){
984                 su2ip_addr(&ip, from);
985                 if (!ip_addr_any(&ip))
986                         /* we already know the source ip, skip the sys. call */
987                         goto find_socket;
988         }
989         my_name_len=sizeof(my_name);
990         if (unlikely(getsockname(s, &my_name.s, &my_name_len)!=0)){
991                 LOG(L_ERR, "ERROR: tcp_do_connect: getsockname failed: %s(%d)\n",
992                                 strerror(errno), errno);
993                 *res_si=0;
994                 goto error;
995         }
996         from=&my_name; /* update from with the real "from" address */
997         su2ip_addr(&ip, &my_name);
998 find_socket:
999 #ifdef USE_TLS
1000         if (unlikely(type==PROTO_TLS))
1001                 *res_si=find_si(&ip, 0, PROTO_TLS);
1002         else
1003 #endif
1004                 *res_si=find_si(&ip, 0, PROTO_TCP);
1005         
1006         if (unlikely(*res_si==0)){
1007                 LOG(L_WARN, "WARNING: tcp_do_connect: could not find corresponding"
1008                                 " listening socket, using default...\n");
1009                 if (server->s.sa_family==AF_INET) *res_si=sendipv4_tcp;
1010 #ifdef USE_IPV6
1011                 else *res_si=sendipv6_tcp;
1012 #endif
1013         }
1014         *res_local_addr=*from;
1015         return s;
1016 error:
1017         if (s!=-1) close(s);
1018         return -1;
1019 }
1020
1021
1022
1023 struct tcp_connection* tcpconn_connect( union sockaddr_union* server, 
1024                                                                                 union sockaddr_union* from,
1025                                                                                 int type)
1026 {
1027         int s;
1028         struct socket_info* si;
1029         union sockaddr_union my_name;
1030         struct tcp_connection* con;
1031         enum tcp_conn_states state;
1032
1033         s=-1;
1034         
1035         if (*tcp_connections_no >= tcp_max_connections){
1036                 LOG(L_ERR, "ERROR: tcpconn_connect: maximum number of connections"
1037                                         " exceeded (%d/%d)\n",
1038                                         *tcp_connections_no, tcp_max_connections);
1039                 goto error;
1040         }
1041         s=tcp_do_connect(server, from, type, &my_name, &si, &state);
1042         if (s==-1){
1043                 LOG(L_ERR, "ERROR: tcp_do_connect: failed (%d) %s\n",
1044                                 errno, strerror(errno));
1045                 goto error;
1046         }
1047         con=tcpconn_new(s, server, &my_name, si, type, state);
1048         if (con==0){
1049                 LOG(L_ERR, "ERROR: tcp_connect: tcpconn_new failed, closing the "
1050                                  " socket\n");
1051                 goto error;
1052         }
1053         return con;
1054         /*FIXME: set sock idx! */
1055 error:
1056         if (s!=-1) close(s); /* close the opened socket */
1057         return 0;
1058 }
1059
1060
1061
1062 #ifdef TCP_CONNECT_WAIT
1063 int tcpconn_finish_connect( struct tcp_connection* c,
1064                                                                                                 union sockaddr_union* from)
1065 {
1066         int s;
1067         int r;
1068         union sockaddr_union local_addr;
1069         struct socket_info* si;
1070         enum tcp_conn_states state;
1071         struct tcp_conn_alias* a;
1072         
1073         s=tcp_do_connect(&c->rcv.src_su, from, c->type, &local_addr, &si, &state);
1074         if (unlikely(s==-1)){
1075                 LOG(L_ERR, "ERROR: tcpconn_finish_connect: tcp_do_connect for %p"
1076                                                 " failed\n", c);
1077                 return -1;
1078         }
1079         c->rcv.bind_address=si;
1080         su2ip_addr(&c->rcv.dst_ip, &local_addr);
1081         c->rcv.dst_port=su_getport(&local_addr);
1082         /* update aliases if needed */
1083         if (likely(from==0)){
1084                 /* add aliases */
1085                 TCPCONN_LOCK;
1086                 _tcpconn_add_alias_unsafe(c, c->rcv.src_port, &c->rcv.dst_ip, 0,
1087                                                                                                         tcp_new_conn_alias_flags);
1088                 _tcpconn_add_alias_unsafe(c, c->rcv.src_port, &c->rcv.dst_ip,
1089                                                                         c->rcv.dst_port, tcp_new_conn_alias_flags);
1090                 TCPCONN_UNLOCK;
1091         }else if (su_cmp(from, &local_addr)!=1){
1092                 TCPCONN_LOCK;
1093                         /* remove all the aliases except the first one and re-add them
1094                          * (there shouldn't be more then the 3 default aliases at this 
1095                          * stage) */
1096                         for (r=1; r<c->aliases; r++){
1097                                 a=&c->con_aliases[r];
1098                                 tcpconn_listrm(tcpconn_aliases_hash[a->hash], a, next, prev);
1099                         }
1100                         c->aliases=1;
1101                         /* add the local_ip:0 and local_ip:local_port aliases */
1102                         _tcpconn_add_alias_unsafe(c, c->rcv.src_port, &c->rcv.dst_ip,
1103                                                                                                 0, tcp_new_conn_alias_flags);
1104                         _tcpconn_add_alias_unsafe(c, c->rcv.src_port, &c->rcv.dst_ip,
1105                                                                         c->rcv.dst_port, tcp_new_conn_alias_flags);
1106                 TCPCONN_UNLOCK;
1107         }
1108         
1109         return s;
1110 }
1111 #endif /* TCP_CONNECT_WAIT */
1112
1113
1114
1115 /* adds a tcp connection to the tcpconn hashes
1116  * Note: it's called _only_ from the tcp_main process */
1117 inline static struct tcp_connection*  tcpconn_add(struct tcp_connection *c)
1118 {
1119         struct ip_addr zero_ip;
1120
1121         if (likely(c)){
1122                 ip_addr_mk_any(c->rcv.src_ip.af, &zero_ip);
1123                 c->id_hash=tcp_id_hash(c->id);
1124                 c->aliases=0;
1125                 TCPCONN_LOCK;
1126                 c->flags|=F_CONN_HASHED;
1127                 /* add it at the begining of the list*/
1128                 tcpconn_listadd(tcpconn_id_hash[c->id_hash], c, id_next, id_prev);
1129                 /* set the aliases */
1130                 /* first alias is for (peer_ip, peer_port, 0 ,0) -- for finding
1131                  *  any connection to peer_ip, peer_port
1132                  * the second alias is for (peer_ip, peer_port, local_addr, 0) -- for
1133                  *  finding any conenction to peer_ip, peer_port from local_addr 
1134                  * the third alias is for (peer_ip, peer_port, local_addr, local_port) 
1135                  *   -- for finding if a fully specified connection exists */
1136                 _tcpconn_add_alias_unsafe(c, c->rcv.src_port, &zero_ip, 0,
1137                                                                                                         tcp_new_conn_alias_flags);
1138                 if (likely(c->rcv.dst_ip.af && ! ip_addr_any(&c->rcv.dst_ip))){
1139                         _tcpconn_add_alias_unsafe(c, c->rcv.src_port, &c->rcv.dst_ip, 0,
1140                                                                                                         tcp_new_conn_alias_flags);
1141                         _tcpconn_add_alias_unsafe(c, c->rcv.src_port, &c->rcv.dst_ip,
1142                                                                         c->rcv.dst_port, tcp_new_conn_alias_flags);
1143                 }
1144                 /* ignore add_alias errors, there are some valid cases when one
1145                  *  of the add_alias would fail (e.g. first add_alias for 2 connections
1146                  *   with the same destination but different src. ip*/
1147                 TCPCONN_UNLOCK;
1148                 DBG("tcpconn_add: hashes: %d:%d:%d, %d\n",
1149                                                                                                 c->con_aliases[0].hash,
1150                                                                                                 c->con_aliases[1].hash,
1151                                                                                                 c->con_aliases[2].hash,
1152                                                                                                 c->id_hash);
1153                 return c;
1154         }else{
1155                 LOG(L_CRIT, "tcpconn_add: BUG: null connection pointer\n");
1156                 return 0;
1157         }
1158 }
1159
1160
1161 static inline void _tcpconn_detach(struct tcp_connection *c)
1162 {
1163         int r;
1164         tcpconn_listrm(tcpconn_id_hash[c->id_hash], c, id_next, id_prev);
1165         /* remove all the aliases */
1166         for (r=0; r<c->aliases; r++)
1167                 tcpconn_listrm(tcpconn_aliases_hash[c->con_aliases[r].hash], 
1168                                                 &c->con_aliases[r], next, prev);
1169 }
1170
1171
1172
1173 static inline void _tcpconn_free(struct tcp_connection* c)
1174 {
1175 #ifdef TCP_BUF_WRITE
1176         if (unlikely(_wbufq_non_empty(c)))
1177                 _wbufq_destroy(&c->wbuf_q);
1178 #endif
1179         lock_destroy(&c->write_lock);
1180 #ifdef USE_TLS
1181         if (unlikely(c->type==PROTO_TLS)) tls_tcpconn_clean(c);
1182 #endif
1183         shm_free(c);
1184 }
1185
1186
1187
1188 /* unsafe tcpconn_rm version (nolocks) */
1189 void _tcpconn_rm(struct tcp_connection* c)
1190 {
1191         _tcpconn_detach(c);
1192         _tcpconn_free(c);
1193 }
1194
1195
1196
1197 void tcpconn_rm(struct tcp_connection* c)
1198 {
1199         int r;
1200         TCPCONN_LOCK;
1201         tcpconn_listrm(tcpconn_id_hash[c->id_hash], c, id_next, id_prev);
1202         /* remove all the aliases */
1203         for (r=0; r<c->aliases; r++)
1204                 tcpconn_listrm(tcpconn_aliases_hash[c->con_aliases[r].hash], 
1205                                                 &c->con_aliases[r], next, prev);
1206         TCPCONN_UNLOCK;
1207         lock_destroy(&c->write_lock);
1208 #ifdef USE_TLS
1209         if ((c->type==PROTO_TLS)&&(c->extra_data)) tls_tcpconn_clean(c);
1210 #endif
1211         shm_free(c);
1212 }
1213
1214
1215 /* finds a connection, if id=0 uses the ip addr, port, local_ip and local port
1216  *  (host byte order) and tries to find the connection that matches all of
1217  *   them. Wild cards can be used for local_ip and local_port (a 0 filled
1218  *   ip address and/or a 0 local port).
1219  * WARNING: unprotected (locks) use tcpconn_get unless you really
1220  * know what you are doing */
1221 struct tcp_connection* _tcpconn_find(int id, struct ip_addr* ip, int port,
1222                                                                                 struct ip_addr* l_ip, int l_port)
1223 {
1224
1225         struct tcp_connection *c;
1226         struct tcp_conn_alias* a;
1227         unsigned hash;
1228         int is_local_ip_any;
1229         
1230 #ifdef EXTRA_DEBUG
1231         DBG("tcpconn_find: %d  port %d\n",id, port);
1232         if (ip) print_ip("tcpconn_find: ip ", ip, "\n");
1233 #endif
1234         if (likely(id)){
1235                 hash=tcp_id_hash(id);
1236                 for (c=tcpconn_id_hash[hash]; c; c=c->id_next){
1237 #ifdef EXTRA_DEBUG
1238                         DBG("c=%p, c->id=%d, port=%d\n",c, c->id, c->rcv.src_port);
1239                         print_ip("ip=", &c->rcv.src_ip, "\n");
1240 #endif
1241                         if ((id==c->id)&&(c->state!=S_CONN_BAD)) return c;
1242                 }
1243         }else if (likely(ip)){
1244                 hash=tcp_addr_hash(ip, port, l_ip, l_port);
1245                 is_local_ip_any=ip_addr_any(l_ip);
1246                 for (a=tcpconn_aliases_hash[hash]; a; a=a->next){
1247 #ifdef EXTRA_DEBUG
1248                         DBG("a=%p, c=%p, c->id=%d, alias port= %d port=%d\n", a, a->parent,
1249                                         a->parent->id, a->port, a->parent->rcv.src_port);
1250                         print_ip("ip=",&a->parent->rcv.src_ip,"\n");
1251 #endif
1252                         if ( (a->parent->state!=S_CONN_BAD) && (port==a->port) &&
1253                                         ((l_port==0) || (l_port==a->parent->rcv.dst_port)) &&
1254                                         (ip_addr_cmp(ip, &a->parent->rcv.src_ip)) &&
1255                                         (is_local_ip_any ||
1256                                                 ip_addr_cmp(l_ip, &a->parent->rcv.dst_ip))
1257                                 )
1258                                 return a->parent;
1259                 }
1260         }
1261         return 0;
1262 }
1263
1264
1265
1266 /* _tcpconn_find with locks and timeout
1267  * local_addr contains the desired local ip:port. If null any local address 
1268  * will be used.  IN*ADDR_ANY or 0 port are wild cards.
1269  */
1270 struct tcp_connection* tcpconn_get(int id, struct ip_addr* ip, int port,
1271                                                                         union sockaddr_union* local_addr,
1272                                                                         ticks_t timeout)
1273 {
1274         struct tcp_connection* c;
1275         struct ip_addr local_ip;
1276         int local_port;
1277         
1278         local_port=0;
1279         if (likely(ip)){
1280                 if (unlikely(local_addr)){
1281                         su2ip_addr(&local_ip, local_addr);
1282                         local_port=su_getport(local_addr);
1283                 }else{
1284                         ip_addr_mk_any(ip->af, &local_ip);
1285                         local_port=0;
1286                 }
1287         }
1288         TCPCONN_LOCK;
1289         c=_tcpconn_find(id, ip, port, &local_ip, local_port);
1290         if (likely(c)){ 
1291                         atomic_inc(&c->refcnt);
1292                         /* update the timeout only if the connection is not handled
1293                          * by a tcp reader (the tcp reader process uses c->timeout for 
1294                          * its own internal timeout and c->timeout will be overwritten
1295                          * anyway on return to tcp_main) */
1296                         if (likely(c->reader_pid==0))
1297                                 c->timeout=get_ticks_raw()+timeout;
1298         }
1299         TCPCONN_UNLOCK;
1300         return c;
1301 }
1302
1303
1304
1305 /* add c->dst:port, local_addr as an alias for the "id" connection, 
1306  * flags: TCP_ALIAS_FORCE_ADD  - add an alias even if a previous one exists
1307  *        TCP_ALIAS_REPLACE    - if a prev. alias exists, replace it with the
1308  *                                new one
1309  * returns 0 on success, <0 on failure ( -1  - null c, -2 too many aliases,
1310  *  -3 alias already present and pointing to another connection)
1311  * WARNING: must be called with TCPCONN_LOCK held */
1312 inline static int _tcpconn_add_alias_unsafe(struct tcp_connection* c, int port,
1313                                                                                 struct ip_addr* l_ip, int l_port,
1314                                                                                 int flags)
1315 {
1316         unsigned hash;
1317         struct tcp_conn_alias* a;
1318         struct tcp_conn_alias* nxt;
1319         struct tcp_connection* p;
1320         int is_local_ip_any;
1321         int i;
1322         int r;
1323         
1324         a=0;
1325         is_local_ip_any=ip_addr_any(l_ip);
1326         if (likely(c)){
1327                 hash=tcp_addr_hash(&c->rcv.src_ip, port, l_ip, l_port);
1328                 /* search the aliases for an already existing one */
1329                 for (a=tcpconn_aliases_hash[hash], nxt=0; a; a=nxt){
1330                         nxt=a->next;
1331                         if ( (a->parent->state!=S_CONN_BAD) && (port==a->port) &&
1332                                         ( (l_port==0) || (l_port==a->parent->rcv.dst_port)) &&
1333                                         (ip_addr_cmp(&c->rcv.src_ip, &a->parent->rcv.src_ip)) &&
1334                                         ( is_local_ip_any || 
1335                                           ip_addr_cmp(&a->parent->rcv.dst_ip, l_ip))
1336                                         ){
1337                                 /* found */
1338                                 if (unlikely(a->parent!=c)){
1339                                         if (flags & TCP_ALIAS_FORCE_ADD)
1340                                                 /* still have to walk the whole list to check if
1341                                                  * the alias was not already added */
1342                                                 continue;
1343                                         else if (flags & TCP_ALIAS_REPLACE){
1344                                                 /* remove the alias =>
1345                                                  * remove the current alias and all the following
1346                                                  *  ones from the corresponding connection, shift the 
1347                                                  *  connection aliases array and re-add the other 
1348                                                  *  aliases (!= current one) */
1349                                                 p=a->parent;
1350                                                 for (i=0; (i<p->aliases) && (&(p->con_aliases[i])!=a);
1351                                                                 i++);
1352                                                 if (unlikely(i==p->aliases)){
1353                                                         LOG(L_CRIT, "BUG: _tcpconn_add_alias_unsafe: "
1354                                                                         " alias %p not found in con %p (id %d)\n",
1355                                                                         a, p, p->id);
1356                                                         goto error_not_found;
1357                                                 }
1358                                                 for (r=i; r<p->aliases; r++){
1359                                                         tcpconn_listrm(
1360                                                                 tcpconn_aliases_hash[p->con_aliases[r].hash],
1361                                                                 &p->con_aliases[r], next, prev);
1362                                                 }
1363                                                 if (likely((i+1)<p->aliases)){
1364                                                         memmove(&p->con_aliases[i], &p->con_aliases[i+1],
1365                                                                                         (p->aliases-i-1)*
1366                                                                                                 sizeof(p->con_aliases[0]));
1367                                                 }
1368                                                 p->aliases--;
1369                                                 /* re-add the remaining aliases */
1370                                                 for (r=i; r<p->aliases; r++){
1371                                                         tcpconn_listadd(
1372                                                                 tcpconn_aliases_hash[p->con_aliases[r].hash], 
1373                                                                 &p->con_aliases[r], next, prev);
1374                                                 }
1375                                         }else
1376                                                 goto error_sec;
1377                                 }else goto ok;
1378                         }
1379                 }
1380                 if (unlikely(c->aliases>=TCP_CON_MAX_ALIASES)) goto error_aliases;
1381                 c->con_aliases[c->aliases].parent=c;
1382                 c->con_aliases[c->aliases].port=port;
1383                 c->con_aliases[c->aliases].hash=hash;
1384                 tcpconn_listadd(tcpconn_aliases_hash[hash], 
1385                                                                 &c->con_aliases[c->aliases], next, prev);
1386                 c->aliases++;
1387         }else goto error_not_found;
1388 ok:
1389 #ifdef EXTRA_DEBUG
1390         if (a) DBG("_tcpconn_add_alias_unsafe: alias already present\n");
1391         else   DBG("_tcpconn_add_alias_unsafe: alias port %d for hash %d, id %d\n",
1392                         port, hash, c->id);
1393 #endif
1394         return 0;
1395 error_aliases:
1396         /* too many aliases */
1397         return -2;
1398 error_not_found:
1399         /* null connection */
1400         return -1;
1401 error_sec:
1402         /* alias already present and pointing to a different connection
1403          * (hijack attempt?) */
1404         return -3;
1405 }
1406
1407
1408
1409 /* add port as an alias for the "id" connection, 
1410  * returns 0 on success,-1 on failure */
1411 int tcpconn_add_alias(int id, int port, int proto)
1412 {
1413         struct tcp_connection* c;
1414         int ret;
1415         struct ip_addr zero_ip;
1416         int r;
1417         
1418         /* fix the port */
1419         port=port?port:((proto==PROTO_TLS)?SIPS_PORT:SIP_PORT);
1420         TCPCONN_LOCK;
1421         /* check if alias already exists */
1422         c=_tcpconn_find(id, 0, 0, 0, 0);
1423         if (likely(c)){
1424                 ip_addr_mk_any(c->rcv.src_ip.af, &zero_ip);
1425                 
1426                 /* alias src_ip:port, 0, 0 */
1427                 ret=_tcpconn_add_alias_unsafe(c, port,  &zero_ip, 0, 
1428                                                                                 tcp_alias_flags);
1429                 if (ret<0 && ret!=-3) goto error;
1430                 /* alias src_ip:port, local_ip, 0 */
1431                 ret=_tcpconn_add_alias_unsafe(c, port,  &c->rcv.dst_ip, 0, 
1432                                                                                 tcp_alias_flags);
1433                 if (ret<0 && ret!=-3) goto error;
1434                 /* alias src_ip:port, local_ip, local_port */
1435                 ret=_tcpconn_add_alias_unsafe(c, port, &c->rcv.dst_ip, c->rcv.dst_port,
1436                                                                                 tcp_alias_flags);
1437                 if (unlikely(ret<0)) goto error;
1438         }else goto error_not_found;
1439         TCPCONN_UNLOCK;
1440         return 0;
1441 error_not_found:
1442         TCPCONN_UNLOCK;
1443         LOG(L_ERR, "ERROR: tcpconn_add_alias: no connection found for id %d\n",id);
1444         return -1;
1445 error:
1446         TCPCONN_UNLOCK;
1447         switch(ret){
1448                 case -2:
1449                         LOG(L_ERR, "ERROR: tcpconn_add_alias: too many aliases (%d)"
1450                                         " for connection %p (id %d) %s:%d <- %d\n",
1451                                         c->aliases, c, c->id, ip_addr2a(&c->rcv.src_ip),
1452                                         c->rcv.src_port, port);
1453                         for (r=0; r<c->aliases; r++){
1454                                 LOG(L_ERR, "ERROR: tcpconn_add_alias: alias %d: for %p (%d)"
1455                                                 " %s:%d <-%d hash %x\n",  r, c, c->id, 
1456                                                  ip_addr2a(&c->rcv.src_ip), c->rcv.src_port, 
1457                                                 c->con_aliases[r].port, c->con_aliases[r].hash);
1458                         }
1459                         break;
1460                 case -3:
1461                         LOG(L_ERR, "ERROR: tcpconn_add_alias: possible port"
1462                                         " hijack attempt\n");
1463                         LOG(L_ERR, "ERROR: tcpconn_add_alias: alias for %d port %d already"
1464                                                 " present and points to another connection \n",
1465                                                 c->id, port);
1466                         break;
1467                 default:
1468                         LOG(L_ERR, "ERROR: tcpconn_add_alias: unkown error %d\n", ret);
1469         }
1470         return -1;
1471 }
1472
1473
1474
1475 #ifdef TCP_FD_CACHE
1476
1477 static void tcp_fd_cache_init()
1478 {
1479         int r;
1480         for (r=0; r<TCP_FD_CACHE_SIZE; r++)
1481                 fd_cache[r].fd=-1;
1482 }
1483
1484
1485 inline static struct fd_cache_entry* tcp_fd_cache_get(struct tcp_connection *c)
1486 {
1487         int h;
1488         
1489         h=c->id%TCP_FD_CACHE_SIZE;
1490         if ((fd_cache[h].fd>0) && (fd_cache[h].id==c->id) && (fd_cache[h].con==c))
1491                 return &fd_cache[h];
1492         return 0;
1493 }
1494
1495
1496 inline static void tcp_fd_cache_rm(struct fd_cache_entry* e)
1497 {
1498         e->fd=-1;
1499 }
1500
1501
1502 inline static void tcp_fd_cache_add(struct tcp_connection *c, int fd)
1503 {
1504         int h;
1505         
1506         h=c->id%TCP_FD_CACHE_SIZE;
1507         if (likely(fd_cache[h].fd>0))
1508                 close(fd_cache[h].fd);
1509         fd_cache[h].fd=fd;
1510         fd_cache[h].id=c->id;
1511         fd_cache[h].con=c;
1512 }
1513
1514 #endif /* TCP_FD_CACHE */
1515
1516
1517
1518 inline static int tcpconn_chld_put(struct tcp_connection* tcpconn);
1519
1520
1521
1522 /* finds a tcpconn & sends on it
1523  * uses the dst members to, proto (TCP|TLS) and id and tries to send
1524  *  from the "from" address (if non null and id==0)
1525  * returns: number of bytes written (>=0) on success
1526  *          <0 on error */
1527 int tcp_send(struct dest_info* dst, union sockaddr_union* from,
1528                                         char* buf, unsigned len)
1529 {
1530         struct tcp_connection *c;
1531         struct tcp_connection *tmp;
1532         struct ip_addr ip;
1533         int port;
1534         int fd;
1535         long response[2];
1536         int n;
1537         int do_close_fd;
1538 #ifdef TCP_BUF_WRITE
1539         int enable_write_watch;
1540 #endif /* TCP_BUF_WRITE */
1541 #ifdef TCP_FD_CACHE
1542         struct fd_cache_entry* fd_cache_e;
1543         int use_fd_cache;
1544         
1545         use_fd_cache=tcp_options.fd_cache;
1546         fd_cache_e=0;
1547 #endif /* TCP_FD_CACHE */
1548         do_close_fd=1; /* close the fd on exit */
1549         port=su_getport(&dst->to);
1550         if (likely(port)){
1551                 su2ip_addr(&ip, &dst->to);
1552                 c=tcpconn_get(dst->id, &ip, port, from, tcp_con_lifetime); 
1553         }else if (likely(dst->id)){
1554                 c=tcpconn_get(dst->id, 0, 0, 0, tcp_con_lifetime);
1555         }else{
1556                 LOG(L_CRIT, "BUG: tcp_send called with null id & to\n");
1557                 return -1;
1558         }
1559         
1560         if (likely(dst->id)){
1561                 if (unlikely(c==0)) {
1562                         if (likely(port)){
1563                                 /* try again w/o id */
1564                                 c=tcpconn_get(0, &ip, port, from, tcp_con_lifetime);
1565                                 goto no_id;
1566                         }else{
1567                                 LOG(L_ERR, "ERROR: tcp_send: id %d not found, dropping\n",
1568                                                 dst->id);
1569                                 return -1;
1570                         }
1571                 }else goto get_fd;
1572         }
1573 no_id:
1574                 if (unlikely(c==0)){
1575                         DBG("tcp_send: no open tcp connection found, opening new one\n");
1576                         /* create tcp connection */
1577                         if (likely(from==0)){
1578                                 /* check to see if we have to use a specific source addr. */
1579                                 switch (dst->to.s.sa_family) {
1580                                         case AF_INET:
1581                                                         from = tcp_source_ipv4;
1582                                                 break;
1583 #ifdef USE_IPV6
1584                                         case AF_INET6:
1585                                                         from = tcp_source_ipv6;
1586                                                 break;
1587 #endif
1588                                         default:
1589                                                 /* error, bad af, ignore ... */
1590                                                 break;
1591                                 }
1592                         }
1593 #if defined(TCP_CONNECT_WAIT) && defined(TCP_BUF_WRITE)
1594                         if (likely(tcp_options.tcp_connect_wait && 
1595                                                 tcp_options.tcp_buf_write )){
1596                                 /* FIXME: flag for timer? + check again */
1597                                 if (unlikely(*tcp_connections_no >= tcp_max_connections)){
1598                                         LOG(L_ERR, "ERROR: tcp_send: maximum number of connections"
1599                                                                 " exceeded (%d/%d)\n",
1600                                                                 *tcp_connections_no, tcp_max_connections);
1601                                         return -1;
1602                                 }
1603                                 c=tcpconn_new(-1, &dst->to, from, 0, dst->proto,
1604                                                                 S_CONN_PENDING);
1605                                 if (unlikely(c==0)){
1606                                         LOG(L_ERR, "ERROR: tcp_send: could not create new"
1607                                                         " connection\n");
1608                                         return -1;
1609                                 }
1610                                 c->flags|=F_CONN_PENDING|F_CONN_FD_CLOSED;
1611                                 atomic_set(&c->refcnt, 2); /* ref from here and from main hash
1612                                                                                          table */
1613                                 /* add it to id hash and aliases */
1614                                 if (unlikely(tcpconn_add(c)==0)){
1615                                         LOG(L_ERR, "ERROR: tcp_send: could not add "
1616                                                                         "connection %p\n", c);
1617                                         _tcpconn_free(c);
1618                                         n=-1;
1619                                         goto end_no_conn;
1620                                 }
1621                                 /* do connect and if src ip or port changed, update the 
1622                                  * aliases */
1623                                 if (unlikely((fd=tcpconn_finish_connect(c, from))<0)){
1624                                         LOG(L_ERR, "ERROR: tcp_send: tcpconn_finish_connect(%p)"
1625                                                         " failed\n", c);
1626                                         goto conn_wait_error;
1627                                 }
1628                                 /* ? TODO: it might be faster just to queue the write directly
1629                                  *  and send to main CONN_NEW_PENDING_WRITE */
1630                                 /* delay sending the fd to main after the send */
1631                                 
1632                                 /* NOTE: no lock here, because the connection is marked as
1633                                  * pending and nobody else will try to write on it. However
1634                                  * this might produce out-of-order writes. If this is not
1635                                  * desired either lock before the write or use 
1636                                  * _wbufq_insert(...) */
1637                                 n=_tcpconn_write_nb(fd, c, buf, len);
1638                                 if (unlikely(n<(int)len)){
1639                                         if ((n>=0) || errno==EAGAIN || errno==EWOULDBLOCK){
1640                                                 DBG("tcp_send: pending write on new connection %p "
1641                                                                 " (%d/%d bytes written)\n", c, n, len);
1642                                                 if (n<0) n=0;
1643                                                 /* add to the write queue */
1644                                                 lock_get(&c->write_lock);
1645                                                         if (unlikely(_wbufq_insert(c, buf+n, len-n)<0)){
1646                                                                 lock_release(&c->write_lock);
1647                                                                 n=-1;
1648                                                                 LOG(L_ERR, "ERROR: tcp_send: EAGAIN and"
1649                                                                                 " write queue full or failed for %p\n",
1650                                                                                 c);
1651                                                                 goto conn_wait_error;
1652                                                         }
1653                                                 lock_release(&c->write_lock);
1654                                                 /* send to tcp_main */
1655                                                 response[0]=(long)c;
1656                                                 response[1]=CONN_NEW_PENDING_WRITE;
1657                                                 if (unlikely(send_fd(unix_tcp_sock, response, 
1658                                                                                                 sizeof(response), fd) <= 0)){
1659                                                         LOG(L_ERR, "BUG: tcp_send: CONN_NEW_PENDING_WRITE"
1660                                                                                 " for %p failed:" " %s (%d)\n",
1661                                                                                 c, strerror(errno), errno);
1662                                                         goto conn_wait_error;
1663                                                 }
1664                                                 n=len;
1665                                                 goto end;
1666                                         }
1667                                         /* error: destroy it directly */
1668                                         LOG(L_ERR, "ERROR: tcp_send: connect & send "
1669                                                                                 " for %p failed:" " %s (%d)\n",
1670                                                                                 c, strerror(errno), errno);
1671                                         goto conn_wait_error;
1672                                 }
1673                                 LOG(L_INFO, "tcp_send: quick connect for %p\n", c);
1674                                 /* send to tcp_main */
1675                                 response[0]=(long)c;
1676                                 response[1]=CONN_NEW_COMPLETE;
1677                                 if (unlikely(send_fd(unix_tcp_sock, response, 
1678                                                                                 sizeof(response), fd) <= 0)){
1679                                         LOG(L_ERR, "BUG: tcp_send: CONN_NEW_COMPLETE  for %p"
1680                                                                 " failed:" " %s (%d)\n",
1681                                                                 c, strerror(errno), errno);
1682                                         goto conn_wait_error;
1683                                 }
1684                                 goto end;
1685                         }
1686 #endif /* TCP_CONNECT_WAIT  && TCP_BUF_WRITE */
1687                         if (unlikely((c=tcpconn_connect(&dst->to, from, dst->proto))==0)){
1688                                 LOG(L_ERR, "ERROR: tcp_send: connect failed\n");
1689                                 return -1;
1690                         }
1691                         atomic_set(&c->refcnt, 2); /* ref. from here and it will also
1692                                                       be added in the tcp_main hash */
1693                         fd=c->s;
1694                         c->flags|=F_CONN_FD_CLOSED; /* not yet opened in main */
1695                         /* ? TODO: it might be faster just to queue the write and
1696                          * send to main a CONN_NEW_PENDING_WRITE */
1697                         
1698                         /* send the new tcpconn to "tcp main" */
1699                         response[0]=(long)c;
1700                         response[1]=CONN_NEW;
1701                         n=send_fd(unix_tcp_sock, response, sizeof(response), c->s);
1702                         if (unlikely(n<=0)){
1703                                 LOG(L_ERR, "BUG: tcp_send: failed send_fd: %s (%d)\n",
1704                                                 strerror(errno), errno);
1705                                 /* we can safely delete it, it's not referenced by anybody */
1706                                 _tcpconn_free(c);
1707                                 n=-1;
1708                                 goto end_no_conn;
1709                         }
1710                         goto send_it;
1711                 }
1712 get_fd:
1713 #ifdef TCP_BUF_WRITE
1714                 /* if data is already queued, we don't need the fd any more */
1715                 if (unlikely(tcp_options.tcp_buf_write && (_wbufq_non_empty(c)
1716 #ifdef TCP_CONNECT_WAIT
1717                                                                                                 || (c->state==S_CONN_PENDING)
1718 #endif /* TCP_CONNECT_WAIT */
1719                                                 ) )){
1720                         lock_get(&c->write_lock);
1721                                 if (likely(_wbufq_non_empty(c)
1722 #ifdef TCP_CONNECT_WAIT
1723                                                         || (c->state==S_CONN_PENDING)
1724 #endif /* TCP_CONNECT_WAIT */
1725
1726                                                         )){
1727                                         do_close_fd=0;
1728                                         if (unlikely(_wbufq_add(c, buf, len)<0)){
1729                                                 lock_release(&c->write_lock);
1730                                                 n=-1;
1731                                                 goto error;
1732                                         }
1733                                         n=len;
1734                                         lock_release(&c->write_lock);
1735                                         goto release_c;
1736                                 }
1737                         lock_release(&c->write_lock);
1738                 }
1739 #endif /* TCP_BUF_WRITE */
1740                 /* check if this is not the same reader process holding
1741                  *  c  and if so send directly on c->fd */
1742                 if (c->reader_pid==my_pid()){
1743                         DBG("tcp_send: send from reader (%d (%d)), reusing fd\n",
1744                                         my_pid(), process_no);
1745                         fd=c->fd;
1746                         do_close_fd=0; /* don't close the fd on exit, it's in use */
1747 #ifdef TCP_FD_CACHE
1748                         use_fd_cache=0; /* don't cache: problems would arise due to the
1749                                                            close() on cache eviction (if the fd is still 
1750                                                            used). If it has to be cached then dup() _must_ 
1751                                                            be used */
1752                 }else if (likely(use_fd_cache && 
1753                                                         ((fd_cache_e=tcp_fd_cache_get(c))!=0))){
1754                         fd=fd_cache_e->fd;
1755                         do_close_fd=0;
1756                         DBG("tcp_send: found fd in cache ( %d, %p, %d)\n",
1757                                         fd, c, fd_cache_e->id);
1758 #endif /* TCP_FD_CACHE */
1759                 }else{
1760                         DBG("tcp_send: tcp connection found (%p), acquiring fd\n", c);
1761                         /* get the fd */
1762                         response[0]=(long)c;
1763                         response[1]=CONN_GET_FD;
1764                         n=send_all(unix_tcp_sock, response, sizeof(response));
1765                         if (unlikely(n<=0)){
1766                                 LOG(L_ERR, "BUG: tcp_send: failed to get fd(write):%s (%d)\n",
1767                                                 strerror(errno), errno);
1768                                 n=-1;
1769                                 goto release_c;
1770                         }
1771                         DBG("tcp_send, c= %p, n=%d\n", c, n);
1772                         n=receive_fd(unix_tcp_sock, &tmp, sizeof(tmp), &fd, MSG_WAITALL);
1773                         if (unlikely(n<=0)){
1774                                 LOG(L_ERR, "BUG: tcp_send: failed to get fd(receive_fd):"
1775                                                         " %s (%d)\n", strerror(errno), errno);
1776                                 n=-1;
1777                                 do_close_fd=0;
1778                                 goto release_c;
1779                         }
1780                         if (unlikely(c!=tmp)){
1781                                 LOG(L_CRIT, "BUG: tcp_send: get_fd: got different connection:"
1782                                                 "  %p (id= %d, refcnt=%d state=%d) != "
1783                                                 "  %p (n=%d)\n",
1784                                                   c,   c->id,   atomic_get(&c->refcnt),   c->state,
1785                                                   tmp, n
1786                                    );
1787                                 n=-1; /* fail */
1788                                 goto end;
1789                         }
1790                         DBG("tcp_send: after receive_fd: c= %p n=%d fd=%d\n",c, n, fd);
1791                 }
1792         
1793         
1794 send_it:
1795         DBG("tcp_send: sending...\n");
1796         lock_get(&c->write_lock);
1797 #ifdef TCP_BUF_WRITE
1798         if (likely(tcp_options.tcp_buf_write)){
1799                 if (_wbufq_non_empty(c)
1800 #ifdef TCP_CONNECT_WAIT
1801                         || (c->state==S_CONN_PENDING) 
1802 #endif /* TCP_CONNECT_WAIT */
1803                         ){
1804                         if (unlikely(_wbufq_add(c, buf, len)<0)){
1805                                 lock_release(&c->write_lock);
1806                                 n=-1;
1807                                 goto error;
1808                         }
1809                         lock_release(&c->write_lock);
1810                         n=len;
1811                         goto end;
1812                 }
1813                 n=_tcpconn_write_nb(fd, c, buf, len);
1814         }else{
1815 #endif /* TCP_BUF_WRITE */
1816 #ifdef USE_TLS
1817         if (c->type==PROTO_TLS)
1818                 n=tls_blocking_write(c, fd, buf, len);
1819         else
1820 #endif
1821                 /* n=tcp_blocking_write(c, fd, buf, len); */
1822                 n=tsend_stream(fd, buf, len, tcp_send_timeout*1000); 
1823 #ifdef TCP_BUF_WRITE
1824         }
1825 #else /* ! TCP_BUF_WRITE */
1826         lock_release(&c->write_lock);
1827 #endif /* TCP_BUF_WRITE */
1828         
1829         DBG("tcp_send: after real write: c= %p n=%d fd=%d\n",c, n, fd);
1830         DBG("tcp_send: buf=\n%.*s\n", (int)len, buf);
1831         if (unlikely(n<(int)len)){
1832 #ifdef TCP_BUF_WRITE
1833                 if (tcp_options.tcp_buf_write && 
1834                                 ((n>=0) || errno==EAGAIN || errno==EWOULDBLOCK)){
1835                         enable_write_watch=_wbufq_empty(c);
1836                         if (n<0) n=0;
1837                         if (unlikely(_wbufq_add(c, buf+n, len-n)<0)){
1838                                 lock_release(&c->write_lock);
1839                                 n=-1;
1840                                 goto error;
1841                         }
1842                         lock_release(&c->write_lock);
1843                         n=len;
1844                         if (likely(enable_write_watch)){
1845                                 response[0]=(long)c;
1846                                 response[1]=CONN_QUEUED_WRITE;
1847                                 if (send_all(unix_tcp_sock, response, sizeof(response)) <= 0){
1848                                         LOG(L_ERR, "BUG: tcp_send: error return failed "
1849                                                                 "(write):%s (%d)\n", strerror(errno), errno);
1850                                         n=-1;
1851                                         goto error;
1852                                 }
1853                         }
1854                         goto end;
1855                 }else{
1856                         lock_release(&c->write_lock);
1857                 }
1858 #endif /* TCP_BUF_WRITE */
1859                 LOG(L_ERR, "ERROR: tcp_send: failed to send on %p: %s (%d)\n",
1860                                         c, strerror(errno), errno);
1861 #ifdef TCP_BUF_WRITE
1862 error:
1863 #endif /* TCP_BUF_WRITE */
1864                 /* error on the connection , mark it as bad and set 0 timeout */
1865                 c->state=S_CONN_BAD;
1866                 c->timeout=get_ticks_raw();
1867                 /* tell "main" it should drop this (optional it will t/o anyway?)*/
1868                 response[0]=(long)c;
1869                 response[1]=CONN_ERROR;
1870                 if (send_all(unix_tcp_sock, response, sizeof(response))<=0){
1871                         LOG(L_CRIT, "BUG: tcp_send: error return failed (write):%s (%d)\n",
1872                                         strerror(errno), errno);
1873                         tcpconn_chld_put(c); /* deref. it manually */
1874                         n=-1;
1875                 }
1876                 /* CONN_ERROR will auto-dec refcnt => we must not call tcpconn_put 
1877                  * if it succeeds */
1878 #ifdef TCP_FD_CACHE
1879                 if (unlikely(fd_cache_e)){
1880                         LOG(L_ERR, "ERROR: tcp_send: error on cached fd, removing from the"
1881                                         "cache (%d, %p, %d)\n", 
1882                                         fd, fd_cache_e->con, fd_cache_e->id);
1883                         tcp_fd_cache_rm(fd_cache_e);
1884                         close(fd);
1885                 }else
1886 #endif /* TCP_FD_CACHE */
1887                 if (do_close_fd) close(fd);
1888                 return n; /* error return, no tcpconn_put */
1889         }
1890         
1891 #ifdef TCP_BUF_WRITE
1892         lock_release(&c->write_lock);
1893         if (likely(tcp_options.tcp_buf_write)){
1894                 if (unlikely(c->state==S_CONN_CONNECT))
1895                         c->state=S_CONN_OK;
1896         }
1897 #endif /* TCP_BUF_WRITE */
1898 end:
1899 #ifdef TCP_FD_CACHE
1900         if (unlikely((fd_cache_e==0) && use_fd_cache)){
1901                 tcp_fd_cache_add(c, fd);
1902         }else
1903 #endif /* TCP_FD_CACHE */
1904         if (do_close_fd) close(fd);
1905 release_c:
1906         tcpconn_chld_put(c); /* release c (dec refcnt & free on 0) */
1907 end_no_conn:
1908         return n;
1909 #ifdef TCP_CONNECT_WAIT
1910 conn_wait_error:
1911         /* connect or send failed on newly created connection which was not
1912          * yet sent to tcp_main (but was already hashed) => don't send to main,
1913          * unhash and destroy directly (if refcnt>2 it will be destroyed when the 
1914          * last sender releases the connection (tcpconn_chld_put(c))) or when
1915          * tcp_main receives a CONN_ERROR it*/
1916         c->state=S_CONN_BAD;
1917         TCPCONN_LOCK;
1918                 if (c->flags & F_CONN_HASHED){
1919                         /* if some other parallel tcp_send did send CONN_ERROR to
1920                          * tcp_main, the connection might be already detached */
1921                         _tcpconn_detach(c);
1922                         c->flags&=~F_CONN_HASHED;
1923                         TCPCONN_UNLOCK;
1924                         tcpconn_put(c);
1925                 }else
1926                         TCPCONN_UNLOCK;
1927         /* dec refcnt -> mark it for destruction */
1928         tcpconn_chld_put(c);
1929         return -1;
1930 #endif /* TCP_CONNET_WAIT */
1931 }
1932
1933
1934
1935 int tcp_init(struct socket_info* sock_info)
1936 {
1937         union sockaddr_union* addr;
1938         int optval;
1939 #ifdef HAVE_TCP_ACCEPT_FILTER
1940         struct accept_filter_arg afa;
1941 #endif /* HAVE_TCP_ACCEPT_FILTER */
1942 #ifdef DISABLE_NAGLE
1943         int flag;
1944         struct protoent* pe;
1945
1946         if (tcp_proto_no==-1){ /* if not already set */
1947                 pe=getprotobyname("tcp");
1948                 if (pe==0){
1949                         LOG(L_ERR, "ERROR: tcp_init: could not get TCP protocol number\n");
1950                         tcp_proto_no=-1;
1951                 }else{
1952                         tcp_proto_no=pe->p_proto;
1953                 }
1954         }
1955 #endif
1956         
1957         addr=&sock_info->su;
1958         /* sock_info->proto=PROTO_TCP; */
1959         if (init_su(addr, &sock_info->address, sock_info->port_no)<0){
1960                 LOG(L_ERR, "ERROR: tcp_init: could no init sockaddr_union\n");
1961                 goto error;
1962         }
1963         sock_info->socket=socket(AF2PF(addr->s.sa_family), SOCK_STREAM, 0);
1964         if (sock_info->socket==-1){
1965                 LOG(L_ERR, "ERROR: tcp_init: socket: %s\n", strerror(errno));
1966                 goto error;
1967         }
1968 #ifdef DISABLE_NAGLE
1969         flag=1;
1970         if ( (tcp_proto_no!=-1) &&
1971                  (setsockopt(sock_info->socket, tcp_proto_no , TCP_NODELAY,
1972                                          &flag, sizeof(flag))<0) ){
1973                 LOG(L_ERR, "ERROR: tcp_init: could not disable Nagle: %s\n",
1974                                 strerror(errno));
1975         }
1976 #endif
1977
1978
1979 #if  !defined(TCP_DONT_REUSEADDR) 
1980         /* Stevens, "Network Programming", Section 7.5, "Generic Socket
1981      * Options": "...server started,..a child continues..on existing
1982          * connection..listening server is restarted...call to bind fails
1983          * ... ALL TCP servers should specify the SO_REUSEADDRE option 
1984          * to allow the server to be restarted in this situation
1985          *
1986          * Indeed, without this option, the server can't restart.
1987          *   -jiri
1988          */
1989         optval=1;
1990         if (setsockopt(sock_info->socket, SOL_SOCKET, SO_REUSEADDR,
1991                                 (void*)&optval, sizeof(optval))==-1) {
1992                 LOG(L_ERR, "ERROR: tcp_init: setsockopt %s\n",
1993                         strerror(errno));
1994                 goto error;
1995         }
1996 #endif
1997         /* tos */
1998         optval = tos;
1999         if (setsockopt(sock_info->socket, IPPROTO_IP, IP_TOS, (void*)&optval, 
2000                                 sizeof(optval)) ==-1){
2001                 LOG(L_WARN, "WARNING: tcp_init: setsockopt tos: %s\n", strerror(errno));
2002                 /* continue since this is not critical */
2003         }
2004 #ifdef HAVE_TCP_DEFER_ACCEPT
2005         /* linux only */
2006         if (tcp_options.defer_accept){
2007                 optval=tcp_options.defer_accept;
2008                 if (setsockopt(sock_info->socket, IPPROTO_TCP, TCP_DEFER_ACCEPT,
2009                                         (void*)&optval, sizeof(optval)) ==-1){
2010                         LOG(L_WARN, "WARNING: tcp_init: setsockopt TCP_DEFER_ACCEPT %s\n",
2011                                                 strerror(errno));
2012                 /* continue since this is not critical */
2013                 }
2014         }
2015 #endif /* HAVE_TCP_DEFFER_ACCEPT */
2016 #ifdef HAVE_TCP_SYNCNT
2017         if (tcp_options.syncnt){
2018                 optval=tcp_options.syncnt;
2019                 if (setsockopt(sock_info->socket, IPPROTO_TCP, TCP_SYNCNT, &optval,
2020                                                 sizeof(optval))<0){
2021                         LOG(L_WARN, "WARNING: tcp_init: failed to set"
2022                                                 " maximum SYN retr. count: %s\n", strerror(errno));
2023                 }
2024         }
2025 #endif
2026 #ifdef HAVE_TCP_LINGER2
2027         if (tcp_options.linger2){
2028                 optval=tcp_options.linger2;
2029                 if (setsockopt(sock_info->socket, IPPROTO_TCP, TCP_LINGER2, &optval,
2030                                                 sizeof(optval))<0){
2031                         LOG(L_WARN, "WARNING: tcp_init: failed to set"
2032                                                 " maximum LINGER2 timeout: %s\n", strerror(errno));
2033                 }
2034         }
2035 #endif
2036         init_sock_keepalive(sock_info->socket);
2037         if (bind(sock_info->socket, &addr->s, sockaddru_len(*addr))==-1){
2038                 LOG(L_ERR, "ERROR: tcp_init: bind(%x, %p, %d) on %s:%d : %s\n",
2039                                 sock_info->socket,  &addr->s, 
2040                                 (unsigned)sockaddru_len(*addr),
2041                                 sock_info->address_str.s,
2042                                 sock_info->port_no,
2043                                 strerror(errno));
2044                 goto error;
2045         }
2046         if (listen(sock_info->socket, TCP_LISTEN_BACKLOG)==-1){
2047                 LOG(L_ERR, "ERROR: tcp_init: listen(%x, %p, %d) on %s: %s\n",
2048                                 sock_info->socket, &addr->s, 
2049                                 (unsigned)sockaddru_len(*addr),
2050                                 sock_info->address_str.s,
2051                                 strerror(errno));
2052                 goto error;
2053         }
2054 #ifdef HAVE_TCP_ACCEPT_FILTER
2055         /* freebsd */
2056         if (tcp_options.defer_accept){
2057                 memset(&afa, 0, sizeof(afa));
2058                 strcpy(afa.af_name, "dataready");
2059                 if (setsockopt(sock_info->socket, SOL_SOCKET, SO_ACCEPTFILTER,
2060                                         (void*)&afa, sizeof(afa)) ==-1){
2061                         LOG(L_WARN, "WARNING: tcp_init: setsockopt SO_ACCEPTFILTER %s\n",
2062                                                 strerror(errno));
2063                 /* continue since this is not critical */
2064                 }
2065         }
2066 #endif /* HAVE_TCP_ACCEPT_FILTER */
2067         
2068         return 0;
2069 error:
2070         if (sock_info->socket!=-1){
2071                 close(sock_info->socket);
2072                 sock_info->socket=-1;
2073         }
2074         return -1;
2075 }
2076
2077
2078
2079 /* close tcp_main's fd from a tcpconn
2080  * WARNING: call only in tcp_main context */
2081 inline static void tcpconn_close_main_fd(struct tcp_connection* tcpconn)
2082 {
2083         int fd;
2084         
2085         
2086         fd=tcpconn->s;
2087 #ifdef USE_TLS
2088         /*FIXME: lock ->writelock ? */
2089         if (tcpconn->type==PROTO_TLS)
2090                 tls_close(tcpconn, fd);
2091 #endif
2092 #ifdef TCP_FD_CACHE
2093         if (likely(tcp_options.fd_cache)) shutdown(fd, SHUT_RDWR);
2094 #endif /* TCP_FD_CACHE */
2095 close_again:
2096         if (unlikely(close(fd)<0)){
2097                 if (errno==EINTR)
2098                         goto close_again;
2099                 LOG(L_ERR, "ERROR: tcpconn_put_destroy; close() failed: %s (%d)\n",
2100                                 strerror(errno), errno);
2101         }
2102 }
2103
2104
2105
2106 /* dec refcnt & frees the connection if refcnt==0
2107  * returns 1 if the connection is freed, 0 otherwise
2108  *
2109  * WARNING: use only from child processes */
2110 inline static int tcpconn_chld_put(struct tcp_connection* tcpconn)
2111 {
2112         if (unlikely(atomic_dec_and_test(&tcpconn->refcnt))){
2113                 DBG("tcpconn_chld_put: destroying connection %p (%d, %d) "
2114                                 "flags %04x\n", tcpconn, tcpconn->id,
2115                                 tcpconn->s, tcpconn->flags);
2116                 /* sanity checks */
2117                 membar_read_atomic_op(); /* make sure we see the current flags */
2118                 if (unlikely(!(tcpconn->flags & F_CONN_FD_CLOSED) || 
2119                                          !(tcpconn->flags & F_CONN_REMOVED) || 
2120                                         (tcpconn->flags & 
2121                                                 (F_CONN_HASHED|F_CONN_MAIN_TIMER| F_CONN_WRITE_W)) )){
2122                         LOG(L_CRIT, "BUG: tcpconn_chld_put: %p bad flags = %0x\n",
2123                                         tcpconn, tcpconn->flags);
2124                         abort();
2125                 }
2126                 _tcpconn_free(tcpconn); /* destroys also the wbuf_q if still present*/
2127                 return 1;
2128         }
2129         return 0;
2130 }
2131
2132
2133
2134 /* simple destroy function (the connection should be already removed
2135  * from the hashes and the fds should not be watched anymore for IO)
2136  */
2137 inline static void tcpconn_destroy(struct tcp_connection* tcpconn)
2138 {
2139                 DBG("tcpconn_destroy: destroying connection %p (%d, %d) "
2140                                 "flags %04x\n", tcpconn, tcpconn->id,
2141                                 tcpconn->s, tcpconn->flags);
2142                 if (unlikely(tcpconn->flags & F_CONN_HASHED)){
2143                         LOG(L_CRIT, "BUG: tcpconn_destroy: called with hashed"
2144                                                 " connection (%p)\n", tcpconn);
2145                         /* try to continue */
2146                         if (likely(tcpconn->flags & F_CONN_MAIN_TIMER))
2147                                 local_timer_del(&tcp_main_ltimer, &tcpconn->timer);
2148                         TCPCONN_LOCK;
2149                                 _tcpconn_detach(tcpconn);
2150                         TCPCONN_UNLOCK;
2151                 }
2152                 if (likely(!(tcpconn->flags & F_CONN_FD_CLOSED))){
2153                         tcpconn_close_main_fd(tcpconn);
2154                         (*tcp_connections_no)--;
2155                 }
2156                 _tcpconn_free(tcpconn); /* destroys also the wbuf_q if still present*/
2157 }
2158
2159
2160
2161 /* tries to destroy the connection: dec. refcnt and if 0 destroys the
2162  *  connection, else it will mark it as BAD and close the main fds
2163  *
2164  * returns 1 if the connection was destroyed, 0 otherwise
2165  *
2166  * WARNING: - the connection _has_ to be removed from the hash and timer
2167  *  first (use tcpconn_try_unhash() for this )
2168  *         - the fd should not be watched anymore (io_watch_del()...)
2169  *         - must be called _only_ from the tcp_main process context
2170  *          (or else the fd will remain open)
2171  */
2172 inline static int tcpconn_put_destroy(struct tcp_connection* tcpconn)
2173 {
2174         if (unlikely((tcpconn->flags & 
2175                                                 (F_CONN_WRITE_W|F_CONN_HASHED|F_CONN_MAIN_TIMER)) ||
2176                                 !(tcpconn->flags & F_CONN_REMOVED) )){
2177                 /* sanity check */
2178                 if (unlikely(tcpconn->flags & F_CONN_HASHED)){
2179                         LOG(L_CRIT, "BUG: tcpconn_destroy: called with hashed and/or"
2180                                                 "on timer connection (%p), flags = %0x\n",
2181                                                 tcpconn, tcpconn->flags);
2182                         /* try to continue */
2183                         if (likely(tcpconn->flags & F_CONN_MAIN_TIMER))
2184                                 local_timer_del(&tcp_main_ltimer, &tcpconn->timer);
2185                         TCPCONN_LOCK;
2186                                 _tcpconn_detach(tcpconn);
2187                         TCPCONN_UNLOCK;
2188                 }else{
2189                         LOG(L_CRIT, "BUG: tcpconn_put_destroy: %p flags = %0x\n",
2190                                         tcpconn, tcpconn->flags);
2191                 }
2192         }
2193         tcpconn->state=S_CONN_BAD;
2194         /* in case it's still in a reader timer */
2195         tcpconn->timeout=get_ticks_raw();
2196         /* fast close: close fds now */
2197         if (likely(!(tcpconn->flags & F_CONN_FD_CLOSED))){
2198                 tcpconn_close_main_fd(tcpconn);
2199                 tcpconn->flags|=F_CONN_FD_CLOSED;
2200                 (*tcp_connections_no)--;
2201         }
2202         /* all the flags / ops on the tcpconn must be done prior to decrementing
2203          * the refcnt. and at least a membar_write_atomic_op() mem. barrier or
2204          *  a mb_atomic_* op must * be used to make sure all the changed flags are
2205          *  written into memory prior to the new refcnt value */
2206         if (unlikely(mb_atomic_dec_and_test(&tcpconn->refcnt))){
2207                 _tcpconn_free(tcpconn);
2208                 return 1;
2209         }
2210         return 0;
2211 }
2212
2213
2214
2215 /* try to remove a connection from the hashes and timer.
2216  * returns 1 if the connection was removed, 0 if not (connection not in
2217  *  hash)
2218  *
2219  * WARNING: call it only in the  tcp_main process context or else the
2220  *  timer removal won't work.
2221  */
2222 inline static int tcpconn_try_unhash(struct tcp_connection* tcpconn)
2223 {
2224         if (likely(tcpconn->flags & F_CONN_HASHED)){
2225                 tcpconn->state=S_CONN_BAD;
2226                 if (likely(tcpconn->flags & F_CONN_MAIN_TIMER)){
2227                         local_timer_del(&tcp_main_ltimer, &tcpconn->timer);
2228                         tcpconn->flags&=~F_CONN_MAIN_TIMER;
2229                 }else
2230                         /* in case it's still in a reader timer */
2231                         tcpconn->timeout=get_ticks_raw();
2232                 TCPCONN_LOCK;
2233                         if (tcpconn->flags & F_CONN_HASHED){
2234                                 tcpconn->flags&=~F_CONN_HASHED;
2235                                 _tcpconn_detach(tcpconn);
2236                                 TCPCONN_UNLOCK;
2237                         }else{
2238                                 /* tcp_send was faster and did unhash it itself */
2239                                 TCPCONN_UNLOCK;
2240                                 return 0;
2241                         }
2242 #ifdef TCP_BUF_WRITE
2243                 /* empty possible write buffers (optional) */
2244                 if (unlikely(_wbufq_non_empty(tcpconn))){
2245                         lock_get(&tcpconn->write_lock);
2246                                 /* check again, while holding the lock */
2247                                 if (likely(_wbufq_non_empty(tcpconn)))
2248                                         _wbufq_destroy(&tcpconn->wbuf_q);
2249                         lock_release(&tcpconn->write_lock);
2250                 }
2251 #endif /* TCP_BUF_WRITE */
2252                 return 1;
2253         }
2254         return 0;
2255 }
2256
2257
2258
2259 #ifdef SEND_FD_QUEUE
2260 struct send_fd_info{
2261         struct tcp_connection* tcp_conn;
2262         ticks_t expire;
2263         int unix_sock;
2264         unsigned int retries; /* debugging */
2265 };
2266
2267 struct tcp_send_fd_q{
2268         struct send_fd_info* data; /* buffer */
2269         struct send_fd_info* crt;  /* pointer inside the buffer */
2270         struct send_fd_info* end;  /* points after the last valid position */
2271 };
2272
2273
2274 static struct tcp_send_fd_q send2child_q;
2275
2276
2277
2278 static int send_fd_queue_init(struct tcp_send_fd_q *q, unsigned int size)
2279 {
2280         q->data=pkg_malloc(size*sizeof(struct send_fd_info));
2281         if (q->data==0){
2282                 LOG(L_ERR, "ERROR: send_fd_queue_init: out of memory\n");
2283                 return -1;
2284         }
2285         q->crt=&q->data[0];
2286         q->end=&q->data[size];
2287         return 0;
2288 }
2289
2290 static void send_fd_queue_destroy(struct tcp_send_fd_q *q)
2291 {
2292         if (q->data){
2293                 pkg_free(q->data);
2294                 q->data=0;
2295                 q->crt=q->end=0;
2296         }
2297 }
2298
2299
2300
2301 static int init_send_fd_queues()
2302 {
2303         if (send_fd_queue_init(&send2child_q, SEND_FD_QUEUE_SIZE)!=0)
2304                 goto error;
2305         return 0;
2306 error:
2307         LOG(L_ERR, "ERROR: init_send_fd_queues: init failed\n");
2308         return -1;
2309 }
2310
2311
2312
2313 static void destroy_send_fd_queues()
2314 {
2315         send_fd_queue_destroy(&send2child_q);
2316 }
2317
2318
2319
2320
2321 inline static int send_fd_queue_add(    struct tcp_send_fd_q* q, 
2322                                                                                 int unix_sock,
2323                                                                                 struct tcp_connection *t)
2324 {
2325         struct send_fd_info* tmp;
2326         unsigned long new_size;
2327         
2328         if (q->crt>=q->end){
2329                 new_size=q->end-&q->data[0];
2330                 if (new_size< MAX_SEND_FD_QUEUE_SIZE/2){
2331                         new_size*=2;
2332                 }else new_size=MAX_SEND_FD_QUEUE_SIZE;
2333                 if (unlikely(q->crt>=&q->data[new_size])){
2334                         LOG(L_ERR, "ERROR: send_fd_queue_add: queue full: %ld/%ld\n",
2335                                         (long)(q->crt-&q->data[0]-1), new_size);
2336                         goto error;
2337                 }
2338                 LOG(L_CRIT, "INFO: send_fd_queue: queue full: %ld, extending to %ld\n",
2339                                 (long)(q->end-&q->data[0]), new_size);
2340                 tmp=pkg_realloc(q->data, new_size*sizeof(struct send_fd_info));
2341                 if (unlikely(tmp==0)){
2342                         LOG(L_ERR, "ERROR: send_fd_queue_add: out of memory\n");
2343                         goto error;
2344                 }
2345                 q->crt=(q->crt-&q->data[0])+tmp;
2346                 q->data=tmp;
2347                 q->end=&q->data[new_size];
2348         }
2349         q->crt->tcp_conn=t;
2350         q->crt->unix_sock=unix_sock;
2351         q->crt->expire=get_ticks_raw()+SEND_FD_QUEUE_TIMEOUT;
2352         q->crt->retries=0;
2353         q->crt++;
2354         return 0;
2355 error:
2356         return -1;
2357 }
2358
2359
2360
2361 inline static void send_fd_queue_run(struct tcp_send_fd_q* q)
2362 {
2363         struct send_fd_info* p;
2364         struct send_fd_info* t;
2365         
2366         for (p=t=&q->data[0]; p<q->crt; p++){
2367                 if (unlikely(send_fd(p->unix_sock, &(p->tcp_conn),
2368                                         sizeof(struct tcp_connection*), p->tcp_conn->s)<=0)){
2369                         if ( ((errno==EAGAIN)||(errno==EWOULDBLOCK)) && 
2370                                                         ((s_ticks_t)(p->expire-get_ticks_raw())>0)){
2371                                 /* leave in queue for a future try */
2372                                 *t=*p;
2373                                 t->retries++;
2374                                 t++;
2375                         }else{
2376                                 LOG(L_ERR, "ERROR: run_send_fd_queue: send_fd failed"
2377                                                    " on socket %d , queue entry %ld, retries %d,"
2378                                                    " connection %p, tcp socket %d, errno=%d (%s) \n",
2379                                                    p->unix_sock, (long)(p-&q->data[0]), p->retries,
2380                                                    p->tcp_conn, p->tcp_conn->s, errno,
2381                                                    strerror(errno));
2382 #ifdef TCP_BUF_WRITE
2383                                 if (p->tcp_conn->flags & F_CONN_WRITE_W){
2384                                         io_watch_del(&io_h, p->tcp_conn->s, -1, IO_FD_CLOSING);
2385                                         p->tcp_conn->flags &=~F_CONN_WRITE_W;
2386                                 }
2387 #endif
2388                                 p->tcp_conn->flags &= ~F_CONN_READER;
2389                                 if (likely(tcpconn_try_unhash(p->tcp_conn)))
2390                                         tcpconn_put(p->tcp_conn);
2391                                 tcpconn_put_destroy(p->tcp_conn); /* dec refcnt & destroy */
2392                         }
2393                 }
2394         }
2395         q->crt=t;
2396 }
2397 #else
2398 #define send_fd_queue_run(q)
2399 #endif
2400
2401
2402 /* non blocking write() on a tcpconnection, unsafe version (should be called
2403  * while holding  c->write_lock). The fd should be non-blocking.
2404  *  returns number of bytes written on success, -1 on error (and sets errno)
2405  */
2406 inline static int _tcpconn_write_nb(int fd, struct tcp_connection* c,
2407                                                                         char* buf, int len)
2408 {
2409         int n;
2410         
2411 again:
2412 #ifdef USE_TLS
2413         if (unlikely(c->type==PROTO_TLS))
2414                 /* FIXME: tls_nonblocking_write !! */
2415                 n=tls_blocking_write(c, fd, buf, len);
2416         else
2417 #endif /* USE_TLS */
2418                 n=send(fd, buf, len,
2419 #ifdef HAVE_MSG_NOSIGNAL
2420                                         MSG_NOSIGNAL
2421 #else
2422                                         0
2423 #endif /* HAVE_MSG_NOSIGNAL */
2424                           );
2425         if (unlikely(n<0)){
2426                 if (errno==EINTR) goto again;
2427         }
2428         return n;
2429 }
2430
2431
2432
2433 /* handles io from a tcp child process
2434  * params: tcp_c - pointer in the tcp_children array, to the entry for
2435  *                 which an io event was detected 
2436  *         fd_i  - fd index in the fd_array (usefull for optimizing
2437  *                 io_watch_deletes)
2438  * returns:  handle_* return convention: -1 on error, 0 on EAGAIN (no more
2439  *           io events queued), >0 on success. success/error refer only to
2440  *           the reads from the fd.
2441  */
2442 inline static int handle_tcp_child(struct tcp_child* tcp_c, int fd_i)
2443 {
2444         struct tcp_connection* tcpconn;
2445         long response[2];
2446         int cmd;
2447         int bytes;
2448         int n;
2449         ticks_t t;
2450         ticks_t crt_timeout;
2451         
2452         if (unlikely(tcp_c->unix_sock<=0)){
2453                 /* (we can't have a fd==0, 0 is never closed )*/
2454                 LOG(L_CRIT, "BUG: handle_tcp_child: fd %d for %d "
2455                                 "(pid %d, ser no %d)\n", tcp_c->unix_sock,
2456                                 (int)(tcp_c-&tcp_children[0]), tcp_c->pid, tcp_c->proc_no);
2457                 goto error;
2458         }
2459         /* read until sizeof(response)
2460          * (this is a SOCK_STREAM so read is not atomic) */
2461         bytes=recv_all(tcp_c->unix_sock, response, sizeof(response), MSG_DONTWAIT);
2462         if (unlikely(bytes<(int)sizeof(response))){
2463                 if (bytes==0){
2464                         /* EOF -> bad, child has died */
2465                         DBG("DBG: handle_tcp_child: dead tcp child %d (pid %d, no %d)"
2466                                         " (shutting down?)\n", (int)(tcp_c-&tcp_children[0]), 
2467                                         tcp_c->pid, tcp_c->proc_no );
2468                         /* don't listen on it any more */
2469                         io_watch_del(&io_h, tcp_c->unix_sock, fd_i, 0); 
2470                         goto error; /* eof. so no more io here, it's ok to return error */
2471                 }else if (bytes<0){
2472                         /* EAGAIN is ok if we try to empty the buffer
2473                          * e.g.: SIGIO_RT overflow mode or EPOLL ET */
2474                         if ((errno!=EAGAIN) && (errno!=EWOULDBLOCK)){
2475                                 LOG(L_CRIT, "ERROR: handle_tcp_child: read from tcp child %ld "
2476                                                 " (pid %d, no %d) %s [%d]\n",
2477                                                 (long)(tcp_c-&tcp_children[0]), tcp_c->pid,
2478                                                 tcp_c->proc_no, strerror(errno), errno );
2479                         }else{
2480                                 bytes=0;
2481                         }
2482                         /* try to ignore ? */
2483                         goto end;
2484                 }else{
2485                         /* should never happen */
2486                         LOG(L_CRIT, "BUG: handle_tcp_child: too few bytes received (%d)\n",
2487                                         bytes );
2488                         bytes=0; /* something was read so there is no error; otoh if
2489                                           receive_fd returned less then requested => the receive
2490                                           buffer is empty => no more io queued on this fd */
2491                         goto end;
2492                 }
2493         }
2494         
2495         DBG("handle_tcp_child: reader response= %lx, %ld from %d \n",
2496                                         response[0], response[1], (int)(tcp_c-&tcp_children[0]));
2497         cmd=response[1];
2498         tcpconn=(struct tcp_connection*)response[0];
2499         if (unlikely(tcpconn==0)){
2500                 /* should never happen */
2501                 LOG(L_CRIT, "BUG: handle_tcp_child: null tcpconn pointer received"
2502                                  " from tcp child %d (pid %d): %lx, %lx\n",
2503                                         (int)(tcp_c-&tcp_children[0]), tcp_c->pid,
2504                                         response[0], response[1]) ;
2505                 goto end;
2506         }
2507         switch(cmd){
2508                 case CONN_RELEASE:
2509                         tcp_c->busy--;
2510                         if (unlikely(tcpconn_put(tcpconn))){
2511                                 tcpconn_destroy(tcpconn);
2512                                 break;
2513                         }
2514                         if (unlikely(tcpconn->state==S_CONN_BAD)){ 
2515 #ifdef TCP_BUF_WRITE
2516                                 if (unlikely(tcpconn->flags & F_CONN_WRITE_W)){
2517                                         io_watch_del(&io_h, tcpconn->s, -1, IO_FD_CLOSING);
2518                                         tcpconn->flags &= ~F_CONN_WRITE_W;
2519                                 }
2520 #endif /* TCP_BUF_WRITE */
2521                                 if (tcpconn_try_unhash(tcpconn))
2522                                         tcpconn_put_destroy(tcpconn);
2523                                 break;
2524                         }
2525                         /* update the timeout*/
2526                         t=get_ticks_raw();
2527                         tcpconn->timeout=t+tcp_con_lifetime;
2528                         crt_timeout=tcp_con_lifetime;
2529 #ifdef TCP_BUF_WRITE
2530                         if (unlikely(tcp_options.tcp_buf_write && 
2531                                                         _wbufq_non_empty(tcpconn) )){
2532                                 if (unlikely(TICKS_LE(t, tcpconn->wbuf_q.wr_timeout))){
2533                                         DBG("handle_tcp_child: wr. timeout on CONN_RELEASE for %p "
2534                                                         "refcnt= %d\n", tcpconn,
2535                                                         atomic_get(&tcpconn->refcnt));
2536                                         /* timeout */
2537                                         if (unlikely(tcpconn->flags & F_CONN_WRITE_W)){
2538                                                 io_watch_del(&io_h, tcpconn->s, -1, IO_FD_CLOSING);
2539                                                 tcpconn->flags&=~F_CONN_WRITE_W;
2540                                         }
2541                                         if (tcpconn_try_unhash(tcpconn))
2542                                                 tcpconn_put_destroy(tcpconn);
2543                                         break;
2544                                 }else{
2545                                         crt_timeout=MIN_unsigned(tcp_con_lifetime,
2546                                                                                         tcpconn->wbuf_q.wr_timeout-t);
2547                                 }
2548                         }
2549 #endif /* TCP_BUF_WRITE */
2550                         /* re-activate the timer */
2551                         tcpconn->timer.f=tcpconn_main_timeout;
2552                         local_timer_reinit(&tcpconn->timer);
2553                         local_timer_add(&tcp_main_ltimer, &tcpconn->timer, crt_timeout, t);
2554                         /* must be after the de-ref*/
2555                         tcpconn->flags|=F_CONN_MAIN_TIMER;
2556                         tcpconn->flags&=~(F_CONN_REMOVED|F_CONN_READER);
2557 #ifdef TCP_BUF_WRITE
2558                         if (unlikely(tcpconn->flags & F_CONN_WRITE_W))
2559                                 n=io_watch_chg(&io_h, tcpconn->s, POLLIN| POLLOUT, -1);
2560                         else
2561 #endif /* TCP_BUF_WRITE */
2562                                 n=io_watch_add(&io_h, tcpconn->s, POLLIN, F_TCPCONN, tcpconn);
2563                         if (unlikely(n<0)){
2564                                 LOG(L_CRIT, "ERROR: tcp_main: handle_tcp_child: failed to add"
2565                                                 " new socket to the fd list\n");
2566                                 tcpconn->flags|=F_CONN_REMOVED;
2567 #ifdef TCP_BUF_WRITE
2568                                 if (unlikely(tcpconn->flags & F_CONN_WRITE_W)){
2569                                         io_watch_del(&io_h, tcpconn->s, -1, IO_FD_CLOSING);
2570                                         tcpconn->flags&=~F_CONN_WRITE_W;
2571                                 }
2572 #endif /* TCP_BUF_WRITE */
2573                                 if (tcpconn_try_unhash(tcpconn));
2574                                         tcpconn_put_destroy(tcpconn);
2575                                 break;
2576                         }
2577                         DBG("handle_tcp_child: CONN_RELEASE  %p refcnt= %d\n", 
2578                                                         tcpconn, atomic_get(&tcpconn->refcnt));
2579                         break;
2580                 case CONN_ERROR:
2581                 case CONN_DESTROY:
2582                 case CONN_EOF:
2583                         /* WARNING: this will auto-dec. refcnt! */
2584                                 tcp_c->busy--;
2585                                 /* main doesn't listen on it => we don't have to delete it
2586                                  if (tcpconn->s!=-1)
2587                                         io_watch_del(&io_h, tcpconn->s, -1, IO_FD_CLOSING);
2588                                 */
2589 #ifdef TCP_BUF_WRITE
2590                                 if ((tcpconn->flags & F_CONN_WRITE_W) && (tcpconn->s!=-1)){
2591                                         io_watch_del(&io_h, tcpconn->s, -1, IO_FD_CLOSING);
2592                                         tcpconn->flags&=~F_CONN_WRITE_W;
2593                                 }
2594 #endif /* TCP_BUF_WRITE */
2595                                 if (tcpconn_try_unhash(tcpconn))
2596                                         tcpconn_put(tcpconn);
2597                                 tcpconn_put_destroy(tcpconn); /* deref & delete if refcnt==0 */
2598                                 break;
2599                 default:
2600                                 LOG(L_CRIT, "BUG: handle_tcp_child:  unknown cmd %d"
2601                                                                         " from tcp reader %d\n",
2602                                                                         cmd, (int)(tcp_c-&tcp_children[0]));
2603         }
2604 end:
2605         return bytes;
2606 error:
2607         return -1;
2608 }
2609
2610
2611
2612 /* handles io from a "generic" ser process (get fd or new_fd from a tcp_send)
2613  * 
2614  * params: p     - pointer in the ser processes array (pt[]), to the entry for
2615  *                 which an io event was detected
2616  *         fd_i  - fd index in the fd_array (usefull for optimizing
2617  *                 io_watch_deletes)
2618  * returns:  handle_* return convention:
2619  *          -1 on error reading from the fd,
2620  *           0 on EAGAIN  or when no  more io events are queued 
2621  *             (receive buffer empty),
2622  *           >0 on successfull reads from the fd (the receive buffer might
2623  *             be non-empty).
2624  */
2625 inline static int handle_ser_child(struct process_table* p, int fd_i)
2626 {
2627         struct tcp_connection* tcpconn;
2628         long response[2];
2629         int cmd;
2630         int bytes;
2631         int ret;
2632         int fd;
2633         int flags;
2634         ticks_t t;
2635         
2636         ret=-1;
2637         if (unlikely(p->unix_sock<=0)){
2638                 /* (we can't have a fd==0, 0 is never closed )*/
2639                 LOG(L_CRIT, "BUG: handle_ser_child: fd %d for %d "
2640                                 "(pid %d)\n", p->unix_sock, (int)(p-&pt[0]), p->pid);
2641                 goto error;
2642         }
2643                         
2644         /* get all bytes and the fd (if transmitted)
2645          * (this is a SOCK_STREAM so read is not atomic) */
2646         bytes=receive_fd(p->unix_sock, response, sizeof(response), &fd,
2647                                                 MSG_DONTWAIT);
2648         if (unlikely(bytes<(int)sizeof(response))){
2649                 /* too few bytes read */
2650                 if (bytes==0){
2651                         /* EOF -> bad, child has died */
2652                         DBG("DBG: handle_ser_child: dead child %d, pid %d"
2653                                         " (shutting down?)\n", (int)(p-&pt[0]), p->pid);
2654                         /* don't listen on it any more */
2655                         io_watch_del(&io_h, p->unix_sock, fd_i, 0);
2656                         goto error; /* child dead => no further io events from it */
2657                 }else if (bytes<0){
2658                         /* EAGAIN is ok if we try to empty the buffer
2659                          * e.g: SIGIO_RT overflow mode or EPOLL ET */
2660                         if ((errno!=EAGAIN) && (errno!=EWOULDBLOCK)){
2661                                 LOG(L_CRIT, "ERROR: handle_ser_child: read from child %d  "
2662                                                 "(pid %d):  %s [%d]\n", (int)(p-&pt[0]), p->pid,
2663                                                 strerror(errno), errno);
2664                                 ret=-1;
2665                         }else{
2666                                 ret=0;
2667                         }
2668                         /* try to ignore ? */
2669                         goto end;
2670                 }else{
2671                         /* should never happen */
2672                         LOG(L_CRIT, "BUG: handle_ser_child: too few bytes received (%d)\n",
2673                                         bytes );
2674                         ret=0; /* something was read so there is no error; otoh if
2675                                           receive_fd returned less then requested => the receive
2676                                           buffer is empty => no more io queued on this fd */
2677                         goto end;
2678                 }
2679         }
2680         ret=1; /* something was received, there might be more queued */
2681         DBG("handle_ser_child: read response= %lx, %ld, fd %d from %d (%d)\n",
2682                                         response[0], response[1], fd, (int)(p-&pt[0]), p->pid);
2683         cmd=response[1];
2684         tcpconn=(struct tcp_connection*)response[0];
2685         if (unlikely(tcpconn==0)){
2686                 LOG(L_CRIT, "BUG: handle_ser_child: null tcpconn pointer received"
2687                                  " from child %d (pid %d): %lx, %lx\n",
2688                                         (int)(p-&pt[0]), p->pid, response[0], response[1]) ;
2689                 goto end;
2690         }
2691         switch(cmd){
2692                 case CONN_ERROR:
2693                         LOG(L_ERR, "handle_ser_child: ERROR: received CON_ERROR for %p"
2694                                         " (id %d), refcnt %d\n", 
2695                                         tcpconn, tcpconn->id, atomic_get(&tcpconn->refcnt));
2696 #ifdef TCP_CONNECT_WAIT
2697                         /* if the connection is pending => it might be on the way of
2698                          * reaching tcp_main (e.g. CONN_NEW_COMPLETE or 
2699                          *  CONN_NEW_PENDING_WRITE) =>  it cannot be destroyed here */
2700                         if ( !(tcpconn->flags & F_CONN_PENDING) && 
2701                                         tcpconn_try_unhash(tcpconn) )
2702                                 tcpconn_put(tcpconn);
2703 #else /* ! TCP_CONNECT_WAIT */
2704                         if ( tcpconn_try_unhash(tcpconn) )
2705                                 tcpconn_put(tcpconn);
2706 #endif /* TCP_CONNECT_WAIT */
2707                         if ( (!(tcpconn->flags & F_CONN_REMOVED) ||
2708                                         (tcpconn->flags & F_CONN_WRITE_W) ) && (tcpconn->s!=-1)){
2709                                 io_watch_del(&io_h, tcpconn->s, -1, IO_FD_CLOSING);
2710                                 tcpconn->flags|=F_CONN_REMOVED;
2711                                 tcpconn->flags&=~F_CONN_WRITE_W;
2712                         }
2713                         tcpconn_put_destroy(tcpconn); /* dec refcnt & destroy on 0 */
2714                         break;
2715                 case CONN_GET_FD:
2716                         /* send the requested FD  */
2717                         /* WARNING: take care of setting refcnt properly to
2718                          * avoid race conditions */
2719                         if (unlikely(send_fd(p->unix_sock, &tcpconn, sizeof(tcpconn),
2720                                                                 tcpconn->s)<=0)){
2721                                 LOG(L_ERR, "ERROR: handle_ser_child: send_fd failed\n");
2722                         }
2723                         break;
2724                 case CONN_NEW:
2725                         /* update the fd in the requested tcpconn*/
2726                         /* WARNING: take care of setting refcnt properly to
2727                          * avoid race conditions */
2728                         if (unlikely(fd==-1)){
2729                                 LOG(L_CRIT, "BUG: handle_ser_child: CONN_NEW:"
2730                                                         " no fd received\n");
2731                                 tcpconn->flags|=F_CONN_FD_CLOSED;
2732                                 tcpconn_put_destroy(tcpconn);
2733                                 break;
2734                         }
2735                         (*tcp_connections_no)++;
2736                         tcpconn->s=fd;
2737                         /* add tcpconn to the list*/
2738                         tcpconn_add(tcpconn);
2739                         /* update the timeout*/
2740                         t=get_ticks_raw();
2741                         tcpconn->timeout=t+tcp_con_lifetime;
2742                         /* activate the timer (already properly init. in tcpconn_new())
2743                          * no need for reinit */
2744                         local_timer_add(&tcp_main_ltimer, &tcpconn->timer, 
2745                                                                 tcp_con_lifetime, t);
2746                         tcpconn->flags|=F_CONN_MAIN_TIMER;
2747                         tcpconn->flags&=~(F_CONN_REMOVED|F_CONN_FD_CLOSED);
2748                         flags=POLLIN 
2749 #ifdef TCP_BUF_WRITE
2750                                         /* not used for now, the connection is sent to tcp_main
2751                                          * before knowing if we can write on it or we should 
2752                                          * wait */
2753                                         | (((int)!(tcpconn->flags & F_CONN_WRITE_W)-1) & POLLOUT)
2754 #endif /* TCP_BUF_WRITE */
2755                                         ;
2756                         if (unlikely(
2757                                         io_watch_add(&io_h, tcpconn->s, flags,
2758                                                                                                 F_TCPCONN, tcpconn)<0)){
2759                                 LOG(L_CRIT, "ERROR: tcp_main: handle_ser_child: failed to add"
2760                                                 " new socket to the fd list\n");
2761                                 tcpconn->flags|=F_CONN_REMOVED;
2762                                 tcpconn->flags&=~F_CONN_WRITE_W;
2763                                 tcpconn_try_unhash(tcpconn); /*  unhash & dec refcnt */
2764                                 tcpconn_put_destroy(tcpconn);
2765                         }
2766                         break;
2767 #ifdef TCP_BUF_WRITE
2768                 case CONN_QUEUED_WRITE:
2769                         /* received only if the wr. queue is empty and a write finishes
2770                          * with EAGAIN (common after connect())
2771                          * it should only enable write watching on the fd. The connection
2772                          * should be  already in the hash. The refcnt is not changed.
2773                          */
2774                         if (unlikely((tcpconn->state==S_CONN_BAD) || 
2775                                                         !(tcpconn->flags & F_CONN_HASHED) ))
2776                                 break;
2777                         if (!(tcpconn->flags & F_CONN_WRITE_W)){
2778                                 t=get_ticks_raw();
2779                                 if (likely((tcpconn->flags & F_CONN_MAIN_TIMER) && 
2780                                         (TICKS_LT(tcpconn->wbuf_q.wr_timeout, tcpconn->timeout)) &&
2781                                                 TICKS_LT(t, tcpconn->wbuf_q.wr_timeout) )){
2782                                         /* _wbufq_nonempty() is guaranteed here */
2783                                         /* update the timer */
2784                                         local_timer_del(&tcp_main_ltimer, &tcpconn->timer);
2785                                         local_timer_reinit(&tcpconn->timer);
2786                                         local_timer_add(&tcp_main_ltimer, &tcpconn->timer,
2787                                                                                 tcpconn->wbuf_q.wr_timeout-t, t);
2788                                         DBG("tcp_main: handle_ser_child: CONN_QUEUED_WRITE; %p "
2789                                                         "timeout adjusted to %d s\n", tcpconn, 
2790                                                         TICKS_TO_S(tcpconn->wbuf_q.wr_timeout-t));
2791                                 }
2792                                 if (tcpconn->flags& F_CONN_REMOVED){
2793                                         if (unlikely(io_watch_add(&io_h, tcpconn->s, POLLOUT,
2794                                                                                                 F_TCPCONN, tcpconn)<0)){
2795                                                 LOG(L_CRIT, "ERROR: tcp_main: handle_ser_child: failed"
2796                                                                     " to enable write watch on socket\n");
2797                                                 if (tcpconn_try_unhash(tcpconn))
2798                                                         tcpconn_put_destroy(tcpconn);
2799                                                 break;
2800                                         }
2801                                 }else{
2802                                         if (unlikely(io_watch_chg(&io_h, tcpconn->s,
2803                                                                                                 POLLIN|POLLOUT, -1)<0)){
2804                                                 LOG(L_CRIT, "ERROR: tcp_main: handle_ser_child: failed"
2805                                                                     " to change socket watch events\n");
2806                                                 io_watch_del(&io_h, tcpconn->s, -1, IO_FD_CLOSING);
2807                                                 tcpconn->flags|=F_CONN_REMOVED;
2808                                                 if (tcpconn_try_unhash(tcpconn))
2809                                                         tcpconn_put_destroy(tcpconn);
2810                                                 break;
2811                                         }
2812                                 }
2813                                 tcpconn->flags|=F_CONN_WRITE_W;
2814                         }else{
2815                                 LOG(L_WARN, "tcp_main: hanlder_ser_child: connection %p"
2816                                                         " already watched for write\n", tcpconn);
2817                         }
2818                         break;
2819 #ifdef TCP_CONNECT_WAIT
2820                 case CONN_NEW_COMPLETE:
2821                 case CONN_NEW_PENDING_WRITE:
2822                                 /* received when a pending connect completes in the same
2823                                  * tcp_send() that initiated it
2824                                  * the connection is already in the hash with S_CONN_PENDING
2825                                  * state (added by tcp_send()) and refcnt at least 1 (for the
2826                                  *  hash)*/
2827                         tcpconn->flags&=~(F_CONN_PENDING|F_CONN_FD_CLOSED|F_CONN_REMOVED);
2828                         if (unlikely((tcpconn->state==S_CONN_BAD) || (fd==-1))){
2829                                 if (unlikely(fd==-1))
2830                                         LOG(L_CRIT, "BUG: handle_ser_child: CONN_NEW_COMPLETE:"
2831                                                                 " no fd received\n");
2832                                 else
2833                                         LOG(L_WARN, "WARNING: handle_ser_child: CONN_NEW_COMPLETE:"
2834                                                         " received connection with error\n");
2835                                 tcpconn->flags|=F_CONN_FD_CLOSED;
2836                                 tcpconn->state=S_CONN_BAD;
2837                                 tcpconn_try_unhash(tcpconn);
2838                                 tcpconn_put_destroy(tcpconn);
2839                                 break;
2840                         }
2841                         tcpconn->state=S_CONN_OK;
2842                         (*tcp_connections_no)++;
2843                         tcpconn->s=fd;
2844                         /* update the timeout*/
2845                         t=get_ticks_raw();
2846                         tcpconn->timeout=t+tcp_con_lifetime;
2847                         /* activate the timer (already properly init. in tcpconn_new())
2848                          * no need for reinit */
2849                         local_timer_add(&tcp_main_ltimer, &tcpconn->timer, 
2850                                                                 tcp_con_lifetime, t);
2851                         tcpconn->flags|=F_CONN_MAIN_TIMER;
2852                         if (unlikely(cmd==CONN_NEW_COMPLETE)){
2853                                 /* check if needs to be watched for write */
2854                                 lock_get(&tcpconn->write_lock);
2855                                         /* if queue non empty watch it for write */
2856                                         flags=(_wbufq_empty(tcpconn)-1)&POLLOUT;
2857                                 lock_release(&tcpconn->write_lock);
2858                                 tcpconn->flags|=(!(flags&POLLOUT)-1)&F_CONN_WRITE_W;
2859                         }else{
2860                                 /* CONN_NEW_PENDING_WRITE */
2861                                 /* no need to check, we have something queued for write */
2862                                 flags=POLLOUT;
2863                                 tcpconn->flags|=F_CONN_WRITE_W;
2864                         }
2865                         flags|=POLLIN;
2866                         if (unlikely(
2867                                         io_watch_add(&io_h, tcpconn->s, flags,
2868                                                                                                 F_TCPCONN, tcpconn)<0)){
2869                                 LOG(L_CRIT, "ERROR: tcp_main: handle_ser_child: failed to add"
2870                                                 " new socket to the fd list\n");
2871                                 tcpconn->flags|=F_CONN_REMOVED;
2872                                 tcpconn->flags&=~F_CONN_WRITE_W;
2873                                 tcpconn_try_unhash(tcpconn); /*  unhash & dec refcnt */
2874                                 tcpconn_put_destroy(tcpconn);
2875                         }
2876                         break;
2877 #endif /* TCP_CONNECT_WAIT */
2878 #endif /* TCP_BUF_WRITE */
2879                 default:
2880                         LOG(L_CRIT, "BUG: handle_ser_child: unknown cmd %d\n", cmd);
2881         }
2882 end:
2883         return ret;
2884 error:
2885         return -1;
2886 }
2887
2888
2889
2890 /* sends a tcpconn + fd to a choosen child */
2891 inline static int send2child(struct tcp_connection* tcpconn)
2892 {
2893         int i;
2894         int min_busy;
2895         int idx;
2896         static int crt=0; /* current child */
2897         int last;
2898         
2899         min_busy=tcp_children[0].busy;
2900         idx=0;
2901         last=crt+tcp_children_no;
2902         for (; crt<last; crt++){
2903                 i=crt%tcp_children_no;
2904                 if (!tcp_children[i].busy){
2905                         idx=i;
2906                         min_busy=0;
2907                         break;
2908                 }else if (min_busy>tcp_children[i].busy){
2909                         min_busy=tcp_children[i].busy;
2910                         idx=i;
2911                 }
2912         }
2913         crt=idx+1; /* next time we start with crt%tcp_children_no */
2914         
2915         tcp_children[idx].busy++;
2916         tcp_children[idx].n_reqs++;
2917         if (unlikely(min_busy)){
2918                 DBG("WARNING: send2child: no free tcp receiver, "
2919                                 " connection passed to the least busy one (%d)\n",
2920                                 min_busy);
2921         }
2922         DBG("send2child: to tcp child %d %d(%d), %p\n", idx, 
2923                                         tcp_children[idx].proc_no,
2924                                         tcp_children[idx].pid, tcpconn);
2925         /* first make sure this child doesn't have pending&nbs