ac76dc5abd20d1faef756a7a677f2941f37b0ec9
[sip-router] / tcp_main.c
1 /*
2  * $Id$
3  *
4  * Copyright (C) 2001-2003 FhG Fokus
5  *
6  * This file is part of ser, a free SIP server.
7  *
8  * ser is free software; you can redistribute it and/or modify
9  * it under the terms of the GNU General Public License as published by
10  * the Free Software Foundation; either version 2 of the License, or
11  * (at your option) any later version
12  *
13  * For a license to use the ser software under conditions
14  * other than those described here, or to purchase support for this
15  * software, please contact iptel.org by e-mail at the following addresses:
16  *    info@iptel.org
17  *
18  * ser is distributed in the hope that it will be useful,
19  * but WITHOUT ANY WARRANTY; without even the implied warranty of
20  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
21  * GNU General Public License for more details.
22  *
23  * You should have received a copy of the GNU General Public License
24  * along with this program; if not, write to the Free Software
25  * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
26  */
27 /*
28  * History:
29  * --------
30  *  2002-11-29  created by andrei
31  *  2002-12-11  added tcp_send (andrei)
32  *  2003-01-20  locking fixes, hashtables (andrei)
33  *  2003-02-20  s/lock_t/gen_lock_t/ to avoid a conflict on solaris (andrei)
34  *  2003-02-25  Nagle is disabled if -DDISABLE_NAGLE (andrei)
35  *  2003-03-29  SO_REUSEADDR before calling bind to allow
36  *              server restart, Nagle set on the (hopefuly) 
37  *              correct socket (jiri)
38  *  2003-03-31  always try to find the corresponding tcp listen socket for
39  *               a temp. socket and store in in *->bind_address: added
40  *               find_tcp_si, modified tcpconn_connect (andrei)
41  *  2003-04-14  set sockopts to TOS low delay (andrei)
42  *  2003-06-30  moved tcp new connect checking & handling to
43  *               handle_new_connect (andrei)
44  *  2003-07-09  tls_close called before closing the tcp connection (andrei)
45  *  2003-10-24  converted to the new socket_info lists (andrei)
46  *  2003-10-27  tcp port aliases support added (andrei)
47  *  2003-11-04  always lock before manipulating refcnt; sendchild
48  *              does not inc refcnt by itself anymore (andrei)
49  *  2003-11-07  different unix sockets are used for fd passing
50  *              to/from readers/writers (andrei)
51  *  2003-11-17  handle_new_connect & tcp_connect will close the 
52  *              new socket if tcpconn_new return 0 (e.g. out of mem) (andrei)
53  *  2003-11-28  tcp_blocking_write & tcp_blocking_connect added (andrei)
54  *  2004-11-08  dropped find_tcp_si and replaced with find_si (andrei)
55  *  2005-06-07  new tcp optimized code, supports epoll (LT), sigio + real time
56  *               signals, poll & select (andrei)
57  *  2005-06-26  *bsd kqueue support (andrei)
58  *  2005-07-04  solaris /dev/poll support (andrei)
59  *  2005-07-08  tcp_max_connections, tcp_connection_lifetime, don't accept
60  *               more connections if tcp_max_connections is exceeded (andrei)
61  *  2005-10-21  cleanup all the open connections on exit
62  *              decrement the no. of open connections on timeout too    (andrei) *  2006-01-30  queue send_fd request and execute them at the end of the
63  *              poll loop  (#ifdef) (andrei)
64  *              process all children requests, before attempting to send
65  *              them new stuff (fixes some deadlocks) (andrei)
66  *  2006-02-03  timers are run only once per s (andrei)
67  *              tcp children fds can be non-blocking; send fds are queued on
68  *              EAGAIN; lots of bug fixes (andrei)
69  *  2006-02-06  better tcp_max_connections checks, tcp_connections_no moved to
70  *              shm (andrei)
71  *  2006-04-12  tcp_send() changed to use struct dest_info (andrei)
72  *  2006-11-02  switched to atomic ops for refcnt, locking improvements 
73  *               (andrei)
74  *  2006-11-04  switched to raw ticks (to fix conversion errors which could
75  *               result in inf. lifetime) (andrei)
76  *  2007-07-25  tcpconn_connect can now bind the socket on a specified
77  *                source addr/port (andrei)
78  *  2007-07-26   tcp_send() and tcpconn_get() can now use a specified source
79  *                addr./port (andrei)
80  *  2007-08-23   getsockname() for INADDR_ANY(SI_IS_ANY) sockets (andrei)
81  *  2007-08-27   split init_sock_opt into a lightweight init_sock_opt_accept() 
82  *               used when accepting connections and init_sock_opt used for 
83  *               connect/ new sockets (andrei)
84  *  2007-11-22  always add the connection & clear the coresponding flags before
85  *               io_watch_add-ing its fd - it's safer this way (andrei)
86  *  2007-11-26  improved tcp timers: switched to local_timer (andrei)
87  *  2007-11-27  added send fd cache and reader fd reuse (andrei)
88  *  2007-11-28  added support for TCP_DEFER_ACCEPT, KEEPALIVE, KEEPINTVL,
89  *               KEEPCNT, QUICKACK, SYNCNT, LINGER2 (andrei)
90  *  2007-12-04  support for queueing write requests (andrei)
91  *  2007-12-12  destroy connection asap on wbuf. timeout (andrei)
92  *  2007-12-13  changed the refcnt and destroy scheme, now refcnt is 1 if
93  *                linked into the hash tables (was 0) (andrei)
94  *  2007-12-21  support for pending connects (connections are added to the
95  *               hash immediately and writes on them are buffered) (andrei)
96  *  2008-02-05  handle POLLRDHUP (if supported), POLLERR and
97  *               POLLHUP (andrei)
98  *              on write error check if there's still data in the socket 
99  *               read buffer and process it first (andrei)
100  *  2009-02-26  direct blacklist support (andrei)
101  *  2009-03-20  s/wq_timeout/send_timeout ; send_timeout is now in ticks
102  *              (andrei)
103  *  2009-04-09  tcp ev and tcp stats macros added (andrei)
104  *  2009-09-15  support for force connection reuse and close after send
105  *               send flags (andrei)
106  *  2010-03-23  tcp_send() split in 3 smaller functions (andrei)
107  */
108
109 /** tcp main/dispatcher and tcp send functions.
110  * @file tcp_main.c
111  * @ingroup core
112  * Module: @ref core
113  */
114
115
116 #ifdef USE_TCP
117
118
119 #ifndef SHM_MEM
120 #error "shared memory support needed (add -DSHM_MEM to Makefile.defs)"
121 #endif
122
123 #define HANDLE_IO_INLINE
124 #include "io_wait.h" /* include first to make sure the needed features are
125                                                 turned on (e.g. _GNU_SOURCE for POLLRDHUP) */
126
127 #include <sys/time.h>
128 #include <sys/types.h>
129 #include <sys/select.h>
130 #include <sys/socket.h>
131 #ifdef HAVE_FILIO_H
132 #include <sys/filio.h> /* needed on solaris 2.x for FIONREAD */
133 #elif defined __OS_solaris
134 #define BSD_COMP  /* needed on older solaris for FIONREAD */
135 #endif /* HAVE_FILIO_H / __OS_solaris */
136 #include <sys/ioctl.h>  /* ioctl() used on write error */
137 #include <netinet/in.h>
138 #include <netinet/in_systm.h>
139 #include <netinet/ip.h>
140 #include <netinet/tcp.h>
141 #include <sys/uio.h>  /* writev*/
142 #include <netdb.h>
143 #include <stdlib.h> /*exit() */
144
145 #include <unistd.h>
146
147 #include <errno.h>
148 #include <string.h>
149
150 #ifdef HAVE_SELECT
151 #include <sys/select.h>
152 #endif
153 #include <sys/poll.h>
154
155
156 #include "ip_addr.h"
157 #include "pass_fd.h"
158 #include "tcp_conn.h"
159 #include "globals.h"
160 #include "pt.h"
161 #include "locking.h"
162 #include "mem/mem.h"
163 #include "mem/shm_mem.h"
164 #include "timer.h"
165 #include "sr_module.h"
166 #include "tcp_server.h"
167 #include "tcp_init.h"
168 #include "tcp_int_send.h"
169 #include "tcp_stats.h"
170 #include "tcp_ev.h"
171 #include "tsend.h"
172 #include "timer_ticks.h"
173 #include "local_timer.h"
174 #ifdef CORE_TLS
175 #include "tls/tls_server.h"
176 #define tls_loaded() 1
177 #else
178 #include "tls_hooks_init.h"
179 #include "tls_hooks.h"
180 #endif /* CORE_TLS*/
181 #ifdef USE_DST_BLACKLIST
182 #include "dst_blacklist.h"
183 #endif /* USE_DST_BLACKLIST */
184
185 #include "tcp_info.h"
186 #include "tcp_options.h"
187 #include "ut.h"
188 #include "cfg/cfg_struct.h"
189
190 #define local_malloc pkg_malloc
191 #define local_free   pkg_free
192
193 #include <fcntl.h> /* must be included after io_wait.h if SIGIO_RT is used */
194
195
196 #ifdef NO_MSG_DONTWAIT
197 #ifndef MSG_DONTWAIT
198 /* should work inside tcp_main */
199 #define MSG_DONTWAIT 0
200 #endif
201 #endif /*NO_MSG_DONTWAIT */
202
203
204 #define TCP_PASS_NEW_CONNECTION_ON_DATA /* don't pass a new connection
205                                                                                    immediately to a child, wait for
206                                                                                    some data on it first */
207 #define TCP_LISTEN_BACKLOG 1024
208 #define SEND_FD_QUEUE /* queue send fd requests on EAGAIN, instead of sending 
209                                                         them immediately */
210 #define TCP_CHILD_NON_BLOCKING 
211 #ifdef SEND_FD_QUEUE
212 #ifndef TCP_CHILD_NON_BLOCKING
213 #define TCP_CHILD_NON_BLOCKING
214 #endif
215 #define MAX_SEND_FD_QUEUE_SIZE  tcp_main_max_fd_no
216 #define SEND_FD_QUEUE_SIZE              128  /* initial size */
217 #define SEND_FD_QUEUE_TIMEOUT   MS_TO_TICKS(2000)  /* 2 s */
218 #endif
219
220 /* minimum interval local_timer_run() is allowed to run, in ticks */
221 #define TCPCONN_TIMEOUT_MIN_RUN 1  /* once per tick */
222 #define TCPCONN_WAIT_TIMEOUT 1 /* 1 tick */
223
224 #ifdef TCP_ASYNC
225 static unsigned int* tcp_total_wq=0;
226 #endif
227
228
229 enum fd_types { F_NONE, F_SOCKINFO /* a tcp_listen fd */,
230                                 F_TCPCONN, F_TCPCHILD, F_PROC };
231
232
233 #ifdef TCP_FD_CACHE
234
235 #define TCP_FD_CACHE_SIZE 8
236
237 struct fd_cache_entry{
238         struct tcp_connection* con;
239         int id;
240         int fd;
241 };
242
243
244 static struct fd_cache_entry fd_cache[TCP_FD_CACHE_SIZE];
245 #endif /* TCP_FD_CACHE */
246
247 static int is_tcp_main=0;
248
249
250 enum poll_types tcp_poll_method=0; /* by default choose the best method */
251 int tcp_main_max_fd_no=0;
252 int tcp_max_connections=DEFAULT_TCP_MAX_CONNECTIONS;
253
254 static union sockaddr_union tcp_source_ipv4_addr; /* saved bind/srv v4 addr. */
255 static union sockaddr_union* tcp_source_ipv4=0;
256 #ifdef USE_IPV6
257 static union sockaddr_union tcp_source_ipv6_addr; /* saved bind/src v6 addr. */
258 static union sockaddr_union* tcp_source_ipv6=0;
259 #endif
260
261 static int* tcp_connections_no=0; /* current open connections */
262
263 /* connection hash table (after ip&port) , includes also aliases */
264 struct tcp_conn_alias** tcpconn_aliases_hash=0;
265 /* connection hash table (after connection id) */
266 struct tcp_connection** tcpconn_id_hash=0;
267 gen_lock_t* tcpconn_lock=0;
268
269 struct tcp_child* tcp_children;
270 static int* connection_id=0; /*  unique for each connection, used for 
271                                                                 quickly finding the corresponding connection
272                                                                 for a reply */
273 int unix_tcp_sock;
274
275 static int tcp_proto_no=-1; /* tcp protocol number as returned by
276                                                            getprotobyname */
277
278 static io_wait_h io_h;
279
280 static struct local_timer tcp_main_ltimer;
281 static ticks_t tcp_main_prev_ticks;
282
283
284 static ticks_t tcpconn_main_timeout(ticks_t , struct timer_ln* , void* );
285
286 inline static int _tcpconn_add_alias_unsafe(struct tcp_connection* c, int port,
287                                                                                 struct ip_addr* l_ip, int l_port,
288                                                                                 int flags);
289
290
291
292 /* sets source address used when opening new sockets and no source is specified
293  *  (by default the address is choosen by the kernel)
294  * Should be used only on init.
295  * returns -1 on error */
296 int tcp_set_src_addr(struct ip_addr* ip)
297 {
298         switch (ip->af){
299                 case AF_INET:
300                         ip_addr2su(&tcp_source_ipv4_addr, ip, 0);
301                         tcp_source_ipv4=&tcp_source_ipv4_addr;
302                         break;
303                 #ifdef USE_IPV6
304                 case AF_INET6:
305                         ip_addr2su(&tcp_source_ipv6_addr, ip, 0);
306                         tcp_source_ipv6=&tcp_source_ipv6_addr;
307                         break;
308                 #endif
309                 default:
310                         return -1;
311         }
312         return 0;
313 }
314
315
316
317 static inline int init_sock_keepalive(int s)
318 {
319         int optval;
320         
321 #ifdef HAVE_SO_KEEPALIVE
322         if (cfg_get(tcp, tcp_cfg, keepalive)){
323                 optval=1;
324                 if (setsockopt(s, SOL_SOCKET, SO_KEEPALIVE, &optval,
325                                                 sizeof(optval))<0){
326                         LOG(L_WARN, "WARNING: init_sock_keepalive: failed to enable"
327                                                 " SO_KEEPALIVE: %s\n", strerror(errno));
328                         return -1;
329                 }
330         }
331 #endif
332 #ifdef HAVE_TCP_KEEPINTVL
333         if ((optval=cfg_get(tcp, tcp_cfg, keepintvl))){
334                 if (setsockopt(s, IPPROTO_TCP, TCP_KEEPINTVL, &optval,
335                                                 sizeof(optval))<0){
336                         LOG(L_WARN, "WARNING: init_sock_keepalive: failed to set"
337                                                 " keepalive probes interval: %s\n", strerror(errno));
338                 }
339         }
340 #endif
341 #ifdef HAVE_TCP_KEEPIDLE
342         if ((optval=cfg_get(tcp, tcp_cfg, keepidle))){
343                 if (setsockopt(s, IPPROTO_TCP, TCP_KEEPIDLE, &optval,
344                                                 sizeof(optval))<0){
345                         LOG(L_WARN, "WARNING: init_sock_keepalive: failed to set"
346                                                 " keepalive idle interval: %s\n", strerror(errno));
347                 }
348         }
349 #endif
350 #ifdef HAVE_TCP_KEEPCNT
351         if ((optval=cfg_get(tcp, tcp_cfg, keepcnt))){
352                 if (setsockopt(s, IPPROTO_TCP, TCP_KEEPCNT, &optval,
353                                                 sizeof(optval))<0){
354                         LOG(L_WARN, "WARNING: init_sock_keepalive: failed to set"
355                                                 " maximum keepalive count: %s\n", strerror(errno));
356                 }
357         }
358 #endif
359         return 0;
360 }
361
362
363
364 /* set all socket/fd options for new sockets (e.g. before connect): 
365  *  disable nagle, tos lowdelay, reuseaddr, non-blocking
366  *
367  * return -1 on error */
368 static int init_sock_opt(int s)
369 {
370         int flags;
371         int optval;
372         
373 #ifdef DISABLE_NAGLE
374         flags=1;
375         if ( (tcp_proto_no!=-1) && (setsockopt(s, tcp_proto_no , TCP_NODELAY,
376                                         &flags, sizeof(flags))<0) ){
377                 LOG(L_WARN, "WARNING: init_sock_opt: could not disable Nagle: %s\n",
378                                 strerror(errno));
379         }
380 #endif
381         /* tos*/
382         optval = tos;
383         if (setsockopt(s, IPPROTO_IP, IP_TOS, (void*)&optval,sizeof(optval)) ==-1){
384                 LOG(L_WARN, "WARNING: init_sock_opt: setsockopt tos: %s\n",
385                                 strerror(errno));
386                 /* continue since this is not critical */
387         }
388 #if  !defined(TCP_DONT_REUSEADDR) 
389         optval=1;
390         if (setsockopt(s, SOL_SOCKET, SO_REUSEADDR,
391                                                 (void*)&optval, sizeof(optval))==-1){
392                 LOG(L_ERR, "ERROR: setsockopt SO_REUSEADDR %s\n",
393                                 strerror(errno));
394                 /* continue, not critical */
395         }
396 #endif /* !TCP_DONT_REUSEADDR */
397 #ifdef HAVE_TCP_SYNCNT
398         if ((optval=cfg_get(tcp, tcp_cfg, syncnt))){
399                 if (setsockopt(s, IPPROTO_TCP, TCP_SYNCNT, &optval,
400                                                 sizeof(optval))<0){
401                         LOG(L_WARN, "WARNING: init_sock_opt: failed to set"
402                                                 " maximum SYN retr. count: %s\n", strerror(errno));
403                 }
404         }
405 #endif
406 #ifdef HAVE_TCP_LINGER2
407         if ((optval=cfg_get(tcp, tcp_cfg, linger2))){
408                 if (setsockopt(s, IPPROTO_TCP, TCP_LINGER2, &optval,
409                                                 sizeof(optval))<0){
410                         LOG(L_WARN, "WARNING: init_sock_opt: failed to set"
411                                                 " maximum LINGER2 timeout: %s\n", strerror(errno));
412                 }
413         }
414 #endif
415 #ifdef HAVE_TCP_QUICKACK
416         if (cfg_get(tcp, tcp_cfg, delayed_ack)){
417                 optval=0; /* reset quick ack => delayed ack */
418                 if (setsockopt(s, IPPROTO_TCP, TCP_QUICKACK, &optval,
419                                                 sizeof(optval))<0){
420                         LOG(L_WARN, "WARNING: init_sock_opt: failed to reset"
421                                                 " TCP_QUICKACK: %s\n", strerror(errno));
422                 }
423         }
424 #endif /* HAVE_TCP_QUICKACK */
425         init_sock_keepalive(s);
426         
427         /* non-blocking */
428         flags=fcntl(s, F_GETFL);
429         if (flags==-1){
430                 LOG(L_ERR, "ERROR: init_sock_opt: fnctl failed: (%d) %s\n",
431                                 errno, strerror(errno));
432                 goto error;
433         }
434         if (fcntl(s, F_SETFL, flags|O_NONBLOCK)==-1){
435                 LOG(L_ERR, "ERROR: init_sock_opt: fcntl: set non-blocking failed:"
436                                 " (%d) %s\n", errno, strerror(errno));
437                 goto error;
438         }
439         return 0;
440 error:
441         return -1;
442 }
443
444
445
446 /* set all socket/fd options for "accepted" sockets 
447  *  only nonblocking is set since the rest is inherited from the
448  *  "parent" (listening) socket
449  *  Note: setting O_NONBLOCK is required on linux but it's not needed on
450  *        BSD and possibly solaris (where the flag is inherited from the 
451  *        parent socket). However since there is no standard document 
452  *        requiring a specific behaviour in this case it's safer to always set
453  *        it (at least for now)  --andrei
454  *  TODO: check on which OSes  O_NONBLOCK is inherited and make this 
455  *        function a nop.
456  *
457  * return -1 on error */
458 static int init_sock_opt_accept(int s)
459 {
460         int flags;
461         
462         /* non-blocking */
463         flags=fcntl(s, F_GETFL);
464         if (flags==-1){
465                 LOG(L_ERR, "ERROR: init_sock_opt_accept: fnctl failed: (%d) %s\n",
466                                 errno, strerror(errno));
467                 goto error;
468         }
469         if (fcntl(s, F_SETFL, flags|O_NONBLOCK)==-1){
470                 LOG(L_ERR, "ERROR: init_sock_opt_accept: "
471                                         "fcntl: set non-blocking failed: (%d) %s\n",
472                                         errno, strerror(errno));
473                 goto error;
474         }
475         return 0;
476 error:
477         return -1;
478 }
479
480
481
482 /** close a socket, handling errno.
483  * On EINTR, repeat the close().
484  * Filter expected errors (return success if close() failed because
485  * EPIPE, ECONNRST a.s.o). Note that this happens on *BSDs (on linux close()
486  * does not fail for socket level errors).
487  * @param s - open valid socket.
488  * @return - 0 on success, < 0 on error (whatever close() returns). On error
489  *           errno is set.
490  */
491 static int tcp_safe_close(int s)
492 {
493         int ret;
494 retry:
495         if (unlikely((ret = close(s)) < 0 )) {
496                 switch(errno) {
497                         case EINTR:
498                                 goto retry;
499                         case EPIPE:
500                         case ENOTCONN:
501                         case ECONNRESET:
502                         case ECONNREFUSED:
503                         case ENETUNREACH:
504                         case EHOSTUNREACH:
505                                 /* on *BSD we really get these errors at close() time 
506                                    => ignore them */
507                                 ret = 0;
508                                 break;
509                         default:
510                                 break;
511                 }
512         }
513         return ret;
514 }
515
516
517
518 /* blocking connect on a non-blocking fd; it will timeout after
519  * tcp_connect_timeout 
520  * if BLOCKING_USE_SELECT and HAVE_SELECT are defined it will internally
521  * use select() instead of poll (bad if fd > FD_SET_SIZE, poll is preferred)
522  */
523 static int tcp_blocking_connect(int fd, int type, snd_flags_t* send_flags,
524                                                                 const struct sockaddr *servaddr,
525                                                                 socklen_t addrlen)
526 {
527         int n;
528 #if defined(HAVE_SELECT) && defined(BLOCKING_USE_SELECT)
529         fd_set sel_set;
530         fd_set orig_set;
531         struct timeval timeout;
532 #else
533         struct pollfd pf;
534 #endif
535         int elapsed;
536         int to;
537         int ticks;
538         int err;
539         unsigned int err_len;
540         int poll_err;
541         
542         poll_err=0;
543         to=cfg_get(tcp, tcp_cfg, connect_timeout_s);
544         ticks=get_ticks();
545 again:
546         n=connect(fd, servaddr, addrlen);
547         if (n==-1){
548                 if (errno==EINTR){
549                         elapsed=(get_ticks()-ticks)*TIMER_TICK;
550                         if (elapsed<to)         goto again;
551                         else goto error_timeout;
552                 }
553                 if (errno!=EINPROGRESS && errno!=EALREADY){
554                         goto error_errno;
555                 }
556         }else goto end;
557         
558         /* poll/select loop */
559 #if defined(HAVE_SELECT) && defined(BLOCKING_USE_SELECT)
560                 FD_ZERO(&orig_set);
561                 FD_SET(fd, &orig_set);
562 #else
563                 pf.fd=fd;
564                 pf.events=POLLOUT;
565 #endif
566         while(1){
567                 elapsed=(get_ticks()-ticks)*TIMER_TICK;
568                 if (elapsed>=to)
569                         goto error_timeout;
570 #if defined(HAVE_SELECT) && defined(BLOCKING_USE_SELECT)
571                 sel_set=orig_set;
572                 timeout.tv_sec=to-elapsed;
573                 timeout.tv_usec=0;
574                 n=select(fd+1, 0, &sel_set, 0, &timeout);
575 #else
576                 n=poll(&pf, 1, (to-elapsed)*1000);
577 #endif
578                 if (n<0){
579                         if (errno==EINTR) continue;
580                         LOG(L_ERR, "ERROR: tcp_blocking_connect %s: poll/select failed:"
581                                         " (%d) %s\n",
582                                         su2a((union sockaddr_union*)servaddr, addrlen),
583                                         errno, strerror(errno));
584                         goto error;
585                 }else if (n==0) /* timeout */ continue;
586 #if defined(HAVE_SELECT) && defined(BLOCKING_USE_SELECT)
587                 if (FD_ISSET(fd, &sel_set))
588 #else
589                 if (pf.revents&(POLLERR|POLLHUP|POLLNVAL)){ 
590                         LOG(L_ERR, "ERROR: tcp_blocking_connect %s: poll error: "
591                                         "flags %x\n",
592                                         su2a((union sockaddr_union*)servaddr, addrlen),
593                                         pf.revents);
594                         poll_err=1;
595                 }
596 #endif
597                 {
598                         err_len=sizeof(err);
599                         getsockopt(fd, SOL_SOCKET, SO_ERROR, &err, &err_len);
600                         if ((err==0) && (poll_err==0)) goto end;
601                         if (err!=EINPROGRESS && err!=EALREADY){
602                                 LOG(L_ERR, "ERROR: tcp_blocking_connect %s: SO_ERROR (%d) "
603                                                 "%s\n",
604                                                 su2a((union sockaddr_union*)servaddr, addrlen),
605                                                 err, strerror(err));
606                                 errno=err;
607                                 goto error_errno;
608                         }
609                 }
610         }
611 error_errno:
612         switch(errno){
613                 case ENETUNREACH:
614                 case EHOSTUNREACH:
615 #ifdef USE_DST_BLACKLIST
616                         dst_blacklist_su(BLST_ERR_CONNECT, type,
617                                                          (union sockaddr_union*)servaddr, send_flags, 0);
618 #endif /* USE_DST_BLACKLIST */
619                         TCP_EV_CONNECT_UNREACHABLE(errno, 0, 0,
620                                                         (union sockaddr_union*)servaddr, type);
621                         break;
622                 case ETIMEDOUT:
623 #ifdef USE_DST_BLACKLIST
624                         dst_blacklist_su(BLST_ERR_CONNECT, type,
625                                                          (union sockaddr_union*)servaddr, send_flags, 0);
626 #endif /* USE_DST_BLACKLIST */
627                         TCP_EV_CONNECT_TIMEOUT(errno, 0, 0,
628                                                         (union sockaddr_union*)servaddr, type);
629                         break;
630                 case ECONNREFUSED:
631                 case ECONNRESET:
632 #ifdef USE_DST_BLACKLIST
633                         dst_blacklist_su(BLST_ERR_CONNECT, type,
634                                                          (union sockaddr_union*)servaddr, send_flags, 0);
635 #endif /* USE_DST_BLACKLIST */
636                         TCP_EV_CONNECT_RST(errno, 0, 0,
637                                                         (union sockaddr_union*)servaddr, type);
638                         break;
639                 case EAGAIN: /* not posix, but supported on linux and bsd */
640                         TCP_EV_CONNECT_NO_MORE_PORTS(errno, 0, 0,
641                                                         (union sockaddr_union*)servaddr, type);
642                         break;
643                 default:
644                         TCP_EV_CONNECT_ERR(errno, 0, 0,
645                                                                 (union sockaddr_union*)servaddr, type);
646         }
647         LOG(L_ERR, "ERROR: tcp_blocking_connect %s: (%d) %s\n",
648                         su2a((union sockaddr_union*)servaddr, addrlen),
649                         errno, strerror(errno));
650         goto error;
651 error_timeout:
652         /* timeout */
653 #ifdef USE_DST_BLACKLIST
654         dst_blacklist_su(BLST_ERR_CONNECT, type,
655                                                 (union sockaddr_union*)servaddr, send_flags, 0);
656 #endif /* USE_DST_BLACKLIST */
657         TCP_EV_CONNECT_TIMEOUT(0, 0, 0, (union sockaddr_union*)servaddr, type);
658         LOG(L_ERR, "ERROR: tcp_blocking_connect %s: timeout %d s elapsed "
659                                 "from %d s\n", su2a((union sockaddr_union*)servaddr, addrlen),
660                                 elapsed, cfg_get(tcp, tcp_cfg, connect_timeout_s));
661 error:
662         TCP_STATS_CONNECT_FAILED();
663         return -1;
664 end:
665         return 0;
666 }
667
668
669
670 #ifdef TCP_ASYNC
671
672
673 /* unsafe version */
674 #define _wbufq_empty(con) ((con)->wbuf_q.first==0)
675 /* unsafe version */
676 #define _wbufq_non_empty(con) ((con)->wbuf_q.first!=0)
677
678
679 /* unsafe version, call while holding the connection write lock */
680 inline static int _wbufq_add(struct  tcp_connection* c, const char* data, 
681                                                         unsigned int size)
682 {
683         struct tcp_wbuffer_queue* q;
684         struct tcp_wbuffer* wb;
685         unsigned int last_free;
686         unsigned int wb_size;
687         unsigned int crt_size;
688         ticks_t t;
689         
690         q=&c->wbuf_q;
691         t=get_ticks_raw();
692         if (unlikely(   ((q->queued+size)>cfg_get(tcp, tcp_cfg, tcpconn_wq_max)) ||
693                                         ((*tcp_total_wq+size)>cfg_get(tcp, tcp_cfg, tcp_wq_max)) ||
694                                         (q->first &&
695                                         TICKS_LT(q->wr_timeout, t)) )){
696                 LOG(L_ERR, "ERROR: wbufq_add(%d bytes): write queue full or timeout "
697                                         " (%d, total %d, last write %d s ago)\n",
698                                         size, q->queued, *tcp_total_wq,
699                                         TICKS_TO_S(t-(q->wr_timeout-
700                                                                 cfg_get(tcp, tcp_cfg, send_timeout))));
701                 if (q->first && TICKS_LT(q->wr_timeout, t)){
702                         if (unlikely(c->state==S_CONN_CONNECT)){
703 #ifdef USE_DST_BLACKLIST
704                                 dst_blacklist_su( BLST_ERR_CONNECT, c->rcv.proto,
705                                                                                 &c->rcv.src_su, &c->send_flags, 0);
706 #endif /* USE_DST_BLACKLIST */
707                                 TCP_EV_CONNECT_TIMEOUT(0, TCP_LADDR(c), TCP_LPORT(c),
708                                                                                         TCP_PSU(c), TCP_PROTO(c));
709                                 TCP_STATS_CONNECT_FAILED();
710                         }else{
711 #ifdef USE_DST_BLACKLIST
712                                 dst_blacklist_su( BLST_ERR_SEND, c->rcv.proto,
713                                                                         &c->rcv.src_su, &c->send_flags, 0);
714 #endif /* USE_DST_BLACKLIST */
715                                 TCP_EV_SEND_TIMEOUT(0, &c->rcv);
716                                 TCP_STATS_SEND_TIMEOUT();
717                         }
718                 }else{
719                         /* if it's not a timeout => queue full */
720                         TCP_EV_SENDQ_FULL(0, &c->rcv);
721                         TCP_STATS_SENDQ_FULL();
722                 }
723                 goto error;
724         }
725         
726         if (unlikely(q->last==0)){
727                 wb_size=MAX_unsigned(cfg_get(tcp, tcp_cfg, wq_blk_size), size);
728                 wb=shm_malloc(sizeof(*wb)+wb_size-1);
729                 if (unlikely(wb==0))
730                         goto error;
731                 wb->b_size=wb_size;
732                 wb->next=0;
733                 q->last=wb;
734                 q->first=wb;
735                 q->last_used=0;
736                 q->offset=0;
737                 q->wr_timeout=get_ticks_raw()+
738                         ((c->state==S_CONN_CONNECT)?
739                                         S_TO_TICKS(cfg_get(tcp, tcp_cfg, connect_timeout_s)):
740                                         cfg_get(tcp, tcp_cfg, send_timeout));
741         }else{
742                 wb=q->last;
743         }
744         
745         while(size){
746                 last_free=wb->b_size-q->last_used;
747                 if (last_free==0){
748                         wb_size=MAX_unsigned(cfg_get(tcp, tcp_cfg, wq_blk_size), size);
749                         wb=shm_malloc(sizeof(*wb)+wb_size-1);
750                         if (unlikely(wb==0))
751                                 goto error;
752                         wb->b_size=wb_size;
753                         wb->next=0;
754                         q->last->next=wb;
755                         q->last=wb;
756                         q->last_used=0;
757                         last_free=wb->b_size;
758                 }
759                 crt_size=MIN_unsigned(last_free, size);
760                 memcpy(wb->buf+q->last_used, data, crt_size);
761                 q->last_used+=crt_size;
762                 size-=crt_size;
763                 data+=crt_size;
764                 q->queued+=crt_size;
765                 atomic_add_int((int*)tcp_total_wq, crt_size);
766         }
767         return 0;
768 error:
769         return -1;
770 }
771
772
773
774 /* unsafe version, call while holding the connection write lock
775  * inserts data at the beginning, it ignores the max queue size checks and
776  * the timeout (use sparingly)
777  * Note: it should never be called on a write buffer after wbufq_run() */
778 inline static int _wbufq_insert(struct  tcp_connection* c, const char* data, 
779                                                         unsigned int size)
780 {
781         struct tcp_wbuffer_queue* q;
782         struct tcp_wbuffer* wb;
783         
784         q=&c->wbuf_q;
785         if (likely(q->first==0)) /* if empty, use wbufq_add */
786                 return _wbufq_add(c, data, size);
787         
788         if (unlikely((*tcp_total_wq+size)>cfg_get(tcp, tcp_cfg, tcp_wq_max))){
789                 LOG(L_ERR, "ERROR: wbufq_insert(%d bytes): write queue full"
790                                         " (%d, total %d, last write %d s ago)\n",
791                                         size, q->queued, *tcp_total_wq,
792                                         TICKS_TO_S(get_ticks_raw()-q->wr_timeout-
793                                                                         cfg_get(tcp, tcp_cfg, send_timeout)));
794                 goto error;
795         }
796         if (unlikely(q->offset)){
797                 LOG(L_CRIT, "BUG: wbufq_insert: non-null offset %d (bad call, should"
798                                 "never be called after the wbufq_run())\n", q->offset);
799                 goto error;
800         }
801         if ((q->first==q->last) && ((q->last->b_size-q->last_used)>=size)){
802                 /* one block with enough space in it for size bytes */
803                 memmove(q->first->buf+size, q->first->buf, size);
804                 memcpy(q->first->buf, data, size);
805                 q->last_used+=size;
806         }else{
807                 /* create a size bytes block directly */
808                 wb=shm_malloc(sizeof(*wb)+size-1);
809                 if (unlikely(wb==0))
810                         goto error;
811                 wb->b_size=size;
812                 /* insert it */
813                 wb->next=q->first;
814                 q->first=wb;
815                 memcpy(wb->buf, data, size);
816         }
817         
818         q->queued+=size;
819         atomic_add_int((int*)tcp_total_wq, size);
820         return 0;
821 error:
822         return -1;
823 }
824
825
826
827 /* unsafe version, call while holding the connection write lock */
828 inline static void _wbufq_destroy( struct  tcp_wbuffer_queue* q)
829 {
830         struct tcp_wbuffer* wb;
831         struct tcp_wbuffer* next_wb;
832         int unqueued;
833         
834         unqueued=0;
835         if (likely(q->first)){
836                 wb=q->first;
837                 do{
838                         next_wb=wb->next;
839                         unqueued+=(wb==q->last)?q->last_used:wb->b_size;
840                         if (wb==q->first)
841                                 unqueued-=q->offset;
842                         shm_free(wb);
843                         wb=next_wb;
844                 }while(wb);
845         }
846         memset(q, 0, sizeof(*q));
847         atomic_add_int((int*)tcp_total_wq, -unqueued);
848 }
849
850
851
852 /* tries to empty the queue  (safe version, c->write_lock must not be hold)
853  * returns -1 on error, bytes written on success (>=0) 
854  * if the whole queue is emptied => sets *empty*/
855 inline static int wbufq_run(int fd, struct tcp_connection* c, int* empty)
856 {
857         struct tcp_wbuffer_queue* q;
858         struct tcp_wbuffer* wb;
859         int n;
860         int ret;
861         int block_size;
862         char* buf;
863         
864         *empty=0;
865         ret=0;
866         lock_get(&c->write_lock);
867         q=&c->wbuf_q;
868         while(q->first){
869                 block_size=((q->first==q->last)?q->last_used:q->first->b_size)-
870                                                 q->offset;
871                 buf=q->first->buf+q->offset;
872                 n=_tcpconn_write_nb(fd, c, buf, block_size);
873                 if (likely(n>0)){
874                         ret+=n;
875                         if (likely(n==block_size)){
876                                 wb=q->first;
877                                 q->first=q->first->next; 
878                                 shm_free(wb);
879                                 q->offset=0;
880                                 q->queued-=block_size;
881                                 atomic_add_int((int*)tcp_total_wq, -block_size);
882                         }else{
883                                 q->offset+=n;
884                                 q->queued-=n;
885                                 atomic_add_int((int*)tcp_total_wq, -n);
886                                 break;
887                         }
888                 }else{
889                         if (n<0){
890                                 /* EINTR is handled inside _tcpconn_write_nb */
891                                 if (!(errno==EAGAIN || errno==EWOULDBLOCK)){
892                                         if (unlikely(c->state==S_CONN_CONNECT)){
893                                                 switch(errno){
894                                                         case ENETUNREACH:
895                                                         case EHOSTUNREACH: /* not posix for send() */
896 #ifdef USE_DST_BLACKLIST
897                                                                 dst_blacklist_su(BLST_ERR_CONNECT,
898                                                                                                         c->rcv.proto,
899                                                                                                         &c->rcv.src_su,
900                                                                                                         &c->send_flags, 0);
901 #endif /* USE_DST_BLACKLIST */
902                                                                 TCP_EV_CONNECT_UNREACHABLE(errno, TCP_LADDR(c),
903                                                                                                         TCP_LPORT(c), TCP_PSU(c),
904                                                                                                         TCP_PROTO(c));
905                                                                 break;
906                                                         case ECONNREFUSED:
907                                                         case ECONNRESET:
908 #ifdef USE_DST_BLACKLIST
909                                                                 dst_blacklist_su(BLST_ERR_CONNECT,
910                                                                                                         c->rcv.proto,
911                                                                                                         &c->rcv.src_su,
912                                                                                                         &c->send_flags, 0);
913 #endif /* USE_DST_BLACKLIST */
914                                                                 TCP_EV_CONNECT_RST(0, TCP_LADDR(c),
915                                                                                                         TCP_LPORT(c), TCP_PSU(c),
916                                                                                                         TCP_PROTO(c));
917                                                                 break;
918                                                         default:
919                                                                 TCP_EV_CONNECT_ERR(errno, TCP_LADDR(c),
920                                                                                                         TCP_LPORT(c), TCP_PSU(c),
921                                                                                                         TCP_PROTO(c));
922                                                 }
923                                                 TCP_STATS_CONNECT_FAILED();
924                                         }else{
925                                                 switch(errno){
926                                                         case ECONNREFUSED:
927                                                         case ECONNRESET:
928                                                                 TCP_STATS_CON_RESET();
929                                                                 /* no break */
930                                                         case ENETUNREACH:
931                                                         case EHOSTUNREACH: /* not posix for send() */
932 #ifdef USE_DST_BLACKLIST
933                                                                 dst_blacklist_su(BLST_ERR_SEND,
934                                                                                                         c->rcv.proto,
935                                                                                                         &c->rcv.src_su,
936                                                                                                         &c->send_flags, 0);
937 #endif /* USE_DST_BLACKLIST */
938                                                                 break;
939                                                 }
940                                         }
941                                         ret=-1;
942                                         LOG(L_ERR, "ERROR: wbuf_runq: %s [%d]\n",
943                                                 strerror(errno), errno);
944                                 }
945                         }
946                         break;
947                 }
948         }
949         if (likely(q->first==0)){
950                 q->last=0;
951                 q->last_used=0;
952                 q->offset=0;
953                 *empty=1;
954         }
955         lock_release(&c->write_lock);
956         if (likely(ret>0)){
957                 q->wr_timeout=get_ticks_raw()+cfg_get(tcp, tcp_cfg, send_timeout);
958                 if (unlikely(c->state==S_CONN_CONNECT || c->state==S_CONN_ACCEPT)){
959                         TCP_STATS_ESTABLISHED(c->state);
960                         c->state=S_CONN_OK;
961                 }
962         }
963         return ret;
964 }
965
966 #endif /* TCP_ASYNC */
967
968
969
970 #if 0
971 /* blocking write even on non-blocking sockets 
972  * if TCP_TIMEOUT will return with error */
973 static int tcp_blocking_write(struct tcp_connection* c, int fd, char* buf,
974                                                                 unsigned int len)
975 {
976         int n;
977         fd_set sel_set;
978         struct timeval timeout;
979         int ticks;
980         int initial_len;
981         
982         initial_len=len;
983 again:
984         
985         n=send(fd, buf, len,
986 #ifdef HAVE_MSG_NOSIGNAL
987                         MSG_NOSIGNAL
988 #else
989                         0
990 #endif
991                 );
992         if (n<0){
993                 if (errno==EINTR)       goto again;
994                 else if (errno!=EAGAIN && errno!=EWOULDBLOCK){
995                         LOG(L_ERR, "tcp_blocking_write: failed to send: (%d) %s\n",
996                                         errno, strerror(errno));
997                         TCP_EV_SEND_TIMEOUT(errno, &c->rcv);
998                         TCP_STATS_SEND_TIMEOUT();
999                         goto error;
1000                 }
1001         }else if (n<len){
1002                 /* partial write */
1003                 buf+=n;
1004                 len-=n;
1005         }else{
1006                 /* success: full write */
1007                 goto end;
1008         }
1009         while(1){
1010                 FD_ZERO(&sel_set);
1011                 FD_SET(fd, &sel_set);
1012                 timeout.tv_sec=tcp_send_timeout;
1013                 timeout.tv_usec=0;
1014                 ticks=get_ticks();
1015                 n=select(fd+1, 0, &sel_set, 0, &timeout);
1016                 if (n<0){
1017                         if (errno==EINTR) continue; /* signal, ignore */
1018                         LOG(L_ERR, "ERROR: tcp_blocking_write: select failed: "
1019                                         " (%d) %s\n", errno, strerror(errno));
1020                         goto error;
1021                 }else if (n==0){
1022                         /* timeout */
1023                         if (get_ticks()-ticks>=tcp_send_timeout){
1024                                 LOG(L_ERR, "ERROR: tcp_blocking_write: send timeout (%d)\n",
1025                                                 tcp_send_timeout);
1026                                 goto error;
1027                         }
1028                         continue;
1029                 }
1030                 if (FD_ISSET(fd, &sel_set)){
1031                         /* we can write again */
1032                         goto again;
1033                 }
1034         }
1035 error:
1036                 return -1;
1037 end:
1038                 return initial_len;
1039 }
1040 #endif
1041
1042
1043
1044 struct tcp_connection* tcpconn_new(int sock, union sockaddr_union* su,
1045                                                                         union sockaddr_union* local_addr,
1046                                                                         struct socket_info* ba, int type, 
1047                                                                         int state)
1048 {
1049         struct tcp_connection *c;
1050         int rd_b_size;
1051         
1052         rd_b_size=cfg_get(tcp, tcp_cfg, rd_buf_size);
1053         c=shm_malloc(sizeof(struct tcp_connection) + rd_b_size);
1054         if (c==0){
1055                 LOG(L_ERR, "ERROR: tcpconn_new: mem. allocation failure\n");
1056                 goto error;
1057         }
1058         memset(c, 0, sizeof(struct tcp_connection)); /* zero init (skip rd buf)*/
1059         c->s=sock;
1060         c->fd=-1; /* not initialized */
1061         if (lock_init(&c->write_lock)==0){
1062                 LOG(L_ERR, "ERROR: tcpconn_new: init lock failed\n");
1063                 goto error;
1064         }
1065         
1066         c->rcv.src_su=*su;
1067         
1068         atomic_set(&c->refcnt, 0);
1069         local_timer_init(&c->timer, tcpconn_main_timeout, c, 0);
1070         su2ip_addr(&c->rcv.src_ip, su);
1071         c->rcv.src_port=su_getport(su);
1072         c->rcv.bind_address=ba;
1073         if (likely(local_addr)){
1074                 su2ip_addr(&c->rcv.dst_ip, local_addr);
1075                 c->rcv.dst_port=su_getport(local_addr);
1076         }else if (ba){
1077                 c->rcv.dst_ip=ba->address;
1078                 c->rcv.dst_port=ba->port_no;
1079         }
1080         print_ip("tcpconn_new: new tcp connection: ", &c->rcv.src_ip, "\n");
1081         DBG(     "tcpconn_new: on port %d, type %d\n", c->rcv.src_port, type);
1082         init_tcp_req(&c->req, (char*)c+sizeof(struct tcp_connection), rd_b_size);
1083         c->id=(*connection_id)++;
1084         c->rcv.proto_reserved1=0; /* this will be filled before receive_message*/
1085         c->rcv.proto_reserved2=0;
1086         c->state=state;
1087         c->extra_data=0;
1088 #ifdef USE_TLS
1089         if (type==PROTO_TLS){
1090                 if (tls_tcpconn_init(c, sock)==-1) goto error;
1091         }else
1092 #endif /* USE_TLS*/
1093         {
1094                 c->type=PROTO_TCP;
1095                 c->rcv.proto=PROTO_TCP;
1096                 c->timeout=get_ticks_raw()+cfg_get(tcp, tcp_cfg, con_lifetime);
1097         }
1098         
1099         return c;
1100         
1101 error:
1102         if (c) shm_free(c);
1103         return 0;
1104 }
1105
1106
1107
1108 /* do the actual connect, set sock. options a.s.o
1109  * returns socket on success, -1 on error
1110  * sets also *res_local_addr, res_si and state (S_CONN_CONNECT for an
1111  * unfinished connect and S_CONN_OK for a finished one)*/
1112 inline static int tcp_do_connect(       union sockaddr_union* server,
1113                                                                         union sockaddr_union* from,
1114                                                                         int type,
1115                                                                         snd_flags_t* send_flags,
1116                                                                         union sockaddr_union* res_local_addr,
1117                                                                         struct socket_info** res_si,
1118                                                                         enum tcp_conn_states *state
1119                                                                         )
1120 {
1121         int s;
1122         union sockaddr_union my_name;
1123         socklen_t my_name_len;
1124         struct ip_addr ip;
1125 #ifdef TCP_ASYNC
1126         int n;
1127 #endif /* TCP_ASYNC */
1128
1129         s=socket(AF2PF(server->s.sa_family), SOCK_STREAM, 0);
1130         if (unlikely(s==-1)){
1131                 LOG(L_ERR, "ERROR: tcp_do_connect %s: socket: (%d) %s\n",
1132                                 su2a(server, sizeof(*server)), errno, strerror(errno));
1133                 goto error;
1134         }
1135         if (init_sock_opt(s)<0){
1136                 LOG(L_ERR, "ERROR: tcp_do_connect %s: init_sock_opt failed\n",
1137                                         su2a(server, sizeof(*server)));
1138                 goto error;
1139         }
1140         
1141         if (unlikely(from && bind(s, &from->s, sockaddru_len(*from)) != 0)){
1142                 LOG(L_WARN, "WARNING: tcp_do_connect: binding to source address"
1143                                         " %s failed: %s [%d]\n", su2a(from, sizeof(*from)),
1144                                         strerror(errno), errno);
1145         }
1146         *state=S_CONN_OK;
1147 #ifdef TCP_ASYNC
1148         if (likely(cfg_get(tcp, tcp_cfg, async))){
1149 again:
1150                 n=connect(s, &server->s, sockaddru_len(*server));
1151                 if (likely(n==-1)){ /*non-blocking => most probable EINPROGRESS*/
1152                         if (likely(errno==EINPROGRESS))
1153                                 *state=S_CONN_CONNECT;
1154                         else if (errno==EINTR) goto again;
1155                         else if (errno!=EALREADY){
1156                                 switch(errno){
1157                                         case ENETUNREACH:
1158                                         case EHOSTUNREACH:
1159 #ifdef USE_DST_BLACKLIST
1160                                                 dst_blacklist_su(BLST_ERR_CONNECT, type, server,
1161                                                                                         send_flags, 0);
1162 #endif /* USE_DST_BLACKLIST */
1163                                                 TCP_EV_CONNECT_UNREACHABLE(errno, 0, 0, server, type);
1164                                                 break;
1165                                         case ETIMEDOUT:
1166 #ifdef USE_DST_BLACKLIST
1167                                                 dst_blacklist_su(BLST_ERR_CONNECT, type, server,
1168                                                                                         send_flags, 0);
1169 #endif /* USE_DST_BLACKLIST */
1170                                                 TCP_EV_CONNECT_TIMEOUT(errno, 0, 0, server, type);
1171                                                 break;
1172                                         case ECONNREFUSED:
1173                                         case ECONNRESET:
1174 #ifdef USE_DST_BLACKLIST
1175                                                 dst_blacklist_su(BLST_ERR_CONNECT, type, server,
1176                                                                                         send_flags, 0);
1177 #endif /* USE_DST_BLACKLIST */
1178                                                 TCP_EV_CONNECT_RST(errno, 0, 0, server, type);
1179                                                 break;
1180                                         case EAGAIN:/* not posix, but supported on linux and bsd */
1181                                                 TCP_EV_CONNECT_NO_MORE_PORTS(errno, 0, 0, server,type);
1182                                                 break;
1183                                         default:
1184                                                 TCP_EV_CONNECT_ERR(errno, 0, 0, server, type);
1185                                 }
1186                                 TCP_STATS_CONNECT_FAILED();
1187                                 LOG(L_ERR, "ERROR: tcp_do_connect: connect %s: (%d) %s\n",
1188                                                         su2a(server, sizeof(*server)),
1189                                                         errno, strerror(errno));
1190                                 goto error;
1191                         }
1192                 }
1193         }else{
1194 #endif /* TCP_ASYNC */
1195                 if (tcp_blocking_connect(s, type,  send_flags, &server->s,
1196                                                                         sockaddru_len(*server))<0){
1197                         LOG(L_ERR, "ERROR: tcp_do_connect: tcp_blocking_connect %s"
1198                                                 " failed\n", su2a(server, sizeof(*server)));
1199                         goto error;
1200                 }
1201 #ifdef TCP_ASYNC
1202         }
1203 #endif /* TCP_ASYNC */
1204         if (from){
1205                 su2ip_addr(&ip, from);
1206                 if (!ip_addr_any(&ip))
1207                         /* we already know the source ip, skip the sys. call */
1208                         goto find_socket;
1209         }
1210         my_name_len=sizeof(my_name);
1211         if (unlikely(getsockname(s, &my_name.s, &my_name_len)!=0)){
1212                 LOG(L_ERR, "ERROR: tcp_do_connect: getsockname failed: %s(%d)\n",
1213                                 strerror(errno), errno);
1214                 *res_si=0;
1215                 goto error;
1216         }
1217         from=&my_name; /* update from with the real "from" address */
1218         su2ip_addr(&ip, &my_name);
1219 find_socket:
1220 #ifdef USE_TLS
1221         if (unlikely(type==PROTO_TLS))
1222                 *res_si=find_si(&ip, 0, PROTO_TLS);
1223         else
1224 #endif
1225                 *res_si=find_si(&ip, 0, PROTO_TCP);
1226         
1227         if (unlikely(*res_si==0)){
1228                 LOG(L_WARN, "WARNING: tcp_do_connect %s: could not find corresponding"
1229                                 " listening socket for %s, using default...\n",
1230                                         su2a(server, sizeof(*server)), ip_addr2a(&ip));
1231                 if (server->s.sa_family==AF_INET) *res_si=sendipv4_tcp;
1232 #ifdef USE_IPV6
1233                 else *res_si=sendipv6_tcp;
1234 #endif
1235         }
1236         *res_local_addr=*from;
1237         return s;
1238 error:
1239         if (s!=-1) tcp_safe_close(s);
1240         return -1;
1241 }
1242
1243
1244
1245 struct tcp_connection* tcpconn_connect( union sockaddr_union* server,
1246                                                                                 union sockaddr_union* from,
1247                                                                                 int type, snd_flags_t* send_flags)
1248 {
1249         int s;
1250         struct socket_info* si;
1251         union sockaddr_union my_name;
1252         struct tcp_connection* con;
1253         enum tcp_conn_states state;
1254
1255         s=-1;
1256         
1257         if (*tcp_connections_no >= cfg_get(tcp, tcp_cfg, max_connections)){
1258                 LOG(L_ERR, "ERROR: tcpconn_connect: maximum number of connections"
1259                                         " exceeded (%d/%d)\n",
1260                                         *tcp_connections_no,
1261                                         cfg_get(tcp, tcp_cfg, max_connections));
1262                 goto error;
1263         }
1264         s=tcp_do_connect(server, from, type,  send_flags, &my_name, &si, &state);
1265         if (s==-1){
1266                 LOG(L_ERR, "ERROR: tcp_do_connect %s: failed (%d) %s\n",
1267                                 su2a(server, sizeof(*server)), errno, strerror(errno));
1268                 goto error;
1269         }
1270         con=tcpconn_new(s, server, &my_name, si, type, state);
1271         if (con==0){
1272                 LOG(L_ERR, "ERROR: tcp_connect %s: tcpconn_new failed, closing the "
1273                                  " socket\n", su2a(server, sizeof(*server)));
1274                 goto error;
1275         }
1276         tcpconn_set_send_flags(con, *send_flags);
1277         return con;
1278 error:
1279         if (s!=-1) tcp_safe_close(s); /* close the opened socket */
1280         return 0;
1281 }
1282
1283
1284
1285 #ifdef TCP_CONNECT_WAIT
1286 int tcpconn_finish_connect( struct tcp_connection* c,
1287                                                                                                 union sockaddr_union* from)
1288 {
1289         int s;
1290         int r;
1291         union sockaddr_union local_addr;
1292         struct socket_info* si;
1293         enum tcp_conn_states state;
1294         struct tcp_conn_alias* a;
1295         int new_conn_alias_flags;
1296         
1297         s=tcp_do_connect(&c->rcv.src_su, from, c->type, &c->send_flags,
1298                                                 &local_addr, &si, &state);
1299         if (unlikely(s==-1)){
1300                 LOG(L_ERR, "ERROR: tcpconn_finish_connect %s: tcp_do_connect for %p"
1301                                         " failed\n", su2a(&c->rcv.src_su, sizeof(c->rcv.src_su)),
1302                                         c);
1303                 return -1;
1304         }
1305         c->rcv.bind_address=si;
1306         su2ip_addr(&c->rcv.dst_ip, &local_addr);
1307         c->rcv.dst_port=su_getport(&local_addr);
1308         /* update aliases if needed */
1309         if (likely(from==0)){
1310                 new_conn_alias_flags=cfg_get(tcp, tcp_cfg, new_conn_alias_flags);
1311                 /* add aliases */
1312                 TCPCONN_LOCK;
1313                 _tcpconn_add_alias_unsafe(c, c->rcv.src_port, &c->rcv.dst_ip, 0,
1314                                                                                                         new_conn_alias_flags);
1315                 _tcpconn_add_alias_unsafe(c, c->rcv.src_port, &c->rcv.dst_ip,
1316                                                                         c->rcv.dst_port, new_conn_alias_flags);
1317                 TCPCONN_UNLOCK;
1318         }else if (su_cmp(from, &local_addr)!=1){
1319                 new_conn_alias_flags=cfg_get(tcp, tcp_cfg, new_conn_alias_flags);
1320                 TCPCONN_LOCK;
1321                         /* remove all the aliases except the first one and re-add them
1322                          * (there shouldn't be more then the 3 default aliases at this 
1323                          * stage) */
1324                         for (r=1; r<c->aliases; r++){
1325                                 a=&c->con_aliases[r];
1326                                 tcpconn_listrm(tcpconn_aliases_hash[a->hash], a, next, prev);
1327                         }
1328                         c->aliases=1;
1329                         /* add the local_ip:0 and local_ip:local_port aliases */
1330                         _tcpconn_add_alias_unsafe(c, c->rcv.src_port, &c->rcv.dst_ip,
1331                                                                                                 0, new_conn_alias_flags);
1332                         _tcpconn_add_alias_unsafe(c, c->rcv.src_port, &c->rcv.dst_ip,
1333                                                                         c->rcv.dst_port, new_conn_alias_flags);
1334                 TCPCONN_UNLOCK;
1335         }
1336         
1337         return s;
1338 }
1339 #endif /* TCP_CONNECT_WAIT */
1340
1341
1342
1343 /* adds a tcp connection to the tcpconn hashes
1344  * Note: it's called _only_ from the tcp_main process */
1345 inline static struct tcp_connection*  tcpconn_add(struct tcp_connection *c)
1346 {
1347         struct ip_addr zero_ip;
1348         int new_conn_alias_flags;
1349
1350         if (likely(c)){
1351                 ip_addr_mk_any(c->rcv.src_ip.af, &zero_ip);
1352                 c->id_hash=tcp_id_hash(c->id);
1353                 c->aliases=0;
1354                 new_conn_alias_flags=cfg_get(tcp, tcp_cfg, new_conn_alias_flags);
1355                 TCPCONN_LOCK;
1356                 c->flags|=F_CONN_HASHED;
1357                 /* add it at the begining of the list*/
1358                 tcpconn_listadd(tcpconn_id_hash[c->id_hash], c, id_next, id_prev);
1359                 /* set the aliases */
1360                 /* first alias is for (peer_ip, peer_port, 0 ,0) -- for finding
1361                  *  any connection to peer_ip, peer_port
1362                  * the second alias is for (peer_ip, peer_port, local_addr, 0) -- for
1363                  *  finding any conenction to peer_ip, peer_port from local_addr 
1364                  * the third alias is for (peer_ip, peer_port, local_addr, local_port) 
1365                  *   -- for finding if a fully specified connection exists */
1366                 _tcpconn_add_alias_unsafe(c, c->rcv.src_port, &zero_ip, 0,
1367                                                                                                         new_conn_alias_flags);
1368                 if (likely(c->rcv.dst_ip.af && ! ip_addr_any(&c->rcv.dst_ip))){
1369                         _tcpconn_add_alias_unsafe(c, c->rcv.src_port, &c->rcv.dst_ip, 0,
1370                                                                                                         new_conn_alias_flags);
1371                         _tcpconn_add_alias_unsafe(c, c->rcv.src_port, &c->rcv.dst_ip,
1372                                                                         c->rcv.dst_port, new_conn_alias_flags);
1373                 }
1374                 /* ignore add_alias errors, there are some valid cases when one
1375                  *  of the add_alias would fail (e.g. first add_alias for 2 connections
1376                  *   with the same destination but different src. ip*/
1377                 TCPCONN_UNLOCK;
1378                 DBG("tcpconn_add: hashes: %d:%d:%d, %d\n",
1379                                                                                                 c->con_aliases[0].hash,
1380                                                                                                 c->con_aliases[1].hash,
1381                                                                                                 c->con_aliases[2].hash,
1382                                                                                                 c->id_hash);
1383                 return c;
1384         }else{
1385                 LOG(L_CRIT, "tcpconn_add: BUG: null connection pointer\n");
1386                 return 0;
1387         }
1388 }
1389
1390
1391 static inline void _tcpconn_detach(struct tcp_connection *c)
1392 {
1393         int r;
1394         tcpconn_listrm(tcpconn_id_hash[c->id_hash], c, id_next, id_prev);
1395         /* remove all the aliases */
1396         for (r=0; r<c->aliases; r++)
1397                 tcpconn_listrm(tcpconn_aliases_hash[c->con_aliases[r].hash], 
1398                                                 &c->con_aliases[r], next, prev);
1399         c->aliases = 0;
1400 }
1401
1402
1403
1404 static inline void _tcpconn_free(struct tcp_connection* c)
1405 {
1406 #ifdef TCP_ASYNC
1407         if (unlikely(_wbufq_non_empty(c)))
1408                 _wbufq_destroy(&c->wbuf_q);
1409 #endif
1410         lock_destroy(&c->write_lock);
1411 #ifdef USE_TLS
1412         if (unlikely(c->type==PROTO_TLS)) tls_tcpconn_clean(c);
1413 #endif
1414         shm_free(c);
1415 }
1416
1417
1418
1419 /* unsafe tcpconn_rm version (nolocks) */
1420 void _tcpconn_rm(struct tcp_connection* c)
1421 {
1422         _tcpconn_detach(c);
1423         _tcpconn_free(c);
1424 }
1425
1426
1427
1428 void tcpconn_rm(struct tcp_connection* c)
1429 {
1430         int r;
1431         TCPCONN_LOCK;
1432         tcpconn_listrm(tcpconn_id_hash[c->id_hash], c, id_next, id_prev);
1433         /* remove all the aliases */
1434         for (r=0; r<c->aliases; r++)
1435                 tcpconn_listrm(tcpconn_aliases_hash[c->con_aliases[r].hash], 
1436                                                 &c->con_aliases[r], next, prev);
1437         c->aliases = 0;
1438         TCPCONN_UNLOCK;
1439         lock_destroy(&c->write_lock);
1440 #ifdef USE_TLS
1441         if ((c->type==PROTO_TLS)&&(c->extra_data)) tls_tcpconn_clean(c);
1442 #endif
1443         shm_free(c);
1444 }
1445
1446
1447 /* finds a connection, if id=0 uses the ip addr, port, local_ip and local port
1448  *  (host byte order) and tries to find the connection that matches all of
1449  *   them. Wild cards can be used for local_ip and local_port (a 0 filled
1450  *   ip address and/or a 0 local port).
1451  * WARNING: unprotected (locks) use tcpconn_get unless you really
1452  * know what you are doing */
1453 struct tcp_connection* _tcpconn_find(int id, struct ip_addr* ip, int port,
1454                                                                                 struct ip_addr* l_ip, int l_port)
1455 {
1456
1457         struct tcp_connection *c;
1458         struct tcp_conn_alias* a;
1459         unsigned hash;
1460         int is_local_ip_any;
1461         
1462 #ifdef EXTRA_DEBUG
1463         DBG("tcpconn_find: %d  port %d\n",id, port);
1464         if (ip) print_ip("tcpconn_find: ip ", ip, "\n");
1465 #endif
1466         if (likely(id)){
1467                 hash=tcp_id_hash(id);
1468                 for (c=tcpconn_id_hash[hash]; c; c=c->id_next){
1469 #ifdef EXTRA_DEBUG
1470                         DBG("c=%p, c->id=%d, port=%d\n",c, c->id, c->rcv.src_port);
1471                         print_ip("ip=", &c->rcv.src_ip, "\n");
1472 #endif
1473                         if ((id==c->id)&&(c->state!=S_CONN_BAD)) return c;
1474                 }
1475         }else if (likely(ip)){
1476                 hash=tcp_addr_hash(ip, port, l_ip, l_port);
1477                 is_local_ip_any=ip_addr_any(l_ip);
1478                 for (a=tcpconn_aliases_hash[hash]; a; a=a->next){
1479 #ifdef EXTRA_DEBUG
1480                         DBG("a=%p, c=%p, c->id=%d, alias port= %d port=%d\n", a, a->parent,
1481                                         a->parent->id, a->port, a->parent->rcv.src_port);
1482                         print_ip("ip=",&a->parent->rcv.src_ip,"\n");
1483 #endif
1484                         if ( (a->parent->state!=S_CONN_BAD) && (port==a->port) &&
1485                                         ((l_port==0) || (l_port==a->parent->rcv.dst_port)) &&
1486                                         (ip_addr_cmp(ip, &a->parent->rcv.src_ip)) &&
1487                                         (is_local_ip_any ||
1488                                                 ip_addr_cmp(l_ip, &a->parent->rcv.dst_ip))
1489                                 )
1490                                 return a->parent;
1491                 }
1492         }
1493         return 0;
1494 }
1495
1496
1497
1498 /* _tcpconn_find with locks and timeout
1499  * local_addr contains the desired local ip:port. If null any local address 
1500  * will be used.  IN*ADDR_ANY or 0 port are wild cards.
1501  */
1502 struct tcp_connection* tcpconn_get(int id, struct ip_addr* ip, int port,
1503                                                                         union sockaddr_union* local_addr,
1504                                                                         ticks_t timeout)
1505 {
1506         struct tcp_connection* c;
1507         struct ip_addr local_ip;
1508         int local_port;
1509         
1510         local_port=0;
1511         if (likely(ip)){
1512                 if (unlikely(local_addr)){
1513                         su2ip_addr(&local_ip, local_addr);
1514                         local_port=su_getport(local_addr);
1515                 }else{
1516                         ip_addr_mk_any(ip->af, &local_ip);
1517                         local_port=0;
1518                 }
1519         }
1520         TCPCONN_LOCK;
1521         c=_tcpconn_find(id, ip, port, &local_ip, local_port);
1522         if (likely(c)){ 
1523                         atomic_inc(&c->refcnt);
1524                         /* update the timeout only if the connection is not handled
1525                          * by a tcp reader (the tcp reader process uses c->timeout for 
1526                          * its own internal timeout and c->timeout will be overwritten
1527                          * anyway on return to tcp_main) */
1528                         if (likely(c->reader_pid==0))
1529                                 c->timeout=get_ticks_raw()+timeout;
1530         }
1531         TCPCONN_UNLOCK;
1532         return c;
1533 }
1534
1535
1536
1537 /* add c->dst:port, local_addr as an alias for the "id" connection, 
1538  * flags: TCP_ALIAS_FORCE_ADD  - add an alias even if a previous one exists
1539  *        TCP_ALIAS_REPLACE    - if a prev. alias exists, replace it with the
1540  *                                new one
1541  * returns 0 on success, <0 on failure ( -1  - null c, -2 too many aliases,
1542  *  -3 alias already present and pointing to another connection)
1543  * WARNING: must be called with TCPCONN_LOCK held */
1544 inline static int _tcpconn_add_alias_unsafe(struct tcp_connection* c, int port,
1545                                                                                 struct ip_addr* l_ip, int l_port,
1546                                                                                 int flags)
1547 {
1548         unsigned hash;
1549         struct tcp_conn_alias* a;
1550         struct tcp_conn_alias* nxt;
1551         struct tcp_connection* p;
1552         int is_local_ip_any;
1553         int i;
1554         int r;
1555         
1556         a=0;
1557         is_local_ip_any=ip_addr_any(l_ip);
1558         if (likely(c)){
1559                 hash=tcp_addr_hash(&c->rcv.src_ip, port, l_ip, l_port);
1560                 /* search the aliases for an already existing one */
1561                 for (a=tcpconn_aliases_hash[hash], nxt=0; a; a=nxt){
1562                         nxt=a->next;
1563                         if ( (a->parent->state!=S_CONN_BAD) && (port==a->port) &&
1564                                         ( (l_port==0) || (l_port==a->parent->rcv.dst_port)) &&
1565                                         (ip_addr_cmp(&c->rcv.src_ip, &a->parent->rcv.src_ip)) &&
1566                                         ( is_local_ip_any || 
1567                                           ip_addr_cmp(&a->parent->rcv.dst_ip, l_ip))
1568                                         ){
1569                                 /* found */
1570                                 if (unlikely(a->parent!=c)){
1571                                         if (flags & TCP_ALIAS_FORCE_ADD)
1572                                                 /* still have to walk the whole list to check if
1573                                                  * the alias was not already added */
1574                                                 continue;
1575                                         else if (flags & TCP_ALIAS_REPLACE){
1576                                                 /* remove the alias =>
1577                                                  * remove the current alias and all the following
1578                                                  *  ones from the corresponding connection, shift the 
1579                                                  *  connection aliases array and re-add the other 
1580                                                  *  aliases (!= current one) */
1581                                                 p=a->parent;
1582                                                 for (i=0; (i<p->aliases) && (&(p->con_aliases[i])!=a);
1583                                                                 i++);
1584                                                 if (unlikely(i==p->aliases)){
1585                                                         LOG(L_CRIT, "BUG: _tcpconn_add_alias_unsafe: "
1586                                                                         " alias %p not found in con %p (id %d)\n",
1587                                                                         a, p, p->id);
1588                                                         goto error_not_found;
1589                                                 }
1590                                                 for (r=i; r<p->aliases; r++){
1591                                                         tcpconn_listrm(
1592                                                                 tcpconn_aliases_hash[p->con_aliases[r].hash],
1593                                                                 &p->con_aliases[r], next, prev);
1594                                                 }
1595                                                 if (likely((i+1)<p->aliases)){
1596                                                         memmove(&p->con_aliases[i], &p->con_aliases[i+1],
1597                                                                                         (p->aliases-i-1)*
1598                                                                                                 sizeof(p->con_aliases[0]));
1599                                                 }
1600                                                 p->aliases--;
1601                                                 /* re-add the remaining aliases */
1602                                                 for (r=i; r<p->aliases; r++){
1603                                                         tcpconn_listadd(
1604                                                                 tcpconn_aliases_hash[p->con_aliases[r].hash], 
1605                                                                 &p->con_aliases[r], next, prev);
1606                                                 }
1607                                         }else
1608                                                 goto error_sec;
1609                                 }else goto ok;
1610                         }
1611                 }
1612                 if (unlikely(c->aliases>=TCP_CON_MAX_ALIASES)) goto error_aliases;
1613                 c->con_aliases[c->aliases].parent=c;
1614                 c->con_aliases[c->aliases].port=port;
1615                 c->con_aliases[c->aliases].hash=hash;
1616                 tcpconn_listadd(tcpconn_aliases_hash[hash], 
1617                                                                 &c->con_aliases[c->aliases], next, prev);
1618                 c->aliases++;
1619         }else goto error_not_found;
1620 ok:
1621 #ifdef EXTRA_DEBUG
1622         if (a) DBG("_tcpconn_add_alias_unsafe: alias already present\n");
1623         else   DBG("_tcpconn_add_alias_unsafe: alias port %d for hash %d, id %d\n",
1624                         port, hash, c->id);
1625 #endif
1626         return 0;
1627 error_aliases:
1628         /* too many aliases */
1629         return -2;
1630 error_not_found:
1631         /* null connection */
1632         return -1;
1633 error_sec:
1634         /* alias already present and pointing to a different connection
1635          * (hijack attempt?) */
1636         return -3;
1637 }
1638
1639
1640
1641 /* add port as an alias for the "id" connection, 
1642  * returns 0 on success,-1 on failure */
1643 int tcpconn_add_alias(int id, int port, int proto)
1644 {
1645         struct tcp_connection* c;
1646         int ret;
1647         struct ip_addr zero_ip;
1648         int r;
1649         int alias_flags;
1650         
1651         /* fix the port */
1652         port=port?port:((proto==PROTO_TLS)?SIPS_PORT:SIP_PORT);
1653         TCPCONN_LOCK;
1654         /* check if alias already exists */
1655         c=_tcpconn_find(id, 0, 0, 0, 0);
1656         if (likely(c)){
1657                 ip_addr_mk_any(c->rcv.src_ip.af, &zero_ip);
1658                 alias_flags=cfg_get(tcp, tcp_cfg, alias_flags);
1659                 /* alias src_ip:port, 0, 0 */
1660                 ret=_tcpconn_add_alias_unsafe(c, port,  &zero_ip, 0, 
1661                                                                                 alias_flags);
1662                 if (ret<0 && ret!=-3) goto error;
1663                 /* alias src_ip:port, local_ip, 0 */
1664                 ret=_tcpconn_add_alias_unsafe(c, port,  &c->rcv.dst_ip, 0, 
1665                                                                                 alias_flags);
1666                 if (ret<0 && ret!=-3) goto error;
1667                 /* alias src_ip:port, local_ip, local_port */
1668                 ret=_tcpconn_add_alias_unsafe(c, port, &c->rcv.dst_ip, c->rcv.dst_port,
1669                                                                                 alias_flags);
1670                 if (unlikely(ret<0)) goto error;
1671         }else goto error_not_found;
1672         TCPCONN_UNLOCK;
1673         return 0;
1674 error_not_found:
1675         TCPCONN_UNLOCK;
1676         LOG(L_ERR, "ERROR: tcpconn_add_alias: no connection found for id %d\n",id);
1677         return -1;
1678 error:
1679         TCPCONN_UNLOCK;
1680         switch(ret){
1681                 case -2:
1682                         LOG(L_ERR, "ERROR: tcpconn_add_alias: too many aliases (%d)"
1683                                         " for connection %p (id %d) %s:%d <- %d\n",
1684                                         c->aliases, c, c->id, ip_addr2a(&c->rcv.src_ip),
1685                                         c->rcv.src_port, port);
1686                         for (r=0; r<c->aliases; r++){
1687                                 LOG(L_ERR, "ERROR: tcpconn_add_alias: alias %d: for %p (%d)"
1688                                                 " %s:%d <-%d hash %x\n",  r, c, c->id, 
1689                                                  ip_addr2a(&c->rcv.src_ip), c->rcv.src_port, 
1690                                                 c->con_aliases[r].port, c->con_aliases[r].hash);
1691                         }
1692                         break;
1693                 case -3:
1694                         LOG(L_ERR, "ERROR: tcpconn_add_alias: possible port"
1695                                         " hijack attempt\n");
1696                         LOG(L_ERR, "ERROR: tcpconn_add_alias: alias for %d port %d already"
1697                                                 " present and points to another connection \n",
1698                                                 c->id, port);
1699                         break;
1700                 default:
1701                         LOG(L_ERR, "ERROR: tcpconn_add_alias: unkown error %d\n", ret);
1702         }
1703         return -1;
1704 }
1705
1706
1707
1708 #ifdef TCP_FD_CACHE
1709
1710 static void tcp_fd_cache_init()
1711 {
1712         int r;
1713         for (r=0; r<TCP_FD_CACHE_SIZE; r++)
1714                 fd_cache[r].fd=-1;
1715 }
1716
1717
1718 inline static struct fd_cache_entry* tcp_fd_cache_get(struct tcp_connection *c)
1719 {
1720         int h;
1721         
1722         h=c->id%TCP_FD_CACHE_SIZE;
1723         if ((fd_cache[h].fd>0) && (fd_cache[h].id==c->id) && (fd_cache[h].con==c))
1724                 return &fd_cache[h];
1725         return 0;
1726 }
1727
1728
1729 inline static void tcp_fd_cache_rm(struct fd_cache_entry* e)
1730 {
1731         e->fd=-1;
1732 }
1733
1734
1735 inline static void tcp_fd_cache_add(struct tcp_connection *c, int fd)
1736 {
1737         int h;
1738         
1739         h=c->id%TCP_FD_CACHE_SIZE;
1740         if (likely(fd_cache[h].fd>0))
1741                 tcp_safe_close(fd_cache[h].fd);
1742         fd_cache[h].fd=fd;
1743         fd_cache[h].id=c->id;
1744         fd_cache[h].con=c;
1745 }
1746
1747 #endif /* TCP_FD_CACHE */
1748
1749
1750
1751 inline static int tcpconn_chld_put(struct tcp_connection* tcpconn);
1752
1753 static int tcpconn_send_put(struct tcp_connection* c, const char* buf,
1754                                                         unsigned len, snd_flags_t send_flags);
1755 static int tcpconn_do_send(int fd, struct tcp_connection* c,
1756                                                         const char* buf, unsigned len,
1757                                                         snd_flags_t send_flags, long* resp, int locked);
1758
1759 static int tcpconn_1st_send(int fd, struct tcp_connection* c,
1760                                                         const char* buf, unsigned len,
1761                                                         snd_flags_t send_flags, long* resp, int locked);
1762
1763 /* finds a tcpconn & sends on it
1764  * uses the dst members to, proto (TCP|TLS) and id and tries to send
1765  *  from the "from" address (if non null and id==0)
1766  * returns: number of bytes written (>=0) on success
1767  *          <0 on error */
1768 int tcp_send(struct dest_info* dst, union sockaddr_union* from,
1769                                         const char* buf, unsigned len)
1770 {
1771         struct tcp_connection *c;
1772         struct ip_addr ip;
1773         int port;
1774         int fd;
1775         long response[2];
1776         int n;
1777         int do_close_fd;
1778         ticks_t con_lifetime;
1779 #ifdef USE_TLS
1780         const char* rest_buf;
1781         const char* t_buf;
1782         unsigned rest_len, t_len;
1783         long resp;
1784         snd_flags_t t_send_flags;
1785 #endif /* USE_TLS */
1786         
1787         do_close_fd=1; /* close the fd on exit */
1788         port=su_getport(&dst->to);
1789         con_lifetime=cfg_get(tcp, tcp_cfg, con_lifetime);
1790         if (likely(port)){
1791                 su2ip_addr(&ip, &dst->to);
1792                 c=tcpconn_get(dst->id, &ip, port, from, con_lifetime); 
1793         }else if (likely(dst->id)){
1794                 c=tcpconn_get(dst->id, 0, 0, 0, con_lifetime);
1795         }else{
1796                 LOG(L_CRIT, "BUG: tcp_send called with null id & to\n");
1797                 return -1;
1798         }
1799         
1800         if (likely(dst->id)){
1801                 if (unlikely(c==0)) {
1802                         if (likely(port)){
1803                                 /* try again w/o id */
1804                                 c=tcpconn_get(0, &ip, port, from, con_lifetime);
1805                         }else{
1806                                 LOG(L_ERR, "ERROR: tcp_send: id %d not found, dropping\n",
1807                                                 dst->id);
1808                                 return -1;
1809                         }
1810                 }
1811         }
1812         /* connection not found or unusable => open a new one and send on it */
1813         if (unlikely((c==0) || tcpconn_close_after_send(c))){
1814                 if (unlikely(c)){
1815                         /* can't use c if it's marked as close-after-send  =>
1816                            release it and try opening new one */
1817                         tcpconn_chld_put(c); /* release c (dec refcnt & free on 0) */
1818                         c=0;
1819                 }
1820                 /* check if connect() is disabled */
1821                 if (unlikely((dst->send_flags.f & SND_F_FORCE_CON_REUSE) ||
1822                                                 cfg_get(tcp, tcp_cfg, no_connect)))
1823                         return -1;
1824                 DBG("tcp_send: no open tcp connection found, opening new one\n");
1825                 /* create tcp connection */
1826                 if (likely(from==0)){
1827                         /* check to see if we have to use a specific source addr. */
1828                         switch (dst->to.s.sa_family) {
1829                                 case AF_INET:
1830                                                 from = tcp_source_ipv4;
1831                                         break;
1832 #ifdef USE_IPV6
1833                                 case AF_INET6:
1834                                                 from = tcp_source_ipv6;
1835                                         break;
1836 #endif
1837                                 default:
1838                                         /* error, bad af, ignore ... */
1839                                         break;
1840                         }
1841                 }
1842 #if defined(TCP_CONNECT_WAIT) && defined(TCP_ASYNC)
1843                 if (likely(cfg_get(tcp, tcp_cfg, tcp_connect_wait) && 
1844                                         cfg_get(tcp, tcp_cfg, async) )){
1845                         if (unlikely(*tcp_connections_no >=
1846                                                         cfg_get(tcp, tcp_cfg, max_connections))){
1847                                 LOG(L_ERR, "ERROR: tcp_send %s: maximum number of"
1848                                                         " connections exceeded (%d/%d)\n",
1849                                                         su2a(&dst->to, sizeof(dst->to)),
1850                                                         *tcp_connections_no,
1851                                                         cfg_get(tcp, tcp_cfg, max_connections));
1852                                 return -1;
1853                         }
1854                         c=tcpconn_new(-1, &dst->to, from, 0, dst->proto,
1855                                                         S_CONN_CONNECT);
1856                         if (unlikely(c==0)){
1857                                 LOG(L_ERR, "ERROR: tcp_send %s: could not create new"
1858                                                 " connection\n",
1859                                                 su2a(&dst->to, sizeof(dst->to)));
1860                                 return -1;
1861                         }
1862                         c->flags|=F_CONN_PENDING|F_CONN_FD_CLOSED;
1863                         tcpconn_set_send_flags(c, dst->send_flags);
1864                         atomic_set(&c->refcnt, 2); /* ref from here and from main hash
1865                                                                                  table */
1866                         /* add it to id hash and aliases */
1867                         if (unlikely(tcpconn_add(c)==0)){
1868                                 LOG(L_ERR, "ERROR: tcp_send %s: could not add "
1869                                                         "connection %p\n",
1870                                                         su2a(&dst->to, sizeof(dst->to)),
1871                                                                 c);
1872                                 _tcpconn_free(c);
1873                                 n=-1;
1874                                 goto end_no_conn;
1875                         }
1876                         /* do connect and if src ip or port changed, update the 
1877                          * aliases */
1878                         if (unlikely((fd=tcpconn_finish_connect(c, from))<0)){
1879                                 /* tcpconn_finish_connect will automatically blacklist
1880                                    on error => no need to do it here */
1881                                 LOG(L_ERR, "ERROR: tcp_send %s: tcpconn_finish_connect(%p)"
1882                                                 " failed\n", su2a(&dst->to, sizeof(dst->to)),
1883                                                         c);
1884                                 goto conn_wait_error;
1885                         }
1886                         /* ? TODO: it might be faster just to queue the write directly
1887                          *  and send to main CONN_NEW_PENDING_WRITE */
1888                         /* delay sending the fd to main after the send */
1889                         
1890                         /* NOTE: no lock here, because the connection is marked as
1891                          * pending and nobody else will try to write on it. However
1892                          * this might produce out-of-order writes. If this is not
1893                          * desired either lock before the write or use 
1894                          * _wbufq_insert(...)
1895                          * NOTE2: _wbufq_insert() is used now (no out-of-order).
1896                          */
1897 #ifdef USE_TLS
1898                         if (unlikely(c->type==PROTO_TLS)) {
1899                         /* for TLS the TLS processing and the send must happen
1900                            atomically w/ respect to other sends on the same connection
1901                            (otherwise reordering might occur which would break TLS) =>
1902                            lock. However in this case this send will always be the first.
1903                            We can have the send() outside the lock only if this is the
1904                            first and only send (tls_encode is not called again), or
1905                            this is the last send for a tls_encode() loop and all the
1906                            previous ones did return CONN_NEW_COMPLETE or CONN_EOF.
1907                         */
1908                                 response[1] = CONN_NOP;
1909                                 t_buf = buf;
1910                                 t_len = len;
1911                                 lock_get(&c->write_lock);
1912 redo_tls_encode:
1913                                         t_send_flags = dst->send_flags;
1914                                         n = tls_encode(c, &t_buf, &t_len, &rest_buf, &rest_len,
1915                                                                         &t_send_flags);
1916                                         /* There are 4 cases:
1917                                            1. entire buffer consumed from the first try
1918                                              (rest_len == rest_buf == 0)
1919                                            2. rest_buf & first call
1920                                            3. rest_buf & not first call
1921                                                   3a. CONN_NEW_COMPLETE or CONN_EOF
1922                                                   3b. CONN_NEW_PENDING_WRITE
1923                                            4. entire buffer consumed, but not first call
1924                                                4a. CONN_NEW_COMPLETE or CONN_EOF
1925                                                    4b. CONN_NEW_PENDING_WRITE
1926                                                 We misuse response[1] == CONN_NOP to test for the
1927                                                 first call.
1928                                         */
1929                                         if (unlikely(n < 0)) {
1930                                                 lock_release(&c->write_lock);
1931                                                 goto conn_wait_error;
1932                                         }
1933                                         if (likely(rest_len == 0)) {
1934                                                 /* 1 or 4*: CONN_NEW_COMPLETE, CONN_EOF,  CONN_NOP
1935                                                     or CONN_NEW_PENDING_WRITE (*rest_len == 0) */
1936                                                 if (likely(response[1] != CONN_NEW_PENDING_WRITE)) {
1937                                                         /* 1 or 4a => it's safe to do the send outside the
1938                                                            lock (it will either send directly or
1939                                                            wbufq_insert())
1940                                                         */
1941                                                         lock_release(&c->write_lock);
1942                                                         if (likely(t_len != 0)) {
1943                                                                 n=tcpconn_1st_send(fd, c, t_buf, t_len,
1944                                                                                                         t_send_flags,
1945                                                                                                         &response[1], 0);
1946                                                         } else { /* t_len == 0 */
1947                                                                 if (response[1] == CONN_NOP) {
1948                                                                         /* nothing to send (e.g  parallel send
1949                                                                            tls_encode queues some data and then
1950                                                                            WANT_READ => this tls_encode will queue
1951                                                                            the cleartext too and will have nothing
1952                                                                            to send right now) and initial send =>
1953                                                                            behave as if the send was successful
1954                                                                            (but never return EOF here) */
1955                                                                         response[1] = CONN_NEW_COMPLETE;
1956                                                                 }
1957                                                         }
1958                                                         /* exit */
1959                                                 } else {
1960                                                         /* CONN_NEW_PENDING_WRITE:  4b: it was a
1961                                                            repeated tls_encode() (or otherwise we would
1962                                                            have here CONN_NOP) => add to the queue */
1963                                                         if (unlikely(t_len &&
1964                                                                                         _wbufq_add(c, t_buf, t_len) < 0)) {
1965                                                                 response[1] = CONN_ERROR;
1966                                                                 n = -1;
1967                                                         }
1968                                                         lock_release(&c->write_lock);
1969                                                         /* exit (no send) */
1970                                                 }
1971                                         } else {  /* rest_len != 0 */
1972                                                 /* 2 or 3*: if tls_encode hasn't finished, we have to
1973                                                    call tcpconn_1st_send() under lock (otherwise if it
1974                                                    returns CONN_NEW_PENDING_WRITE, there is no way
1975                                                    to find the right place to add the new queued
1976                                                    data from the 2nd tls_encode()) */
1977                                                 if (likely((response[1] == CONN_NOP /*2*/ ||
1978                                                                         response[1] == CONN_NEW_COMPLETE /*3a*/ ||
1979                                                                         response[1] == CONN_EOF /*3a*/) && t_len))
1980                                                         n = tcpconn_1st_send(fd, c, t_buf, t_len,
1981                                                                                                         t_send_flags,
1982                                                                                                         &response[1], 1);
1983                                                 else if (unlikely(t_len &&
1984                                                                                         _wbufq_add(c, t_buf, t_len) < 0)) {
1985                                                         /*3b: CONN_NEW_PENDING_WRITE*/
1986                                                         response[1] = CONN_ERROR;
1987                                                         n = -1;
1988                                                 }
1989                                                 if (likely(n >= 0)) {
1990                                                         /* if t_len == 0 => nothing was sent => previous
1991                                                            response will be kept */
1992                                                         t_buf = rest_buf;
1993                                                         t_len = rest_len;
1994                                                         goto redo_tls_encode;
1995                                                 } else {
1996                                                         lock_release(&c->write_lock);
1997                                                         /* error exit */
1998                                                 }
1999                                         }
2000                         } else
2001 #endif /* USE_TLS */
2002                                 n=tcpconn_1st_send(fd, c, buf, len, dst->send_flags,
2003                                                                         &response[1], 0);
2004                         if (unlikely(n<0)) /* this will catch CONN_ERROR too */
2005                                 goto conn_wait_error;
2006                         if (unlikely(response[1]==CONN_EOF)){
2007                                 /* if close-after-send requested, don't bother
2008                                    sending the fd back to tcp_main, try closing it
2009                                    immediately (no other tcp_send should use it,
2010                                    because it is marked as close-after-send before
2011                                    being added to the hash) */
2012                                 goto conn_wait_close;
2013                         }
2014                         /* send to tcp_main */
2015                         response[0]=(long)c;
2016                         if (unlikely(send_fd(unix_tcp_sock, response,
2017                                                                         sizeof(response), fd) <= 0)){
2018                                 LOG(L_ERR, "BUG: tcp_send %s: %ld for %p"
2019                                                         " failed:" " %s (%d)\n",
2020                                                         su2a(&dst->to, sizeof(dst->to)),
2021                                                         response[1], c, strerror(errno), errno);
2022                                 goto conn_wait_error;
2023                         }
2024                         goto conn_wait_success;
2025                 }
2026 #endif /* TCP_CONNECT_WAIT  && TCP_ASYNC */
2027                 if (unlikely((c=tcpconn_connect(&dst->to, from, dst->proto,
2028                                                                                 &dst->send_flags))==0)){
2029                         LOG(L_ERR, "ERROR: tcp_send %s: connect failed\n",
2030                                                         su2a(&dst->to, sizeof(dst->to)));
2031                         return -1;
2032                 }
2033                 tcpconn_set_send_flags(c, dst->send_flags);
2034                 if (likely(c->state==S_CONN_OK))
2035                         TCP_STATS_ESTABLISHED(S_CONN_CONNECT);
2036                 atomic_set(&c->refcnt, 2); /* ref. from here and it will also
2037                                                                           be added in the tcp_main hash */
2038                 fd=c->s;
2039                 c->flags|=F_CONN_FD_CLOSED; /* not yet opened in main */
2040                 /* ? TODO: it might be faster just to queue the write and
2041                  * send to main a CONN_NEW_PENDING_WRITE */
2042                 
2043                 /* send the new tcpconn to "tcp main" */
2044                 response[0]=(long)c;
2045                 response[1]=CONN_NEW;
2046                 n=send_fd(unix_tcp_sock, response, sizeof(response), c->s);
2047                 if (unlikely(n<=0)){
2048                         LOG(L_ERR, "BUG: tcp_send %s: failed send_fd: %s (%d)\n",
2049                                         su2a(&dst->to, sizeof(dst->to)),
2050                                         strerror(errno), errno);
2051                         /* we can safely delete it, it's not referenced by anybody */
2052                         _tcpconn_free(c);
2053                         n=-1;
2054                         goto end_no_conn;
2055                 }
2056                 /* new connection => send on it directly */
2057 #ifdef USE_TLS
2058                 if (unlikely(c->type==PROTO_TLS)) {
2059                         /* for TLS the TLS processing and the send must happen
2060                            atomically w/ respect to other sends on the same connection
2061                            (otherwise reordering might occur which would break TLS) =>
2062                            lock.
2063                         */
2064                         response[1] = CONN_NOP;
2065                         t_buf = buf;
2066                         t_len = len;
2067                         lock_get(&c->write_lock);
2068                                 do {
2069                                         t_send_flags = dst->send_flags;
2070                                         n = tls_encode(c, &t_buf, &t_len, &rest_buf, &rest_len,
2071                                                                         &t_send_flags);
2072                                         if (likely(n > 0)) {
2073                                                 n = tcpconn_do_send(fd, c, t_buf, t_len, t_send_flags,
2074                                                                                                 &resp, 1);
2075                                                 if (likely(response[1] != CONN_QUEUED_WRITE ||
2076                                                                         resp == CONN_ERROR))
2077                                                         /* don't overwrite a previous CONN_QUEUED_WRITE
2078                                                            unless error */
2079                                                         response[1] = resp;
2080                                         } else  if (unlikely(n < 0)) {
2081                                                 response[1] = CONN_ERROR;
2082                                                 break;
2083                                         }
2084                                         /* else do nothing for n (t_len) == 0, keep
2085                                            the last reponse */
2086                                         t_buf = rest_buf;
2087                                         t_len = rest_len;
2088                                 } while(unlikely(rest_len && n > 0));
2089                         lock_release(&c->write_lock);
2090                 } else
2091 #endif /* USE_TLS */
2092                         n = tcpconn_do_send(fd, c, buf, len, dst->send_flags,
2093                                                                         &response[1], 0);
2094                 if (unlikely(response[1] != CONN_NOP)) {
2095                         response[0]=(long)c;
2096                         if (send_all(unix_tcp_sock, response, sizeof(response)) <= 0) {
2097                                 BUG("tcp_main command %ld sending failed (write):"
2098                                                 "%s (%d)\n", response[1], strerror(errno), errno);
2099                                 /* all commands != CONN_NOP returned by tcpconn_do_send()
2100                                    (CONN_EOF, CONN_ERROR, CONN_QUEUED_WRITE) will auto-dec
2101                                    refcnt => if sending the command fails we have to
2102                                    dec. refcnt by hand */
2103                                 tcpconn_chld_put(c); /* deref. it manually */
2104                                 n=-1;
2105                         }
2106                         /* here refcnt for c is already decremented => c contents can
2107                            no longer be used and refcnt _must_ _not_ be decremented
2108                            again on exit */
2109                         if (unlikely(n < 0 || response[1] == CONN_EOF)) {
2110                                 /* on error or eof, close fd */
2111                                 tcp_safe_close(fd);
2112                         } else if (response[1] == CONN_QUEUED_WRITE) {
2113 #ifdef TCP_FD_CACHE
2114                                 if (cfg_get(tcp, tcp_cfg, fd_cache)) {
2115                                         tcp_fd_cache_add(c, fd);
2116                                 } else
2117 #endif /* TCP_FD_CACHE */
2118                                         tcp_safe_close(fd);
2119                         } else {
2120                                 BUG("unexpected tcpconn_do_send() return & response:"
2121                                                 " %d, %ld\n", n, response[1]);
2122                         }
2123                         goto end_no_deref;
2124                 }
2125 #ifdef TCP_FD_CACHE
2126                 if (cfg_get(tcp, tcp_cfg, fd_cache)) {
2127                         tcp_fd_cache_add(c, fd);
2128                 }else
2129 #endif /* TCP_FD_CACHE */
2130                         tcp_safe_close(fd);
2131         /* here we can have only commands that _do_ _not_ dec refcnt.
2132            (CONN_EOF, CON_ERROR, CON_QUEUED_WRITE are all treated above) */
2133                 goto release_c;
2134         } /* if (c==0 or unusable) new connection */
2135         /* existing connection, send on it */
2136         n = tcpconn_send_put(c, buf, len, dst->send_flags);
2137         /* no deref needed (automatically done inside tcpconn_send_put() */
2138         return n;
2139 #ifdef TCP_CONNECT_WAIT
2140 conn_wait_success:
2141 #ifdef TCP_FD_CACHE
2142         if (cfg_get(tcp, tcp_cfg, fd_cache)) {
2143                 tcp_fd_cache_add(c, fd);
2144         } else
2145 #endif /* TCP_FD_CACHE */
2146                 if (unlikely (tcp_safe_close(fd) < 0))
2147                         LOG(L_ERR, "closing temporary send fd for %p: %s: "
2148                                         "close(%d) failed (flags 0x%x): %s (%d)\n", c,
2149                                         su2a(&c->rcv.src_su, sizeof(c->rcv.src_su)),
2150                                         fd, c->flags, strerror(errno), errno);
2151         tcpconn_chld_put(c); /* release c (dec refcnt & free on 0) */
2152         return n;
2153 conn_wait_error:
2154         n=-1;
2155 conn_wait_close:
2156         /* connect or send failed or immediate close-after-send was requested on
2157          * newly created connection which was not yet sent to tcp_main (but was
2158          * already hashed) => don't send to main, unhash and destroy directly
2159          * (if refcnt>2 it will be destroyed when the last sender releases the
2160          * connection (tcpconn_chld_put(c))) or when tcp_main receives a
2161          * CONN_ERROR it*/
2162         c->state=S_CONN_BAD;
2163         /* we are here only if we opened a new fd (and not reused a cached or
2164            a reader one) => if the connect was successful close the fd */
2165         if (fd>=0) {
2166                 if (unlikely(tcp_safe_close(fd) < 0 ))
2167                         LOG(L_ERR, "closing temporary send fd for %p: %s: "
2168                                         "close(%d) failed (flags 0x%x): %s (%d)\n", c,
2169                                         su2a(&c->rcv.src_su, sizeof(c->rcv.src_su)),
2170                                         fd, c->flags, strerror(errno), errno);
2171         }
2172         /* here the connection is for sure in the hash (tcp_main will not
2173            remove it because it's marked as PENDing) and the refcnt is at least
2174            2
2175          */
2176         TCPCONN_LOCK;
2177                 _tcpconn_detach(c);
2178                 c->flags&=~F_CONN_HASHED;
2179                 tcpconn_put(c);
2180         TCPCONN_UNLOCK;
2181         /* dec refcnt -> mark it for destruction */
2182         tcpconn_chld_put(c);
2183         return n;
2184 #endif /* TCP_CONNECT_WAIT */
2185 release_c:
2186         tcpconn_chld_put(c); /* release c (dec refcnt & free on 0) */
2187 end_no_deref:
2188 end_no_conn:
2189         return n;
2190 }
2191
2192
2193
2194 /** sends on an existing tcpconn and auto-dec. con. ref counter.
2195  * As opposed to tcp_send(), this function requires an existing
2196  * tcp connection.
2197  * WARNING: the tcp_connection will be de-referenced.
2198  * @param c - existing tcp connection pointer.
2199  * @param buf - data to be sent.
2200  * @param len - data length,
2201  * @return >=0 on success, -1 on error.
2202  */
2203 static int tcpconn_send_put(struct tcp_connection* c, const char* buf,
2204                                                                 unsigned len, snd_flags_t send_flags)
2205 {
2206         struct tcp_connection *tmp;
2207         int fd;
2208         long response[2];
2209         int n;
2210         int do_close_fd;
2211 #ifdef USE_TLS
2212         const char* rest_buf;
2213         const char* t_buf;
2214         unsigned rest_len, t_len;
2215         long resp;
2216         snd_flags_t t_send_flags;
2217 #endif /* USE_TLS */
2218 #ifdef TCP_FD_CACHE
2219         struct fd_cache_entry* fd_cache_e;
2220         int use_fd_cache;
2221         
2222         use_fd_cache=cfg_get(tcp, tcp_cfg, fd_cache);
2223         fd_cache_e=0;
2224 #endif /* TCP_FD_CACHE */
2225         do_close_fd=1; /* close the fd on exit */
2226         response[1] = CONN_NOP;
2227 #ifdef TCP_ASYNC
2228         /* if data is already queued, we don't need the fd */
2229 #ifdef TCP_CONNECT_WAIT
2230                 if (unlikely(cfg_get(tcp, tcp_cfg, async) &&
2231                                                 (_wbufq_non_empty(c) || (c->flags&F_CONN_PENDING)) ))
2232 #else /* ! TCP_CONNECT_WAIT */
2233                 if (unlikely(cfg_get(tcp, tcp_cfg, async) && (_wbufq_non_empty(c)) ))
2234 #endif /* TCP_CONNECT_WAIT */
2235                 {
2236                         lock_get(&c->write_lock);
2237 #ifdef TCP_CONNECT_WAIT
2238                                 if (likely(_wbufq_non_empty(c) || (c->flags&F_CONN_PENDING)))
2239 #else /* ! TCP_CONNECT_WAIT */
2240                                 if (likely(_wbufq_non_empty(c)))
2241 #endif /* TCP_CONNECT_WAIT */
2242                                 {
2243                                         do_close_fd=0;
2244 #ifdef USE_TLS
2245                                         if (unlikely(c->type==PROTO_TLS)) {
2246                                                 t_buf = buf;
2247                                                 t_len = len;
2248                                                 do {
2249                                                         t_send_flags = send_flags;
2250                                                         n = tls_encode(c, &t_buf, &t_len,
2251                                                                                         &rest_buf, &rest_len,
2252                                                                                         &t_send_flags);
2253                                                         if (unlikely((n < 0) || (t_len &&
2254                                                                          (_wbufq_add(c, t_buf, t_len) < 0)))) {
2255                                                                 lock_release(&c->write_lock);
2256                                                                 n=-1;
2257                                                                 response[1] = CONN_ERROR;
2258                                                                 c->state=S_CONN_BAD;
2259                                                                 c->timeout=get_ticks_raw(); /* force timeout */
2260                                                                 goto error;
2261                                                         }
2262                                                         t_buf = rest_buf;
2263                                                         t_len = rest_len;
2264                                                 } while(unlikely(rest_len && n > 0));
2265                                         } else
2266 #endif /* USE_TLS */
2267                                                 if (unlikely(len && (_wbufq_add(c, buf, len)<0))){
2268                                                         lock_release(&c->write_lock);
2269                                                         n=-1;
2270                                                         response[1] = CONN_ERROR;
2271                                                         c->state=S_CONN_BAD;
2272                                                         c->timeout=get_ticks_raw(); /* force timeout */
2273                                                         goto error;
2274                                                 }
2275                                         n=len;
2276                                         lock_release(&c->write_lock);
2277                                         goto release_c;
2278                                 }
2279                         lock_release(&c->write_lock);
2280                 }
2281 #endif /* TCP_ASYNC */
2282                 /* check if this is not the same reader process holding
2283                  *  c  and if so send directly on c->fd */
2284                 if (c->reader_pid==my_pid()){
2285                         DBG("tcp_send: send from reader (%d (%d)), reusing fd\n",
2286                                         my_pid(), process_no);
2287                         fd=c->fd;
2288                         do_close_fd=0; /* don't close the fd on exit, it's in use */
2289 #ifdef TCP_FD_CACHE
2290                         use_fd_cache=0; /* don't cache: problems would arise due to the
2291                                                            close() on cache eviction (if the fd is still 
2292                                                            used). If it has to be cached then dup() _must_ 
2293                                                            be used */
2294                 }else if (likely(use_fd_cache && 
2295                                                         ((fd_cache_e=tcp_fd_cache_get(c))!=0))){
2296                         fd=fd_cache_e->fd;
2297                         do_close_fd=0;
2298                         DBG("tcp_send: found fd in cache ( %d, %p, %d)\n",
2299                                         fd, c, fd_cache_e->id);
2300 #endif /* TCP_FD_CACHE */
2301                 }else{
2302                         DBG("tcp_send: tcp connection found (%p), acquiring fd\n", c);
2303                         /* get the fd */
2304                         response[0]=(long)c;
2305                         response[1]=CONN_GET_FD;
2306                         n=send_all(unix_tcp_sock, response, sizeof(response));
2307                         if (unlikely(n<=0)){
2308                                 LOG(L_ERR, "BUG: tcp_send: failed to get fd(write):%s (%d)\n",
2309                                                 strerror(errno), errno);
2310                                 n=-1;
2311                                 goto release_c;
2312                         }
2313                         DBG("tcp_send, c= %p, n=%d\n", c, n);
2314                         n=receive_fd(unix_tcp_sock, &tmp, sizeof(tmp), &fd, MSG_WAITALL);
2315                         if (unlikely(n<=0)){
2316                                 LOG(L_ERR, "BUG: tcp_send: failed to get fd(receive_fd):"
2317                                                         " %s (%d)\n", strerror(errno), errno);
2318                                 n=-1;
2319                                 do_close_fd=0;
2320                                 goto release_c;
2321                         }
2322                         /* handle fd closed or bad connection/error
2323                                 (it's possible that this happened in the time between
2324                                 we found the intial connection and the time when we get
2325                                 the fd)
2326                          */
2327                         if (unlikely(c!=tmp || fd==-1 || c->state==S_CONN_BAD)){
2328                                 if (unlikely(c!=tmp && tmp!=0))
2329                                         BUG("tcp_send: get_fd: got different connection:"
2330                                                 "  %p (id= %d, refcnt=%d state=%d) != "
2331                                                 "  %p (n=%d)\n",
2332                                                   c,   c->id,   atomic_get(&c->refcnt),   c->state,
2333                                                   tmp, n
2334                                                 );
2335                                 n=-1; /* fail */
2336                                 /* don't cache fd & close it */
2337                                 do_close_fd = (fd==-1)?0:1;
2338 #ifdef TCP_FD_CACHE
2339                                 use_fd_cache = 0;
2340 #endif /* TCP_FD_CACHE */
2341                                 goto end;
2342                         }
2343                         DBG("tcp_send: after receive_fd: c= %p n=%d fd=%d\n",c, n, fd);
2344                 }
2345         
2346 #ifdef USE_TLS
2347                 if (unlikely(c->type==PROTO_TLS)) {
2348                         /* for TLS the TLS processing and the send must happen
2349                            atomically w/ respect to other sends on the same connection
2350                            (otherwise reordering might occur which would break TLS) =>
2351                            lock.
2352                         */
2353                         response[1] = CONN_NOP;
2354                         t_buf = buf;
2355                         t_len = len;
2356                         lock_get(&c->write_lock);
2357                                 do {
2358                                         t_send_flags = send_flags;
2359                                         n = tls_encode(c, &t_buf, &t_len, &rest_buf, &rest_len,
2360                                                                         &t_send_flags);
2361                                         if (likely(n > 0)) {
2362                                                 n = tcpconn_do_send(fd, c, t_buf, t_len, t_send_flags,
2363                                                                                                 &resp, 1);
2364                                                 if (likely(response[1] != CONN_QUEUED_WRITE ||
2365                                                                         resp == CONN_ERROR))
2366                                                         /* don't overwrite a previous CONN_QUEUED_WRITE
2367                                                            unless error */
2368                                                         response[1] = resp;
2369                                         } else if (unlikely(n < 0)) {
2370                                                 response[1] = CONN_ERROR;
2371                                                 break;
2372                                         }
2373                                         /* else do nothing for n (t_len) == 0, keep
2374                                            the last reponse */
2375                                         t_buf = rest_buf;
2376                                         t_len = rest_len;
2377                                 } while(unlikely(rest_len && n > 0));
2378                         lock_release(&c->write_lock);
2379                 } else
2380 #endif
2381                         n = tcpconn_do_send(fd, c, buf, len, send_flags, &response[1], 0);
2382         if (unlikely(response[1] != CONN_NOP)) {
2383 error:
2384                 response[0]=(long)c;
2385                 if (send_all(unix_tcp_sock, response, sizeof(response)) <= 0) {
2386                         BUG("tcp_main command %ld sending failed (write):%s (%d)\n",
2387                                         response[1], strerror(errno), errno);
2388                         /* all commands != CONN_NOP returned by tcpconn_do_send()
2389                            (CONN_EOF, CONN_ERROR, CONN_QUEUED_WRITE) will auto-dec refcnt
2390                            => if sending the command fails we have to dec. refcnt by hand
2391                          */
2392                         tcpconn_chld_put(c); /* deref. it manually */
2393                         n=-1;
2394                 }
2395                 /* here refcnt for c is already decremented => c contents can no
2396                    longer be used and refcnt _must_ _not_ be decremented again
2397                    on exit */
2398                 if (unlikely(n < 0 || response[1] == CONN_EOF)) {
2399                         /* on error or eof, remove from cache or close fd */
2400 #ifdef TCP_FD_CACHE
2401                         if (unlikely(fd_cache_e)){
2402                                 tcp_fd_cache_rm(fd_cache_e);
2403                                 fd_cache_e = 0;
2404                                 tcp_safe_close(fd);
2405                         }else
2406 #endif /* TCP_FD_CACHE */
2407                                 if (do_close_fd) tcp_safe_close(fd);
2408                 } else if (response[1] == CONN_QUEUED_WRITE) {
2409 #ifdef TCP_FD_CACHE
2410                         if (unlikely((fd_cache_e==0) && use_fd_cache)){
2411                                 tcp_fd_cache_add(c, fd);
2412                         }else
2413 #endif /* TCP_FD_CACHE */
2414                                 if (do_close_fd) tcp_safe_close(fd);
2415                 } else {
2416                         BUG("unexpected tcpconn_do_send() return & response: %d, %ld\n",
2417                                         n, response[1]);
2418                 }
2419                 return n; /* no tcpconn_put */
2420         }
2421 end:
2422 #ifdef TCP_FD_CACHE
2423         if (unlikely((fd_cache_e==0) && use_fd_cache)){
2424                 tcp_fd_cache_add(c, fd);
2425         }else
2426 #endif /* TCP_FD_CACHE */
2427         if (do_close_fd) {
2428                 if (unlikely(tcp_safe_close(fd) < 0))
2429                         LOG(L_ERR, "closing temporary send fd for %p: %s: "
2430                                         "close(%d) failed (flags 0x%x): %s (%d)\n", c,
2431                                         su2a(&c->rcv.src_su, sizeof(c->rcv.src_su)),
2432                                         fd, c->flags, strerror(errno), errno);
2433         }
2434         /* here we can have only commands that _do_ _not_ dec refcnt.
2435            (CONN_EOF, CON_ERROR, CON_QUEUED_WRITE are all treated above) */
2436 release_c:
2437         tcpconn_chld_put(c); /* release c (dec refcnt & free on 0) */
2438         return n;
2439 }
2440
2441
2442
2443 /* unsafe send on a known tcp connection.
2444  * Directly send on a known tcp connection with a given fd.
2445  * It is assumed that the connection locks are already held.
2446  * Side effects: if needed it will send state update commands to
2447  *  tcp_main (e.g. CON_EOF, CON_ERROR, CON_QUEUED_WRITE).
2448  * @param fd - fd used for sending.
2449  * @param c - existing tcp connection pointer (state and flags might be
2450  *            changed).
2451  * @param buf - data to be sent.
2452  * @param len - data length.
2453  * @param send_flags
2454  * @return <0 on error, number of bytes sent on success.
2455  */
2456 int tcpconn_send_unsafe(int fd, struct tcp_connection *c,
2457                                                 const char* buf, unsigned len, snd_flags_t send_flags)
2458 {
2459         int n;
2460         long response[2];
2461         
2462         n = tcpconn_do_send(fd, c, buf, len, send_flags, &response[1], 1);
2463         if (unlikely(response[1] != CONN_NOP)) {
2464                 /* all commands != CONN_NOP returned by tcpconn_do_send()
2465                    (CONN_EOF, CONN_ERROR, CONN_QUEUED_WRITE) will auto-dec refcnt
2466                    => increment it (we don't want the connection to be destroyed
2467                    from under us)
2468                  */
2469                 atomic_inc(&c->refcnt);
2470                 response[0]=(long)c;
2471                 if (send_all(unix_tcp_sock, response, sizeof(response)) <= 0) {
2472                         BUG("connection %p command %ld sending failed (write):%s (%d)\n",
2473                                         c, response[1], strerror(errno), errno);
2474                         /* send failed => deref. it back by hand */
2475                         tcpconn_chld_put(c); 
2476                         n=-1;
2477                 }
2478                 /* here refcnt for c is already decremented => c contents can no
2479                    longer be used and refcnt _must_ _not_ be decremented again
2480                    on exit */
2481                 return n;
2482         }
2483         return n;
2484 }
2485
2486
2487
2488 /** lower level send (connection and fd should be known).
2489  * It takes care of possible write-queueing, blacklisting a.s.o.
2490  * It expects a valid tcp connection. It doesn't touch the ref. cnts.
2491  * It will also set the connection flags from send_flags (it's better
2492  * to do it here, because it's guaranteed to be under lock).
2493  * @param fd - fd used for sending.
2494  * @param c - existing tcp connection pointer (state and flags might be
2495  *            changed).
2496  * @param buf - data to be sent.
2497  * @param len - data length.
2498  * @param send_flags
2499  * @param resp - filled with a cmd. for tcp_main:
2500  *                      CONN_NOP - nothing needs to be done (do not send
2501  *                                 anything to tcp_main).
2502  *                      CONN_ERROR - error, connection should be closed.
2503  *                      CONN_EOF - no error, but connection should be closed.
2504  *                      CONN_QUEUED_WRITE - new write queue (connection
2505  *                                 should be watched for write and the wr.
2506  *                                 queue flushed).
2507  * @param locked - if set assume the connection is already locked (call from
2508  *                  tls) and do not lock/unlock the connection.
2509  * @return >=0 on success, < 0 on error && *resp == CON_ERROR.
2510  *
2511  */
2512 static int tcpconn_do_send(int fd, struct tcp_connection* c,
2513                                                         const char* buf, unsigned len,
2514                                                         snd_flags_t send_flags, long* resp,
2515                                                         int locked)
2516 {
2517         int  n;
2518 #ifdef TCP_ASYNC
2519         int enable_write_watch;
2520 #endif /* TCP_ASYNC */
2521
2522         DBG("tcp_send: sending...\n");
2523         *resp = CONN_NOP;
2524         if (likely(!locked)) lock_get(&c->write_lock);
2525         /* update connection send flags with the current ones */
2526         tcpconn_set_send_flags(c, send_flags);
2527 #ifdef TCP_ASYNC
2528         if (likely(cfg_get(tcp, tcp_cfg, async))){
2529                 if (_wbufq_non_empty(c)
2530 #ifdef TCP_CONNECT_WAIT
2531                         || (c->flags&F_CONN_PENDING) 
2532 #endif /* TCP_CONNECT_WAIT */
2533                         ){
2534                         if (unlikely(_wbufq_add(c, buf, len)<0)){
2535                                 if (likely(!locked)) lock_release(&c->write_lock);
2536                                 n=-1;
2537                                 goto error;
2538                         }
2539                         if (likely(!locked)) lock_release(&c->write_lock);
2540                         n=len;
2541                         goto end;
2542                 }
2543                 n=_tcpconn_write_nb(fd, c, buf, len);
2544         }else{
2545 #endif /* TCP_ASYNC */
2546                 /* n=tcp_blocking_write(c, fd, buf, len); */
2547                 n=tsend_stream(fd, buf, len,
2548                                                 TICKS_TO_S(cfg_get(tcp, tcp_cfg, send_timeout)) *
2549                                                 1000);
2550 #ifdef TCP_ASYNC
2551         }
2552 #else /* ! TCP_ASYNC */
2553         if (likely(!locked)) lock_release(&c->write_lock);
2554 #endif /* TCP_ASYNC */
2555         
2556         DBG("tcp_send: after real write: c= %p n=%d fd=%d\n",c, n, fd);
2557         DBG("tcp_send: buf=\n%.*s\n", (int)len, buf);
2558         if (unlikely(n<(int)len)){
2559 #ifdef TCP_ASYNC
2560                 if (cfg_get(tcp, tcp_cfg, async) &&
2561                                 ((n>=0) || errno==EAGAIN || errno==EWOULDBLOCK)){
2562                         enable_write_watch=_wbufq_empty(c);
2563                         if (n<0) n=0;
2564                         else if (unlikely(c->state==S_CONN_CONNECT ||
2565                                                 c->state==S_CONN_ACCEPT)){
2566                                 TCP_STATS_ESTABLISHED(c->state);
2567                                 c->state=S_CONN_OK; /* something was written */
2568                         }
2569                         if (unlikely(_wbufq_add(c, buf+n, len-n)<0)){
2570                                 if (likely(!locked)) lock_release(&c->write_lock);
2571                                 n=-1;
2572                                 goto error;
2573                         }
2574                         if (likely(!locked)) lock_release(&c->write_lock);
2575                         n=len;
2576                         if (likely(enable_write_watch))
2577                                 *resp=CONN_QUEUED_WRITE;
2578                         goto end;
2579                 }else{
2580                         if (likely(!locked)) lock_release(&c->write_lock);
2581                 }
2582 #endif /* TCP_ASYNC */
2583                 if (unlikely(c->state==S_CONN_CONNECT)){
2584                         switch(errno){
2585                                 case ENETUNREACH:
2586                                 case EHOSTUNREACH: /* not posix for send() */
2587 #ifdef USE_DST_BLACKLIST
2588                                         dst_blacklist_su(BLST_ERR_CONNECT, c->rcv.proto,
2589                                                                                 &c->rcv.src_su, &c->send_flags, 0);
2590 #endif /* USE_DST_BLACKLIST */
2591                                         TCP_EV_CONNECT_UNREACHABLE(errno, TCP_LADDR(c),
2592                                                                         TCP_LPORT(c), TCP_PSU(c), TCP_PROTO(c));
2593                                         break;
2594                                 case ECONNREFUSED:
2595                                 case ECONNRESET:
2596 #ifdef USE_DST_BLACKLIST
2597                                         dst_blacklist_su(BLST_ERR_CONNECT, c->rcv.proto,
2598                                                                                 &c->rcv.src_su, &c->send_flags, 0);
2599 #endif /* USE_DST_BLACKLIST */
2600                                         TCP_EV_CONNECT_RST(errno, TCP_LADDR(c), TCP_LPORT(c),
2601                                                                                 TCP_PSU(c), TCP_PROTO(c));
2602                                         break;
2603                                 default:
2604                                         TCP_EV_CONNECT_ERR(errno, TCP_LADDR(c), TCP_LPORT(c),
2605                                                                                 TCP_PSU(c), TCP_PROTO(c));
2606                                 }
2607                         TCP_STATS_CONNECT_FAILED();
2608                 }else{
2609                         switch(errno){
2610                                 case ECONNREFUSED:
2611                                 case ECONNRESET:
2612                                         TCP_STATS_CON_RESET();
2613                                         /* no break */
2614                                 case ENETUNREACH:
2615                                 /*case EHOSTUNREACH: -- not posix */
2616 #ifdef USE_DST_BLACKLIST
2617                                         dst_blacklist_su(BLST_ERR_SEND, c->rcv.proto,
2618                                                                                 &c->rcv.src_su, &c->send_flags, 0);
2619 #endif /* USE_DST_BLACKLIST */
2620                                         break;
2621                         }
2622                 }
2623                 LOG(L_ERR, "ERROR: tcp_send: failed to send on %p (%s:%d->%s): %s (%d)"
2624                                         "\n", c, ip_addr2a(&c->rcv.dst_ip), c->rcv.dst_port,
2625                                         su2a(&c->rcv.src_su, sizeof(c->rcv.src_su)),
2626                                         strerror(errno), errno);
2627                 n = -1;
2628 #ifdef TCP_ASYNC
2629 error:
2630 #endif /* TCP_ASYNC */
2631                 /* error on the connection , mark it as bad and set 0 timeout */
2632                 c->state=S_CONN_BAD;
2633                 c->timeout=get_ticks_raw();
2634                 /* tell "main" it should drop this (optional it will t/o anyway?)*/
2635                 *resp=CONN_ERROR;
2636                 return n; /* error return, no tcpconn_put */
2637         }
2638         
2639 #ifdef TCP_ASYNC
2640         if (likely(!locked)) lock_release(&c->write_lock);
2641 #endif /* TCP_ASYNC */
2642         /* in non-async mode here we're either in S_CONN_OK or S_CONN_ACCEPT*/
2643         if (unlikely(c->state==S_CONN_CONNECT || c->state==S_CONN_ACCEPT)){
2644                         TCP_STATS_ESTABLISHED(c->state);
2645                         c->state=S_CONN_OK;
2646         }
2647         if (unlikely(send_flags.f & SND_F_CON_CLOSE)){
2648                 /* close after write => send EOF request to tcp_main */
2649                 c->state=S_CONN_BAD;
2650                 c->timeout=get_ticks_raw();
2651                 /* tell "main" it should drop this*/
2652                 *resp=CONN_EOF;
2653                 return n;
2654         }
2655 end:
2656         return n;
2657 }
2658
2659
2660
2661 /** low level 1st send on a new connection.
2662  * It takes care of possible write-queueing, blacklisting a.s.o.
2663  * It expects a valid just-opened tcp connection. It doesn't touch the 
2664  * ref. counters. It's used only in the async first send case.
2665  * @param fd - fd used for sending.
2666  * @param c - existing tcp connection pointer (state and flags might be
2667  *            changed). The connection must be new (no previous send on it).
2668  * @param buf - data to be sent.
2669  * @param len - data length.
2670  * @param send_flags
2671  * @param resp - filled with a fd sending cmd. for tcp_main on success. It
2672  *                      _must_ be one of the commands listed below:
2673  *                      CONN_NEW_PENDING_WRITE - new connection, first write
2674  *                                 was partially successful (or EAGAIN) and
2675  *                                 was queued (connection should be watched
2676  *                                 for write and the write queue flushed).
2677  *                                 The fd should be sent to tcp_main.
2678  *                      CONN_NEW_COMPLETE - new connection, first write
2679  *                                 completed successfully and no data is
2680  *                                 queued. The fd should be sent to tcp_main.
2681  *                      CONN_EOF - no error, but the connection should be
2682  *                                  closed (e.g. SND_F_CON_CLOSE send flag).
2683  *                      CONN_ERROR - error, _must_ return < 0.
2684  * @param locked - if set assume the connection is already locked (call from
2685  *                  tls) and do not lock/unlock the connection.
2686  * @return >=0 on success, < 0 on error (on error *resp is undefined).
2687  *
2688  */
2689 static int tcpconn_1st_send(int fd, struct tcp_connection* c,
2690                                                         const char* buf, unsigned len,
2691                                                         snd_flags_t send_flags, long* resp,
2692                                                         int locked)
2693 {
2694         int n;
2695         
2696         n=_tcpconn_write_nb(fd, c, buf, len);
2697         if (unlikely(n<(int)len)){
2698                 if ((n>=0) || errno==EAGAIN || errno==EWOULDBLOCK){
2699                         DBG("pending write on new connection %p "
2700                                 " (%d/%d bytes written)\n", c, n, len);
2701                         if (unlikely(n<0)) n=0;
2702                         else{
2703                                 if (likely(c->state == S_CONN_CONNECT))
2704                                         TCP_STATS_ESTABLISHED(S_CONN_CONNECT);
2705                                 c->state=S_CONN_OK; /* partial write => connect()
2706                                                                                                 ended */
2707                         }
2708                         /* add to the write queue */
2709                         if (likely(!locked)) lock_get(&c->write_lock);
2710                                 if (unlikely(_wbufq_insert(c, buf+n, len-n)<0)){
2711                                         if (likely(!locked)) lock_release(&c->write_lock);
2712                                         n=-1;
2713                                         LOG(L_ERR, "%s: EAGAIN and"
2714                                                         " write queue full or failed for %p\n",
2715                                                         su2a(&c->rcv.src_su, sizeof(c->rcv.src_su)), c);
2716                                         goto error;
2717                                 }
2718                         if (likely(!locked)) lock_release(&c->write_lock);
2719                         /* send to tcp_main */
2720                         *resp=CONN_NEW_PENDING_WRITE;
2721                         n=len;
2722                         goto end;
2723                 }
2724                 /* n < 0 and not EAGAIN => write error */
2725                 /* if first write failed it's most likely a
2726                    connect error */
2727                 switch(errno){
2728                         case ENETUNREACH:
2729                         case EHOSTUNREACH:  /* not posix for send() */
2730 #ifdef USE_DST_BLACKLIST
2731                                 dst_blacklist_su( BLST_ERR_CONNECT, c->rcv.proto,
2732                                                                         &c->rcv.src_su, &c->send_flags, 0);
2733 #endif /* USE_DST_BLACKLIST */
2734                                 TCP_EV_CONNECT_UNREACHABLE(errno, TCP_LADDR(c),
2735                                                                 TCP_LPORT(c), TCP_PSU(c), TCP_PROTO(c));
2736                                 break;
2737                         case ECONNREFUSED:
2738                         case ECONNRESET:
2739 #ifdef USE_DST_BLACKLIST
2740                                 dst_blacklist_su( BLST_ERR_CONNECT, c->rcv.proto,
2741                                                                         &c->rcv.src_su, &c->send_flags, 0);
2742 #endif /* USE_DST_BLACKLIST */
2743                                 TCP_EV_CONNECT_RST(errno, TCP_LADDR(c),
2744                                                                 TCP_LPORT(c), TCP_PSU(c), TCP_PROTO(c));
2745                                 break;
2746                         default:
2747                                 TCP_EV_CONNECT_ERR(errno, TCP_LADDR(c),
2748                                                                 TCP_LPORT(c), TCP_PSU(c), TCP_PROTO(c));
2749                 }
2750                 /* error: destroy it directly */
2751                 TCP_STATS_CONNECT_FAILED();
2752                 LOG(L_ERR, "%s: connect & send  for %p failed:" " %s (%d)\n",
2753                                         su2a(&c->rcv.src_su, sizeof(c->rcv.src_su)),
2754                                         c, strerror(errno), errno);
2755                 goto error;
2756         }
2757         LOG(L_INFO, "quick connect for %p\n", c);
2758         if (likely(c->state == S_CONN_CONNECT))
2759                 TCP_STATS_ESTABLISHED(S_CONN_CONNECT);
2760         if (unlikely(send_flags.f & SND_F_CON_CLOSE)){
2761                 /* close after write =>  EOF => close immediately */
2762                 c->state=S_CONN_BAD;
2763                 /* tell our caller that it should drop this*/
2764                 *resp=CONN_EOF;
2765         }else{
2766                 c->state=S_CONN_OK;
2767                 /* send to tcp_main */
2768                 *resp=CONN_NEW_COMPLETE;
2769         }
2770 end:
2771         return n; /* >= 0 */
2772 error:
2773         *resp=CONN_ERROR;
2774         return -1;
2775 }
2776
2777
2778
2779 int tcp_init(struct socket_info* sock_info)
2780 {
2781         union sockaddr_union* addr;
2782         int optval;
2783 #ifdef HAVE_TCP_ACCEPT_FILTER
2784         struct accept_filter_arg afa;
2785 #endif /* HAVE_TCP_ACCEPT_FILTER */
2786 #ifdef DISABLE_NAGLE
2787         int flag;
2788         struct protoent* pe;
2789
2790         if (tcp_proto_no==-1){ /* if not already set */
2791                 pe=getprotobyname("tcp");
2792                 if (pe==0){
2793                         LOG(L_ERR, "ERROR: tcp_init: could not get TCP protocol number\n");
2794                         tcp_proto_no=-1;
2795                 }else{
2796                         tcp_proto_no=pe->p_proto;
2797                 }
2798         }
2799 #endif
2800         
2801         addr=&sock_info->su;
2802         /* sock_info->proto=PROTO_TCP; */
2803         if (init_su(addr, &sock_info->address, sock_info->port_no)<0){
2804                 LOG(L_ERR, "ERROR: tcp_init: could no init sockaddr_union\n");
2805                 goto error;
2806         }
2807         DBG("tcp_init: added %s\n", su2a(addr, sizeof(*addr)));
2808         sock_info->socket=socket(AF2PF(addr->s.sa_family), SOCK_STREAM, 0);
2809         if (sock_info->socket==-1){
2810                 LOG(L_ERR, "ERROR: tcp_init: socket: %s\n", strerror(errno));
2811                 goto error;
2812         }
2813 #ifdef DISABLE_NAGLE
2814         flag=1;
2815         if ( (tcp_proto_no!=-1) &&
2816                  (setsockopt(sock_info->socket, tcp_proto_no , TCP_NODELAY,
2817                                          &flag, sizeof(flag))<0) ){
2818                 LOG(L_ERR, "ERROR: tcp_init: could not disable Nagle: %s\n",
2819                                 strerror(errno));
2820         }
2821 #endif
2822
2823
2824 #if  !defined(TCP_DONT_REUSEADDR) 
2825         /* Stevens, "Network Programming", Section 7.5, "Generic Socket
2826      * Options": "...server started,..a child continues..on existing
2827          * connection..listening server is restarted...call to bind fails
2828          * ... ALL TCP servers should specify the SO_REUSEADDRE option 
2829          * to allow the server to be restarted in this situation
2830          *
2831          * Indeed, without this option, the server can't restart.
2832          *   -jiri
2833          */
2834         optval=1;
2835         if (setsockopt(sock_info->socket, SOL_SOCKET, SO_REUSEADDR,
2836                                 (void*)&optval, sizeof(optval))==-1) {
2837                 LOG(L_ERR, "ERROR: tcp_init: setsockopt %s\n",
2838                         strerror(errno));
2839                 goto error;
2840         }
2841 #endif
2842         /* tos */
2843         optval = tos;
2844         if (setsockopt(sock_info->socket, IPPROTO_IP, IP_TOS, (void*)&optval, 
2845                                 sizeof(optval)) ==-1){
2846                 LOG(L_WARN, "WARNING: tcp_init: setsockopt tos: %s\n", strerror(errno));
2847                 /* continue since this is not critical */
2848         }
2849 #ifdef HAVE_TCP_DEFER_ACCEPT
2850         /* linux only */
2851         if ((optval=cfg_get(tcp, tcp_cfg, defer_accept))){
2852                 if (setsockopt(sock_info->socket, IPPROTO_TCP, TCP_DEFER_ACCEPT,
2853                                         (void*)&optval, sizeof(optval)) ==-1){
2854                         LOG(L_WARN, "WARNING: tcp_init: setsockopt TCP_DEFER_ACCEPT %s\n",
2855                                                 strerror(errno));
2856                 /* continue since this is not critical */
2857                 }
2858         }
2859 #endif /* HAVE_TCP_DEFFER_ACCEPT */
2860 #ifdef HAVE_TCP_SYNCNT
2861         if ((optval=cfg_get(tcp, tcp_cfg, syncnt))){
2862                 if (setsockopt(sock_info->socket, IPPROTO_TCP, TCP_SYNCNT, &optval,
2863                                                 sizeof(optval))<0){
2864                         LOG(L_WARN, "WARNING: tcp_init: failed to set"
2865                                                 " maximum SYN retr. count: %s\n", strerror(errno));
2866                 }
2867         }
2868 #endif
2869 #ifdef HAVE_TCP_LINGER2
2870         if ((optval=cfg_get(tcp, tcp_cfg, linger2))){
2871                 if (setsockopt(sock_info->socket, IPPROTO_TCP, TCP_LINGER2, &optval,
2872                                                 sizeof(optval))<0){
2873                         LOG(L_WARN, "WARNING: tcp_init: failed to set"
2874                                                 " maximum LINGER2 timeout: %s\n", strerror(errno));
2875                 }
2876         }
2877 #endif
2878         init_sock_keepalive(sock_info->socket);
2879         if (bind(sock_info->socket, &addr->s, sockaddru_len(*addr))==-1){
2880                 LOG(L_ERR, "ERROR: tcp_init: bind(%x, %p, %d) on %s:%d : %s\n",
2881                                 sock_info->socket,  &addr->s, 
2882                                 (unsigned)sockaddru_len(*addr),
2883                                 sock_info->address_str.s,
2884                                 sock_info->port_no,
2885                                 strerror(errno));
2886                 goto error;
2887         }
2888         if (listen(sock_info->socket, TCP_LISTEN_BACKLOG)==-1){
2889                 LOG(L_ERR, "ERROR: tcp_init: listen(%x, %p, %d) on %s: %s\n",
2890                                 sock_info->socket, &addr->s, 
2891                                 (unsigned)sockaddru_len(*addr),
2892                                 sock_info->address_str.s,
2893                                 strerror(errno));
2894                 goto error;
2895         }
2896 #ifdef HAVE_TCP_ACCEPT_FILTER
2897         /* freebsd */
2898         if (cfg_get(tcp, tcp_cfg, defer_accept)){
2899                 memset(&afa, 0, sizeof(afa));
2900                 strcpy(afa.af_name, "dataready");
2901                 if (setsockopt(sock_info->socket, SOL_SOCKET, SO_ACCEPTFILTER,
2902                                         (void*)&afa, sizeof(afa)) ==-1){
2903                         LOG(L_WARN, "WARNING: tcp_init: setsockopt SO_ACCEPTFILTER %s\n",
2904                                                 strerror(errno));
2905                 /* continue since this is not critical */
2906                 }
2907         }
2908 #endif /* HAVE_TCP_ACCEPT_FILTER */
2909         
2910         return 0;
2911 error:
2912         if (sock_info->socket!=-1){
2913                 tcp_safe_close(sock_info->socket);
2914                 sock_info->socket=-1;
2915         }
2916         return -1;
2917 }
2918
2919
2920
2921 /* close tcp_main's fd from a tcpconn
2922  * WARNING: call only in tcp_main context */
2923 inline static void tcpconn_close_main_fd(struct tcp_connection* tcpconn)
2924 {
2925         int fd;
2926         
2927         
2928         fd=tcpconn->s;
2929 #ifdef USE_TLS
2930         if (tcpconn->type==PROTO_TLS)
2931                 tls_close(tcpconn, fd);
2932 #endif
2933 #ifdef TCP_FD_CACHE
2934         if (likely(cfg_get(tcp, tcp_cfg, fd_cache))) shutdown(fd, SHUT_RDWR);
2935 #endif /* TCP_FD_CACHE */
2936         if (unlikely(tcp_safe_close(fd)<0))
2937                 LOG(L_ERR, "ERROR: tcpconn_close_main_fd(%p): %s "
2938                                         "close(%d) failed (flags 0x%x): %s (%d)\n", tcpconn,
2939                                         su2a(&tcpconn->rcv.src_su, sizeof(tcpconn->rcv.src_su)),
2940                                         fd, tcpconn->flags, strerror(errno), errno);
2941         tcpconn->s=-1;
2942 }
2943
2944
2945
2946 /* dec refcnt & frees the connection if refcnt==0
2947  * returns 1 if the connection is freed, 0 otherwise
2948  *
2949  * WARNING: use only from child processes */
2950 inline static int tcpconn_chld_put(struct tcp_connection* tcpconn)
2951 {
2952         if (unlikely(atomic_dec_and_test(&tcpconn->refcnt))){
2953                 DBG("tcpconn_chld_put: destroying connection %p (%d, %d) "
2954                                 "flags %04x\n", tcpconn, tcpconn->id,
2955                                 tcpconn->s, tcpconn->flags);
2956                 /* sanity checks */
2957                 membar_read_atomic_op(); /* make sure we see the current flags */
2958                 if (unlikely(!(tcpconn->flags & F_CONN_FD_CLOSED) ||
2959                         (tcpconn->flags &
2960                                 (F_CONN_HASHED|F_CONN_MAIN_TIMER|
2961                                  F_CONN_READ_W|F_CONN_WRITE_W)) )){
2962                         LOG(L_CRIT, "BUG: tcpconn_chld_put: %p bad flags = %0x\n",
2963                                         tcpconn, tcpconn->flags);
2964                         abort();
2965                 }
2966                 _tcpconn_free(tcpconn); /* destroys also the wbuf_q if still present*/
2967                 return 1;
2968         }
2969         return 0;
2970 }
2971
2972
2973
2974 /* simple destroy function (the connection should be already removed
2975  * from the hashes. refcnt 0 and the fds should not be watched anymore for IO)
2976  */
2977 inline static void tcpconn_destroy(struct tcp_connection* tcpconn)
2978 {
2979                 DBG("tcpconn_destroy: destroying connection %p (%d, %d) "
2980                                 "flags %04x\n", tcpconn, tcpconn->id,
2981                                 tcpconn->s, tcpconn->flags);
2982                 if (unlikely(tcpconn->flags & F_CONN_HASHED)){
2983                         LOG(L_CRIT, "BUG: tcpconn_destroy: called with hashed"
2984                                                 " connection (%p)\n", tcpconn);
2985                         /* try to continue */
2986                         if (likely(tcpconn->flags & F_CONN_MAIN_TIMER))
2987                                 local_timer_del(&tcp_main_ltimer, &tcpconn->timer);
2988                         TCPCONN_LOCK;
2989                                 _tcpconn_detach(tcpconn);
2990                                 tcpconn->flags &= ~(F_CONN_HASHED|F_CONN_MAIN_TIMER);
2991                         TCPCONN_UNLOCK;
2992                 }
2993                 if (likely(!(tcpconn->flags & F_CONN_FD_CLOSED))){
2994                         tcpconn_close_main_fd(tcpconn);
2995                         tcpconn->flags|=F_CONN_FD_CLOSED;
2996                         (*tcp_connections_no)--;
2997                 }
2998                 _tcpconn_free(tcpconn); /* destroys also the wbuf_q if still present*/
2999 }
3000
3001
3002
3003 /* tries to destroy the connection: dec. refcnt and if 0 destroys the
3004  *  connection, else it will mark it as BAD and close the main fds
3005  *
3006  * returns 1 if the connection was destroyed, 0 otherwise
3007  *
3008  * WARNING: - the connection _has_ to be removed from the hash and timer
3009  *  first (use tcpconn_try_unhash() for this )
3010  *         - the fd should not be watched anymore (io_watch_del()...)
3011  *         - must be called _only_ from the tcp_main process context
3012  *          (or else the fd will remain open)
3013  */
3014 inline static int tcpconn_put_destroy(struct tcp_connection* tcpconn)
3015 {
3016         if (unlikely((tcpconn->flags &
3017                         (F_CONN_WRITE_W|F_CONN_HASHED|F_CONN_MAIN_TIMER|F_CONN_READ_W)) )){
3018                 /* sanity check */
3019                 if (unlikely(tcpconn->flags & F_CONN_HASHED)){
3020                         LOG(L_CRIT, "BUG: tcpconn_destroy: called with hashed and/or"
3021                                                 "on timer connection (%p), flags = %0x\n",
3022                                                 tcpconn, tcpconn->flags);
3023                         /* try to continue */
3024                         if (likely(tcpconn->flags & F_CONN_MAIN_TIMER))
3025                                 local_timer_del(&tcp_main_ltimer, &tcpconn->timer);
3026                         TCPCONN_LOCK;
3027                                 _tcpconn_detach(tcpconn);
3028                                 tcpconn->flags &= ~(F_CONN_HASHED|F_CONN_MAIN_TIMER);
3029                         TCPCONN_UNLOCK;
3030                 }else{
3031                         LOG(L_CRIT, "BUG: tcpconn_put_destroy: %p flags = %0x\n",
3032                                         tcpconn, tcpconn->flags);
3033                 }
3034         }
3035         tcpconn->state=S_CONN_BAD;
3036         /* in case it's still in a reader timer */
3037         tcpconn->timeout=get_ticks_raw();
3038         /* fast close: close fds now */
3039         if (likely(!(tcpconn->flags & F_CONN_FD_CLOSED))){
3040                 tcpconn_close_main_fd(tcpconn);
3041                 tcpconn->flags|=F_CONN_FD_CLOSED;
3042                 (*tcp_connections_no)--;
3043         }
3044         /* all the flags / ops on the tcpconn must be done prior to decrementing
3045          * the refcnt. and at least a membar_write_atomic_op() mem. barrier or
3046          *  a mb_atomic_* op must * be used to make sure all the changed flags are
3047          *  written into memory prior to the new refcnt value */
3048         if (unlikely(mb_atomic_dec_and_test(&tcpconn->refcnt))){
3049                 _tcpconn_free(tcpconn);
3050                 return 1;
3051         }
3052         return 0;
3053 }
3054
3055
3056
3057 /* try to remove a connection from the hashes and timer.
3058  * returns 1 if the connection was removed, 0 if not (connection not in
3059  *  hash)
3060  *
3061  * WARNING: call it only in the  tcp_main process context or else the
3062  *  timer removal won't work.
3063  */
3064 inline static int tcpconn_try_unhash(struct tcp_connection* tcpconn)
3065 {
3066         if (likely(tcpconn->flags & F_CONN_HASHED)){
3067                 tcpconn->state=S_CONN_BAD;
3068                 if (likely(tcpconn->flags & F_CONN_MAIN_TIMER)){
3069                         local_timer_del(&tcp_main_ltimer, &tcpconn->timer);
3070                         tcpconn->flags&=~F_CONN_MAIN_TIMER;
3071                 }else
3072                         /* in case it's still in a reader timer */
3073                         tcpconn->timeout=get_ticks_raw();
3074                 TCPCONN_LOCK;
3075                         if (tcpconn->flags & F_CONN_HASHED){
3076                                 tcpconn->flags&=~F_CONN_HASHED;
3077                                 _tcpconn_detach(tcpconn);
3078                                 TCPCONN_UNLOCK;
3079                         }else{
3080                                 /* tcp_send was faster and did unhash it itself */
3081                                 TCPCONN_UNLOCK;
3082                                 return 0;
3083                         }
3084 #ifdef TCP_ASYNC
3085                 /* empty possible write buffers (optional) */
3086                 if (unlikely(_wbufq_non_empty(tcpconn))){
3087                         lock_get(&tcpconn->write_lock);
3088                                 /* check again, while holding the lock */
3089                                 if (likely(_wbufq_non_empty(tcpconn)))
3090                                         _wbufq_destroy(&tcpconn->wbuf_q);
3091                         lock_release(&tcpconn->write_lock);
3092                 }
3093 #endif /* TCP_ASYNC */
3094                 return 1;
3095         }
3096         return 0;
3097 }
3098
3099
3100
3101 #ifdef SEND_FD_QUEUE
3102 struct send_fd_info{
3103         struct tcp_connection* tcp_conn;
3104         ticks_t expire;
3105         int unix_sock;
3106         unsigned int retries; /* debugging */
3107 };
3108
3109 struct tcp_send_fd_q{
3110         struct send_fd_info* data; /* buffer */
3111         struct send_fd_info* crt;  /* pointer inside the buffer */
3112         struct send_fd_info* end;  /* points after the last valid position */
3113 };
3114
3115
3116 static struct tcp_send_fd_q send2child_q;
3117
3118
3119
3120 static int send_fd_queue_init(struct tcp_send_fd_q *q, unsigned int size)
3121 {
3122         q->data=pkg_malloc(size*sizeof(struct send_fd_info));
3123         if (q->data==0){
3124                 LOG(L_ERR, "ERROR: send_fd_queue_init: out of memory\n");
3125                 return -1;
3126         }
3127         q->crt=&q->data[0];
3128         q->end=&q->data[size];
3129         return 0;
3130 }
3131
3132 static void send_fd_queue_destroy(struct tcp_send_fd_q *q)
3133 {
3134         if (q->data){
3135                 pkg_free(q->data);
3136                 q->data=0;
3137                 q->crt=q->end=0;
3138         }
3139 }
3140
3141
3142
3143 static int init_send_fd_queues()
3144 {
3145         if (send_fd_queue_init(&send2child_q, SEND_FD_QUEUE_SIZE)!=0)
3146                 goto error;
3147         return 0;
3148 error:
3149         LOG(L_ERR, "ERROR: init_send_fd_queues: init failed\n");
3150         return -1;
3151 }
3152
3153
3154
3155 static void destroy_send_fd_queues()
3156 {
3157         send_fd_queue_destroy(&send2child_q);
3158 }
3159
3160
3161
3162
3163 inline static int send_fd_queue_add(    struct tcp_send_fd_q* q, 
3164                                                                                 int unix_sock,
3165                                                                                 struct tcp_connection *t)
3166 {
3167         struct send_fd_info* tmp;
3168         unsigned long new_size;
3169         
3170         if (q->crt>=q->end){
3171                 new_size=q->end-&q->data[0];
3172                 if (new_size< MAX_SEND_FD_QUEUE_SIZE/2){
3173                         new_size*=2;
3174                 }else new_size=MAX_SEND_FD_QUEUE_SIZE;
3175                 if (unlikely(q->crt>=&q->data[new_size])){
3176                         LOG(L_ERR, "ERROR: send_fd_queue_add: queue full: %ld/%ld\n",
3177                                         (long)(q->crt-&q->data[0]-1), new_size);
3178                         goto error;
3179                 }
3180                 LOG(L_CRIT, "INFO: send_fd_queue: queue full: %ld, extending to %ld\n",
3181                                 (long)(q->end-&q->data[0]), new_size);
3182                 tmp=pkg_realloc(q->data, new_size*sizeof(struct send_fd_info));
3183                 if (unlikely(tmp==0)){
3184                         LOG(L_ERR, "ERROR: send_fd_queue_add: out of memory\n");
3185                         goto error;
3186                 }
3187                 q->crt=(q->crt-&q->data[0])+tmp;
3188                 q->data=tmp;
3189                 q->end=&q->data[new_size];
3190         }
3191         q->crt->tcp_conn=t;
3192         q->crt->unix_sock=unix_sock;
3193         q->crt->expire=get_ticks_raw()+SEND_FD_QUEUE_TIMEOUT;
3194         q->crt->retries=0;
3195         q->crt++;
3196         return 0;
3197 error:
3198         return -1;
3199 }
3200
3201
3202
3203 inline static void send_fd_queue_run(struct tcp_send_fd_q* q)
3204 {
3205         struct send_fd_info* p;
3206         struct send_fd_info* t;
3207         
3208         for (p=t=&q->data[0]; p<q->crt; p++){
3209                 if (unlikely(p->tcp_conn->state == S_CONN_BAD ||
3210                                          p->tcp_conn->flags & F_CONN_FD_CLOSED ||
3211                                          p->tcp_conn->s ==-1)) {
3212                         /* bad and/or already closed connection => remove */
3213                         goto rm_con;
3214                 }
3215                 if (unlikely(send_fd(p->unix_sock, &(p->tcp_conn),
3216                                         sizeof(struct tcp_connection*), p->tcp_conn->s)<=0)){
3217                         if ( ((errno==EAGAIN)||(errno==EWOULDBLOCK)) && 
3218                                                         ((s_ticks_t)(p->expire-get_ticks_raw())>0)){
3219                                 /* leave in queue for a future try */
3220                                 *t=*p;
3221                                 t->retries++;
3222                                 t++;
3223                         }else{
3224                                 LOG(L_ERR, "ERROR: run_send_fd_queue: send_fd failed"
3225                                                    " on socket %d , queue entry %ld, retries %d,"
3226                                                    " connection %p, tcp socket %d, errno=%d (%s) \n",
3227                                                    p->unix_sock, (long)(p-&q->data[0]), p->retries,
3228                        &