75feb6d505b57233b73f8e0ad49cc25515a7d86c
[kamailio] / src / core / tcp_main.c
1 /*
2  * Copyright (C) 2001-2003 FhG Fokus
3  *
4  * This file is part of Kamailio, a free SIP server.
5  *
6  * Kamailio is free software; you can redistribute it and/or modify
7  * it under the terms of the GNU General Public License as published by
8  * the Free Software Foundation; either version 2 of the License, or
9  * (at your option) any later version
10  *
11  * Kamailio is distributed in the hope that it will be useful,
12  * but WITHOUT ANY WARRANTY; without even the implied warranty of
13  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
14  * GNU General Public License for more details.
15  *
16  * You should have received a copy of the GNU General Public License
17  * along with this program; if not, write to the Free Software
18  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
19  */
20
21 /** Kamailio core: tcp main/dispatcher and tcp send functions.
22  * @file tcp_main.c
23  * @ingroup core
24  * Module: @ref core
25  */
26
27
28 #ifdef USE_TCP
29
30
31 #ifndef SHM_MEM
32 #error "shared memory support needed (add -DSHM_MEM to Makefile.defs)"
33 #endif
34
35 #define HANDLE_IO_INLINE
36 #include "io_wait.h" /* include first to make sure the needed features are
37                                                 turned on (e.g. _GNU_SOURCE for POLLRDHUP) */
38
39 #include <sys/time.h>
40 #include <sys/types.h>
41 #include <sys/select.h>
42 #include <sys/socket.h>
43 #ifdef HAVE_FILIO_H
44 #include <sys/filio.h> /* needed on solaris 2.x for FIONREAD */
45 #elif defined __OS_solaris
46 #define BSD_COMP  /* needed on older solaris for FIONREAD */
47 #endif /* HAVE_FILIO_H / __OS_solaris */
48 #include <sys/ioctl.h>  /* ioctl() used on write error */
49 #include <netinet/in.h>
50 #include <netinet/in_systm.h>
51 #include <netinet/ip.h>
52 #include <netinet/tcp.h>
53 #include <sys/uio.h>  /* writev*/
54 #include <netdb.h>
55 #include <stdlib.h> /*exit() */
56
57 #include <unistd.h>
58
59 #include <errno.h>
60 #include <string.h>
61
62 #ifdef HAVE_SELECT
63 #include <sys/select.h>
64 #endif
65 #include <poll.h>
66
67
68 #include "ip_addr.h"
69 #include "pass_fd.h"
70 #include "tcp_conn.h"
71 #include "globals.h"
72 #include "pt.h"
73 #include "locking.h"
74 #include "mem/mem.h"
75 #include "mem/shm_mem.h"
76 #include "timer.h"
77 #include "sr_module.h"
78 #include "tcp_server.h"
79 #include "tcp_init.h"
80 #include "tcp_int_send.h"
81 #include "tcp_stats.h"
82 #include "tcp_ev.h"
83 #include "tsend.h"
84 #include "timer_ticks.h"
85 #include "local_timer.h"
86 #ifdef CORE_TLS
87 #include "tls/tls_server.h"
88 #define tls_loaded() 1
89 #else
90 #include "tls_hooks_init.h"
91 #include "tls_hooks.h"
92 #endif /* CORE_TLS*/
93 #ifdef USE_DST_BLACKLIST
94 #include "dst_blacklist.h"
95 #endif /* USE_DST_BLACKLIST */
96
97 #include "tcp_info.h"
98 #include "tcp_options.h"
99 #include "ut.h"
100 #include "cfg/cfg_struct.h"
101
102 #define local_malloc pkg_malloc
103 #define local_free   pkg_free
104
105 #include <fcntl.h> /* must be included after io_wait.h if SIGIO_RT is used */
106
107
108 #ifdef NO_MSG_DONTWAIT
109 #ifndef MSG_DONTWAIT
110 /* should work inside tcp_main */
111 #define MSG_DONTWAIT 0
112 #endif
113 #endif /*NO_MSG_DONTWAIT */
114
115
116 #define TCP_PASS_NEW_CONNECTION_ON_DATA /* don't pass a new connection
117                                                                                    immediately to a child, wait for
118                                                                                    some data on it first */
119 #define TCP_LISTEN_BACKLOG 1024
120 #define SEND_FD_QUEUE /* queue send fd requests on EAGAIN, instead of sending 
121                                                         them immediately */
122 #define TCP_CHILD_NON_BLOCKING 
123 #ifdef SEND_FD_QUEUE
124 #ifndef TCP_CHILD_NON_BLOCKING
125 #define TCP_CHILD_NON_BLOCKING
126 #endif
127 #define MAX_SEND_FD_QUEUE_SIZE  tcp_main_max_fd_no
128 #define SEND_FD_QUEUE_SIZE              128  /* initial size */
129 #define SEND_FD_QUEUE_TIMEOUT   MS_TO_TICKS(2000)  /* 2 s */
130 #endif
131
132 /* minimum interval local_timer_run() is allowed to run, in ticks */
133 #define TCPCONN_TIMEOUT_MIN_RUN 1  /* once per tick */
134 #define TCPCONN_WAIT_TIMEOUT 1 /* 1 tick */
135
136 #ifdef TCP_ASYNC
137 static unsigned int* tcp_total_wq=0;
138 #endif
139
140
141 enum fd_types { F_NONE, F_SOCKINFO /* a tcp_listen fd */,
142                                 F_TCPCONN, F_TCPCHILD, F_PROC };
143
144
145 #ifdef TCP_FD_CACHE
146
147 #define TCP_FD_CACHE_SIZE 8
148
149 struct fd_cache_entry{
150         struct tcp_connection* con;
151         int id;
152         int fd;
153 };
154
155
156 static struct fd_cache_entry fd_cache[TCP_FD_CACHE_SIZE];
157 #endif /* TCP_FD_CACHE */
158
159 static int is_tcp_main=0;
160
161
162 enum poll_types tcp_poll_method=0; /* by default choose the best method */
163 int tcp_main_max_fd_no=0;
164 int tcp_max_connections=DEFAULT_TCP_MAX_CONNECTIONS;
165 int tls_max_connections=DEFAULT_TLS_MAX_CONNECTIONS;
166
167 static union sockaddr_union tcp_source_ipv4_addr; /* saved bind/srv v4 addr. */
168 static union sockaddr_union* tcp_source_ipv4=0;
169 static union sockaddr_union tcp_source_ipv6_addr; /* saved bind/src v6 addr. */
170 static union sockaddr_union* tcp_source_ipv6=0;
171
172 static int* tcp_connections_no=0; /* current tcp (+tls) open connections */
173 static int* tls_connections_no=0; /* current tls open connections */
174
175 /* connection hash table (after ip&port) , includes also aliases */
176 struct tcp_conn_alias** tcpconn_aliases_hash=0;
177 /* connection hash table (after connection id) */
178 struct tcp_connection** tcpconn_id_hash=0;
179 gen_lock_t* tcpconn_lock=0;
180
181 struct tcp_child* tcp_children=0;
182 static int* connection_id=0; /*  unique for each connection, used for 
183                                                                 quickly finding the corresponding connection
184                                                                 for a reply */
185 int unix_tcp_sock;
186
187 static int tcp_proto_no=-1; /* tcp protocol number as returned by
188                                                            getprotobyname */
189
190 static io_wait_h io_h;
191
192 static struct local_timer tcp_main_ltimer;
193 static ticks_t tcp_main_prev_ticks;
194
195 /* tell if there are tcp workers that should handle only specific socket
196  * - used to optimize the search of least loaded worker for a tcp socket
197  * - 0 - no workers per tcp sockets have been set
198  * - 1 + generic_workers - when there are workers per tcp sockets
199  */
200 static int tcp_sockets_gworkers = 0;
201
202 static ticks_t tcpconn_main_timeout(ticks_t , struct timer_ln* , void* );
203
204 inline static int _tcpconn_add_alias_unsafe(struct tcp_connection* c, int port,
205                                                                                 struct ip_addr* l_ip, int l_port,
206                                                                                 int flags);
207
208
209
210 /* sets source address used when opening new sockets and no source is specified
211  *  (by default the address is choosen by the kernel)
212  * Should be used only on init.
213  * returns -1 on error */
214 int tcp_set_src_addr(struct ip_addr* ip)
215 {
216         switch (ip->af){
217                 case AF_INET:
218                         ip_addr2su(&tcp_source_ipv4_addr, ip, 0);
219                         tcp_source_ipv4=&tcp_source_ipv4_addr;
220                         break;
221                 case AF_INET6:
222                         ip_addr2su(&tcp_source_ipv6_addr, ip, 0);
223                         tcp_source_ipv6=&tcp_source_ipv6_addr;
224                         break;
225                 default:
226                         return -1;
227         }
228         return 0;
229 }
230
231
232
233 static inline int init_sock_keepalive(int s)
234 {
235         int optval;
236         
237 #ifdef HAVE_SO_KEEPALIVE
238         if (cfg_get(tcp, tcp_cfg, keepalive)){
239                 optval=1;
240                 if (setsockopt(s, SOL_SOCKET, SO_KEEPALIVE, &optval,
241                                                 sizeof(optval))<0){
242                         LM_WARN("failed to enable SO_KEEPALIVE: %s\n", strerror(errno));
243                         return -1;
244                 }
245         }
246 #endif
247 #ifdef HAVE_TCP_KEEPINTVL
248         if ((optval=cfg_get(tcp, tcp_cfg, keepintvl))){
249                 if (setsockopt(s, IPPROTO_TCP, TCP_KEEPINTVL, &optval,
250                                                 sizeof(optval))<0){
251                         LM_WARN("failed to set keepalive probes interval: %s\n", strerror(errno));
252                 }
253         }
254 #endif
255 #ifdef HAVE_TCP_KEEPIDLE
256         if ((optval=cfg_get(tcp, tcp_cfg, keepidle))){
257                 if (setsockopt(s, IPPROTO_TCP, TCP_KEEPIDLE, &optval,
258                                                 sizeof(optval))<0){
259                         LM_WARN("failed to set keepalive idle interval: %s\n", strerror(errno));
260                 }
261         }
262 #endif
263 #ifdef HAVE_TCP_KEEPCNT
264         if ((optval=cfg_get(tcp, tcp_cfg, keepcnt))){
265                 if (setsockopt(s, IPPROTO_TCP, TCP_KEEPCNT, &optval,
266                                                 sizeof(optval))<0){
267                         LM_WARN("failed to set maximum keepalive count: %s\n", strerror(errno));
268                 }
269         }
270 #endif
271         return 0;
272 }
273
274
275
276 /* set all socket/fd options for new sockets (e.g. before connect): 
277  *  disable nagle, tos lowdelay, reuseaddr, non-blocking
278  *
279  * return -1 on error */
280 static int init_sock_opt(int s, int af)
281 {
282         int flags;
283         int optval;
284         
285 #ifdef DISABLE_NAGLE
286         flags=1;
287         if ( (tcp_proto_no!=-1) && (setsockopt(s, tcp_proto_no , TCP_NODELAY,
288                                         &flags, sizeof(flags))<0) ){
289                 LM_WARN("could not disable Nagle: %s\n", strerror(errno));
290         }
291 #endif
292         /* tos*/
293         optval = tos;
294         if(af==AF_INET){
295                 if (setsockopt(s, IPPROTO_IP, IP_TOS, (void*)&optval,
296                                         sizeof(optval)) ==-1){
297                         LM_WARN("setsockopt tos: %s\n", strerror(errno));
298                         /* continue since this is not critical */
299                 }
300         } else if(af==AF_INET6){
301                 if (setsockopt(s, IPPROTO_IPV6, IPV6_TCLASS,
302                                         (void*)&optval, sizeof(optval)) ==-1) {
303                         LM_WARN("setsockopt v6 tos: %s\n", strerror(errno));
304                         /* continue since this is not critical */
305                 }
306         }
307
308 #if  !defined(TCP_DONT_REUSEADDR) 
309         optval=1;
310         if (setsockopt(s, SOL_SOCKET, SO_REUSEADDR,
311                                                 (void*)&optval, sizeof(optval))==-1){
312                 LM_ERR("setsockopt SO_REUSEADDR %s\n", strerror(errno));
313                 /* continue, not critical */
314         }
315 #endif /* !TCP_DONT_REUSEADDR */
316
317 #ifdef SO_REUSEPORT
318         if ((optval=cfg_get(tcp, tcp_cfg, reuse_port))) {
319                 if (setsockopt(s, SOL_SOCKET, SO_REUSEPORT,
320                                 (void*)&optval, sizeof(optval))==-1) {
321                         LM_ERR("setsockopt %s\n", strerror(errno));
322                 }
323         }
324 #endif
325
326 #ifdef HAVE_TCP_SYNCNT
327         if ((optval=cfg_get(tcp, tcp_cfg, syncnt))){
328                 if (setsockopt(s, IPPROTO_TCP, TCP_SYNCNT, &optval,
329                                                 sizeof(optval))<0){
330                         LM_WARN("failed to set maximum SYN retr. count: %s\n", strerror(errno));
331                 }
332         }
333 #endif
334 #ifdef HAVE_TCP_LINGER2
335         if ((optval=cfg_get(tcp, tcp_cfg, linger2))){
336                 if (setsockopt(s, IPPROTO_TCP, TCP_LINGER2, &optval,
337                                                 sizeof(optval))<0){
338                         LM_WARN("failed to set maximum LINGER2 timeout: %s\n", strerror(errno));
339                 }
340         }
341 #endif
342 #ifdef HAVE_TCP_QUICKACK
343         if (cfg_get(tcp, tcp_cfg, delayed_ack)){
344                 optval=0; /* reset quick ack => delayed ack */
345                 if (setsockopt(s, IPPROTO_TCP, TCP_QUICKACK, &optval,
346                                                 sizeof(optval))<0){
347                         LM_WARN("failed to reset TCP_QUICKACK: %s\n", strerror(errno));
348                 }
349         }
350 #endif /* HAVE_TCP_QUICKACK */
351         init_sock_keepalive(s);
352         
353         /* non-blocking */
354         flags=fcntl(s, F_GETFL);
355         if (flags==-1){
356                 LM_ERR("fnctl failed: (%d) %s\n", errno, strerror(errno));
357                 goto error;
358         }
359         if (fcntl(s, F_SETFL, flags|O_NONBLOCK)==-1){
360                 LM_ERR("fcntl: set non-blocking failed: (%d) %s\n", errno, strerror(errno));
361                 goto error;
362         }
363         return 0;
364 error:
365         return -1;
366 }
367
368
369
370 /* set all socket/fd options for "accepted" sockets 
371  *  only nonblocking is set since the rest is inherited from the
372  *  "parent" (listening) socket
373  *  Note: setting O_NONBLOCK is required on linux but it's not needed on
374  *        BSD and possibly solaris (where the flag is inherited from the 
375  *        parent socket). However since there is no standard document 
376  *        requiring a specific behaviour in this case it's safer to always set
377  *        it (at least for now)  --andrei
378  *  TODO: check on which OSes  O_NONBLOCK is inherited and make this 
379  *        function a nop.
380  *
381  * return -1 on error */
382 static int init_sock_opt_accept(int s)
383 {
384         int flags;
385         
386         /* non-blocking */
387         flags=fcntl(s, F_GETFL);
388         if (flags==-1){
389                 LM_ERR("fnctl failed: (%d) %s\n", errno, strerror(errno));
390                 goto error;
391         }
392         if (fcntl(s, F_SETFL, flags|O_NONBLOCK)==-1){
393                 LM_ERR("fcntl: set non-blocking failed: (%d) %s\n", errno, strerror(errno));
394                 goto error;
395         }
396         return 0;
397 error:
398         return -1;
399 }
400
401
402
403 /** close a socket, handling errno.
404  * On EINTR, repeat the close().
405  * Filter expected errors (return success if close() failed because
406  * EPIPE, ECONNRST a.s.o). Note that this happens on *BSDs (on linux close()
407  * does not fail for socket level errors).
408  * @param s - open valid socket.
409  * @return - 0 on success, < 0 on error (whatever close() returns). On error
410  *           errno is set.
411  */
412 static int tcp_safe_close(int s)
413 {
414         int ret;
415 retry:
416         if (unlikely((ret = close(s)) < 0 )) {
417                 switch(errno) {
418                         case EINTR:
419                                 goto retry;
420                         case EPIPE:
421                         case ENOTCONN:
422                         case ECONNRESET:
423                         case ECONNREFUSED:
424                         case ENETUNREACH:
425                         case EHOSTUNREACH:
426                                 /* on *BSD we really get these errors at close() time 
427                                    => ignore them */
428                                 ret = 0;
429                                 break;
430                         default:
431                                 break;
432                 }
433         }
434         return ret;
435 }
436
437
438
439 /* blocking connect on a non-blocking fd; it will timeout after
440  * tcp_connect_timeout 
441  * if BLOCKING_USE_SELECT and HAVE_SELECT are defined it will internally
442  * use select() instead of poll (bad if fd > FD_SET_SIZE, poll is preferred)
443  */
444 static int tcp_blocking_connect(int fd, int type, snd_flags_t* send_flags,
445                                                                 const struct sockaddr *servaddr,
446                                                                 socklen_t addrlen)
447 {
448         int n;
449 #if defined(HAVE_SELECT) && defined(BLOCKING_USE_SELECT)
450         fd_set sel_set;
451         fd_set orig_set;
452         struct timeval timeout;
453 #else
454         struct pollfd pf;
455 #endif
456         int elapsed;
457         int to;
458         int ticks;
459         int err;
460         unsigned int err_len;
461         int poll_err;
462         
463         poll_err=0;
464         to=cfg_get(tcp, tcp_cfg, connect_timeout_s);
465         ticks=get_ticks();
466 again:
467         n=connect(fd, servaddr, addrlen);
468         if (n==-1){
469                 if (errno==EINTR){
470                         elapsed=(get_ticks()-ticks)*TIMER_TICK;
471                         if (elapsed<to)         goto again;
472                         else goto error_timeout;
473                 }
474                 if (errno!=EINPROGRESS && errno!=EALREADY){
475                         goto error_errno;
476                 }
477         }else goto end;
478         
479         /* poll/select loop */
480 #if defined(HAVE_SELECT) && defined(BLOCKING_USE_SELECT)
481                 FD_ZERO(&orig_set);
482                 FD_SET(fd, &orig_set);
483 #else
484                 pf.fd=fd;
485                 pf.events=POLLOUT;
486 #endif
487         while(1){
488                 elapsed=(get_ticks()-ticks)*TIMER_TICK;
489                 if (elapsed>=to)
490                         goto error_timeout;
491 #if defined(HAVE_SELECT) && defined(BLOCKING_USE_SELECT)
492                 sel_set=orig_set;
493                 timeout.tv_sec=to-elapsed;
494                 timeout.tv_usec=0;
495                 n=select(fd+1, 0, &sel_set, 0, &timeout);
496 #else
497                 n=poll(&pf, 1, (to-elapsed)*1000);
498 #endif
499                 if (n<0){
500                         if (errno==EINTR) continue;
501                         LM_ERR("%s: poll/select failed: (%d) %s\n",
502                                         su2a((union sockaddr_union*)servaddr, addrlen),
503                                         errno, strerror(errno));
504                         goto error;
505                 }else if (n==0) /* timeout */ continue;
506 #if defined(HAVE_SELECT) && defined(BLOCKING_USE_SELECT)
507                 if (FD_ISSET(fd, &sel_set))
508 #else
509                 if (pf.revents&(POLLERR|POLLHUP|POLLNVAL)){ 
510                         LM_ERR("%s: poll error: flags %x\n",
511                                         su2a((union sockaddr_union*)servaddr, addrlen),
512                                         pf.revents);
513                         poll_err=1;
514                 }
515 #endif
516                 {
517                         err_len=sizeof(err);
518                         getsockopt(fd, SOL_SOCKET, SO_ERROR, &err, &err_len);
519                         if ((err==0) && (poll_err==0)) goto end;
520                         if (err!=EINPROGRESS && err!=EALREADY){
521                                 LM_ERR("%s: SO_ERROR (%d) %s\n",
522                                                 su2a((union sockaddr_union*)servaddr, addrlen),
523                                                 err, strerror(err));
524                                 errno=err;
525                                 goto error_errno;
526                         }
527                 }
528         }
529 error_errno:
530         switch(errno){
531                 case ENETUNREACH:
532                 case EHOSTUNREACH:
533 #ifdef USE_DST_BLACKLIST
534                         dst_blacklist_su(BLST_ERR_CONNECT, type,
535                                                          (union sockaddr_union*)servaddr, send_flags, 0);
536 #endif /* USE_DST_BLACKLIST */
537                         TCP_EV_CONNECT_UNREACHABLE(errno, 0, 0,
538                                                         (union sockaddr_union*)servaddr, type);
539                         break;
540                 case ETIMEDOUT:
541 #ifdef USE_DST_BLACKLIST
542                         dst_blacklist_su(BLST_ERR_CONNECT, type,
543                                                          (union sockaddr_union*)servaddr, send_flags, 0);
544 #endif /* USE_DST_BLACKLIST */
545                         TCP_EV_CONNECT_TIMEOUT(errno, 0, 0,
546                                                         (union sockaddr_union*)servaddr, type);
547                         break;
548                 case ECONNREFUSED:
549                 case ECONNRESET:
550 #ifdef USE_DST_BLACKLIST
551                         dst_blacklist_su(BLST_ERR_CONNECT, type,
552                                                          (union sockaddr_union*)servaddr, send_flags, 0);
553 #endif /* USE_DST_BLACKLIST */
554                         TCP_EV_CONNECT_RST(errno, 0, 0,
555                                                         (union sockaddr_union*)servaddr, type);
556                         break;
557                 case EAGAIN: /* not posix, but supported on linux and bsd */
558                         TCP_EV_CONNECT_NO_MORE_PORTS(errno, 0, 0,
559                                                         (union sockaddr_union*)servaddr, type);
560                         break;
561                 default:
562                         TCP_EV_CONNECT_ERR(errno, 0, 0,
563                                                                 (union sockaddr_union*)servaddr, type);
564         }
565         LM_ERR("%s: (%d) %s\n",
566                         su2a((union sockaddr_union*)servaddr, addrlen),
567                         errno, strerror(errno));
568         goto error;
569 error_timeout:
570         /* timeout */
571 #ifdef USE_DST_BLACKLIST
572         dst_blacklist_su(BLST_ERR_CONNECT, type,
573                                                 (union sockaddr_union*)servaddr, send_flags, 0);
574 #endif /* USE_DST_BLACKLIST */
575         TCP_EV_CONNECT_TIMEOUT(0, 0, 0, (union sockaddr_union*)servaddr, type);
576         LM_ERR("%s: timeout %d s elapsed from %d s\n",
577                                 su2a((union sockaddr_union*)servaddr, addrlen),
578                                 elapsed, cfg_get(tcp, tcp_cfg, connect_timeout_s));
579 error:
580         TCP_STATS_CONNECT_FAILED();
581         return -1;
582 end:
583         return 0;
584 }
585
586
587
588 #ifdef TCP_ASYNC
589
590
591 /* unsafe version */
592 #define _wbufq_empty(con) ((con)->wbuf_q.first==0)
593 /* unsafe version */
594 #define _wbufq_non_empty(con) ((con)->wbuf_q.first!=0)
595
596
597 /* unsafe version, call while holding the connection write lock */
598 inline static int _wbufq_add(struct  tcp_connection* c, const char* data, 
599                                                         unsigned int size)
600 {
601         struct tcp_wbuffer_queue* q;
602         struct tcp_wbuffer* wb;
603         unsigned int last_free;
604         unsigned int wb_size;
605         unsigned int crt_size;
606         ticks_t t;
607         
608         q=&c->wbuf_q;
609         t=get_ticks_raw();
610         if (unlikely(   ((q->queued+size)>cfg_get(tcp, tcp_cfg, tcpconn_wq_max)) ||
611                                         ((*tcp_total_wq+size)>cfg_get(tcp, tcp_cfg, tcp_wq_max)) ||
612                                         (q->first &&
613                                         TICKS_LT(q->wr_timeout, t)) )){
614                 LM_ERR("(%d bytes): write queue full or timeout "
615                                         " (%d, total %d, last write %d s ago)\n",
616                                         size, q->queued, *tcp_total_wq,
617                                         TICKS_TO_S(t-(q->wr_timeout-
618                                                                 cfg_get(tcp, tcp_cfg, send_timeout))));
619                 if (q->first && TICKS_LT(q->wr_timeout, t)){
620                         if (unlikely(c->state==S_CONN_CONNECT)){
621 #ifdef USE_DST_BLACKLIST
622                                 (void)dst_blacklist_su( BLST_ERR_CONNECT, c->rcv.proto,
623                                                                                 &c->rcv.src_su, &c->send_flags, 0);
624 #endif /* USE_DST_BLACKLIST */
625                                 TCP_EV_CONNECT_TIMEOUT(0, TCP_LADDR(c), TCP_LPORT(c),
626                                                                                         TCP_PSU(c), TCP_PROTO(c));
627                                 TCP_STATS_CONNECT_FAILED();
628                         }else{
629 #ifdef USE_DST_BLACKLIST
630                                 (void)dst_blacklist_su( BLST_ERR_SEND, c->rcv.proto,
631                                                                         &c->rcv.src_su, &c->send_flags, 0);
632 #endif /* USE_DST_BLACKLIST */
633                                 TCP_EV_SEND_TIMEOUT(0, &c->rcv);
634                                 TCP_STATS_SEND_TIMEOUT();
635                         }
636                 }else{
637                         /* if it's not a timeout => queue full */
638                         TCP_EV_SENDQ_FULL(0, &c->rcv);
639                         TCP_STATS_SENDQ_FULL();
640                 }
641                 goto error;
642         }
643         
644         if (unlikely(q->last==0)){
645                 wb_size=MAX_unsigned(cfg_get(tcp, tcp_cfg, wq_blk_size), size);
646                 wb=shm_malloc(sizeof(*wb)+wb_size-1);
647                 if (unlikely(wb==0))
648                         goto error;
649                 wb->b_size=wb_size;
650                 wb->next=0;
651                 q->last=wb;
652                 q->first=wb;
653                 q->last_used=0;
654                 q->offset=0;
655                 q->wr_timeout=get_ticks_raw()+
656                         ((c->state==S_CONN_CONNECT)?
657                                         S_TO_TICKS(cfg_get(tcp, tcp_cfg, connect_timeout_s)):
658                                         cfg_get(tcp, tcp_cfg, send_timeout));
659         }else{
660                 wb=q->last;
661         }
662         
663         while(size){
664                 last_free=wb->b_size-q->last_used;
665                 if (last_free==0){
666                         wb_size=MAX_unsigned(cfg_get(tcp, tcp_cfg, wq_blk_size), size);
667                         wb=shm_malloc(sizeof(*wb)+wb_size-1);
668                         if (unlikely(wb==0))
669                                 goto error;
670                         wb->b_size=wb_size;
671                         wb->next=0;
672                         q->last->next=wb;
673                         q->last=wb;
674                         q->last_used=0;
675                         last_free=wb->b_size;
676                 }
677                 crt_size=MIN_unsigned(last_free, size);
678                 memcpy(wb->buf+q->last_used, data, crt_size);
679                 q->last_used+=crt_size;
680                 size-=crt_size;
681                 data+=crt_size;
682                 q->queued+=crt_size;
683                 atomic_add_int((int*)tcp_total_wq, crt_size);
684         }
685         return 0;
686 error:
687         return -1;
688 }
689
690
691
692 /* unsafe version, call while holding the connection write lock
693  * inserts data at the beginning, it ignores the max queue size checks and
694  * the timeout (use sparingly)
695  * Note: it should never be called on a write buffer after wbufq_run() */
696 inline static int _wbufq_insert(struct  tcp_connection* c, const char* data, 
697                                                         unsigned int size)
698 {
699         struct tcp_wbuffer_queue* q;
700         struct tcp_wbuffer* wb;
701         
702         q=&c->wbuf_q;
703         if (likely(q->first==0)) /* if empty, use wbufq_add */
704                 return _wbufq_add(c, data, size);
705         
706         if (unlikely((*tcp_total_wq+size)>cfg_get(tcp, tcp_cfg, tcp_wq_max))){
707                 LM_ERR("(%d bytes): write queue full"
708                                         " (%d, total %d, last write %d s ago)\n",
709                                         size, q->queued, *tcp_total_wq,
710                                         TICKS_TO_S(get_ticks_raw()-q->wr_timeout-
711                                                                         cfg_get(tcp, tcp_cfg, send_timeout)));
712                 goto error;
713         }
714         if (unlikely(q->offset)){
715                 LM_CRIT("non-null offset %d (bad call, should"
716                                 "never be called after the wbufq_run())\n", q->offset);
717                 goto error;
718         }
719         if ((q->first==q->last) && ((q->last->b_size-q->last_used)>=size)){
720                 /* one block with enough space in it for size bytes */
721                 memmove(q->first->buf+size, q->first->buf, q->last_used);
722                 memcpy(q->first->buf, data, size);
723                 q->last_used+=size;
724         }else{
725                 /* create a size bytes block directly */
726                 wb=shm_malloc(sizeof(*wb)+size-1);
727                 if (unlikely(wb==0))
728                         goto error;
729                 wb->b_size=size;
730                 /* insert it */
731                 wb->next=q->first;
732                 q->first=wb;
733                 memcpy(wb->buf, data, size);
734         }
735         
736         q->queued+=size;
737         atomic_add_int((int*)tcp_total_wq, size);
738         return 0;
739 error:
740         return -1;
741 }
742
743
744
745 /* unsafe version, call while holding the connection write lock */
746 inline static void _wbufq_destroy( struct  tcp_wbuffer_queue* q)
747 {
748         struct tcp_wbuffer* wb;
749         struct tcp_wbuffer* next_wb;
750         int unqueued;
751         
752         unqueued=0;
753         if (likely(q->first)){
754                 wb=q->first;
755                 do{
756                         next_wb=wb->next;
757                         unqueued+=(wb==q->last)?q->last_used:wb->b_size;
758                         if (wb==q->first)
759                                 unqueued-=q->offset;
760                         shm_free(wb);
761                         wb=next_wb;
762                 }while(wb);
763         }
764         memset(q, 0, sizeof(*q));
765         atomic_add_int((int*)tcp_total_wq, -unqueued);
766 }
767
768
769
770 /* tries to empty the queue  (safe version, c->write_lock must not be hold)
771  * returns -1 on error, bytes written on success (>=0) 
772  * if the whole queue is emptied => sets *empty*/
773 inline static int wbufq_run(int fd, struct tcp_connection* c, int* empty)
774 {
775         struct tcp_wbuffer_queue* q;
776         struct tcp_wbuffer* wb;
777         int n;
778         int ret;
779         int block_size;
780         char* buf;
781         
782         *empty=0;
783         ret=0;
784         lock_get(&c->write_lock);
785         q=&c->wbuf_q;
786         while(q->first){
787                 block_size=((q->first==q->last)?q->last_used:q->first->b_size)-
788                                                 q->offset;
789                 buf=q->first->buf+q->offset;
790                 n=_tcpconn_write_nb(fd, c, buf, block_size);
791                 if (likely(n>0)){
792                         ret+=n;
793                         if (likely(n==block_size)){
794                                 wb=q->first;
795                                 q->first=q->first->next; 
796                                 shm_free(wb);
797                                 q->offset=0;
798                                 q->queued-=block_size;
799                                 atomic_add_int((int*)tcp_total_wq, -block_size);
800                         }else{
801                                 q->offset+=n;
802                                 q->queued-=n;
803                                 atomic_add_int((int*)tcp_total_wq, -n);
804                                 break;
805                         }
806                 }else{
807                         if (n<0){
808                                 /* EINTR is handled inside _tcpconn_write_nb */
809                                 if (!(errno==EAGAIN || errno==EWOULDBLOCK)){
810                                         if (unlikely(c->state==S_CONN_CONNECT)){
811                                                 switch(errno){
812                                                         case ENETUNREACH:
813                                                         case EHOSTUNREACH: /* not posix for send() */
814 #ifdef USE_DST_BLACKLIST
815                                                                 dst_blacklist_su(BLST_ERR_CONNECT,
816                                                                                                         c->rcv.proto,
817                                                                                                         &c->rcv.src_su,
818                                                                                                         &c->send_flags, 0);
819 #endif /* USE_DST_BLACKLIST */
820                                                                 TCP_EV_CONNECT_UNREACHABLE(errno, TCP_LADDR(c),
821                                                                                                         TCP_LPORT(c), TCP_PSU(c),
822                                                                                                         TCP_PROTO(c));
823                                                                 break;
824                                                         case ECONNREFUSED:
825                                                         case ECONNRESET:
826 #ifdef USE_DST_BLACKLIST
827                                                                 dst_blacklist_su(BLST_ERR_CONNECT,
828                                                                                                         c->rcv.proto,
829                                                                                                         &c->rcv.src_su,
830                                                                                                         &c->send_flags, 0);
831 #endif /* USE_DST_BLACKLIST */
832                                                                 TCP_EV_CONNECT_RST(0, TCP_LADDR(c),
833                                                                                                         TCP_LPORT(c), TCP_PSU(c),
834                                                                                                         TCP_PROTO(c));
835                                                                 break;
836                                                         default:
837                                                                 TCP_EV_CONNECT_ERR(errno, TCP_LADDR(c),
838                                                                                                         TCP_LPORT(c), TCP_PSU(c),
839                                                                                                         TCP_PROTO(c));
840                                                 }
841                                                 TCP_STATS_CONNECT_FAILED();
842                                         }else{
843                                                 switch(errno){
844                                                         case ECONNREFUSED:
845                                                         case ECONNRESET:
846                                                                 TCP_STATS_CON_RESET();
847                                                                 /* no break */
848                                                         case ENETUNREACH:
849                                                         case EHOSTUNREACH: /* not posix for send() */
850 #ifdef USE_DST_BLACKLIST
851                                                                 dst_blacklist_su(BLST_ERR_SEND,
852                                                                                                         c->rcv.proto,
853                                                                                                         &c->rcv.src_su,
854                                                                                                         &c->send_flags, 0);
855 #endif /* USE_DST_BLACKLIST */
856                                                                 break;
857                                                 }
858                                         }
859                                         ret=-1;
860                                         LM_ERR("%s [%d]\n", strerror(errno), errno);
861                                 }
862                         }
863                         break;
864                 }
865         }
866         if (likely(q->first==0)){
867                 q->last=0;
868                 q->last_used=0;
869                 q->offset=0;
870                 *empty=1;
871         }
872         lock_release(&c->write_lock);
873         if (likely(ret>0)){
874                 q->wr_timeout=get_ticks_raw()+cfg_get(tcp, tcp_cfg, send_timeout);
875                 if (unlikely(c->state==S_CONN_CONNECT || c->state==S_CONN_ACCEPT)){
876                         TCP_STATS_ESTABLISHED(c->state);
877                         c->state=S_CONN_OK;
878                 }
879         }
880         return ret;
881 }
882
883 #endif /* TCP_ASYNC */
884
885
886
887 #if 0
888 /* blocking write even on non-blocking sockets 
889  * if TCP_TIMEOUT will return with error */
890 static int tcp_blocking_write(struct tcp_connection* c, int fd, char* buf,
891                                                                 unsigned int len)
892 {
893         int n;
894         fd_set sel_set;
895         struct timeval timeout;
896         int ticks;
897         int initial_len;
898         
899         initial_len=len;
900 again:
901         
902         n=send(fd, buf, len,
903 #ifdef HAVE_MSG_NOSIGNAL
904                         MSG_NOSIGNAL
905 #else
906                         0
907 #endif
908                 );
909         if (n<0){
910                 if (errno==EINTR)       goto again;
911                 else if (errno!=EAGAIN && errno!=EWOULDBLOCK){
912                         LM_ERR("failed to send: (%d) %s\n", errno, strerror(errno));
913                         TCP_EV_SEND_TIMEOUT(errno, &c->rcv);
914                         TCP_STATS_SEND_TIMEOUT();
915                         goto error;
916                 }
917         }else if (n<len){
918                 /* partial write */
919                 buf+=n;
920                 len-=n;
921         }else{
922                 /* success: full write */
923                 goto end;
924         }
925         while(1){
926                 FD_ZERO(&sel_set);
927                 FD_SET(fd, &sel_set);
928                 timeout.tv_sec=tcp_send_timeout;
929                 timeout.tv_usec=0;
930                 ticks=get_ticks();
931                 n=select(fd+1, 0, &sel_set, 0, &timeout);
932                 if (n<0){
933                         if (errno==EINTR) continue; /* signal, ignore */
934                         LM_ERR("select failed: (%d) %s\n", errno, strerror(errno));
935                         goto error;
936                 }else if (n==0){
937                         /* timeout */
938                         if (get_ticks()-ticks>=tcp_send_timeout){
939                                 LM_ERR("send timeout (%d)\n", tcp_send_timeout);
940                                 goto error;
941                         }
942                         continue;
943                 }
944                 if (FD_ISSET(fd, &sel_set)){
945                         /* we can write again */
946                         goto again;
947                 }
948         }
949 error:
950                 return -1;
951 end:
952                 return initial_len;
953 }
954 #endif
955
956
957
958 struct tcp_connection* tcpconn_new(int sock, union sockaddr_union* su,
959                                                                         union sockaddr_union* local_addr,
960                                                                         struct socket_info* ba, int type, 
961                                                                         int state)
962 {
963         struct tcp_connection *c;
964         int rd_b_size;
965         
966         rd_b_size=cfg_get(tcp, tcp_cfg, rd_buf_size);
967         c=shm_malloc(sizeof(struct tcp_connection) + rd_b_size);
968         if (c==0){
969                 LM_ERR("mem. allocation failure\n");
970                 goto error;
971         }
972         memset(c, 0, sizeof(struct tcp_connection)); /* zero init (skip rd buf)*/
973         c->s=sock;
974         c->fd=-1; /* not initialized */
975         if (lock_init(&c->write_lock)==0){
976                 LM_ERR("init lock failed\n");
977                 goto error;
978         }
979         
980         c->rcv.src_su=*su;
981         
982         atomic_set(&c->refcnt, 0);
983         local_timer_init(&c->timer, tcpconn_main_timeout, c, 0);
984         su2ip_addr(&c->rcv.src_ip, su);
985         c->rcv.src_port=su_getport(su);
986         c->rcv.bind_address=ba;
987         if (likely(local_addr)){
988                 su2ip_addr(&c->rcv.dst_ip, local_addr);
989                 c->rcv.dst_port=su_getport(local_addr);
990         }else if (ba){
991                 c->rcv.dst_ip=ba->address;
992                 c->rcv.dst_port=ba->port_no;
993         }
994         print_ip("tcpconn_new: new tcp connection: ", &c->rcv.src_ip, "\n");
995         LM_DBG("on port %d, type %d\n", c->rcv.src_port, type);
996         init_tcp_req(&c->req, (char*)c+sizeof(struct tcp_connection), rd_b_size);
997         c->id=(*connection_id)++;
998         c->rcv.proto_reserved1=0; /* this will be filled before receive_message*/
999         c->rcv.proto_reserved2=0;
1000         c->state=state;
1001         c->extra_data=0;
1002 #ifdef USE_TLS
1003         if (type==PROTO_TLS){
1004                 if (tls_tcpconn_init(c, sock)==-1) goto error;
1005         }else
1006 #endif /* USE_TLS*/
1007         {
1008                 c->type=PROTO_TCP;
1009                 c->rcv.proto=PROTO_TCP;
1010                 c->timeout=get_ticks_raw()+cfg_get(tcp, tcp_cfg, con_lifetime);
1011                 c->lifetime = cfg_get(tcp, tcp_cfg, con_lifetime);
1012         }
1013         
1014         return c;
1015         
1016 error:
1017         if (c) shm_free(c);
1018         return 0;
1019 }
1020
1021
1022
1023 /* do the actual connect, set sock. options a.s.o
1024  * returns socket on success, -1 on error
1025  * sets also *res_local_addr, res_si and state (S_CONN_CONNECT for an
1026  * unfinished connect and S_CONN_OK for a finished one)*/
1027 inline static int tcp_do_connect(       union sockaddr_union* server,
1028                                                                         union sockaddr_union* from,
1029                                                                         int type,
1030                                                                         snd_flags_t* send_flags,
1031                                                                         union sockaddr_union* res_local_addr,
1032                                                                         struct socket_info** res_si,
1033                                                                         enum tcp_conn_states *state
1034                                                                         )
1035 {
1036         int s;
1037         union sockaddr_union my_name;
1038         socklen_t my_name_len;
1039         struct ip_addr ip;
1040 #ifdef TCP_ASYNC
1041         int n;
1042 #endif /* TCP_ASYNC */
1043
1044         s=socket(AF2PF(server->s.sa_family), SOCK_STREAM, 0);
1045         if (unlikely(s==-1)){
1046                 LM_ERR("%s: socket: (%d) %s\n",
1047                                 su2a(server, sizeof(*server)), errno, strerror(errno));
1048                 goto error;
1049         }
1050         if (init_sock_opt(s, server->s.sa_family)<0){
1051                 LM_ERR("%s: init_sock_opt failed\n",
1052                                         su2a(server, sizeof(*server)));
1053                 goto error;
1054         }
1055         
1056         if (unlikely(from && bind(s, &from->s, sockaddru_len(*from)) != 0)){
1057                 LM_WARN("binding to source address %s failed: %s [%d]\n",
1058                                         su2a(from, sizeof(*from)),
1059                                         strerror(errno), errno);
1060         }
1061         *state=S_CONN_OK;
1062 #ifdef TCP_ASYNC
1063         if (likely(cfg_get(tcp, tcp_cfg, async))){
1064 again:
1065                 n=connect(s, &server->s, sockaddru_len(*server));
1066                 if (likely(n==-1)){ /*non-blocking => most probable EINPROGRESS*/
1067                         if (likely(errno==EINPROGRESS))
1068                                 *state=S_CONN_CONNECT;
1069                         else if (errno==EINTR) goto again;
1070                         else if (errno!=EALREADY){
1071                                 switch(errno){
1072                                         case ENETUNREACH:
1073                                         case EHOSTUNREACH:
1074 #ifdef USE_DST_BLACKLIST
1075                                                 dst_blacklist_su(BLST_ERR_CONNECT, type, server,
1076                                                                                         send_flags, 0);
1077 #endif /* USE_DST_BLACKLIST */
1078                                                 TCP_EV_CONNECT_UNREACHABLE(errno, 0, 0, server, type);
1079                                                 break;
1080                                         case ETIMEDOUT:
1081 #ifdef USE_DST_BLACKLIST
1082                                                 dst_blacklist_su(BLST_ERR_CONNECT, type, server,
1083                                                                                         send_flags, 0);
1084 #endif /* USE_DST_BLACKLIST */
1085                                                 TCP_EV_CONNECT_TIMEOUT(errno, 0, 0, server, type);
1086                                                 break;
1087                                         case ECONNREFUSED:
1088                                         case ECONNRESET:
1089 #ifdef USE_DST_BLACKLIST
1090                                                 dst_blacklist_su(BLST_ERR_CONNECT, type, server,
1091                                                                                         send_flags, 0);
1092 #endif /* USE_DST_BLACKLIST */
1093                                                 TCP_EV_CONNECT_RST(errno, 0, 0, server, type);
1094                                                 break;
1095                                         case EAGAIN:/* not posix, but supported on linux and bsd */
1096                                                 TCP_EV_CONNECT_NO_MORE_PORTS(errno, 0, 0, server,type);
1097                                                 break;
1098                                         default:
1099                                                 TCP_EV_CONNECT_ERR(errno, 0, 0, server, type);
1100                                 }
1101                                 TCP_STATS_CONNECT_FAILED();
1102                                 LM_ERR("connect %s: (%d) %s\n",
1103                                                         su2a(server, sizeof(*server)),
1104                                                         errno, strerror(errno));
1105                                 goto error;
1106                         }
1107                 }
1108         }else{
1109 #endif /* TCP_ASYNC */
1110                 if (tcp_blocking_connect(s, type,  send_flags, &server->s,
1111                                                                         sockaddru_len(*server))<0){
1112                         LM_ERR("tcp_blocking_connect %s failed\n",
1113                                                 su2a(server, sizeof(*server)));
1114                         goto error;
1115                 }
1116 #ifdef TCP_ASYNC
1117         }
1118 #endif /* TCP_ASYNC */
1119         if (from){
1120                 su2ip_addr(&ip, from);
1121                 if (!ip_addr_any(&ip))
1122                         /* we already know the source ip, skip the sys. call */
1123                         goto find_socket;
1124         }
1125         my_name_len=sizeof(my_name);
1126         if (unlikely(getsockname(s, &my_name.s, &my_name_len)!=0)){
1127                 LM_ERR("getsockname failed: %s(%d)\n", strerror(errno), errno);
1128                 *res_si=0;
1129                 goto error;
1130         }
1131         from=&my_name; /* update from with the real "from" address */
1132         su2ip_addr(&ip, &my_name);
1133 find_socket:
1134 #ifdef USE_TLS
1135         if (unlikely(type==PROTO_TLS))
1136                 *res_si=find_si(&ip, 0, PROTO_TLS);
1137         else
1138 #endif
1139                 *res_si=find_si(&ip, 0, PROTO_TCP);
1140         
1141         if (unlikely(*res_si==0)){
1142                 LM_WARN("%s: could not find corresponding"
1143                                 " listening socket for %s, using default...\n",
1144                                         su2a(server, sizeof(*server)), ip_addr2a(&ip));
1145                 if (server->s.sa_family==AF_INET) *res_si=sendipv4_tcp;
1146                 else *res_si=sendipv6_tcp;
1147         }
1148         *res_local_addr=*from;
1149         return s;
1150 error:
1151         if (s!=-1) tcp_safe_close(s);
1152         return -1;
1153 }
1154
1155
1156
1157 struct tcp_connection* tcpconn_connect( union sockaddr_union* server,
1158                                                                                 union sockaddr_union* from,
1159                                                                                 int type, snd_flags_t* send_flags)
1160 {
1161         int s;
1162         struct socket_info* si;
1163         union sockaddr_union my_name;
1164         struct tcp_connection* con;
1165         enum tcp_conn_states state;
1166
1167         s=-1;
1168         
1169         if (*tcp_connections_no >= cfg_get(tcp, tcp_cfg, max_connections)){
1170                 LM_ERR("maximum number of connections exceeded (%d/%d)\n",
1171                                         *tcp_connections_no,
1172                                         cfg_get(tcp, tcp_cfg, max_connections));
1173                 goto error;
1174         }
1175         if (unlikely(type==PROTO_TLS)) {
1176                 if (*tls_connections_no >= cfg_get(tcp, tcp_cfg, max_tls_connections)){
1177                         LM_ERR("maximum number of tls connections"
1178                                                 " exceeded (%d/%d)\n",
1179                                                 *tls_connections_no,
1180                                                 cfg_get(tcp, tcp_cfg, max_tls_connections));
1181                         goto error;
1182                 }
1183         }
1184
1185         s=tcp_do_connect(server, from, type,  send_flags, &my_name, &si, &state);
1186         if (s==-1){
1187                 LM_ERR("tcp_do_connect %s: failed (%d) %s\n",
1188                                 su2a(server, sizeof(*server)), errno, strerror(errno));
1189                 goto error;
1190         }
1191         con=tcpconn_new(s, server, &my_name, si, type, state);
1192         if (con==0){
1193                 LM_ERR("%s: tcpconn_new failed, closing the "
1194                                  " socket\n", su2a(server, sizeof(*server)));
1195                 goto error;
1196         }
1197         tcpconn_set_send_flags(con, *send_flags);
1198         return con;
1199 error:
1200         if (s!=-1) tcp_safe_close(s); /* close the opened socket */
1201         return 0;
1202 }
1203
1204
1205
1206 #ifdef TCP_CONNECT_WAIT
1207 int tcpconn_finish_connect( struct tcp_connection* c,
1208                                                                                                 union sockaddr_union* from)
1209 {
1210         int s;
1211         int r;
1212         union sockaddr_union local_addr;
1213         struct socket_info* si;
1214         enum tcp_conn_states state;
1215         struct tcp_conn_alias* a;
1216         int new_conn_alias_flags;
1217         
1218         s=tcp_do_connect(&c->rcv.src_su, from, c->type, &c->send_flags,
1219                                                 &local_addr, &si, &state);
1220         if (unlikely(s==-1)){
1221                 LM_ERR("%s: tcp_do_connect for %p failed\n",
1222                                         su2a(&c->rcv.src_su, sizeof(c->rcv.src_su)), c);
1223                 return -1;
1224         }
1225         c->rcv.bind_address=si;
1226         su2ip_addr(&c->rcv.dst_ip, &local_addr);
1227         c->rcv.dst_port=su_getport(&local_addr);
1228         /* update aliases if needed */
1229         if (likely(from==0)){
1230                 new_conn_alias_flags=cfg_get(tcp, tcp_cfg, new_conn_alias_flags);
1231                 /* add aliases */
1232                 TCPCONN_LOCK;
1233                 _tcpconn_add_alias_unsafe(c, c->rcv.src_port, &c->rcv.dst_ip, 0,
1234                                                                                                         new_conn_alias_flags);
1235                 _tcpconn_add_alias_unsafe(c, c->rcv.src_port, &c->rcv.dst_ip,
1236                                                                         c->rcv.dst_port, new_conn_alias_flags);
1237                 TCPCONN_UNLOCK;
1238         }else if (su_cmp(from, &local_addr)!=1){
1239                 new_conn_alias_flags=cfg_get(tcp, tcp_cfg, new_conn_alias_flags);
1240                 TCPCONN_LOCK;
1241                         /* remove all the aliases except the first one and re-add them
1242                          * (there shouldn't be more then the 3 default aliases at this 
1243                          * stage) */
1244                         if (c->aliases > 1) {
1245                                 for (r=1; r<c->aliases; r++){
1246                                         a=&c->con_aliases[r];
1247                                         tcpconn_listrm(tcpconn_aliases_hash[a->hash],
1248                                                                         a, next, prev);
1249                                 }
1250                                 c->aliases=1;
1251                         }
1252                         /* add the local_ip:0 and local_ip:local_port aliases */
1253                         _tcpconn_add_alias_unsafe(c, c->rcv.src_port, &c->rcv.dst_ip,
1254                                                                                                 0, new_conn_alias_flags);
1255                         _tcpconn_add_alias_unsafe(c, c->rcv.src_port, &c->rcv.dst_ip,
1256                                                                         c->rcv.dst_port, new_conn_alias_flags);
1257                 TCPCONN_UNLOCK;
1258         }
1259         
1260         return s;
1261 }
1262 #endif /* TCP_CONNECT_WAIT */
1263
1264
1265
1266 /* adds a tcp connection to the tcpconn hashes
1267  * Note: it's called _only_ from the tcp_main process */
1268 inline static struct tcp_connection*  tcpconn_add(struct tcp_connection *c)
1269 {
1270         struct ip_addr zero_ip;
1271         int new_conn_alias_flags;
1272
1273         if (likely(c)){
1274                 ip_addr_mk_any(c->rcv.src_ip.af, &zero_ip);
1275                 c->id_hash=tcp_id_hash(c->id);
1276                 c->aliases=0;
1277                 new_conn_alias_flags=cfg_get(tcp, tcp_cfg, new_conn_alias_flags);
1278                 TCPCONN_LOCK;
1279                 c->flags|=F_CONN_HASHED;
1280                 /* add it at the begining of the list*/
1281                 tcpconn_listadd(tcpconn_id_hash[c->id_hash], c, id_next, id_prev);
1282                 /* set the aliases */
1283                 /* first alias is for (peer_ip, peer_port, 0 ,0) -- for finding
1284                  *  any connection to peer_ip, peer_port
1285                  * the second alias is for (peer_ip, peer_port, local_addr, 0) -- for
1286                  *  finding any conenction to peer_ip, peer_port from local_addr 
1287                  * the third alias is for (peer_ip, peer_port, local_addr, local_port) 
1288                  *   -- for finding if a fully specified connection exists */
1289                 _tcpconn_add_alias_unsafe(c, c->rcv.src_port, &zero_ip, 0,
1290                                                                                                         new_conn_alias_flags);
1291                 if (likely(c->rcv.dst_ip.af && ! ip_addr_any(&c->rcv.dst_ip))){
1292                         _tcpconn_add_alias_unsafe(c, c->rcv.src_port, &c->rcv.dst_ip, 0,
1293                                                                                                         new_conn_alias_flags);
1294                         _tcpconn_add_alias_unsafe(c, c->rcv.src_port, &c->rcv.dst_ip,
1295                                                                         c->rcv.dst_port, new_conn_alias_flags);
1296                 }
1297                 /* ignore add_alias errors, there are some valid cases when one
1298                  *  of the add_alias would fail (e.g. first add_alias for 2 connections
1299                  *   with the same destination but different src. ip*/
1300                 TCPCONN_UNLOCK;
1301                 LM_DBG("hashes: %d:%d:%d, %d\n",
1302                                                                                                 c->con_aliases[0].hash,
1303                                                                                                 c->con_aliases[1].hash,
1304                                                                                                 c->con_aliases[2].hash,
1305                                                                                                 c->id_hash);
1306                 return c;
1307         }else{
1308                 LM_CRIT("null connection pointer\n");
1309                 return 0;
1310         }
1311 }
1312
1313
1314 static inline void _tcpconn_detach(struct tcp_connection *c)
1315 {
1316         int r;
1317         tcpconn_listrm(tcpconn_id_hash[c->id_hash], c, id_next, id_prev);
1318         /* remove all the aliases */
1319         for (r=0; r<c->aliases; r++)
1320                 tcpconn_listrm(tcpconn_aliases_hash[c->con_aliases[r].hash], 
1321                                                 &c->con_aliases[r], next, prev);
1322         c->aliases = 0;
1323 }
1324
1325
1326
1327 static inline void _tcpconn_free(struct tcp_connection* c)
1328 {
1329 #ifdef TCP_ASYNC
1330         if (unlikely(_wbufq_non_empty(c)))
1331                 _wbufq_destroy(&c->wbuf_q);
1332 #endif
1333         lock_destroy(&c->write_lock);
1334 #ifdef USE_TLS
1335         if (unlikely(c->type==PROTO_TLS || c->type==PROTO_WSS)) tls_tcpconn_clean(c);
1336 #endif
1337         shm_free(c);
1338 }
1339
1340
1341
1342 /* unsafe tcpconn_rm version (nolocks) */
1343 void _tcpconn_rm(struct tcp_connection* c)
1344 {
1345         _tcpconn_detach(c);
1346         _tcpconn_free(c);
1347 }
1348
1349
1350
1351 void tcpconn_rm(struct tcp_connection* c)
1352 {
1353         int r;
1354         TCPCONN_LOCK;
1355         tcpconn_listrm(tcpconn_id_hash[c->id_hash], c, id_next, id_prev);
1356         /* remove all the aliases */
1357         for (r=0; r<c->aliases; r++)
1358                 tcpconn_listrm(tcpconn_aliases_hash[c->con_aliases[r].hash], 
1359                                                 &c->con_aliases[r], next, prev);
1360         c->aliases = 0;
1361         TCPCONN_UNLOCK;
1362         lock_destroy(&c->write_lock);
1363 #ifdef USE_TLS
1364         if ((c->type==PROTO_TLS || c->type==PROTO_WSS)&&(c->extra_data)) tls_tcpconn_clean(c);
1365 #endif
1366         shm_free(c);
1367 }
1368
1369
1370 /* finds a connection, if id=0 uses the ip addr, port, local_ip and local port
1371  *  (host byte order) and tries to find the connection that matches all of
1372  *   them. Wild cards can be used for local_ip and local_port (a 0 filled
1373  *   ip address and/or a 0 local port).
1374  * WARNING: unprotected (locks) use tcpconn_get unless you really
1375  * know what you are doing */
1376 struct tcp_connection* _tcpconn_find(int id, struct ip_addr* ip, int port,
1377                                                                                 struct ip_addr* l_ip, int l_port)
1378 {
1379
1380         struct tcp_connection *c;
1381         struct tcp_conn_alias* a;
1382         unsigned hash;
1383         int is_local_ip_any;
1384         
1385 #ifdef EXTRA_DEBUG
1386         LM_DBG("%d  port %d\n",id, port);
1387         if (ip) print_ip("tcpconn_find: ip ", ip, "\n");
1388 #endif
1389         if (likely(id)){
1390                 hash=tcp_id_hash(id);
1391                 for (c=tcpconn_id_hash[hash]; c; c=c->id_next){
1392 #ifdef EXTRA_DEBUG
1393                         LM_DBG("c=%p, c->id=%d, port=%d\n", c, c->id, c->rcv.src_port);
1394                         print_ip("ip=", &c->rcv.src_ip, "\n");
1395 #endif
1396                         if ((id==c->id)&&(c->state!=S_CONN_BAD)) return c;
1397                 }
1398         }else if (likely(ip)){
1399                 hash=tcp_addr_hash(ip, port, l_ip, l_port);
1400                 is_local_ip_any=ip_addr_any(l_ip);
1401                 for (a=tcpconn_aliases_hash[hash]; a; a=a->next){
1402 #ifdef EXTRA_DEBUG
1403                         LM_DBG("a=%p, c=%p, c->id=%d, alias port= %d port=%d\n", a, a->parent,
1404                                         a->parent->id, a->port, a->parent->rcv.src_port);
1405                         print_ip("ip=",&a->parent->rcv.src_ip,"\n");
1406 #endif
1407                         if ( (a->parent->state!=S_CONN_BAD) && (port==a->port) &&
1408                                         ((l_port==0) || (l_port==a->parent->rcv.dst_port)) &&
1409                                         (ip_addr_cmp(ip, &a->parent->rcv.src_ip)) &&
1410                                         (is_local_ip_any ||
1411                                                 ip_addr_cmp(l_ip, &a->parent->rcv.dst_ip))
1412                                 )
1413                                 return a->parent;
1414                 }
1415         }
1416         return 0;
1417 }
1418
1419
1420
1421 /* _tcpconn_find with locks and timeout
1422  * local_addr contains the desired local ip:port. If null any local address 
1423  * will be used.  IN*ADDR_ANY or 0 port are wild cards.
1424  * If found, the connection's reference counter will be incremented, you might
1425  * want to decrement it after use.
1426  */
1427 struct tcp_connection* tcpconn_get(int id, struct ip_addr* ip, int port,
1428                                                                         union sockaddr_union* local_addr,
1429                                                                         ticks_t timeout)
1430 {
1431         struct tcp_connection* c;
1432         struct ip_addr local_ip;
1433         int local_port;
1434         
1435         local_port=0;
1436         if (likely(ip)){
1437                 if (unlikely(local_addr)){
1438                         su2ip_addr(&local_ip, local_addr);
1439                         local_port=su_getport(local_addr);
1440                 }else{
1441                         ip_addr_mk_any(ip->af, &local_ip);
1442                         local_port=0;
1443                 }
1444         }
1445         TCPCONN_LOCK;
1446         c=_tcpconn_find(id, ip, port, &local_ip, local_port);
1447         if (likely(c)){ 
1448                         atomic_inc(&c->refcnt);
1449                         /* update the timeout only if the connection is not handled
1450                          * by a tcp reader _and_the timeout is non-zero  (the tcp
1451                          * reader process uses c->timeout for its own internal
1452                          * timeout and c->timeout will be overwritten * anyway on
1453                          * return to tcp_main) */
1454                         if (likely(c->reader_pid==0 && timeout != 0))
1455                                 c->timeout=get_ticks_raw()+timeout;
1456         }
1457         TCPCONN_UNLOCK;
1458         return c;
1459 }
1460
1461
1462
1463 /* add c->dst:port, local_addr as an alias for the "id" connection, 
1464  * flags: TCP_ALIAS_FORCE_ADD  - add an alias even if a previous one exists
1465  *        TCP_ALIAS_REPLACE    - if a prev. alias exists, replace it with the
1466  *                                new one
1467  * returns 0 on success, <0 on failure ( -1  - null c, -2 too many aliases,
1468  *  -3 alias already present and pointing to another connection)
1469  * WARNING: must be called with TCPCONN_LOCK held */
1470 inline static int _tcpconn_add_alias_unsafe(struct tcp_connection* c, int port,
1471                                                                                 struct ip_addr* l_ip, int l_port,
1472                                                                                 int flags)
1473 {
1474         unsigned hash;
1475         struct tcp_conn_alias* a;
1476         struct tcp_conn_alias* nxt;
1477         struct tcp_connection* p;
1478         int is_local_ip_any;
1479         int i;
1480         int r;
1481         
1482         a=0;
1483         is_local_ip_any=ip_addr_any(l_ip);
1484         if (likely(c)){
1485                 hash=tcp_addr_hash(&c->rcv.src_ip, port, l_ip, l_port);
1486                 /* search the aliases for an already existing one */
1487                 for (a=tcpconn_aliases_hash[hash], nxt=0; a; a=nxt){
1488                         nxt=a->next;
1489                         if ( (a->parent->state!=S_CONN_BAD) && (port==a->port) &&
1490                                         ( (l_port==0) || (l_port==a->parent->rcv.dst_port)) &&
1491                                         (ip_addr_cmp(&c->rcv.src_ip, &a->parent->rcv.src_ip)) &&
1492                                         ( is_local_ip_any || 
1493                                           ip_addr_cmp(&a->parent->rcv.dst_ip, l_ip))
1494                                         ){
1495                                 /* found */
1496                                 if (unlikely(a->parent!=c)){
1497                                         if (flags & TCP_ALIAS_FORCE_ADD)
1498                                                 /* still have to walk the whole list to check if
1499                                                  * the alias was not already added */
1500                                                 continue;
1501                                         else if (flags & TCP_ALIAS_REPLACE){
1502                                                 /* remove the alias =>
1503                                                  * remove the current alias and all the following
1504                                                  *  ones from the corresponding connection, shift the 
1505                                                  *  connection aliases array and re-add the other 
1506                                                  *  aliases (!= current one) */
1507                                                 p=a->parent;
1508                                                 for (i=0; (i<p->aliases) && (&(p->con_aliases[i])!=a);
1509                                                                 i++);
1510                                                 if (unlikely(i==p->aliases)){
1511                                                         LM_CRIT("alias %p not found in con %p (id %d)\n",
1512                                                                         a, p, p->id);
1513                                                         goto error_not_found;
1514                                                 }
1515                                                 for (r=i; r<p->aliases; r++){
1516                                                         tcpconn_listrm(
1517                                                                 tcpconn_aliases_hash[p->con_aliases[r].hash],
1518                                                                 &p->con_aliases[r], next, prev);
1519                                                 }
1520                                                 if (likely((i+1)<p->aliases)){
1521                                                         memmove(&p->con_aliases[i], &p->con_aliases[i+1],
1522                                                                                         (p->aliases-i-1)*
1523                                                                                                 sizeof(p->con_aliases[0]));
1524                                                 }
1525                                                 p->aliases--;
1526                                                 /* re-add the remaining aliases */
1527                                                 for (r=i; r<p->aliases; r++){
1528                                                         tcpconn_listadd(
1529                                                                 tcpconn_aliases_hash[p->con_aliases[r].hash], 
1530                                                                 &p->con_aliases[r], next, prev);
1531                                                 }
1532                                         }else
1533                                                 goto error_sec;
1534                                 }else goto ok;
1535                         }
1536                 }
1537                 if (unlikely(c->aliases>=TCP_CON_MAX_ALIASES)) goto error_aliases;
1538                 c->con_aliases[c->aliases].parent=c;
1539                 c->con_aliases[c->aliases].port=port;
1540                 c->con_aliases[c->aliases].hash=hash;
1541                 tcpconn_listadd(tcpconn_aliases_hash[hash], 
1542                                                                 &c->con_aliases[c->aliases], next, prev);
1543                 c->aliases++;
1544         }else goto error_not_found;
1545 ok:
1546 #ifdef EXTRA_DEBUG
1547         if (a) LM_DBG("alias already present\n");
1548         else   LM_DBG("alias port %d for hash %d, id %d\n",
1549                         port, hash, c->id);
1550 #endif
1551         return 0;
1552 error_aliases:
1553         /* too many aliases */
1554         return -2;
1555 error_not_found:
1556         /* null connection */
1557         return -1;
1558 error_sec:
1559         /* alias already present and pointing to a different connection
1560          * (hijack attempt?) */
1561         return -3;
1562 }
1563
1564
1565
1566 /* add port as an alias for the "id" connection, 
1567  * returns 0 on success,-1 on failure */
1568 int tcpconn_add_alias(int id, int port, int proto)
1569 {
1570         struct tcp_connection* c;
1571         int ret;
1572         struct ip_addr zero_ip;
1573         int r;
1574         int alias_flags;
1575         
1576         /* fix the port */
1577         port=port?port:((proto==PROTO_TLS)?SIPS_PORT:SIP_PORT);
1578         TCPCONN_LOCK;
1579         /* check if alias already exists */
1580         c=_tcpconn_find(id, 0, 0, 0, 0);
1581         if (likely(c)){
1582                 ip_addr_mk_any(c->rcv.src_ip.af, &zero_ip);
1583                 alias_flags=cfg_get(tcp, tcp_cfg, alias_flags);
1584                 /* alias src_ip:port, 0, 0 */
1585                 ret=_tcpconn_add_alias_unsafe(c, port,  &zero_ip, 0, 
1586                                                                                 alias_flags);
1587                 if (ret<0 && ret!=-3) goto error;
1588                 /* alias src_ip:port, local_ip, 0 */
1589                 ret=_tcpconn_add_alias_unsafe(c, port,  &c->rcv.dst_ip, 0, 
1590                                                                                 alias_flags);
1591                 if (ret<0 && ret!=-3) goto error;
1592                 /* alias src_ip:port, local_ip, local_port */
1593                 ret=_tcpconn_add_alias_unsafe(c, port, &c->rcv.dst_ip, c->rcv.dst_port,
1594                                                                                 alias_flags);
1595                 if (unlikely(ret<0)) goto error;
1596         }else goto error_not_found;
1597         TCPCONN_UNLOCK;
1598         return 0;
1599 error_not_found:
1600         TCPCONN_UNLOCK;
1601         LM_ERR("no connection found for id %d\n",id);
1602         return -1;
1603 error:
1604         TCPCONN_UNLOCK;
1605         switch(ret){
1606                 case -2:
1607                         LM_ERR("too many aliases (%d) for connection %p (id %d) %s:%d <- %d\n",
1608                                         c->aliases, c, c->id, ip_addr2a(&c->rcv.src_ip),
1609                                         c->rcv.src_port, port);
1610                         for (r=0; r<c->aliases; r++){
1611                                 LM_ERR("alias %d: for %p (%d) %s:%d <-%d hash %x\n",  r, c, c->id, 
1612                                                 ip_addr2a(&c->rcv.src_ip), c->rcv.src_port, 
1613                                                 c->con_aliases[r].port, c->con_aliases[r].hash);
1614                         }
1615                         break;
1616                 case -3:
1617                         LM_ERR("possible port hijack attempt\n");
1618                         LM_ERR("alias for %d port %d already"
1619                                                 " present and points to another connection \n",
1620                                                 c->id, port);
1621                         break;
1622                 default:
1623                         LM_ERR("unknown error %d\n", ret);
1624         }
1625         return -1;
1626 }
1627
1628
1629
1630 #ifdef TCP_FD_CACHE
1631
1632 static void tcp_fd_cache_init(void)
1633 {
1634         int r;
1635         for (r=0; r<TCP_FD_CACHE_SIZE; r++)
1636                 fd_cache[r].fd=-1;
1637 }
1638
1639
1640 inline static struct fd_cache_entry* tcp_fd_cache_get(struct tcp_connection *c)
1641 {
1642         int h;
1643         
1644         h=c->id%TCP_FD_CACHE_SIZE;
1645         if ((fd_cache[h].fd>0) && (fd_cache[h].id==c->id) && (fd_cache[h].con==c))
1646                 return &fd_cache[h];
1647         return 0;
1648 }
1649
1650
1651 inline static void tcp_fd_cache_rm(struct fd_cache_entry* e)
1652 {
1653         e->fd=-1;
1654 }
1655
1656
1657 inline static void tcp_fd_cache_add(struct tcp_connection *c, int fd)
1658 {
1659         int h;
1660         
1661         h=c->id%TCP_FD_CACHE_SIZE;
1662         if (likely(fd_cache[h].fd>0))
1663                 tcp_safe_close(fd_cache[h].fd);
1664         fd_cache[h].fd=fd;
1665         fd_cache[h].id=c->id;
1666         fd_cache[h].con=c;
1667 }
1668
1669 #endif /* TCP_FD_CACHE */
1670
1671
1672
1673 inline static int tcpconn_chld_put(struct tcp_connection* tcpconn);
1674
1675 static int tcpconn_send_put(struct tcp_connection* c, const char* buf,
1676                                                         unsigned len, snd_flags_t send_flags);
1677 static int tcpconn_do_send(int fd, struct tcp_connection* c,
1678                                                         const char* buf, unsigned len,
1679                                                         snd_flags_t send_flags, long* resp, int locked);
1680
1681 static int tcpconn_1st_send(int fd, struct tcp_connection* c,
1682                                                         const char* buf, unsigned len,
1683                                                         snd_flags_t send_flags, long* resp, int locked);
1684
1685 /* finds a tcpconn & sends on it
1686  * uses the dst members to, proto (TCP|TLS) and id and tries to send
1687  *  from the "from" address (if non null and id==0)
1688  * returns: number of bytes written (>=0) on success
1689  *          <0 on error */
1690 int tcp_send(struct dest_info* dst, union sockaddr_union* from,
1691                                         const char* buf, unsigned len)
1692 {
1693         struct tcp_connection *c;
1694         struct ip_addr ip;
1695         int port;
1696         int fd;
1697         long response[2];
1698         int n;
1699         ticks_t con_lifetime;
1700 #ifdef USE_TLS
1701         const char* rest_buf;
1702         const char* t_buf;
1703         unsigned rest_len, t_len;
1704         long resp;
1705         snd_flags_t t_send_flags;
1706 #endif /* USE_TLS */
1707         
1708         port=su_getport(&dst->to);
1709         con_lifetime=cfg_get(tcp, tcp_cfg, con_lifetime);
1710         if (likely(port)){
1711                 su2ip_addr(&ip, &dst->to);
1712                 c=tcpconn_get(dst->id, &ip, port, from, con_lifetime); 
1713         }else if (likely(dst->id)){
1714                 c=tcpconn_get(dst->id, 0, 0, 0, con_lifetime);
1715         }else{
1716                 LM_CRIT("null id & to\n");
1717                 return -1;
1718         }
1719         
1720         if (likely(dst->id)){
1721                 if (unlikely(c==0)) {
1722                         if (likely(port)){
1723                                 /* try again w/o id */
1724                                 c=tcpconn_get(0, &ip, port, from, con_lifetime);
1725                         }else{
1726                                 LM_ERR("id %d not found, dropping\n", dst->id);
1727                                 return -1;
1728                         }
1729                 }
1730         }
1731         /* connection not found or unusable => open a new one and send on it */
1732         if (unlikely((c==0) || tcpconn_close_after_send(c))){
1733                 if (unlikely(c)){
1734                         /* can't use c if it's marked as close-after-send  =>
1735                            release it and try opening new one */
1736                         tcpconn_chld_put(c); /* release c (dec refcnt & free on 0) */
1737                         c=0;
1738                 }
1739                 /* check if connect() is disabled */
1740                 if (unlikely((dst->send_flags.f & SND_F_FORCE_CON_REUSE) ||
1741                                                 cfg_get(tcp, tcp_cfg, no_connect)))
1742                         return -1;
1743                 LM_DBG("no open tcp connection found, opening new one\n");
1744                 /* create tcp connection */
1745                 if (likely(from==0)){
1746                         /* check to see if we have to use a specific source addr. */
1747                         switch (dst->to.s.sa_family) {
1748                                 case AF_INET:
1749                                                 from = tcp_source_ipv4;
1750                                         break;
1751                                 case AF_INET6:
1752                                                 from = tcp_source_ipv6;
1753                                         break;
1754                                 default:
1755                                         /* error, bad af, ignore ... */
1756                                         break;
1757                         }
1758                 }
1759 #if defined(TCP_CONNECT_WAIT) && defined(TCP_ASYNC)
1760                 if (likely(cfg_get(tcp, tcp_cfg, tcp_connect_wait) && 
1761                                         cfg_get(tcp, tcp_cfg, async) )){
1762                         if (unlikely(*tcp_connections_no >=
1763                                                         cfg_get(tcp, tcp_cfg, max_connections))){
1764                                 LM_ERR("%s: maximum number of connections exceeded (%d/%d)\n",
1765                                                         su2a(&dst->to, sizeof(dst->to)),
1766                                                         *tcp_connections_no,
1767                                                         cfg_get(tcp, tcp_cfg, max_connections));
1768                                 return -1;
1769                         }
1770                         if (unlikely(dst->proto==PROTO_TLS)) {
1771                                 if (unlikely(*tls_connections_no >=
1772                                                         cfg_get(tcp, tcp_cfg, max_tls_connections))){
1773                                         LM_ERR("%s: maximum number of tls connections exceeded (%d/%d)\n",
1774                                                         su2a(&dst->to, sizeof(dst->to)),
1775                                                         *tls_connections_no,
1776                                                         cfg_get(tcp, tcp_cfg, max_tls_connections));
1777                                         return -1;
1778                                 }
1779                         }
1780                         c=tcpconn_new(-1, &dst->to, from, 0, dst->proto,
1781                                                         S_CONN_CONNECT);
1782                         if (unlikely(c==0)){
1783                                 LM_ERR("%s: could not create new connection\n",
1784                                                 su2a(&dst->to, sizeof(dst->to)));
1785                                 return -1;
1786                         }
1787                         c->flags|=F_CONN_PENDING|F_CONN_FD_CLOSED;
1788                         tcpconn_set_send_flags(c, dst->send_flags);
1789                         atomic_set(&c->refcnt, 2); /* ref from here and from main hash
1790                                                                                  table */
1791                         /* add it to id hash and aliases */
1792                         if (unlikely(tcpconn_add(c)==0)){
1793                                 LM_ERR("%s: could not add connection %p\n",
1794                                                 su2a(&dst->to, sizeof(dst->to)), c);
1795                                 _tcpconn_free(c);
1796                                 n=-1;
1797                                 goto end_no_conn;
1798                         }
1799                         /* do connect and if src ip or port changed, update the 
1800                          * aliases */
1801                         if (unlikely((fd=tcpconn_finish_connect(c, from))<0)){
1802                                 /* tcpconn_finish_connect will automatically blacklist
1803                                    on error => no need to do it here */
1804                                 LM_ERR("%s: tcpconn_finish_connect(%p) failed\n",
1805                                                 su2a(&dst->to, sizeof(dst->to)), c);
1806                                 goto conn_wait_error;
1807                         }
1808                         /* ? TODO: it might be faster just to queue the write directly
1809                          *  and send to main CONN_NEW_PENDING_WRITE */
1810                         /* delay sending the fd to main after the send */
1811                         
1812                         /* NOTE: no lock here, because the connection is marked as
1813                          * pending and nobody else will try to write on it. However
1814                          * this might produce out-of-order writes. If this is not
1815                          * desired either lock before the write or use 
1816                          * _wbufq_insert(...)
1817                          * NOTE2: _wbufq_insert() is used now (no out-of-order).
1818                          */
1819 #ifdef USE_TLS
1820                         if (unlikely(c->type==PROTO_TLS)) {
1821                         /* for TLS the TLS processing and the send must happen
1822                            atomically w/ respect to other sends on the same connection
1823                            (otherwise reordering might occur which would break TLS) =>
1824                            lock. However in this case this send will always be the first.
1825                            We can have the send() outside the lock only if this is the
1826                            first and only send (tls_encode is not called again), or
1827                            this is the last send for a tls_encode() loop and all the
1828                            previous ones did return CONN_NEW_COMPLETE or CONN_EOF.
1829                         */
1830                                 response[1] = CONN_NOP;
1831                                 t_buf = buf;
1832                                 t_len = len;
1833                                 lock_get(&c->write_lock);
1834 redo_tls_encode:
1835                                         t_send_flags = dst->send_flags;
1836                                         n = tls_encode(c, &t_buf, &t_len, &rest_buf, &rest_len,
1837                                                                         &t_send_flags);
1838                                         /* There are 4 cases:
1839                                            1. entire buffer consumed from the first try
1840                                              (rest_len == rest_buf == 0)
1841                                            2. rest_buf & first call
1842                                            3. rest_buf & not first call
1843                                                   3a. CONN_NEW_COMPLETE or CONN_EOF
1844                                                   3b. CONN_NEW_PENDING_WRITE
1845                                            4. entire buffer consumed, but not first call
1846                                                4a. CONN_NEW_COMPLETE or CONN_EOF
1847                                                    4b. CONN_NEW_PENDING_WRITE
1848                                                 We misuse response[1] == CONN_NOP to test for the
1849                                                 first call.
1850                                         */
1851                                         if (unlikely(n < 0)) {
1852                                                 lock_release(&c->write_lock);
1853                                                 goto conn_wait_error;
1854                                         }
1855                                         if (likely(rest_len == 0)) {
1856                                                 /* 1 or 4*: CONN_NEW_COMPLETE, CONN_EOF,  CONN_NOP
1857                                                     or CONN_NEW_PENDING_WRITE (*rest_len == 0) */
1858                                                 if (likely(response[1] != CONN_NEW_PENDING_WRITE)) {
1859                                                         /* 1 or 4a => it's safe to do the send outside the
1860                                                            lock (it will either send directly or
1861                                                            wbufq_insert())
1862                                                         */
1863                                                         lock_release(&c->write_lock);
1864                                                         if (likely(t_len != 0)) {
1865                                                                 n=tcpconn_1st_send(fd, c, t_buf, t_len,
1866                                                                                                         t_send_flags,
1867                                                                                                         &response[1], 0);
1868                                                         } else { /* t_len == 0 */
1869                                                                 if (response[1] == CONN_NOP) {
1870                                                                         /* nothing to send (e.g  parallel send
1871                                                                            tls_encode queues some data and then
1872                                                                            WANT_READ => this tls_encode will queue
1873                                                                            the cleartext too and will have nothing
1874                                                                            to send right now) and initial send =>
1875                                                                            behave as if the send was successful
1876                                                                            (but never return EOF here) */
1877                                                                         response[1] = CONN_NEW_COMPLETE;
1878                                                                 }
1879                                                         }
1880                                                         /* exit */
1881                                                 } else {
1882                                                         /* CONN_NEW_PENDING_WRITE:  4b: it was a
1883                                                            repeated tls_encode() (or otherwise we would
1884                                                            have here CONN_NOP) => add to the queue */
1885                                                         if (unlikely(t_len &&
1886                                                                                         _wbufq_add(c, t_buf, t_len) < 0)) {
1887                                                                 response[1] = CONN_ERROR;
1888                                                                 n = -1;
1889                                                         }
1890                                                         lock_release(&c->write_lock);
1891                                                         /* exit (no send) */
1892                                                 }
1893                                         } else {  /* rest_len != 0 */
1894                                                 /* 2 or 3*: if tls_encode hasn't finished, we have to
1895                                                    call tcpconn_1st_send() under lock (otherwise if it
1896                                                    returns CONN_NEW_PENDING_WRITE, there is no way
1897                                                    to find the right place to add the new queued
1898                                                    data from the 2nd tls_encode()) */
1899                                                 if (likely((response[1] == CONN_NOP /*2*/ ||
1900                                                                         response[1] == CONN_NEW_COMPLETE /*3a*/ ||
1901                                                                         response[1] == CONN_EOF /*3a*/) && t_len))
1902                                                         n = tcpconn_1st_send(fd, c, t_buf, t_len,
1903                                                                                                         t_send_flags,
1904                                                                                                         &response[1], 1);
1905                                                 else if (unlikely(t_len &&
1906                                                                                         _wbufq_add(c, t_buf, t_len) < 0)) {
1907                                                         /*3b: CONN_NEW_PENDING_WRITE*/
1908                                                         response[1] = CONN_ERROR;
1909                                                         n = -1;
1910                                                 }
1911                                                 if (likely(n >= 0)) {
1912                                                         /* if t_len == 0 => nothing was sent => previous
1913                                                            response will be kept */
1914                                                         t_buf = rest_buf;
1915                                                         t_len = rest_len;
1916                                                         goto redo_tls_encode;
1917                                                 } else {
1918                                                         lock_release(&c->write_lock);
1919                                                         /* error exit */
1920                                                 }
1921                                         }
1922                         } else
1923 #endif /* USE_TLS */
1924                                 n=tcpconn_1st_send(fd, c, buf, len, dst->send_flags,
1925                                                                         &response[1], 0);
1926                         if (unlikely(n<0)) /* this will catch CONN_ERROR too */
1927                                 goto conn_wait_error;
1928                         if (unlikely(response[1]==CONN_EOF)){
1929                                 /* if close-after-send requested, don't bother
1930                                    sending the fd back to tcp_main, try closing it
1931                                    immediately (no other tcp_send should use it,
1932                                    because it is marked as close-after-send before
1933                                    being added to the hash) */
1934                                 goto conn_wait_close;
1935                         }
1936                         /* send to tcp_main */
1937                         response[0]=(long)c;
1938                         if (unlikely(send_fd(unix_tcp_sock, response,
1939                                                                         sizeof(response), fd) <= 0)){
1940                                 LM_ERR("%s: %ld for %p failed:" " %s (%d)\n",
1941                                                         su2a(&dst->to, sizeof(dst->to)),
1942                                                         response[1], c, strerror(errno), errno);
1943                                 goto conn_wait_error;
1944                         }
1945                         goto conn_wait_success;
1946                 }
1947 #endif /* TCP_CONNECT_WAIT  && TCP_ASYNC */
1948                 if (unlikely((c=tcpconn_connect(&dst->to, from, dst->proto,
1949                                                                                 &dst->send_flags))==0)){
1950                         LM_ERR("%s: connect failed\n", su2a(&dst->to, sizeof(dst->to)));
1951                         return -1;
1952                 }
1953                 tcpconn_set_send_flags(c, dst->send_flags);
1954                 if (likely(c->state==S_CONN_OK))
1955                         TCP_STATS_ESTABLISHED(S_CONN_CONNECT);
1956                 atomic_set(&c->refcnt, 2); /* ref. from here and it will also
1957                                                                           be added in the tcp_main hash */
1958                 fd=c->s;
1959                 c->flags|=F_CONN_FD_CLOSED; /* not yet opened in main */
1960                 /* ? TODO: it might be faster just to queue the write and
1961                  * send to main a CONN_NEW_PENDING_WRITE */
1962                 
1963                 /* send the new tcpconn to "tcp main" */
1964                 response[0]=(long)c;
1965                 response[1]=CONN_NEW;
1966                 n=send_fd(unix_tcp_sock, response, sizeof(response), c->s);
1967                 if (unlikely(n<=0)){
1968                         LM_ERR("%s: failed send_fd: %s (%d)\n",
1969                                         su2a(&dst->to, sizeof(dst->to)),
1970                                         strerror(errno), errno);
1971                         /* we can safely delete it, it's not referenced by anybody */
1972                         _tcpconn_free(c);
1973                         n=-1;
1974                         goto end_no_conn;
1975                 }
1976                 /* new connection => send on it directly */
1977 #ifdef USE_TLS
1978                 if (unlikely(c->type==PROTO_TLS)) {
1979                         /* for TLS the TLS processing and the send must happen
1980                            atomically w/ respect to other sends on the same connection
1981                            (otherwise reordering might occur which would break TLS) =>
1982                            lock.
1983                         */
1984                         response[1] = CONN_NOP;
1985                         t_buf = buf;
1986                         t_len = len;
1987                         lock_get(&c->write_lock);
1988                                 do {
1989                                         t_send_flags = dst->send_flags;
1990                                         n = tls_encode(c, &t_buf, &t_len, &rest_buf, &rest_len,
1991                                                                         &t_send_flags);
1992                                         if (likely(n > 0)) {
1993                                                 n = tcpconn_do_send(fd, c, t_buf, t_len, t_send_flags,
1994                                                                                                 &resp, 1);
1995                                                 if (likely(response[1] != CONN_QUEUED_WRITE ||
1996                                                                         resp == CONN_ERROR))
1997                                                         /* don't overwrite a previous CONN_QUEUED_WRITE
1998                                                            unless error */
1999                                                         response[1] = resp;
2000                                         } else  if (unlikely(n < 0)) {
2001                                                 response[1] = CONN_ERROR;
2002                                                 break;
2003                                         }
2004                                         /* else do nothing for n (t_len) == 0, keep
2005                                            the last reponse */
2006                                         t_buf = rest_buf;
2007                                         t_len = rest_len;
2008                                 } while(unlikely(rest_len && n > 0));
2009                         lock_release(&c->write_lock);
2010                 } else
2011 #endif /* USE_TLS */
2012                         n = tcpconn_do_send(fd, c, buf, len, dst->send_flags,
2013                                                                         &response[1], 0);
2014                 if (unlikely(response[1] != CONN_NOP)) {
2015                         response[0]=(long)c;
2016                         if (send_all(unix_tcp_sock, response, sizeof(response)) <= 0) {
2017                                 BUG("tcp_main command %ld sending failed (write):"
2018                                                 "%s (%d)\n", response[1], strerror(errno), errno);
2019                                 /* all commands != CONN_NOP returned by tcpconn_do_send()
2020                                    (CONN_EOF, CONN_ERROR, CONN_QUEUED_WRITE) will auto-dec
2021                                    refcnt => if sending the command fails we have to
2022                                    dec. refcnt by hand */
2023                                 tcpconn_chld_put(c); /* deref. it manually */
2024                                 n=-1;
2025                         }
2026                         /* here refcnt for c is already decremented => c contents can
2027                            no longer be used and refcnt _must_ _not_ be decremented
2028                            again on exit */
2029                         if (unlikely(n < 0 || response[1] == CONN_EOF)) {
2030                                 /* on error or eof, close fd */
2031                                 tcp_safe_close(fd);
2032                         } else if (response[1] == CONN_QUEUED_WRITE) {
2033 #ifdef TCP_FD_CACHE
2034                                 if (cfg_get(tcp, tcp_cfg, fd_cache)) {
2035                                         tcp_fd_cache_add(c, fd);
2036                                 } else
2037 #endif /* TCP_FD_CACHE */
2038                                         tcp_safe_close(fd);
2039                         } else {
2040                                 BUG("unexpected tcpconn_do_send() return & response:"
2041                                                 " %d, %ld\n", n, response[1]);
2042                         }
2043                         goto end_no_deref;
2044                 }
2045 #ifdef TCP_FD_CACHE
2046                 if (cfg_get(tcp, tcp_cfg, fd_cache)) {
2047                         tcp_fd_cache_add(c, fd);
2048                 }else
2049 #endif /* TCP_FD_CACHE */
2050                         tcp_safe_close(fd);
2051         /* here we can have only commands that _do_ _not_ dec refcnt.
2052            (CONN_EOF, CON_ERROR, CON_QUEUED_WRITE are all treated above) */
2053                 goto release_c;
2054         } /* if (c==0 or unusable) new connection */
2055         /* existing connection, send on it */
2056         n = tcpconn_send_put(c, buf, len, dst->send_flags);
2057         /* no deref needed (automatically done inside tcpconn_send_put() */
2058         return n;
2059 #ifdef TCP_CONNECT_WAIT
2060 conn_wait_success:
2061 #ifdef TCP_FD_CACHE
2062         if (cfg_get(tcp, tcp_cfg, fd_cache)) {
2063                 tcp_fd_cache_add(c, fd);
2064         } else
2065 #endif /* TCP_FD_CACHE */
2066                 if (unlikely (tcp_safe_close(fd) < 0))
2067                         LM_ERR("closing temporary send fd for %p: %s: "
2068                                         "close(%d) failed (flags 0x%x): %s (%d)\n", c,
2069                                         su2a(&c->rcv.src_su, sizeof(c->rcv.src_su)),
2070                                         fd, c->flags, strerror(errno), errno);
2071         tcpconn_chld_put(c); /* release c (dec refcnt & free on 0) */
2072         return n;
2073 conn_wait_error:
2074         n=-1;
2075 conn_wait_close:
2076         /* connect or send failed or immediate close-after-send was requested on
2077          * newly created connection which was not yet sent to tcp_main (but was
2078          * already hashed) => don't send to main, unhash and destroy directly
2079          * (if refcnt>2 it will be destroyed when the last sender releases the
2080          * connection (tcpconn_chld_put(c))) or when tcp_main receives a
2081          * CONN_ERROR it*/
2082         c->state=S_CONN_BAD;
2083         /* we are here only if we opened a new fd (and not reused a cached or
2084            a reader one) => if the connect was successful close the fd */
2085         if (fd>=0) {
2086                 if (unlikely(tcp_safe_close(fd) < 0 ))
2087                         LM_ERR("closing temporary send fd for %p: %s: "
2088                                         "close(%d) failed (flags 0x%x): %s (%d)\n", c,
2089                                         su2a(&c->rcv.src_su, sizeof(c->rcv.src_su)),
2090                                         fd, c->flags, strerror(errno), errno);
2091         }
2092         /* here the connection is for sure in the hash (tcp_main will not
2093            remove it because it's marked as PENDing) and the refcnt is at least
2094            2
2095          */
2096         TCPCONN_LOCK;
2097                 _tcpconn_detach(c);
2098                 c->flags&=~F_CONN_HASHED;
2099                 tcpconn_put(c);
2100         TCPCONN_UNLOCK;
2101         /* dec refcnt -> mark it for destruction */
2102         tcpconn_chld_put(c);
2103         return n;
2104 #endif /* TCP_CONNECT_WAIT */
2105 release_c:
2106         tcpconn_chld_put(c); /* release c (dec refcnt & free on 0) */
2107 end_no_deref:
2108 end_no_conn:
2109         return n;
2110 }
2111
2112
2113
2114 /** sends on an existing tcpconn and auto-dec. con. ref counter.
2115  * As opposed to tcp_send(), this function requires an existing
2116  * tcp connection.
2117  * WARNING: the tcp_connection will be de-referenced.
2118  * @param c - existing tcp connection pointer.
2119  * @param buf - data to be sent.
2120  * @param len - data length,
2121  * @return >=0 on success, -1 on error.
2122  */
2123 static int tcpconn_send_put(struct tcp_connection* c, const char* buf,
2124                                                                 unsigned len, snd_flags_t send_flags)
2125 {
2126         struct tcp_connection *tmp;
2127         int fd;
2128         long response[2];
2129         int n;
2130         int do_close_fd;
2131 #ifdef USE_TLS
2132         const char* rest_buf;
2133         const char* t_buf;
2134         unsigned rest_len, t_len;
2135         long resp;
2136         snd_flags_t t_send_flags;
2137 #endif /* USE_TLS */
2138 #ifdef TCP_FD_CACHE
2139         struct fd_cache_entry* fd_cache_e;
2140         int use_fd_cache;
2141         
2142         use_fd_cache=cfg_get(tcp, tcp_cfg, fd_cache);
2143         fd_cache_e=0;
2144 #endif /* TCP_FD_CACHE */
2145         do_close_fd=1; /* close the fd on exit */
2146         response[1] = CONN_NOP;
2147 #ifdef TCP_ASYNC
2148         /* if data is already queued, we don't need the fd */
2149 #ifdef TCP_CONNECT_WAIT
2150                 if (unlikely(cfg_get(tcp, tcp_cfg, async) &&
2151                                                 (_wbufq_non_empty(c) || (c->flags&F_CONN_PENDING)) ))
2152 #else /* ! TCP_CONNECT_WAIT */
2153                 if (unlikely(cfg_get(tcp, tcp_cfg, async) && (_wbufq_non_empty(c)) ))
2154 #endif /* TCP_CONNECT_WAIT */
2155                 {
2156                         lock_get(&c->write_lock);
2157 #ifdef TCP_CONNECT_WAIT
2158                                 if (likely(_wbufq_non_empty(c) || (c->flags&F_CONN_PENDING)))
2159 #else /* ! TCP_CONNECT_WAIT */
2160                                 if (likely(_wbufq_non_empty(c)))
2161 #endif /* TCP_CONNECT_WAIT */
2162                                 {
2163                                         do_close_fd=0;
2164 #ifdef USE_TLS
2165                                         if (unlikely(c->type==PROTO_TLS || c->type==PROTO_WSS)) {
2166                                                 t_buf = buf;
2167                                                 t_len = len;
2168                                                 do {
2169                                                         t_send_flags = send_flags;
2170                                                         n = tls_encode(c, &t_buf, &t_len,
2171                                                                                         &rest_buf, &rest_len,
2172                                                                                         &t_send_flags);
2173                                                         if (unlikely((n < 0) || (t_len &&
2174                                                                          (_wbufq_add(c, t_buf, t_len) < 0)))) {
2175                                                                 lock_release(&c->write_lock);
2176                                                                 n=-1;
2177                                                                 response[1] = CONN_ERROR;
2178                                                                 c->state=S_CONN_BAD;
2179                                                                 c->timeout=get_ticks_raw(); /* force timeout */
2180                                                                 goto error;
2181                                                         }
2182                                                         t_buf = rest_buf;
2183                                                         t_len = rest_len;
2184                                                 } while(unlikely(rest_len && n > 0));
2185                                         } else
2186 #endif /* USE_TLS */
2187                                                 if (unlikely(len && (_wbufq_add(c, buf, len)<0))){
2188                                                         lock_release(&c->write_lock);
2189                                                         n=-1;
2190                                                         response[1] = CONN_ERROR;
2191                                                         c->state=S_CONN_BAD;
2192                                                         c->timeout=get_ticks_raw(); /* force timeout */
2193                                                         goto error;
2194                                                 }
2195                                         n=len;
2196                                         lock_release(&c->write_lock);
2197                                         goto release_c;
2198                                 }
2199                         lock_release(&c->write_lock);
2200                 }
2201 #endif /* TCP_ASYNC */
2202                 /* check if this is not the same reader process holding
2203                  *  c  and if so send directly on c->fd */
2204                 if (c->reader_pid==my_pid()){
2205                         LM_DBG("send from reader (%d (%d)), reusing fd\n",
2206                                         my_pid(), process_no);
2207                         fd=c->fd;
2208                         do_close_fd=0; /* don't close the fd on exit, it's in use */
2209 #ifdef TCP_FD_CACHE
2210                         use_fd_cache=0; /* don't cache: problems would arise due to the
2211                                                            close() on cache eviction (if the fd is still 
2212                                                            used). If it has to be cached then dup() _must_ 
2213                                                            be used */
2214                 }else if (likely(use_fd_cache && 
2215                                                         ((fd_cache_e=tcp_fd_cache_get(c))!=0))){
2216                         fd=fd_cache_e->fd;
2217                         do_close_fd=0;
2218                         LM_DBG("found fd in cache (%d, %p, %d)\n", fd, c, fd_cache_e->id);
2219 #endif /* TCP_FD_CACHE */
2220                 }else{
2221                         LM_DBG("tcp connection found (%p), acquiring fd\n", c);
2222                         /* get the fd */
2223                         response[0]=(long)c;
2224                         response[1]=CONN_GET_FD;
2225                         n=send_all(unix_tcp_sock, response, sizeof(response));
2226                         if (unlikely(n<=0)){
2227                                 LM_ERR("failed to get fd(write):%s (%d)\n", strerror(errno), errno);
2228                                 n=-1;
2229                                 goto release_c;
2230                         }
2231                         LM_DBG("c=%p, n=%d\n", c, n);
2232                         n=receive_fd(unix_tcp_sock, &tmp, sizeof(tmp), &fd, MSG_WAITALL);
2233                         if (unlikely(n<=0)){
2234                                 LM_ERR("failed to get fd(receive_fd): %s (%d)\n",
2235                                                 strerror(errno), errno);
2236                                 n=-1;
2237                                 do_close_fd=0;
2238                                 goto release_c;
2239                         }
2240                         /* handle fd closed or bad connection/error
2241                                 (it's possible that this happened in the time between
2242                                 we found the intial connection and the time when we get
2243                                 the fd)
2244                          */
2245                         if (unlikely(c!=tmp || fd==-1 || c->state==S_CONN_BAD)){
2246                                 if (unlikely(c!=tmp && tmp!=0))
2247                                         BUG("tcp_send: get_fd: got different connection:"
2248                                                 "  %p (id= %d, refcnt=%d state=%d) != "
2249                                                 "  %p (n=%d)\n",
2250                                                   c,   c->id,   atomic_get(&c->refcnt),   c->state,
2251                                                   tmp, n
2252                                                 );
2253                                 n=-1; /* fail */
2254                                 /* don't cache fd & close it */
2255                                 do_close_fd = (fd==-1)?0:1;
2256 #ifdef TCP_FD_CACHE
2257                                 use_fd_cache = 0;
2258 #endif /* TCP_FD_CACHE */
2259                                 goto end;
2260                         }
2261                         LM_DBG("after receive_fd: c= %p n=%d fd=%d\n",c, n, fd);
2262                 }
2263         
2264 #ifdef USE_TLS
2265                 if (unlikely(c->type==PROTO_TLS || c->type==PROTO_WSS)) {
2266                         /* for TLS the TLS processing and the send must happen
2267                            atomically w/ respect to other sends on the same connection
2268                            (otherwise reordering might occur which would break TLS) =>
2269                            lock.
2270                         */
2271                         response[1] = CONN_NOP;
2272                         t_buf = buf;
2273                         t_len = len;
2274                         lock_get(&c->write_lock);
2275                                 do {
2276                                         t_send_flags = send_flags;
2277                                         n = tls_encode(c, &t_buf, &t_len, &rest_buf, &rest_len,
2278                                                                         &t_send_flags);
2279                                         if (likely(n > 0)) {
2280                                                 n = tcpconn_do_send(fd, c, t_buf, t_len, t_send_flags,
2281                                                                                                 &resp, 1);
2282                                                 if (likely(response[1] != CONN_QUEUED_WRITE ||
2283                                                                         resp == CONN_ERROR))
2284                                                         /* don't overwrite a previous CONN_QUEUED_WRITE
2285                                                            unless error */
2286                                                         response[1] = resp;
2287                                         } else if (unlikely(n < 0)) {
2288                                                 response[1] = CONN_ERROR;
2289                                                 break;
2290                                         }
2291                                         /* else do nothing for n (t_len) == 0, keep
2292                                            the last reponse */
2293                                         t_buf = rest_buf;
2294                                         t_len = rest_len;
2295                                 } while(unlikely(rest_len && n > 0));
2296                         lock_release(&c->write_lock);
2297                 } else
2298 #endif
2299                         n = tcpconn_do_send(fd, c, buf, len, send_flags, &response[1], 0);
2300         if (unlikely(response[1] != CONN_NOP)) {
2301 error:
2302                 response[0]=(long)c;
2303                 if (send_all(unix_tcp_sock, response, sizeof(response)) <= 0) {
2304                         BUG("tcp_main command %ld sending failed (write):%s (%d)\n",
2305                                         response[1], strerror(errno), errno);
2306                         /* all commands != CONN_NOP returned by tcpconn_do_send()
2307                            (CONN_EOF, CONN_ERROR, CONN_QUEUED_WRITE) will auto-dec refcnt
2308                            => if sending the command fails we have to dec. refcnt by hand
2309                          */
2310                         tcpconn_chld_put(c); /* deref. it manually */
2311                         n=-1;
2312                 }
2313                 /* here refcnt for c is already decremented => c contents can no
2314                    longer be used and refcnt _must_ _not_ be decremented again
2315                    on exit */
2316                 if (unlikely(n < 0 || response[1] == CONN_EOF)) {
2317                         /* on error or eof, remove from cache or close fd */
2318 #ifdef TCP_FD_CACHE
2319                         if (unlikely(fd_cache_e)){
2320                                 tcp_fd_cache_rm(fd_cache_e);
2321                                 fd_cache_e = 0;
2322                                 tcp_safe_close(fd);
2323                         }else
2324 #endif /* TCP_FD_CACHE */
2325                                 if (do_close_fd) tcp_safe_close(fd);
2326                 } else if (response[1] == CONN_QUEUED_WRITE) {
2327 #ifdef TCP_FD_CACHE
2328                         if (unlikely((fd_cache_e==0) && use_fd_cache)){
2329                                 tcp_fd_cache_add(c, fd);
2330                         }else
2331 #endif /* TCP_FD_CACHE */
2332                                 if (do_close_fd) tcp_safe_close(fd);
2333                 } else {
2334                         BUG("unexpected tcpconn_do_send() return & response: %d, %ld\n",
2335                                         n, response[1]);
2336                 }
2337                 return n; /* no tcpconn_put */
2338         }
2339 end:
2340 #ifdef TCP_FD_CACHE
2341         if (unlikely((fd_cache_e==0) && use_fd_cache)){
2342                 tcp_fd_cache_add(c, fd);
2343         }else
2344 #endif /* TCP_FD_CACHE */
2345         if (do_close_fd) {
2346                 if (unlikely(tcp_safe_close(fd) < 0))
2347                         LM_ERR("closing temporary send fd for %p: %s: "
2348                                         "close(%d) failed (flags 0x%x): %s (%d)\n", c,
2349                                         su2a(&c->rcv.src_su, sizeof(c->rcv.src_su)),
2350                                         fd, c->flags, strerror(errno), errno);
2351         }
2352         /* here we can have only commands that _do_ _not_ dec refcnt.
2353            (CONN_EOF, CON_ERROR, CON_QUEUED_WRITE are all treated above) */
2354 release_c:
2355         tcpconn_chld_put(c); /* release c (dec refcnt & free on 0) */
2356         return n;
2357 }
2358
2359
2360
2361 /* unsafe send on a known tcp connection.
2362  * Directly send on a known tcp connection with a given fd.
2363  * It is assumed that the connection locks are already held.
2364  * Side effects: if needed it will send state update commands to
2365  *  tcp_main (e.g. CON_EOF, CON_ERROR, CON_QUEUED_WRITE).
2366  * @param fd - fd used for sending.
2367  * @param c - existing tcp connection pointer (state and flags might be
2368  *            changed).
2369  * @param buf - data to be sent.
2370  * @param len - data length.
2371  * @param send_flags
2372  * @return <0 on error, number of bytes sent on success.
2373  */
2374 int tcpconn_send_unsafe(int fd, struct tcp_connection *c,
2375                                                 const char* buf, unsigned len, snd_flags_t send_flags)
2376 {
2377         int n;
2378         long response[2];
2379         
2380         n = tcpconn_do_send(fd, c, buf, len, send_flags, &response[1], 1);
2381         if (unlikely(response[1] != CONN_NOP)) {
2382                 /* all commands != CONN_NOP returned by tcpconn_do_send()
2383                    (CONN_EOF, CONN_ERROR, CONN_QUEUED_WRITE) will auto-dec refcnt
2384                    => increment it (we don't want the connection to be destroyed
2385                    from under us)
2386                  */
2387                 atomic_inc(&c->refcnt);
2388                 response[0]=(long)c;
2389                 if (send_all(unix_tcp_sock, response, sizeof(response)) <= 0) {
2390                         BUG("connection %p command %ld sending failed (write):%s (%d)\n",
2391                                         c, response[1], strerror(errno), errno);
2392                         /* send failed => deref. it back by hand */
2393                         tcpconn_chld_put(c); 
2394                         n=-1;
2395                 }
2396                 /* here refcnt for c is already decremented => c contents can no
2397                    longer be used and refcnt _must_ _not_ be decremented again
2398                    on exit */
2399                 return n;
2400         }
2401         return n;
2402 }
2403
2404
2405
2406 /** lower level send (connection and fd should be known).
2407  * It takes care of possible write-queueing, blacklisting a.s.o.
2408  * It expects a valid tcp connection. It doesn't touch the ref. cnts.
2409  * It will also set the connection flags from send_flags (it's better
2410  * to do it here, because it's guaranteed to be under lock).
2411  * @param fd - fd used for sending.
2412  * @param c - existing tcp connection pointer (state and flags might be
2413  *            changed).
2414  * @param buf - data to be sent.
2415  * @param len - data length.
2416  * @param send_flags
2417  * @param resp - filled with a cmd. for tcp_main:
2418  *                      CONN_NOP - nothing needs to be done (do not send
2419  *                                 anything to tcp_main).
2420  *                      CONN_ERROR - error, connection should be closed.
2421  *                      CONN_EOF - no error, but connection should be closed.
2422  *                      CONN_QUEUED_WRITE - new write queue (connection
2423  *                                 should be watched for write and the wr.
2424  *                                 queue flushed).
2425  * @param locked - if set assume the connection is already locked (call from
2426  *                  tls) and do not lock/unlock the connection.
2427  * @return >=0 on success, < 0 on error && *resp == CON_ERROR.
2428  *
2429  */
2430 static int tcpconn_do_send(int fd, struct tcp_connection* c,
2431                                                         const char* buf, unsigned len,
2432                                                         snd_flags_t send_flags, long* resp,
2433                                                         int locked)
2434 {
2435         int  n;
2436 #ifdef TCP_ASYNC
2437         int enable_write_watch;
2438 #endif /* TCP_ASYNC */
2439
2440         LM_DBG("sending...\n");
2441         *resp = CONN_NOP;
2442         if (likely(!locked)) lock_get(&c->write_lock);
2443         /* update connection send flags with the current ones */
2444         tcpconn_set_send_flags(c, send_flags);
2445 #ifdef TCP_ASYNC
2446         if (likely(cfg_get(tcp, tcp_cfg, async))){
2447                 if (_wbufq_non_empty(c)
2448 #ifdef TCP_CONNECT_WAIT
2449                         || (c->flags&F_CONN_PENDING) 
2450 #endif /* TCP_CONNECT_WAIT */
2451                         ){
2452                         if (unlikely(_wbufq_add(c, buf, len)<0)){
2453                                 if (likely(!locked)) lock_release(&c->write_lock);
2454                                 n=-1;
2455                                 goto error;
2456                         }
2457                         if (likely(!locked)) lock_release(&c->write_lock);
2458                         n=len;
2459                         goto end;
2460                 }
2461                 n=_tcpconn_write_nb(fd, c, buf, len);
2462         }else{
2463 #endif /* TCP_ASYNC */
2464                 /* n=tcp_blocking_write(c, fd, buf, len); */
2465                 n=tsend_stream(fd, buf, len,
2466                                                 TICKS_TO_S(cfg_get(tcp, tcp_cfg, send_timeout)) *
2467                                                 1000);
2468 #ifdef TCP_ASYNC
2469         }
2470 #else /* ! TCP_ASYNC */
2471         if (likely(!locked)) lock_release(&c->write_lock);
2472 #endif /* TCP_ASYNC */
2473         
2474         LM_DBG("after real write: c= %p n=%d fd=%d\n",c, n, fd);
2475         LM_DBG("buf=\n%.*s\n", (int)len, buf);
2476         if (unlikely(n<(int)len)){
2477 #ifdef TCP_ASYNC
2478                 if (cfg_get(tcp, tcp_cfg, async) &&
2479                                 ((n>=0) || errno==EAGAIN || errno==EWOULDBLOCK)){
2480                         enable_write_watch=_wbufq_empty(c);
2481                         if (n<0) n=0;
2482                         else if (unlikely(c->state==S_CONN_CONNECT ||
2483                                                 c->state==S_CONN_ACCEPT)){
2484                                 TCP_STATS_ESTABLISHED(c->state);
2485                                 c->state=S_CONN_OK; /* something was written */
2486                         }
2487                         if (unlikely(_wbufq_add(c, buf+n, len-n)<0)){
2488                                 if (likely(!locked)) lock_release(&c->write_lock);
2489                                 n=-1;
2490                                 goto error;
2491                         }
2492                         if (likely(!locked)) lock_release(&c->write_lock);
2493                         n=len;
2494                         if (likely(enable_write_watch))
2495                                 *resp=CONN_QUEUED_WRITE;
2496                         goto end;
2497                 }else{
2498                         if (likely(!locked)) lock_release(&c->write_lock);
2499                 }
2500 #endif /* TCP_ASYNC */
2501                 if (unlikely(c->state==S_CONN_CONNECT)){
2502                         switch(errno){
2503                                 case ENETUNREACH:
2504                                 case EHOSTUNREACH: /* not posix for send() */
2505 #ifdef USE_DST_BLACKLIST
2506                                         dst_blacklist_su(BLST_ERR_CONNECT, c->rcv.proto,
2507                                                                                 &c->rcv.src_su, &c->send_flags, 0);
2508 #endif /* USE_DST_BLACKLIST */
2509                                         TCP_EV_CONNECT_UNREACHABLE(errno, TCP_LADDR(c),
2510                                                                         TCP_LPORT(c), TCP_PSU(c), TCP_PROTO(c));
2511                                         break;
2512                                 case ECONNREFUSED:
2513                                 case ECONNRESET:
2514 #ifdef USE_DST_BLACKLIST
2515                                         dst_blacklist_su(BLST_ERR_CONNECT, c->rcv.proto,
2516                                                                                 &c->rcv.src_su, &c->send_flags, 0);
2517 #endif /* USE_DST_BLACKLIST */
2518                                         TCP_EV_CONNECT_RST(errno, TCP_LADDR(c), TCP_LPORT(c),
2519                                                                                 TCP_PSU(c), TCP_PROTO(c));
2520                                         break;
2521                                 default:
2522                                         TCP_EV_CONNECT_ERR(errno, TCP_LADDR(c), TCP_LPORT(c),
2523                                                                                 TCP_PSU(c), TCP_PROTO(c));
2524                                 }
2525                         TCP_STATS_CONNECT_FAILED();
2526                 }else{
2527                         switch(errno){
2528                                 case ECONNREFUSED:
2529                                 case ECONNRESET:
2530                                         TCP_STATS_CON_RESET();
2531                                         /* no break */
2532                                 case ENETUNREACH:
2533                                 /*case EHOSTUNREACH: -- not posix */
2534 #ifdef USE_DST_BLACKLIST
2535                                         dst_blacklist_su(BLST_ERR_SEND, c->rcv.proto,
2536                                                                                 &c->rcv.src_su, &c->send_flags, 0);
2537 #endif /* USE_DST_BLACKLIST */
2538                                         break;
2539                         }
2540                 }
2541                 LM_ERR("failed to send on %p (%s:%d->%s): %s (%d)\n",
2542                                         c, ip_addr2a(&c->rcv.dst_ip), c->rcv.dst_port,
2543                                         su2a(&c->rcv.src_su, sizeof(c->rcv.src_su)),
2544                                         strerror(errno), errno);
2545                 n = -1;
2546 #ifdef TCP_ASYNC
2547 error:
2548 #endif /* TCP_ASYNC */
2549                 /* error on the connection , mark it as bad and set 0 timeout */
2550                 c->state=S_CONN_BAD;
2551                 c->timeout=get_ticks_raw();
2552                 /* tell "main" it should drop this (optional it will t/o anyway?)*/
2553                 *resp=CONN_ERROR;
2554                 return n; /* error return, no tcpconn_put */
2555         }
2556         
2557 #ifdef TCP_ASYNC
2558         if (likely(!locked)) lock_release(&c->write_lock);
2559 #endif /* TCP_ASYNC */
2560         /* in non-async mode here we're either in S_CONN_OK or S_CONN_ACCEPT*/
2561         if (unlikely(c->state==S_CONN_CONNECT || c->state==S_CONN_ACCEPT)){
2562                         TCP_STATS_ESTABLISHED(c->state);
2563                         c->state=S_CONN_OK;
2564         }
2565         if (unlikely(send_flags.f & SND_F_CON_CLOSE)){
2566                 /* close after write => send EOF request to tcp_main */
2567                 c->state=S_CONN_BAD;
2568                 c->timeout=get_ticks_raw();
2569                 /* tell "main" it should drop this*/
2570                 *resp=CONN_EOF;
2571                 return n;
2572         }
2573 end:
2574         return n;
2575 }
2576
2577
2578
2579 /** low level 1st send on a new connection.
2580  * It takes care of possible write-queueing, blacklisting a.s.o.
2581  * It expects a valid just-opened tcp connection. It doesn't touch the 
2582  * ref. counters. It's used only in the async first send case.
2583  * @param fd - fd used for sending.
2584  * @param c - existing tcp connection pointer (state and flags might be
2585  *            changed). The connection must be new (no previous send on it).
2586  * @param buf - data to be sent.
2587  * @param len - data length.
2588  * @param send_flags
2589  * @param resp - filled with a fd sending cmd. for tcp_main on success. It
2590  *                      _must_ be one of the commands listed below:
2591  *                      CONN_NEW_PENDING_WRITE - new connection, first write
2592  *                                 was partially successful (or EAGAIN) and
2593  *                                 was queued (connection should be watched
2594  *                                 for write and the write queue flushed).
2595  *                                 The fd should be sent to tcp_main.
2596  *                      CONN_NEW_COMPLETE - new connection, first write
2597  *                                 completed successfully and no data is
2598  *                                 queued. The fd should be sent to tcp_main.
2599  *                      CONN_EOF - no error, but the connection should be
2600  *                                  closed (e.g. SND_F_CON_CLOSE send flag).
2601  *                      CONN_ERROR - error, _must_ return < 0.
2602  * @param locked - if set assume the connection is already locked (call from
2603  *                  tls) and do not lock/unlock the connection.
2604  * @return >=0 on success, < 0 on error (on error *resp is undefined).
2605  *
2606  */
2607 static int tcpconn_1st_send(int fd, struct tcp_connection* c,
2608                                                         const char* buf, unsigned len,
2609                                                         snd_flags_t send_flags, long* resp,
2610                                                         int locked)
2611 {
2612         int n;
2613         
2614         n=_tcpconn_write_nb(fd, c, buf, len);
2615         if (unlikely(n<(int)len)){
2616                 /* on EAGAIN or ENOTCONN return success.
2617                    ENOTCONN appears on newer FreeBSD versions (non-blocking socket,
2618                    connect() & send immediately) */
2619                 if ((n>=0) || errno==EAGAIN || errno==EWOULDBLOCK || errno==ENOTCONN){
2620                         LM_DBG("pending write on new connection %p "
2621                                 "(%d/%d bytes written)\n", c, n, len);
2622                         if (unlikely(n<0)) n=0;
2623                         else{
2624                                 if (likely(c->state == S_CONN_CONNECT))
2625                                         TCP_STATS_ESTABLISHED(S_CONN_CONNECT);
2626                                 c->state=S_CONN_OK; /* partial write => connect()
2627                                                                                                 ended */
2628                         }
2629                         /* add to the write queue */
2630                         if (likely(!locked)) lock_get(&c->write_lock);
2631                                 if (unlikely(_wbufq_insert(c, buf+n, len-n)<0)){
2632                                         if (likely(!locked)) lock_release(&c->write_lock);
2633                                         n=-1;
2634                                         LM_ERR("%s: EAGAIN and write queue full or failed for %p\n",
2635                                                         su2a(&c->rcv.src_su, sizeof(c->rcv.src_su)), c);
2636                                         goto error;
2637                                 }
2638                         if (likely(!locked)) lock_release(&c->write_lock);
2639                         /* send to tcp_main */
2640                         *resp=CONN_NEW_PENDING_WRITE;
2641                         n=len;
2642                         goto end;
2643                 }
2644                 /* n < 0 and not EAGAIN => write error */
2645                 /* if first write failed it's most likely a
2646                    connect error */
2647                 switch(errno){
2648                         case ENETUNREACH:
2649                         case EHOSTUNREACH:  /* not posix for send() */
2650 #ifdef USE_DST_BLACKLIST
2651                                 dst_blacklist_su( BLST_ERR_CONNECT, c->rcv.proto,
2652                                                                         &c->rcv.src_su, &c->send_flags, 0);
2653 #endif /* USE_DST_BLACKLIST */
2654                                 TCP_EV_CONNECT_UNREACHABLE(errno, TCP_LADDR(c),
2655                                                                 TCP_LPORT(c), TCP_PSU(c), TCP_PROTO(c));
2656                                 break;
2657                         case ECONNREFUSED:
2658                         case ECONNRESET:
2659 #ifdef USE_DST_BLACKLIST
2660                                 dst_blacklist_su( BLST_ERR_CONNECT, c->rcv.proto,
2661                                                                         &c->rcv.src_su, &c->send_flags, 0);
2662 #endif /* USE_DST_BLACKLIST */
2663                                 TCP_EV_CONNECT_RST(errno, TCP_LADDR(c),
2664                                                                 TCP_LPORT(c), TCP_PSU(c), TCP_PROTO(c));
2665                                 break;
2666                         default:
2667                                 TCP_EV_CONNECT_ERR(errno, TCP_LADDR(c),
2668                                                                 TCP_LPORT(c), TCP_PSU(c), TCP_PROTO(c));
2669                 }
2670                 /* error: destroy it directly */
2671                 TCP_STATS_CONNECT_FAILED();
2672                 LM_ERR("%s: connect & send  for %p failed:" " %s (%d)\n",
2673                                         su2a(&c->rcv.src_su, sizeof(c->rcv.src_su)),
2674                                         c, strerror(errno), errno);
2675                 goto error;
2676         }
2677         LM_INFO("quick connect for %p\n", c);
2678         if (likely(c->state == S_CONN_CONNECT))
2679                 TCP_STATS_ESTABLISHED(S_CONN_CONNECT);
2680         if (unlikely(send_flags.f & SND_F_CON_CLOSE)){
2681                 /* close after write =>  EOF => close immediately */
2682                 c->state=S_CONN_BAD;
2683                 /* tell our caller that it should drop this*/
2684                 *resp=CONN_EOF;
2685         }else{
2686                 c->state=S_CONN_OK;
2687                 /* send to tcp_main */
2688                 *resp=CONN_NEW_COMPLETE;
2689         }
2690 end:
2691         return n; /* >= 0 */
2692 error:
2693         *resp=CONN_ERROR;
2694         return -1;
2695 }
2696
2697
2698
2699 int tcp_init(struct socket_info* sock_info)
2700 {
2701         union sockaddr_union* addr;
2702         int optval;
2703 #ifdef HAVE_TCP_ACCEPT_FILTER
2704         struct accept_filter_arg afa;
2705 #endif /* HAVE_TCP_ACCEPT_FILTER */
2706 #ifdef DISABLE_NAGLE
2707         int flag;
2708         struct protoent* pe;
2709
2710         if (tcp_proto_no==-1){ /* if not already set */
2711                 pe=getprotobyname("tcp");
2712                 if (pe==0){
2713                         LM_ERR("could not get TCP protocol number\n");
2714                         tcp_proto_no=-1;
2715                 }else{
2716                         tcp_proto_no=pe->p_proto;
2717                 }
2718         }
2719 #endif
2720
2721         addr=&sock_info->su;
2722         /* sock_info->proto=PROTO_TCP; */
2723         if (init_su(addr, &sock_info->address, sock_info->port_no)<0){
2724                 LM_ERR("could no init sockaddr_union\n");
2725                 goto error;
2726         }
2727         LM_DBG("added %s\n", su2a(addr, sizeof(*addr)));
2728         sock_info->socket=socket(AF2PF(addr->s.sa_family), SOCK_STREAM, 0);
2729         if (sock_info->socket==-1){
2730                 LM_ERR("tcp_init: socket: %s\n", strerror(errno));
2731                 goto error;
2732         }
2733 #ifdef DISABLE_NAGLE
2734         flag=1;
2735         if ( (tcp_proto_no!=-1) &&
2736                  (setsockopt(sock_info->socket, tcp_proto_no , TCP_NODELAY,
2737                                          &flag, sizeof(flag))<0) ){
2738                 LM_ERR("could not disable Nagle: %s\n", strerror(errno));
2739         }
2740 #endif
2741
2742
2743 #if  !defined(TCP_DONT_REUSEADDR) 
2744         /* Stevens, "Network Programming", Section 7.5, "Generic Socket
2745      * Options": "...server started,..a child continues..on existing
2746          * connection..listening server is restarted...call to bind fails
2747          * ... ALL TCP servers should specify the SO_REUSEADDRE option 
2748          * to allow the server to be restarted in this situation
2749          *
2750          * Indeed, without this option, the server can't restart.
2751          *   -jiri
2752          */
2753         optval=1;
2754         if (setsockopt(sock_info->socket, SOL_SOCKET, SO_REUSEADDR,
2755                                 (void*)&optval, sizeof(optval))==-1) {
2756                 LM_ERR("setsockopt %s\n", strerror(errno));
2757                 goto error;
2758         }
2759 #endif
2760
2761 #ifdef SO_REUSEPORT
2762         if ((optval=cfg_get(tcp, tcp_cfg, reuse_port))) {
2763                 if (setsockopt(sock_info->socket, SOL_SOCKET, SO_REUSEPORT,
2764                                 (void*)&optval, sizeof(optval))==-1) {
2765                         LM_ERR("setsockopt %s\n", strerror(errno));
2766                 }
2767         }
2768 #endif
2769
2770         /* tos */
2771         optval = tos;
2772         if(sock_info->address.af==AF_INET){
2773                 if (setsockopt(sock_info->socket, IPPROTO_IP, IP_TOS, (void*)&optval,
2774                                         sizeof(optval)) ==-1){
2775                         LM_WARN("setsockopt tos: %s (%d)\n", strerror(errno), tos);
2776                         /* continue since this is not critical */
2777                 }
2778         } else if(sock_info->address.af==AF_INET6){
2779                 if (setsockopt(sock_info->socket, IPPROTO_IPV6, IPV6_TCLASS,
2780                                         (void*)&optval, sizeof(optval)) ==-1) {
2781                         LM_WARN("setsockopt v6 tos: %s (%d)\n", strerror(errno), tos);
2782                         /* continue since this is not critical */
2783                 }
2784         }
2785
2786 #if defined(IP_FREEBIND)
2787         /* allow bind to non local address.
2788          * useful when daemon started before network initialized */
2789         if (_sr_ip_free_bind && setsockopt(sock_info->socket, IPPROTO_IP,
2790                                 IP_FREEBIND, (void*)&optval, sizeof(optval)) ==-1) {
2791                 LM_WARN("setsockopt freebind failed: %s\n", strerror(errno));
2792                 /* continue since this is not critical */
2793         }
2794 #endif
2795
2796 #ifdef HAVE_TCP_DEFER_ACCEPT
2797         /* linux only */
2798         if ((optval=cfg_get(tcp, tcp_cfg, defer_accept))){
2799                 if (setsockopt(sock_info->socket, IPPROTO_TCP, TCP_DEFER_ACCEPT,
2800                                         (void*)&optval, sizeof(optval)) ==-1){
2801                         LM_WARN("setsockopt TCP_DEFER_ACCEPT %s\n", strerror(errno));
2802                 /* continue since this is not critical */
2803                 }
2804         }
2805 #endif /* HAVE_TCP_DEFFER_ACCEPT */
2806 #ifdef HAVE_TCP_SYNCNT
2807         if ((optval=cfg_get(tcp, tcp_cfg, syncnt))){
2808                 if (setsockopt(sock_info->socket, IPPROTO_TCP, TCP_SYNCNT, &optval,
2809                                                 sizeof(optval))<0){
2810                         LM_WARN("failed to set maximum SYN retr. count: %s\n", strerror(errno));
2811                 }
2812         }
2813 #endif
2814 #ifdef HAVE_TCP_LINGER2
2815         if ((optval=cfg_get(tcp, tcp_cfg, linger2))){
2816                 if (setsockopt(sock_info->socket, IPPROTO_TCP, TCP_LINGER2, &optval,
2817                                                 sizeof(optval))<0){
2818                         LM_WARN("failed to set maximum LINGER2 timeout: %s\n", strerror(errno));
2819                 }
2820         }
2821 #endif
2822         init_sock_keepalive(sock_info->socket);
2823         if (bind(sock_info->socket, &addr->s, sockaddru_len(*addr))==-1){
2824                 LM_ERR("bind(%x, %p, %d) on %s:%d : %s\n",
2825                                 sock_info->socket,  &addr->s, 
2826                                 (unsigned)sockaddru_len(*addr),
2827                                 sock_info->address_str.s,
2828                                 sock_info->port_no,
2829                                 strerror(errno));
2830                 goto error;
2831         }
2832         if (listen(sock_info->socket, TCP_LISTEN_BACKLOG)==-1){
2833                 LM_ERR("listen(%x, %p, %d) on %s: %s\n",
2834                                 sock_info->socket, &addr->s, 
2835                                 (unsigned)sockaddru_len(*addr),
2836                                 sock_info->address_str.s,
2837                                 strerror(errno));
2838                 goto error;
2839         }
2840 #ifdef HAVE_TCP_ACCEPT_FILTER
2841         /* freebsd */
2842         if (cfg_get(tcp, tcp_cfg, defer_accept)){
2843                 memset(&afa, 0, sizeof(afa));
2844                 strcpy(afa.af_name, "dataready");
2845                 if (setsockopt(sock_info->socket, SOL_SOCKET, SO_ACCEPTFILTER,
2846                                         (void*)&afa, sizeof(afa)) ==-1){
2847                         LM_WARN("setsockopt SO_ACCEPTFILTER %s\n",
2848                                                 strerror(errno));
2849                 /* continue since this is not critical */
2850                 }
2851         }
2852 #endif /* HAVE_TCP_ACCEPT_FILTER */
2853         
2854         return 0;
2855 error:
2856         if (sock_info->socket!=-1){
2857                 tcp_safe_close(sock_info->socket);
2858                 sock_info->socket=-1;
2859         }
2860         return -1;
2861 }
2862
2863
2864
2865 /* close tcp_main's fd from a tcpconn
2866  * WARNING: call only in tcp_main context */
2867 inline static void tcpconn_close_main_fd(struct tcp_connection* tcpconn)
2868 {
2869         int fd;
2870         
2871         
2872         fd=tcpconn->s;
2873 #ifdef USE_TLS
2874         if (tcpconn->type==PROTO_TLS || tcpconn->type==PROTO_WSS)
2875                 tls_close(tcpconn, fd);
2876 #endif
2877 #ifdef TCP_FD_CACHE
2878         if (likely(cfg_get(tcp, tcp_cfg, fd_cache))) shutdown(fd, SHUT_RDWR);
2879 #endif /* TCP_FD_CACHE */
2880         if (unlikely(tcp_safe_close(fd)<0))
2881                 LM_ERR("(%p): %s close(%d) failed (flags 0x%x): %s (%d)\n", tcpconn,
2882                                         su2a(&tcpconn->rcv.src_su, sizeof(tcpconn->rcv.src_su)),
2883                                         fd, tcpconn->flags, strerror(errno), errno);
2884         tcpconn->s=-1;
2885 }
2886
2887
2888
2889 /* dec refcnt & frees the connection if refcnt==0
2890  * returns 1 if the connection is freed, 0 otherwise
2891  *
2892  * WARNING: use only from child processes */
2893 inline static int tcpconn_chld_put(struct tcp_connection* tcpconn)
2894 {
2895         if (unlikely(atomic_dec_and_test(&tcpconn->refcnt))){
2896                 LM_DBG("destroying connection %p (%d, %d) flags %04x\n",
2897                                 tcpconn, tcpconn->id, tcpconn->s, tcpconn->flags);
2898                 /* sanity checks */
2899                 membar_read_atomic_op(); /* make sure we see the current flags */
2900                 if (unlikely(!(tcpconn->flags & F_CONN_FD_CLOSED) ||
2901                         (tcpconn->flags &
2902                                 (F_CONN_HASHED|F_CONN_MAIN_TIMER|
2903                                  F_CONN_READ_W|F_CONN_WRITE_W)) )){
2904                         LM_CRIT("%p bad flags = %0x\n", tcpconn, tcpconn->flags);
2905                         abort();
2906                 }
2907                 _tcpconn_free(tcpconn); /* destroys also the wbuf_q if still present*/
2908                 return 1;
2909         }
2910         return 0;
2911 }
2912
2913
2914
2915 /* simple destroy function (the connection should be already removed
2916  * from the hashes. refcnt 0 and the fds should not be watched anymore for IO)
2917  */
2918 inline static void tcpconn_destroy(struct tcp_connection* tcpconn)
2919 {
2920                 LM_DBG("destroying connection %p (%d, %d) flags %04x\n",
2921                                 tcpconn, tcpconn->id, tcpconn->s, tcpconn->flags);
2922                 if (unlikely(tcpconn->flags & F_CONN_HASHED)){
2923                         LM_CRIT("called with hashed connection (%p)\n", tcpconn);
2924                         /* try to continue */
2925                         if (likely(tcpconn->flags & F_CONN_MAIN_TIMER))
2926                                 local_timer_del(&tcp_main_ltimer, &tcpconn->timer);
2927                         TCPCONN_LOCK;
2928                                 _tcpconn_detach(tcpconn);
2929                                 tcpconn->flags &= ~(F_CONN_HASHED|F_CONN_MAIN_TIMER);
2930                         TCPCONN_UNLOCK;
2931                 }
2932                 if (likely(!(tcpconn->flags & F_CONN_FD_CLOSED))){
2933                         tcpconn_close_main_fd(tcpconn);
2934                         tcpconn->flags|=F_CONN_FD_CLOSED;
2935                         (*tcp_connections_no)--;
2936                         if (unlikely(tcpconn->type==PROTO_TLS || tcpconn->type==PROTO_WSS))
2937                                 (*tls_connections_no)--;
2938                 }
2939                 _tcpconn_free(tcpconn); /* destroys also the wbuf_q if still present*/
2940 }
2941
2942
2943
2944 /* tries to destroy the connection: dec. refcnt and if 0 destroys the
2945  *  connection, else it will mark it as BAD and close the main fds
2946  *
2947  * returns 1 if the connection was destroyed, 0 otherwise
2948  *
2949  * WARNING: - the connection _has_ to be removed from the hash and timer
2950  *  first (use tcpconn_try_unhash() for this )
2951  *         - the fd should not be watched anymore (io_watch_del()...)
2952  *         - must be called _only_ from the tcp_main process context
2953  *          (or else the fd will remain open)
2954  */
2955 inline static int tcpconn_put_destroy(struct tcp_connection* tcpconn)
2956 {
2957         if (unlikely((tcpconn->flags &
2958                         (F_CONN_WRITE_W|F_CONN_HASHED|F_CONN_MAIN_TIMER|F_CONN_READ_W)) )){
2959                 /* sanity check */
2960                 if (unlikely(tcpconn->flags & F_CONN_HASHED)){
2961                         LM_CRIT("called with hashed and/or"
2962                                                 "on timer connection (%p), flags = %0x\n",
2963                                                 tcpconn, tcpconn->flags);
2964                         /* try to continue */
2965                         if (likely(tcpconn->flags & F_CONN_MAIN_TIMER))
2966                                 local_timer_del(&tcp_main_ltimer, &tcpconn->timer);
2967                         TCPCONN_LOCK;
2968                                 _tcpconn_detach(tcpconn);
2969                                 tcpconn->flags &= ~(F_CONN_HASHED|F_CONN_MAIN_TIMER);
2970                         TCPCONN_UNLOCK;
2971                 }else{
2972                         LM_CRIT("%p flags = %0x\n", tcpconn, tcpconn->flags);
2973                 }
2974         }
2975         tcpconn->state=S_CONN_BAD;
2976         /* in case it's still in a reader timer */
2977         tcpconn->timeout=get_ticks_raw();
2978         /* fast close: close fds now */
2979         if (likely(!(tcpconn->flags & F_CONN_FD_CLOSED))){
2980                 tcpconn_close_main_fd(tcpconn);
2981                 tcpconn->flags|=F_CONN_FD_CLOSED;
2982                 (*tcp_connections_no)--;
2983                 if (unlikely(tcpconn->type==PROTO_TLS || tcpconn->type==PROTO_WSS))
2984                                 (*tls_connections_no)--;
2985         }
2986         /* all the flags / ops on the tcpconn must be done prior to decrementing
2987          * the refcnt. and at least a membar_write_atomic_op() mem. barrier or
2988          *  a mb_atomic_* op must * be used to make sure all the changed flags are
2989          *  written into memory prior to the new refcnt value */
2990         if (unlikely(mb_atomic_dec_and_test(&tcpconn->refcnt))){
2991                 _tcpconn_free(tcpconn);
2992                 return 1;
2993         }
2994         return 0;
2995 }
2996
2997
2998
2999 /* try to remove a connection from the hashes and timer.
3000  * returns 1 if the connection was removed, 0 if not (connection not in
3001  *  hash)
3002  *
3003  * WARNING: call it only in the  tcp_main process context or else the
3004  *  timer removal won't work.
3005  */
3006 inline static int tcpconn_try_unhash(struct tcp_connection* tcpconn)
3007 {
3008         if (likely(tcpconn->flags & F_CONN_HASHED)){
3009                 tcpconn->state=S_CONN_BAD;
3010                 if (likely(tcpconn->flags & F_CONN_MAIN_TIMER)){
3011                         local_timer_del(&tcp_main_ltimer, &tcpconn->timer);
3012                         tcpconn->flags&=~F_CONN_MAIN_TIMER;
3013                 }else
3014                         /* in case it's still in a reader timer */
3015                         tcpconn->timeout=get_ticks_raw();
3016                 TCPCONN_LOCK;
3017                         if (tcpconn->flags & F_CONN_HASHED){
3018                                 tcpconn->flags&=~F_CONN_HASHED;
3019                                 _tcpconn_detach(tcpconn);
3020                                 TCPCONN_UNLOCK;
3021                         }else{
3022                                 /* tcp_send was faster and did unhash it itself */
3023                                 TCPCONN_UNLOCK;
3024                                 return 0;
3025                         }
3026 #ifdef TCP_ASYNC
3027                 /* empty possible write buffers (optional) */
3028                 if (unlikely(_wbufq_non_empty(tcpconn))){
3029                         lock_get(&tcpconn->write_lock);
3030                                 /* check again, while holding the lock */
3031                                 if (likely(_wbufq_non_empty(tcpconn)))
3032                                         _wbufq_destroy(&tcpconn->wbuf_q);
3033                         lock_release(&tcpconn->write_lock);
3034                 }
3035 #endif /* TCP_ASYNC */
3036                 return 1;
3037         }
3038         return 0;
3039 }
3040
3041
3042
3043 #ifdef SEND_FD_QUEUE
3044 struct send_fd_info{
3045         struct tcp_connection* tcp_conn;
3046         ticks_t expire;
3047         int unix_sock;
3048         unsigned int retries; /* debugging */
3049 };
3050
3051 struct tcp_send_fd_q{
3052         struct send_fd_info* data; /* buffer */
3053         struct send_fd_info* crt;  /* pointer inside the buffer */
3054         struct send_fd_info* end;  /* points after the last valid position */
3055 };
3056
3057
3058 static struct tcp_send_fd_q send2child_q;
3059
3060
3061
3062 static int send_fd_queue_init(struct tcp_send_fd_q *q, unsigned int size)
3063 {
3064         q->data=pkg_malloc(size*sizeof(struct send_fd_info));
3065         if (q->data==0){
3066                 LM_ERR("out of memory\n");
3067                 return -1;
3068         }
3069         q->crt=&q->data[0];
3070         q->end=&q->data[size];
3071         return 0;
3072 }
3073
3074 static void send_fd_queue_destroy(struct tcp_send_fd_q *q)
3075 {
3076         if (q->data){
3077                 pkg_free(q->data);
3078                 q->data=0;
3079                 q->crt=q->end=0;
3080         }
3081 }
3082
3083
3084
3085 static int init_send_fd_queues(void)
3086 {
3087         if (send_fd_queue_init(&send2child_q, SEND_FD_QUEUE_SIZE)!=0)
3088                 goto error;
3089         return 0;
3090 error:
3091         LM_ERR("init failed\n");
3092         return -1;
3093 }
3094
3095
3096
3097 static void destroy_send_fd_queues(void)
3098 {
3099         send_fd_queue_destroy(&send2child_q);
3100 }
3101
3102
3103
3104
3105 inline static int send_fd_queue_add(    struct tcp_send_fd_q* q, 
3106                                                                                 int unix_sock,
3107                                                                                 struct tcp_connection *t)
3108 {
3109         struct send_fd_info* tmp;
3110         unsigned long new_size;
3111         
3112         if (q->crt>=q->end){
3113                 new_size=q->end-&q->data[0];
3114                 if (new_size< MAX_SEND_FD_QUEUE_SIZE/2){
3115                         new_size*=2;
3116                 }else new_size=MAX_SEND_FD_QUEUE_SIZE;
3117                 if (unlikely(q->crt>=&q->data[new_size])){
3118                         LM_ERR("queue full: %ld/%ld\n",
3119                                         (long)(q->crt-&q->data[0]-1), new_size);
3120                         goto error;
3121                 }
3122                 LM_CRIT("queue full: %ld, extending to %ld\n",
3123                                 (long)(q->end-&q->data[0]), new_size);
3124                 tmp=pkg_realloc(q->data, new_size*sizeof(struct send_fd_info));
3125                 if (unlikely(tmp==0)){
3126                         LM_ERR("out of memory\n");
3127                         goto error;
3128                 }
3129                 q->crt=(q->crt-&q->data[0])+tmp;
3130                 q->data=tmp;
3131                 q->end=&q->data[new_size];
3132         }
3133         q->crt->tcp_conn=t;
3134         q->crt->unix_sock=unix_sock;
3135         q->crt->expire=get_ticks_raw()+SEND_FD_QUEUE_TIMEOUT;
3136         q->crt->retries=0;
3137         q->crt++;
3138         return 0;
3139 error:
3140         return -1;
3141 }
3142
3143
3144
3145 inline static void send_fd_queue_run(struct tcp_send_fd_q* q)
3146 {
3147         struct send_fd_info* p;
3148         struct send_fd_info* t;
3149         
3150         for (p=t=&q->data[0]; p<q->crt; p++){
3151                 if (unlikely(p->tcp_conn->state == S_CONN_BAD ||
3152                                          p->tcp_conn->flags & F_CONN_FD_CLOSED ||
3153                                          p->tcp_conn->s ==-1)) {
3154                         /* bad and/or already closed connection => remove */
3155                         goto rm_con;
3156                 }
3157                 if (unlikely(send_fd(p->unix_sock, &(p->tcp_conn),
3158                                         sizeof(struct tcp_connection*), p->tcp_conn->s)<=0)){
3159                         if ( ((errno==EAGAIN)||(errno==EWOULDBLOCK)) && 
3160                                                         ((s_ticks_t)(p->expire-get_ticks_raw())>0)){
3161                                 /* leave in queue for a future try */
3162                                 *t=*p;
3163                                 t->retries++;
3164                                 t++;
3165                         }else{
3166                                 LM_ERR("send_fd failed on socket %d , queue entry %ld, retries %d,"
3167                                                    " connection %p, tcp socket %d, errno=%d (%s) \n",
3168                                                    p->unix_sock, (long)(p-&q->data[0]), p->retries,
3169                                                    p->tcp_conn, p->tcp_conn->s, errno,
3170                                                    strerror(errno));
3171 rm_con:
3172 #ifdef TCP_ASYNC
3173                                 /* if a connection is on the send_fd queue it means it's
3174                                    not watched for read anymore => could be watched only for
3175               &n