core: adjustments for cleaner compiler analyzing reports
[kamailio] / src / core / tcp_main.c
1 /*
2  * Copyright (C) 2001-2003 FhG Fokus
3  *
4  * This file is part of Kamailio, a free SIP server.
5  *
6  * Kamailio is free software; you can redistribute it and/or modify
7  * it under the terms of the GNU General Public License as published by
8  * the Free Software Foundation; either version 2 of the License, or
9  * (at your option) any later version
10  *
11  * Kamailio is distributed in the hope that it will be useful,
12  * but WITHOUT ANY WARRANTY; without even the implied warranty of
13  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
14  * GNU General Public License for more details.
15  *
16  * You should have received a copy of the GNU General Public License
17  * along with this program; if not, write to the Free Software
18  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
19  */
20
21 /** Kamailio core: tcp main/dispatcher and tcp send functions.
22  * @file tcp_main.c
23  * @ingroup core
24  * Module: @ref core
25  */
26
27
28 #ifdef USE_TCP
29
30
31 #ifndef SHM_MEM
32 #error "shared memory support needed (add -DSHM_MEM to Makefile.defs)"
33 #endif
34
35 #define HANDLE_IO_INLINE
36 #include "io_wait.h" /* include first to make sure the needed features are
37                                                 turned on (e.g. _GNU_SOURCE for POLLRDHUP) */
38
39 #include <sys/time.h>
40 #include <sys/types.h>
41 #include <sys/select.h>
42 #include <sys/socket.h>
43 #ifdef HAVE_FILIO_H
44 #include <sys/filio.h> /* needed on solaris 2.x for FIONREAD */
45 #elif defined __OS_solaris
46 #define BSD_COMP  /* needed on older solaris for FIONREAD */
47 #endif /* HAVE_FILIO_H / __OS_solaris */
48 #include <sys/ioctl.h>  /* ioctl() used on write error */
49 #include <netinet/in.h>
50 #include <netinet/in_systm.h>
51 #include <netinet/ip.h>
52 #include <netinet/tcp.h>
53 #include <sys/uio.h>  /* writev*/
54 #include <netdb.h>
55 #include <stdlib.h> /*exit() */
56
57 #include <unistd.h>
58
59 #include <errno.h>
60 #include <string.h>
61
62 #ifdef HAVE_SELECT
63 #include <sys/select.h>
64 #endif
65 #include <poll.h>
66
67
68 #include "ip_addr.h"
69 #include "pass_fd.h"
70 #include "tcp_conn.h"
71 #include "globals.h"
72 #include "pt.h"
73 #include "locking.h"
74 #include "mem/mem.h"
75 #include "mem/shm_mem.h"
76 #include "timer.h"
77 #include "sr_module.h"
78 #include "tcp_server.h"
79 #include "tcp_init.h"
80 #include "tcp_int_send.h"
81 #include "tcp_stats.h"
82 #include "tcp_ev.h"
83 #include "tsend.h"
84 #include "timer_ticks.h"
85 #include "local_timer.h"
86 #ifdef CORE_TLS
87 #include "tls/tls_server.h"
88 #define tls_loaded() 1
89 #else
90 #include "tls_hooks_init.h"
91 #include "tls_hooks.h"
92 #endif /* CORE_TLS*/
93 #ifdef USE_DST_BLACKLIST
94 #include "dst_blacklist.h"
95 #endif /* USE_DST_BLACKLIST */
96
97 #include "tcp_info.h"
98 #include "tcp_options.h"
99 #include "ut.h"
100 #include "cfg/cfg_struct.h"
101
102 #define local_malloc pkg_malloc
103 #define local_free   pkg_free
104
105 #include <fcntl.h> /* must be included after io_wait.h if SIGIO_RT is used */
106
107
108 #ifdef NO_MSG_DONTWAIT
109 #ifndef MSG_DONTWAIT
110 /* should work inside tcp_main */
111 #define MSG_DONTWAIT 0
112 #endif
113 #endif /*NO_MSG_DONTWAIT */
114
115
116 #define TCP_PASS_NEW_CONNECTION_ON_DATA /* don't pass a new connection
117                                                                                    immediately to a child, wait for
118                                                                                    some data on it first */
119 #define TCP_LISTEN_BACKLOG 1024
120 #define SEND_FD_QUEUE /* queue send fd requests on EAGAIN, instead of sending 
121                                                         them immediately */
122 #define TCP_CHILD_NON_BLOCKING 
123 #ifdef SEND_FD_QUEUE
124 #ifndef TCP_CHILD_NON_BLOCKING
125 #define TCP_CHILD_NON_BLOCKING
126 #endif
127 #define MAX_SEND_FD_QUEUE_SIZE  tcp_main_max_fd_no
128 #define SEND_FD_QUEUE_SIZE              128  /* initial size */
129 #define SEND_FD_QUEUE_TIMEOUT   MS_TO_TICKS(2000)  /* 2 s */
130 #endif
131
132 /* minimum interval local_timer_run() is allowed to run, in ticks */
133 #define TCPCONN_TIMEOUT_MIN_RUN 1  /* once per tick */
134 #define TCPCONN_WAIT_TIMEOUT 1 /* 1 tick */
135
136 #ifdef TCP_ASYNC
137 static unsigned int* tcp_total_wq=0;
138 #endif
139
140
141 enum fd_types { F_NONE, F_SOCKINFO /* a tcp_listen fd */,
142                                 F_TCPCONN, F_TCPCHILD, F_PROC };
143
144
145 #ifdef TCP_FD_CACHE
146
147 #define TCP_FD_CACHE_SIZE 8
148
149 struct fd_cache_entry{
150         struct tcp_connection* con;
151         int id;
152         int fd;
153 };
154
155
156 static struct fd_cache_entry fd_cache[TCP_FD_CACHE_SIZE];
157 #endif /* TCP_FD_CACHE */
158
159 static int is_tcp_main=0;
160
161
162 enum poll_types tcp_poll_method=0; /* by default choose the best method */
163 int tcp_main_max_fd_no=0;
164 int tcp_max_connections=DEFAULT_TCP_MAX_CONNECTIONS;
165 int tls_max_connections=DEFAULT_TLS_MAX_CONNECTIONS;
166
167 static union sockaddr_union tcp_source_ipv4_addr; /* saved bind/srv v4 addr. */
168 static union sockaddr_union* tcp_source_ipv4=0;
169 static union sockaddr_union tcp_source_ipv6_addr; /* saved bind/src v6 addr. */
170 static union sockaddr_union* tcp_source_ipv6=0;
171
172 static int* tcp_connections_no=0; /* current tcp (+tls) open connections */
173 static int* tls_connections_no=0; /* current tls open connections */
174
175 /* connection hash table (after ip&port) , includes also aliases */
176 struct tcp_conn_alias** tcpconn_aliases_hash=0;
177 /* connection hash table (after connection id) */
178 struct tcp_connection** tcpconn_id_hash=0;
179 gen_lock_t* tcpconn_lock=0;
180
181 struct tcp_child* tcp_children=0;
182 static int* connection_id=0; /*  unique for each connection, used for 
183                                                                 quickly finding the corresponding connection
184                                                                 for a reply */
185 int unix_tcp_sock;
186
187 static int tcp_proto_no=-1; /* tcp protocol number as returned by
188                                                            getprotobyname */
189
190 static io_wait_h io_h;
191
192 static struct local_timer tcp_main_ltimer;
193 static ticks_t tcp_main_prev_ticks;
194
195 /* tell if there are tcp workers that should handle only specific socket
196  * - used to optimize the search of least loaded worker for a tcp socket
197  * - 0 - no workers per tcp sockets have been set
198  * - 1 + generic_workers - when there are workers per tcp sockets
199  */
200 static int tcp_sockets_gworkers = 0;
201
202 static ticks_t tcpconn_main_timeout(ticks_t , struct timer_ln* , void* );
203
204 inline static int _tcpconn_add_alias_unsafe(struct tcp_connection* c, int port,
205                                                                                 struct ip_addr* l_ip, int l_port,
206                                                                                 int flags);
207
208
209
210 /* sets source address used when opening new sockets and no source is specified
211  *  (by default the address is choosen by the kernel)
212  * Should be used only on init.
213  * returns -1 on error */
214 int tcp_set_src_addr(struct ip_addr* ip)
215 {
216         switch (ip->af){
217                 case AF_INET:
218                         ip_addr2su(&tcp_source_ipv4_addr, ip, 0);
219                         tcp_source_ipv4=&tcp_source_ipv4_addr;
220                         break;
221                 case AF_INET6:
222                         ip_addr2su(&tcp_source_ipv6_addr, ip, 0);
223                         tcp_source_ipv6=&tcp_source_ipv6_addr;
224                         break;
225                 default:
226                         return -1;
227         }
228         return 0;
229 }
230
231
232
233 static inline int init_sock_keepalive(int s)
234 {
235         int optval;
236         
237 #ifdef HAVE_SO_KEEPALIVE
238         if (cfg_get(tcp, tcp_cfg, keepalive)){
239                 optval=1;
240                 if (setsockopt(s, SOL_SOCKET, SO_KEEPALIVE, &optval,
241                                                 sizeof(optval))<0){
242                         LM_WARN("failed to enable SO_KEEPALIVE: %s\n", strerror(errno));
243                         return -1;
244                 }
245         }
246 #endif
247 #ifdef HAVE_TCP_KEEPINTVL
248         if ((optval=cfg_get(tcp, tcp_cfg, keepintvl))){
249                 if (setsockopt(s, IPPROTO_TCP, TCP_KEEPINTVL, &optval,
250                                                 sizeof(optval))<0){
251                         LM_WARN("failed to set keepalive probes interval: %s\n", strerror(errno));
252                 }
253         }
254 #endif
255 #ifdef HAVE_TCP_KEEPIDLE
256         if ((optval=cfg_get(tcp, tcp_cfg, keepidle))){
257                 if (setsockopt(s, IPPROTO_TCP, TCP_KEEPIDLE, &optval,
258                                                 sizeof(optval))<0){
259                         LM_WARN("failed to set keepalive idle interval: %s\n", strerror(errno));
260                 }
261         }
262 #endif
263 #ifdef HAVE_TCP_KEEPCNT
264         if ((optval=cfg_get(tcp, tcp_cfg, keepcnt))){
265                 if (setsockopt(s, IPPROTO_TCP, TCP_KEEPCNT, &optval,
266                                                 sizeof(optval))<0){
267                         LM_WARN("failed to set maximum keepalive count: %s\n", strerror(errno));
268                 }
269         }
270 #endif
271         return 0;
272 }
273
274
275
276 /* set all socket/fd options for new sockets (e.g. before connect): 
277  *  disable nagle, tos lowdelay, reuseaddr, non-blocking
278  *
279  * return -1 on error */
280 static int init_sock_opt(int s, int af)
281 {
282         int flags;
283         int optval;
284         
285 #ifdef DISABLE_NAGLE
286         flags=1;
287         if ( (tcp_proto_no!=-1) && (setsockopt(s, tcp_proto_no , TCP_NODELAY,
288                                         &flags, sizeof(flags))<0) ){
289                 LM_WARN("could not disable Nagle: %s\n", strerror(errno));
290         }
291 #endif
292         /* tos*/
293         optval = tos;
294         if(af==AF_INET){
295                 if (setsockopt(s, IPPROTO_IP, IP_TOS, (void*)&optval,
296                                         sizeof(optval)) ==-1){
297                         LM_WARN("setsockopt tos: %s\n", strerror(errno));
298                         /* continue since this is not critical */
299                 }
300         } else if(af==AF_INET6){
301                 if (setsockopt(s, IPPROTO_IPV6, IPV6_TCLASS,
302                                         (void*)&optval, sizeof(optval)) ==-1) {
303                         LM_WARN("setsockopt v6 tos: %s\n", strerror(errno));
304                         /* continue since this is not critical */
305                 }
306         }
307
308 #if  !defined(TCP_DONT_REUSEADDR) 
309         optval=1;
310         if (setsockopt(s, SOL_SOCKET, SO_REUSEADDR,
311                                                 (void*)&optval, sizeof(optval))==-1){
312                 LM_ERR("setsockopt SO_REUSEADDR %s\n", strerror(errno));
313                 /* continue, not critical */
314         }
315 #endif /* !TCP_DONT_REUSEADDR */
316
317 #ifdef SO_REUSEPORT
318         if ((optval=cfg_get(tcp, tcp_cfg, reuse_port))) {
319                 if (setsockopt(s, SOL_SOCKET, SO_REUSEPORT,
320                                 (void*)&optval, sizeof(optval))==-1) {
321                         LM_ERR("setsockopt %s\n", strerror(errno));
322                 }
323         }
324 #endif
325
326 #ifdef HAVE_TCP_SYNCNT
327         if ((optval=cfg_get(tcp, tcp_cfg, syncnt))){
328                 if (setsockopt(s, IPPROTO_TCP, TCP_SYNCNT, &optval,
329                                                 sizeof(optval))<0){
330                         LM_WARN("failed to set maximum SYN retr. count: %s\n", strerror(errno));
331                 }
332         }
333 #endif
334 #ifdef HAVE_TCP_LINGER2
335         if ((optval=cfg_get(tcp, tcp_cfg, linger2))){
336                 if (setsockopt(s, IPPROTO_TCP, TCP_LINGER2, &optval,
337                                                 sizeof(optval))<0){
338                         LM_WARN("failed to set maximum LINGER2 timeout: %s\n", strerror(errno));
339                 }
340         }
341 #endif
342 #ifdef HAVE_TCP_QUICKACK
343         if (cfg_get(tcp, tcp_cfg, delayed_ack)){
344                 optval=0; /* reset quick ack => delayed ack */
345                 if (setsockopt(s, IPPROTO_TCP, TCP_QUICKACK, &optval,
346                                                 sizeof(optval))<0){
347                         LM_WARN("failed to reset TCP_QUICKACK: %s\n", strerror(errno));
348                 }
349         }
350 #endif /* HAVE_TCP_QUICKACK */
351         init_sock_keepalive(s);
352         
353         /* non-blocking */
354         flags=fcntl(s, F_GETFL);
355         if (flags==-1){
356                 LM_ERR("fnctl failed: (%d) %s\n", errno, strerror(errno));
357                 goto error;
358         }
359         if (fcntl(s, F_SETFL, flags|O_NONBLOCK)==-1){
360                 LM_ERR("fcntl: set non-blocking failed: (%d) %s\n", errno, strerror(errno));
361                 goto error;
362         }
363         return 0;
364 error:
365         return -1;
366 }
367
368
369
370 /* set all socket/fd options for "accepted" sockets 
371  *  only nonblocking is set since the rest is inherited from the
372  *  "parent" (listening) socket
373  *  Note: setting O_NONBLOCK is required on linux but it's not needed on
374  *        BSD and possibly solaris (where the flag is inherited from the 
375  *        parent socket). However since there is no standard document 
376  *        requiring a specific behaviour in this case it's safer to always set
377  *        it (at least for now)  --andrei
378  *  TODO: check on which OSes  O_NONBLOCK is inherited and make this 
379  *        function a nop.
380  *
381  * return -1 on error */
382 static int init_sock_opt_accept(int s)
383 {
384         int flags;
385         
386         /* non-blocking */
387         flags=fcntl(s, F_GETFL);
388         if (flags==-1){
389                 LM_ERR("fnctl failed: (%d) %s\n", errno, strerror(errno));
390                 goto error;
391         }
392         if (fcntl(s, F_SETFL, flags|O_NONBLOCK)==-1){
393                 LM_ERR("fcntl: set non-blocking failed: (%d) %s\n", errno, strerror(errno));
394                 goto error;
395         }
396         return 0;
397 error:
398         return -1;
399 }
400
401
402
403 /** close a socket, handling errno.
404  * On EINTR, repeat the close().
405  * Filter expected errors (return success if close() failed because
406  * EPIPE, ECONNRST a.s.o). Note that this happens on *BSDs (on linux close()
407  * does not fail for socket level errors).
408  * @param s - open valid socket.
409  * @return - 0 on success, < 0 on error (whatever close() returns). On error
410  *           errno is set.
411  */
412 static int tcp_safe_close(int s)
413 {
414         int ret;
415
416         if(s<0)
417                 return 0;
418
419 retry:
420         if (unlikely((ret = close(s)) < 0 )) {
421                 switch(errno) {
422                         case EINTR:
423                                 goto retry;
424                         case EPIPE:
425                         case ENOTCONN:
426                         case ECONNRESET:
427                         case ECONNREFUSED:
428                         case ENETUNREACH:
429                         case EHOSTUNREACH:
430                                 /* on *BSD we really get these errors at close() time 
431                                    => ignore them */
432                                 ret = 0;
433                                 break;
434                         default:
435                                 break;
436                 }
437         }
438         return ret;
439 }
440
441
442
443 /* blocking connect on a non-blocking fd; it will timeout after
444  * tcp_connect_timeout 
445  * if BLOCKING_USE_SELECT and HAVE_SELECT are defined it will internally
446  * use select() instead of poll (bad if fd > FD_SET_SIZE, poll is preferred)
447  */
448 static int tcp_blocking_connect(int fd, int type, snd_flags_t* send_flags,
449                                                                 const struct sockaddr *servaddr,
450                                                                 socklen_t addrlen)
451 {
452         int n;
453 #if defined(HAVE_SELECT) && defined(BLOCKING_USE_SELECT)
454         fd_set sel_set;
455         fd_set orig_set;
456         struct timeval timeout;
457 #else
458         struct pollfd pf;
459 #endif
460         int elapsed;
461         int to;
462         int ticks;
463         int err;
464         unsigned int err_len;
465         int poll_err;
466         
467         poll_err=0;
468         to=cfg_get(tcp, tcp_cfg, connect_timeout_s);
469         ticks=get_ticks();
470 again:
471         n=connect(fd, servaddr, addrlen);
472         if (n==-1){
473                 if (errno==EINTR){
474                         elapsed=(get_ticks()-ticks)*TIMER_TICK;
475                         if (elapsed<to)         goto again;
476                         else goto error_timeout;
477                 }
478                 if (errno!=EINPROGRESS && errno!=EALREADY){
479                         goto error_errno;
480                 }
481         }else goto end;
482         
483         /* poll/select loop */
484 #if defined(HAVE_SELECT) && defined(BLOCKING_USE_SELECT)
485                 FD_ZERO(&orig_set);
486                 FD_SET(fd, &orig_set);
487 #else
488                 pf.fd=fd;
489                 pf.events=POLLOUT;
490 #endif
491         while(1){
492                 elapsed=(get_ticks()-ticks)*TIMER_TICK;
493                 if (elapsed>=to)
494                         goto error_timeout;
495 #if defined(HAVE_SELECT) && defined(BLOCKING_USE_SELECT)
496                 sel_set=orig_set;
497                 timeout.tv_sec=to-elapsed;
498                 timeout.tv_usec=0;
499                 n=select(fd+1, 0, &sel_set, 0, &timeout);
500 #else
501                 n=poll(&pf, 1, (to-elapsed)*1000);
502 #endif
503                 if (n<0){
504                         if (errno==EINTR) continue;
505                         LM_ERR("%s: poll/select failed: (%d) %s\n",
506                                         su2a((union sockaddr_union*)servaddr, addrlen),
507                                         errno, strerror(errno));
508                         goto error;
509                 }else if (n==0) /* timeout */ continue;
510 #if defined(HAVE_SELECT) && defined(BLOCKING_USE_SELECT)
511                 if (FD_ISSET(fd, &sel_set))
512 #else
513                 if (pf.revents&(POLLERR|POLLHUP|POLLNVAL)){ 
514                         LM_ERR("%s: poll error: flags %x\n",
515                                         su2a((union sockaddr_union*)servaddr, addrlen),
516                                         pf.revents);
517                         poll_err=1;
518                 }
519 #endif
520                 {
521                         err_len=sizeof(err);
522                         getsockopt(fd, SOL_SOCKET, SO_ERROR, &err, &err_len);
523                         if ((err==0) && (poll_err==0)) goto end;
524                         if (err!=EINPROGRESS && err!=EALREADY){
525                                 LM_ERR("%s: SO_ERROR (%d) %s\n",
526                                                 su2a((union sockaddr_union*)servaddr, addrlen),
527                                                 err, strerror(err));
528                                 errno=err;
529                                 goto error_errno;
530                         }
531                 }
532         }
533 error_errno:
534         switch(errno){
535                 case ENETUNREACH:
536                 case EHOSTUNREACH:
537 #ifdef USE_DST_BLACKLIST
538                         dst_blacklist_su(BLST_ERR_CONNECT, type,
539                                                          (union sockaddr_union*)servaddr, send_flags, 0);
540 #endif /* USE_DST_BLACKLIST */
541                         TCP_EV_CONNECT_UNREACHABLE(errno, 0, 0,
542                                                         (union sockaddr_union*)servaddr, type);
543                         break;
544                 case ETIMEDOUT:
545 #ifdef USE_DST_BLACKLIST
546                         dst_blacklist_su(BLST_ERR_CONNECT, type,
547                                                          (union sockaddr_union*)servaddr, send_flags, 0);
548 #endif /* USE_DST_BLACKLIST */
549                         TCP_EV_CONNECT_TIMEOUT(errno, 0, 0,
550                                                         (union sockaddr_union*)servaddr, type);
551                         break;
552                 case ECONNREFUSED:
553                 case ECONNRESET:
554 #ifdef USE_DST_BLACKLIST
555                         dst_blacklist_su(BLST_ERR_CONNECT, type,
556                                                          (union sockaddr_union*)servaddr, send_flags, 0);
557 #endif /* USE_DST_BLACKLIST */
558                         TCP_EV_CONNECT_RST(errno, 0, 0,
559                                                         (union sockaddr_union*)servaddr, type);
560                         break;
561                 case EAGAIN: /* not posix, but supported on linux and bsd */
562                         TCP_EV_CONNECT_NO_MORE_PORTS(errno, 0, 0,
563                                                         (union sockaddr_union*)servaddr, type);
564                         break;
565                 default:
566                         TCP_EV_CONNECT_ERR(errno, 0, 0,
567                                                                 (union sockaddr_union*)servaddr, type);
568         }
569         LM_ERR("%s: (%d) %s\n",
570                         su2a((union sockaddr_union*)servaddr, addrlen),
571                         errno, strerror(errno));
572         goto error;
573 error_timeout:
574         /* timeout */
575 #ifdef USE_DST_BLACKLIST
576         dst_blacklist_su(BLST_ERR_CONNECT, type,
577                                                 (union sockaddr_union*)servaddr, send_flags, 0);
578 #endif /* USE_DST_BLACKLIST */
579         TCP_EV_CONNECT_TIMEOUT(0, 0, 0, (union sockaddr_union*)servaddr, type);
580         LM_ERR("%s: timeout %d s elapsed from %d s\n",
581                                 su2a((union sockaddr_union*)servaddr, addrlen),
582                                 elapsed, cfg_get(tcp, tcp_cfg, connect_timeout_s));
583 error:
584         TCP_STATS_CONNECT_FAILED();
585         return -1;
586 end:
587         return 0;
588 }
589
590
591
592 #ifdef TCP_ASYNC
593
594
595 /* unsafe version */
596 #define _wbufq_empty(con) ((con)->wbuf_q.first==0)
597 /* unsafe version */
598 #define _wbufq_non_empty(con) ((con)->wbuf_q.first!=0)
599
600
601 /* unsafe version, call while holding the connection write lock */
602 inline static int _wbufq_add(struct  tcp_connection* c, const char* data, 
603                                                         unsigned int size)
604 {
605         struct tcp_wbuffer_queue* q;
606         struct tcp_wbuffer* wb;
607         unsigned int last_free;
608         unsigned int wb_size;
609         unsigned int crt_size;
610         ticks_t t;
611         
612         q=&c->wbuf_q;
613         t=get_ticks_raw();
614         if (unlikely(   ((q->queued+size)>cfg_get(tcp, tcp_cfg, tcpconn_wq_max)) ||
615                                         ((*tcp_total_wq+size)>cfg_get(tcp, tcp_cfg, tcp_wq_max)) ||
616                                         (q->first &&
617                                         TICKS_LT(q->wr_timeout, t)) )){
618                 LM_ERR("(%d bytes): write queue full or timeout "
619                                         " (%d, total %d, last write %d s ago)\n",
620                                         size, q->queued, *tcp_total_wq,
621                                         TICKS_TO_S(t-(q->wr_timeout-
622                                                                 cfg_get(tcp, tcp_cfg, send_timeout))));
623                 if (q->first && TICKS_LT(q->wr_timeout, t)){
624                         if (unlikely(c->state==S_CONN_CONNECT)){
625 #ifdef USE_DST_BLACKLIST
626                                 (void)dst_blacklist_su( BLST_ERR_CONNECT, c->rcv.proto,
627                                                                                 &c->rcv.src_su, &c->send_flags, 0);
628 #endif /* USE_DST_BLACKLIST */
629                                 TCP_EV_CONNECT_TIMEOUT(0, TCP_LADDR(c), TCP_LPORT(c),
630                                                                                         TCP_PSU(c), TCP_PROTO(c));
631                                 TCP_STATS_CONNECT_FAILED();
632                         }else{
633 #ifdef USE_DST_BLACKLIST
634                                 (void)dst_blacklist_su( BLST_ERR_SEND, c->rcv.proto,
635                                                                         &c->rcv.src_su, &c->send_flags, 0);
636 #endif /* USE_DST_BLACKLIST */
637                                 TCP_EV_SEND_TIMEOUT(0, &c->rcv);
638                                 TCP_STATS_SEND_TIMEOUT();
639                         }
640                 }else{
641                         /* if it's not a timeout => queue full */
642                         TCP_EV_SENDQ_FULL(0, &c->rcv);
643                         TCP_STATS_SENDQ_FULL();
644                 }
645                 goto error;
646         }
647         
648         if (unlikely(q->last==0)){
649                 wb_size=MAX_unsigned(cfg_get(tcp, tcp_cfg, wq_blk_size), size);
650                 wb=shm_malloc(sizeof(*wb)+wb_size-1);
651                 if (unlikely(wb==0))
652                         goto error;
653                 wb->b_size=wb_size;
654                 wb->next=0;
655                 q->last=wb;
656                 q->first=wb;
657                 q->last_used=0;
658                 q->offset=0;
659                 q->wr_timeout=get_ticks_raw()+
660                         ((c->state==S_CONN_CONNECT)?
661                                         S_TO_TICKS(cfg_get(tcp, tcp_cfg, connect_timeout_s)):
662                                         cfg_get(tcp, tcp_cfg, send_timeout));
663         }else{
664                 wb=q->last;
665         }
666         
667         while(size){
668                 last_free=wb->b_size-q->last_used;
669                 if (last_free==0){
670                         wb_size=MAX_unsigned(cfg_get(tcp, tcp_cfg, wq_blk_size), size);
671                         wb=shm_malloc(sizeof(*wb)+wb_size-1);
672                         if (unlikely(wb==0))
673                                 goto error;
674                         wb->b_size=wb_size;
675                         wb->next=0;
676                         q->last->next=wb;
677                         q->last=wb;
678                         q->last_used=0;
679                         last_free=wb->b_size;
680                 }
681                 crt_size=MIN_unsigned(last_free, size);
682                 memcpy(wb->buf+q->last_used, data, crt_size);
683                 q->last_used+=crt_size;
684                 size-=crt_size;
685                 data+=crt_size;
686                 q->queued+=crt_size;
687                 atomic_add_int((int*)tcp_total_wq, crt_size);
688         }
689         return 0;
690 error:
691         return -1;
692 }
693
694
695
696 /* unsafe version, call while holding the connection write lock
697  * inserts data at the beginning, it ignores the max queue size checks and
698  * the timeout (use sparingly)
699  * Note: it should never be called on a write buffer after wbufq_run() */
700 inline static int _wbufq_insert(struct  tcp_connection* c, const char* data, 
701                                                         unsigned int size)
702 {
703         struct tcp_wbuffer_queue* q;
704         struct tcp_wbuffer* wb;
705         
706         q=&c->wbuf_q;
707         if (likely(q->first==0)) /* if empty, use wbufq_add */
708                 return _wbufq_add(c, data, size);
709         
710         if (unlikely((*tcp_total_wq+size)>cfg_get(tcp, tcp_cfg, tcp_wq_max))){
711                 LM_ERR("(%d bytes): write queue full"
712                                         " (%d, total %d, last write %d s ago)\n",
713                                         size, q->queued, *tcp_total_wq,
714                                         TICKS_TO_S(get_ticks_raw()-q->wr_timeout-
715                                                                         cfg_get(tcp, tcp_cfg, send_timeout)));
716                 goto error;
717         }
718         if (unlikely(q->offset)){
719                 LM_CRIT("non-null offset %d (bad call, should"
720                                 "never be called after the wbufq_run())\n", q->offset);
721                 goto error;
722         }
723         if ((q->first==q->last) && ((q->last->b_size-q->last_used)>=size)){
724                 /* one block with enough space in it for size bytes */
725                 memmove(q->first->buf+size, q->first->buf, q->last_used);
726                 memcpy(q->first->buf, data, size);
727                 q->last_used+=size;
728         }else{
729                 /* create a size bytes block directly */
730                 wb=shm_malloc(sizeof(*wb)+size-1);
731                 if (unlikely(wb==0))
732                         goto error;
733                 wb->b_size=size;
734                 /* insert it */
735                 wb->next=q->first;
736                 q->first=wb;
737                 memcpy(wb->buf, data, size);
738         }
739         
740         q->queued+=size;
741         atomic_add_int((int*)tcp_total_wq, size);
742         return 0;
743 error:
744         return -1;
745 }
746
747
748
749 /* unsafe version, call while holding the connection write lock */
750 inline static void _wbufq_destroy( struct  tcp_wbuffer_queue* q)
751 {
752         struct tcp_wbuffer* wb;
753         struct tcp_wbuffer* next_wb;
754         int unqueued;
755         
756         unqueued=0;
757         if (likely(q->first)){
758                 wb=q->first;
759                 do{
760                         next_wb=wb->next;
761                         unqueued+=(wb==q->last)?q->last_used:wb->b_size;
762                         if (wb==q->first)
763                                 unqueued-=q->offset;
764                         shm_free(wb);
765                         wb=next_wb;
766                 }while(wb);
767         }
768         memset(q, 0, sizeof(*q));
769         atomic_add_int((int*)tcp_total_wq, -unqueued);
770 }
771
772
773
774 /* tries to empty the queue  (safe version, c->write_lock must not be hold)
775  * returns -1 on error, bytes written on success (>=0) 
776  * if the whole queue is emptied => sets *empty*/
777 inline static int wbufq_run(int fd, struct tcp_connection* c, int* empty)
778 {
779         struct tcp_wbuffer_queue* q;
780         struct tcp_wbuffer* wb;
781         int n;
782         int ret;
783         int block_size;
784         char* buf;
785         
786         *empty=0;
787         ret=0;
788         lock_get(&c->write_lock);
789         q=&c->wbuf_q;
790         while(q->first){
791                 block_size=((q->first==q->last)?q->last_used:q->first->b_size)-
792                                                 q->offset;
793                 buf=q->first->buf+q->offset;
794                 n=_tcpconn_write_nb(fd, c, buf, block_size);
795                 if (likely(n>0)){
796                         ret+=n;
797                         if (likely(n==block_size)){
798                                 wb=q->first;
799                                 q->first=q->first->next; 
800                                 shm_free(wb);
801                                 q->offset=0;
802                                 q->queued-=block_size;
803                                 atomic_add_int((int*)tcp_total_wq, -block_size);
804                         }else{
805                                 q->offset+=n;
806                                 q->queued-=n;
807                                 atomic_add_int((int*)tcp_total_wq, -n);
808                                 break;
809                         }
810                 }else{
811                         if (n<0){
812                                 /* EINTR is handled inside _tcpconn_write_nb */
813                                 if (!(errno==EAGAIN || errno==EWOULDBLOCK)){
814                                         if (unlikely(c->state==S_CONN_CONNECT)){
815                                                 switch(errno){
816                                                         case ENETUNREACH:
817                                                         case EHOSTUNREACH: /* not posix for send() */
818 #ifdef USE_DST_BLACKLIST
819                                                                 dst_blacklist_su(BLST_ERR_CONNECT,
820                                                                                                         c->rcv.proto,
821                                                                                                         &c->rcv.src_su,
822                                                                                                         &c->send_flags, 0);
823 #endif /* USE_DST_BLACKLIST */
824                                                                 TCP_EV_CONNECT_UNREACHABLE(errno, TCP_LADDR(c),
825                                                                                                         TCP_LPORT(c), TCP_PSU(c),
826                                                                                                         TCP_PROTO(c));
827                                                                 break;
828                                                         case ECONNREFUSED:
829                                                         case ECONNRESET:
830 #ifdef USE_DST_BLACKLIST
831                                                                 dst_blacklist_su(BLST_ERR_CONNECT,
832                                                                                                         c->rcv.proto,
833                                                                                                         &c->rcv.src_su,
834                                                                                                         &c->send_flags, 0);
835 #endif /* USE_DST_BLACKLIST */
836                                                                 TCP_EV_CONNECT_RST(0, TCP_LADDR(c),
837                                                                                                         TCP_LPORT(c), TCP_PSU(c),
838                                                                                                         TCP_PROTO(c));
839                                                                 break;
840                                                         default:
841                                                                 TCP_EV_CONNECT_ERR(errno, TCP_LADDR(c),
842                                                                                                         TCP_LPORT(c), TCP_PSU(c),
843                                                                                                         TCP_PROTO(c));
844                                                 }
845                                                 TCP_STATS_CONNECT_FAILED();
846                                         }else{
847                                                 switch(errno){
848                                                         case ECONNREFUSED:
849                                                         case ECONNRESET:
850                                                                 TCP_STATS_CON_RESET();
851                                                                 /* no break */
852                                                         case ENETUNREACH:
853                                                         case EHOSTUNREACH: /* not posix for send() */
854 #ifdef USE_DST_BLACKLIST
855                                                                 dst_blacklist_su(BLST_ERR_SEND,
856                                                                                                         c->rcv.proto,
857                                                                                                         &c->rcv.src_su,
858                                                                                                         &c->send_flags, 0);
859 #endif /* USE_DST_BLACKLIST */
860                                                                 break;
861                                                 }
862                                         }
863                                         ret=-1;
864                                         LM_ERR("%s [%d]\n", strerror(errno), errno);
865                                 }
866                         }
867                         break;
868                 }
869         }
870         if (likely(q->first==0)){
871                 q->last=0;
872                 q->last_used=0;
873                 q->offset=0;
874                 *empty=1;
875         }
876         lock_release(&c->write_lock);
877         if (likely(ret>0)){
878                 q->wr_timeout=get_ticks_raw()+cfg_get(tcp, tcp_cfg, send_timeout);
879                 if (unlikely(c->state==S_CONN_CONNECT || c->state==S_CONN_ACCEPT)){
880                         TCP_STATS_ESTABLISHED(c->state);
881                         c->state=S_CONN_OK;
882                 }
883         }
884         return ret;
885 }
886
887 #endif /* TCP_ASYNC */
888
889
890
891 #if 0
892 /* blocking write even on non-blocking sockets 
893  * if TCP_TIMEOUT will return with error */
894 static int tcp_blocking_write(struct tcp_connection* c, int fd, char* buf,
895                                                                 unsigned int len)
896 {
897         int n;
898         fd_set sel_set;
899         struct timeval timeout;
900         int ticks;
901         int initial_len;
902         
903         initial_len=len;
904 again:
905         
906         n=send(fd, buf, len,
907 #ifdef HAVE_MSG_NOSIGNAL
908                         MSG_NOSIGNAL
909 #else
910                         0
911 #endif
912                 );
913         if (n<0){
914                 if (errno==EINTR)       goto again;
915                 else if (errno!=EAGAIN && errno!=EWOULDBLOCK){
916                         LM_ERR("failed to send: (%d) %s\n", errno, strerror(errno));
917                         TCP_EV_SEND_TIMEOUT(errno, &c->rcv);
918                         TCP_STATS_SEND_TIMEOUT();
919                         goto error;
920                 }
921         }else if (n<len){
922                 /* partial write */
923                 buf+=n;
924                 len-=n;
925         }else{
926                 /* success: full write */
927                 goto end;
928         }
929         while(1){
930                 FD_ZERO(&sel_set);
931                 FD_SET(fd, &sel_set);
932                 timeout.tv_sec=tcp_send_timeout;
933                 timeout.tv_usec=0;
934                 ticks=get_ticks();
935                 n=select(fd+1, 0, &sel_set, 0, &timeout);
936                 if (n<0){
937                         if (errno==EINTR) continue; /* signal, ignore */
938                         LM_ERR("select failed: (%d) %s\n", errno, strerror(errno));
939                         goto error;
940                 }else if (n==0){
941                         /* timeout */
942                         if (get_ticks()-ticks>=tcp_send_timeout){
943                                 LM_ERR("send timeout (%d)\n", tcp_send_timeout);
944                                 goto error;
945                         }
946                         continue;
947                 }
948                 if (FD_ISSET(fd, &sel_set)){
949                         /* we can write again */
950                         goto again;
951                 }
952         }
953 error:
954                 return -1;
955 end:
956                 return initial_len;
957 }
958 #endif
959
960
961
962 struct tcp_connection* tcpconn_new(int sock, union sockaddr_union* su,
963                                                                         union sockaddr_union* local_addr,
964                                                                         struct socket_info* ba, int type, 
965                                                                         int state)
966 {
967         struct tcp_connection *c;
968         int rd_b_size;
969         
970         rd_b_size=cfg_get(tcp, tcp_cfg, rd_buf_size);
971         c=shm_malloc(sizeof(struct tcp_connection) + rd_b_size);
972         if (c==0){
973                 LM_ERR("mem. allocation failure\n");
974                 goto error;
975         }
976         memset(c, 0, sizeof(struct tcp_connection)); /* zero init (skip rd buf)*/
977         c->s=sock;
978         c->fd=-1; /* not initialized */
979         if (lock_init(&c->write_lock)==0){
980                 LM_ERR("init lock failed\n");
981                 goto error;
982         }
983         
984         c->rcv.src_su=*su;
985         
986         atomic_set(&c->refcnt, 0);
987         local_timer_init(&c->timer, tcpconn_main_timeout, c, 0);
988         su2ip_addr(&c->rcv.src_ip, su);
989         c->rcv.src_port=su_getport(su);
990         c->rcv.bind_address=ba;
991         if (likely(local_addr)){
992                 su2ip_addr(&c->rcv.dst_ip, local_addr);
993                 c->rcv.dst_port=su_getport(local_addr);
994         }else if (ba){
995                 c->rcv.dst_ip=ba->address;
996                 c->rcv.dst_port=ba->port_no;
997         }
998         print_ip("tcpconn_new: new tcp connection: ", &c->rcv.src_ip, "\n");
999         LM_DBG("on port %d, type %d\n", c->rcv.src_port, type);
1000         init_tcp_req(&c->req, (char*)c+sizeof(struct tcp_connection), rd_b_size);
1001         c->id=(*connection_id)++;
1002         c->rcv.proto_reserved1=0; /* this will be filled before receive_message*/
1003         c->rcv.proto_reserved2=0;
1004         c->state=state;
1005         c->extra_data=0;
1006 #ifdef USE_TLS
1007         if (type==PROTO_TLS){
1008                 if (tls_tcpconn_init(c, sock)==-1) goto error;
1009         }else
1010 #endif /* USE_TLS*/
1011         {
1012                 c->type=PROTO_TCP;
1013                 c->rcv.proto=PROTO_TCP;
1014                 c->timeout=get_ticks_raw()+cfg_get(tcp, tcp_cfg, con_lifetime);
1015                 c->lifetime = cfg_get(tcp, tcp_cfg, con_lifetime);
1016         }
1017         
1018         return c;
1019         
1020 error:
1021         if (c) shm_free(c);
1022         return 0;
1023 }
1024
1025
1026
1027 /* do the actual connect, set sock. options a.s.o
1028  * returns socket on success, -1 on error
1029  * sets also *res_local_addr, res_si and state (S_CONN_CONNECT for an
1030  * unfinished connect and S_CONN_OK for a finished one)*/
1031 inline static int tcp_do_connect(       union sockaddr_union* server,
1032                                                                         union sockaddr_union* from,
1033                                                                         int type,
1034                                                                         snd_flags_t* send_flags,
1035                                                                         union sockaddr_union* res_local_addr,
1036                                                                         struct socket_info** res_si,
1037                                                                         enum tcp_conn_states *state
1038                                                                         )
1039 {
1040         int s;
1041         union sockaddr_union my_name;
1042         socklen_t my_name_len;
1043         struct ip_addr ip;
1044 #ifdef TCP_ASYNC
1045         int n;
1046 #endif /* TCP_ASYNC */
1047
1048         s=socket(AF2PF(server->s.sa_family), SOCK_STREAM, 0);
1049         if (unlikely(s==-1)){
1050                 LM_ERR("%s: socket: (%d) %s\n",
1051                                 su2a(server, sizeof(*server)), errno, strerror(errno));
1052                 goto error;
1053         }
1054         if (init_sock_opt(s, server->s.sa_family)<0){
1055                 LM_ERR("%s: init_sock_opt failed\n",
1056                                         su2a(server, sizeof(*server)));
1057                 goto error;
1058         }
1059         
1060         if (unlikely(from && bind(s, &from->s, sockaddru_len(*from)) != 0)){
1061                 LM_WARN("binding to source address %s failed: %s [%d]\n",
1062                                         su2a(from, sizeof(*from)),
1063                                         strerror(errno), errno);
1064         }
1065         *state=S_CONN_OK;
1066 #ifdef TCP_ASYNC
1067         if (likely(cfg_get(tcp, tcp_cfg, async))){
1068 again:
1069                 n=connect(s, &server->s, sockaddru_len(*server));
1070                 if (likely(n==-1)){ /*non-blocking => most probable EINPROGRESS*/
1071                         if (likely(errno==EINPROGRESS))
1072                                 *state=S_CONN_CONNECT;
1073                         else if (errno==EINTR) goto again;
1074                         else if (errno!=EALREADY){
1075                                 switch(errno){
1076                                         case ENETUNREACH:
1077                                         case EHOSTUNREACH:
1078 #ifdef USE_DST_BLACKLIST
1079                                                 dst_blacklist_su(BLST_ERR_CONNECT, type, server,
1080                                                                                         send_flags, 0);
1081 #endif /* USE_DST_BLACKLIST */
1082                                                 TCP_EV_CONNECT_UNREACHABLE(errno, 0, 0, server, type);
1083                                                 break;
1084                                         case ETIMEDOUT:
1085 #ifdef USE_DST_BLACKLIST
1086                                                 dst_blacklist_su(BLST_ERR_CONNECT, type, server,
1087                                                                                         send_flags, 0);
1088 #endif /* USE_DST_BLACKLIST */
1089                                                 TCP_EV_CONNECT_TIMEOUT(errno, 0, 0, server, type);
1090                                                 break;
1091                                         case ECONNREFUSED:
1092                                         case ECONNRESET:
1093 #ifdef USE_DST_BLACKLIST
1094                                                 dst_blacklist_su(BLST_ERR_CONNECT, type, server,
1095                                                                                         send_flags, 0);
1096 #endif /* USE_DST_BLACKLIST */
1097                                                 TCP_EV_CONNECT_RST(errno, 0, 0, server, type);
1098                                                 break;
1099                                         case EAGAIN:/* not posix, but supported on linux and bsd */
1100                                                 TCP_EV_CONNECT_NO_MORE_PORTS(errno, 0, 0, server,type);
1101                                                 break;
1102                                         default:
1103                                                 TCP_EV_CONNECT_ERR(errno, 0, 0, server, type);
1104                                 }
1105                                 TCP_STATS_CONNECT_FAILED();
1106                                 LM_ERR("connect %s: (%d) %s\n",
1107                                                         su2a(server, sizeof(*server)),
1108                                                         errno, strerror(errno));
1109                                 goto error;
1110                         }
1111                 }
1112         }else{
1113 #endif /* TCP_ASYNC */
1114                 if (tcp_blocking_connect(s, type,  send_flags, &server->s,
1115                                                                         sockaddru_len(*server))<0){
1116                         LM_ERR("tcp_blocking_connect %s failed\n",
1117                                                 su2a(server, sizeof(*server)));
1118                         goto error;
1119                 }
1120 #ifdef TCP_ASYNC
1121         }
1122 #endif /* TCP_ASYNC */
1123         if (from){
1124                 su2ip_addr(&ip, from);
1125                 if (!ip_addr_any(&ip))
1126                         /* we already know the source ip, skip the sys. call */
1127                         goto find_socket;
1128         }
1129         my_name_len=sizeof(my_name);
1130         if (unlikely(getsockname(s, &my_name.s, &my_name_len)!=0)){
1131                 LM_ERR("getsockname failed: %s(%d)\n", strerror(errno), errno);
1132                 *res_si=0;
1133                 goto error;
1134         }
1135         from=&my_name; /* update from with the real "from" address */
1136         su2ip_addr(&ip, &my_name);
1137 find_socket:
1138 #ifdef USE_TLS
1139         if (unlikely(type==PROTO_TLS))
1140                 *res_si=find_si(&ip, 0, PROTO_TLS);
1141         else
1142 #endif
1143                 *res_si=find_si(&ip, 0, PROTO_TCP);
1144         
1145         if (unlikely(*res_si==0)){
1146                 LM_WARN("%s: could not find corresponding"
1147                                 " listening socket for %s, using default...\n",
1148                                         su2a(server, sizeof(*server)), ip_addr2a(&ip));
1149                 if (server->s.sa_family==AF_INET) *res_si=sendipv4_tcp;
1150                 else *res_si=sendipv6_tcp;
1151         }
1152         *res_local_addr=*from;
1153         return s;
1154 error:
1155         if (s!=-1) tcp_safe_close(s);
1156         return -1;
1157 }
1158
1159
1160
1161 struct tcp_connection* tcpconn_connect( union sockaddr_union* server,
1162                                                                                 union sockaddr_union* from,
1163                                                                                 int type, snd_flags_t* send_flags)
1164 {
1165         int s;
1166         struct socket_info* si;
1167         union sockaddr_union my_name;
1168         struct tcp_connection* con;
1169         enum tcp_conn_states state;
1170
1171         s=-1;
1172         
1173         if (*tcp_connections_no >= cfg_get(tcp, tcp_cfg, max_connections)){
1174                 LM_ERR("maximum number of connections exceeded (%d/%d)\n",
1175                                         *tcp_connections_no,
1176                                         cfg_get(tcp, tcp_cfg, max_connections));
1177                 goto error;
1178         }
1179         if (unlikely(type==PROTO_TLS)) {
1180                 if (*tls_connections_no >= cfg_get(tcp, tcp_cfg, max_tls_connections)){
1181                         LM_ERR("maximum number of tls connections"
1182                                                 " exceeded (%d/%d)\n",
1183                                                 *tls_connections_no,
1184                                                 cfg_get(tcp, tcp_cfg, max_tls_connections));
1185                         goto error;
1186                 }
1187         }
1188
1189         s=tcp_do_connect(server, from, type,  send_flags, &my_name, &si, &state);
1190         if (s==-1){
1191                 LM_ERR("tcp_do_connect %s: failed (%d) %s\n",
1192                                 su2a(server, sizeof(*server)), errno, strerror(errno));
1193                 goto error;
1194         }
1195         con=tcpconn_new(s, server, &my_name, si, type, state);
1196         if (con==0){
1197                 LM_ERR("%s: tcpconn_new failed, closing the "
1198                                  " socket\n", su2a(server, sizeof(*server)));
1199                 goto error;
1200         }
1201         tcpconn_set_send_flags(con, *send_flags);
1202         return con;
1203 error:
1204         if (s!=-1) tcp_safe_close(s); /* close the opened socket */
1205         return 0;
1206 }
1207
1208
1209
1210 #ifdef TCP_CONNECT_WAIT
1211 int tcpconn_finish_connect( struct tcp_connection* c,
1212                                                                                                 union sockaddr_union* from)
1213 {
1214         int s;
1215         int r;
1216         union sockaddr_union local_addr;
1217         struct socket_info* si;
1218         enum tcp_conn_states state;
1219         struct tcp_conn_alias* a;
1220         int new_conn_alias_flags;
1221         
1222         s=tcp_do_connect(&c->rcv.src_su, from, c->type, &c->send_flags,
1223                                                 &local_addr, &si, &state);
1224         if (unlikely(s==-1)){
1225                 LM_ERR("%s: tcp_do_connect for %p failed\n",
1226                                         su2a(&c->rcv.src_su, sizeof(c->rcv.src_su)), c);
1227                 return -1;
1228         }
1229         c->rcv.bind_address=si;
1230         su2ip_addr(&c->rcv.dst_ip, &local_addr);
1231         c->rcv.dst_port=su_getport(&local_addr);
1232         /* update aliases if needed */
1233         if (likely(from==0)){
1234                 new_conn_alias_flags=cfg_get(tcp, tcp_cfg, new_conn_alias_flags);
1235                 /* add aliases */
1236                 TCPCONN_LOCK;
1237                 _tcpconn_add_alias_unsafe(c, c->rcv.src_port, &c->rcv.dst_ip, 0,
1238                                                                                                         new_conn_alias_flags);
1239                 _tcpconn_add_alias_unsafe(c, c->rcv.src_port, &c->rcv.dst_ip,
1240                                                                         c->rcv.dst_port, new_conn_alias_flags);
1241                 TCPCONN_UNLOCK;
1242         }else if (su_cmp(from, &local_addr)!=1){
1243                 new_conn_alias_flags=cfg_get(tcp, tcp_cfg, new_conn_alias_flags);
1244                 TCPCONN_LOCK;
1245                         /* remove all the aliases except the first one and re-add them
1246                          * (there shouldn't be more then the 3 default aliases at this 
1247                          * stage) */
1248                         if (c->aliases > 1) {
1249                                 for (r=1; r<c->aliases; r++){
1250                                         a=&c->con_aliases[r];
1251                                         tcpconn_listrm(tcpconn_aliases_hash[a->hash],
1252                                                                         a, next, prev);
1253                                 }
1254                                 c->aliases=1;
1255                         }
1256                         /* add the local_ip:0 and local_ip:local_port aliases */
1257                         _tcpconn_add_alias_unsafe(c, c->rcv.src_port, &c->rcv.dst_ip,
1258                                                                                                 0, new_conn_alias_flags);
1259                         _tcpconn_add_alias_unsafe(c, c->rcv.src_port, &c->rcv.dst_ip,
1260                                                                         c->rcv.dst_port, new_conn_alias_flags);
1261                 TCPCONN_UNLOCK;
1262         }
1263         
1264         return s;
1265 }
1266 #endif /* TCP_CONNECT_WAIT */
1267
1268
1269
1270 /* adds a tcp connection to the tcpconn hashes
1271  * Note: it's called _only_ from the tcp_main process */
1272 inline static struct tcp_connection*  tcpconn_add(struct tcp_connection *c)
1273 {
1274         struct ip_addr zero_ip;
1275         int new_conn_alias_flags;
1276
1277         if (likely(c)){
1278                 ip_addr_mk_any(c->rcv.src_ip.af, &zero_ip);
1279                 c->id_hash=tcp_id_hash(c->id);
1280                 c->aliases=0;
1281                 new_conn_alias_flags=cfg_get(tcp, tcp_cfg, new_conn_alias_flags);
1282                 TCPCONN_LOCK;
1283                 c->flags|=F_CONN_HASHED;
1284                 /* add it at the begining of the list*/
1285                 tcpconn_listadd(tcpconn_id_hash[c->id_hash], c, id_next, id_prev);
1286                 /* set the aliases */
1287                 /* first alias is for (peer_ip, peer_port, 0 ,0) -- for finding
1288                  *  any connection to peer_ip, peer_port
1289                  * the second alias is for (peer_ip, peer_port, local_addr, 0) -- for
1290                  *  finding any conenction to peer_ip, peer_port from local_addr 
1291                  * the third alias is for (peer_ip, peer_port, local_addr, local_port) 
1292                  *   -- for finding if a fully specified connection exists */
1293                 _tcpconn_add_alias_unsafe(c, c->rcv.src_port, &zero_ip, 0,
1294                                                                                                         new_conn_alias_flags);
1295                 if (likely(c->rcv.dst_ip.af && ! ip_addr_any(&c->rcv.dst_ip))){
1296                         _tcpconn_add_alias_unsafe(c, c->rcv.src_port, &c->rcv.dst_ip, 0,
1297                                                                                                         new_conn_alias_flags);
1298                         _tcpconn_add_alias_unsafe(c, c->rcv.src_port, &c->rcv.dst_ip,
1299                                                                         c->rcv.dst_port, new_conn_alias_flags);
1300                 }
1301                 /* ignore add_alias errors, there are some valid cases when one
1302                  *  of the add_alias would fail (e.g. first add_alias for 2 connections
1303                  *   with the same destination but different src. ip*/
1304                 TCPCONN_UNLOCK;
1305                 LM_DBG("hashes: %d:%d:%d, %d\n",
1306                                                                                                 c->con_aliases[0].hash,
1307                                                                                                 c->con_aliases[1].hash,
1308                                                                                                 c->con_aliases[2].hash,
1309                                                                                                 c->id_hash);
1310                 return c;
1311         }else{
1312                 LM_CRIT("null connection pointer\n");
1313                 return 0;
1314         }
1315 }
1316
1317
1318 static inline void _tcpconn_detach(struct tcp_connection *c)
1319 {
1320         int r;
1321         tcpconn_listrm(tcpconn_id_hash[c->id_hash], c, id_next, id_prev);
1322         /* remove all the aliases */
1323         for (r=0; r<c->aliases; r++)
1324                 tcpconn_listrm(tcpconn_aliases_hash[c->con_aliases[r].hash], 
1325                                                 &c->con_aliases[r], next, prev);
1326         c->aliases = 0;
1327 }
1328
1329
1330
1331 static inline void _tcpconn_free(struct tcp_connection* c)
1332 {
1333 #ifdef TCP_ASYNC
1334         if (unlikely(_wbufq_non_empty(c)))
1335                 _wbufq_destroy(&c->wbuf_q);
1336 #endif
1337         lock_destroy(&c->write_lock);
1338 #ifdef USE_TLS
1339         if (unlikely(c->type==PROTO_TLS || c->type==PROTO_WSS)) tls_tcpconn_clean(c);
1340 #endif
1341         shm_free(c);
1342 }
1343
1344
1345
1346 /* unsafe tcpconn_rm version (nolocks) */
1347 void _tcpconn_rm(struct tcp_connection* c)
1348 {
1349         _tcpconn_detach(c);
1350         _tcpconn_free(c);
1351 }
1352
1353
1354
1355 void tcpconn_rm(struct tcp_connection* c)
1356 {
1357         int r;
1358         TCPCONN_LOCK;
1359         tcpconn_listrm(tcpconn_id_hash[c->id_hash], c, id_next, id_prev);
1360         /* remove all the aliases */
1361         for (r=0; r<c->aliases; r++)
1362                 tcpconn_listrm(tcpconn_aliases_hash[c->con_aliases[r].hash], 
1363                                                 &c->con_aliases[r], next, prev);
1364         c->aliases = 0;
1365         TCPCONN_UNLOCK;
1366         lock_destroy(&c->write_lock);
1367 #ifdef USE_TLS
1368         if ((c->type==PROTO_TLS || c->type==PROTO_WSS)&&(c->extra_data)) tls_tcpconn_clean(c);
1369 #endif
1370         shm_free(c);
1371 }
1372
1373
1374 /* finds a connection, if id=0 uses the ip addr, port, local_ip and local port
1375  *  (host byte order) and tries to find the connection that matches all of
1376  *   them. Wild cards can be used for local_ip and local_port (a 0 filled
1377  *   ip address and/or a 0 local port).
1378  * WARNING: unprotected (locks) use tcpconn_get unless you really
1379  * know what you are doing */
1380 struct tcp_connection* _tcpconn_find(int id, struct ip_addr* ip, int port,
1381                                                                                 struct ip_addr* l_ip, int l_port)
1382 {
1383
1384         struct tcp_connection *c;
1385         struct tcp_conn_alias* a;
1386         unsigned hash;
1387         int is_local_ip_any;
1388         
1389 #ifdef EXTRA_DEBUG
1390         LM_DBG("%d  port %d\n",id, port);
1391         if (ip) print_ip("tcpconn_find: ip ", ip, "\n");
1392 #endif
1393         if (likely(id)){
1394                 hash=tcp_id_hash(id);
1395                 for (c=tcpconn_id_hash[hash]; c; c=c->id_next){
1396 #ifdef EXTRA_DEBUG
1397                         LM_DBG("c=%p, c->id=%d, port=%d\n", c, c->id, c->rcv.src_port);
1398                         print_ip("ip=", &c->rcv.src_ip, "\n");
1399 #endif
1400                         if ((id==c->id)&&(c->state!=S_CONN_BAD)) return c;
1401                 }
1402         }else if (likely(ip)){
1403                 hash=tcp_addr_hash(ip, port, l_ip, l_port);
1404                 is_local_ip_any=ip_addr_any(l_ip);
1405                 for (a=tcpconn_aliases_hash[hash]; a; a=a->next){
1406 #ifdef EXTRA_DEBUG
1407                         LM_DBG("a=%p, c=%p, c->id=%d, alias port= %d port=%d\n", a, a->parent,
1408                                         a->parent->id, a->port, a->parent->rcv.src_port);
1409                         print_ip("ip=",&a->parent->rcv.src_ip,"\n");
1410 #endif
1411                         if ( (a->parent->state!=S_CONN_BAD) && (port==a->port) &&
1412                                         ((l_port==0) || (l_port==a->parent->rcv.dst_port)) &&
1413                                         (ip_addr_cmp(ip, &a->parent->rcv.src_ip)) &&
1414                                         (is_local_ip_any ||
1415                                                 ip_addr_cmp(l_ip, &a->parent->rcv.dst_ip))
1416                                 )
1417                                 return a->parent;
1418                 }
1419         }
1420         return 0;
1421 }
1422
1423
1424
1425 /* _tcpconn_find with locks and timeout
1426  * local_addr contains the desired local ip:port. If null any local address 
1427  * will be used.  IN*ADDR_ANY or 0 port are wild cards.
1428  * If found, the connection's reference counter will be incremented, you might
1429  * want to decrement it after use.
1430  */
1431 struct tcp_connection* tcpconn_get(int id, struct ip_addr* ip, int port,
1432                                                                         union sockaddr_union* local_addr,
1433                                                                         ticks_t timeout)
1434 {
1435         struct tcp_connection* c;
1436         struct ip_addr local_ip;
1437         int local_port;
1438         
1439         local_port=0;
1440         if (likely(ip)){
1441                 if (unlikely(local_addr)){
1442                         su2ip_addr(&local_ip, local_addr);
1443                         local_port=su_getport(local_addr);
1444                 }else{
1445                         ip_addr_mk_any(ip->af, &local_ip);
1446                         local_port=0;
1447                 }
1448         }
1449         TCPCONN_LOCK;
1450         c=_tcpconn_find(id, ip, port, &local_ip, local_port);
1451         if (likely(c)){ 
1452                         atomic_inc(&c->refcnt);
1453                         /* update the timeout only if the connection is not handled
1454                          * by a tcp reader _and_the timeout is non-zero  (the tcp
1455                          * reader process uses c->timeout for its own internal
1456                          * timeout and c->timeout will be overwritten * anyway on
1457                          * return to tcp_main) */
1458                         if (likely(c->reader_pid==0 && timeout != 0))
1459                                 c->timeout=get_ticks_raw()+timeout;
1460         }
1461         TCPCONN_UNLOCK;
1462         return c;
1463 }
1464
1465
1466
1467 /* add c->dst:port, local_addr as an alias for the "id" connection, 
1468  * flags: TCP_ALIAS_FORCE_ADD  - add an alias even if a previous one exists
1469  *        TCP_ALIAS_REPLACE    - if a prev. alias exists, replace it with the
1470  *                                new one
1471  * returns 0 on success, <0 on failure ( -1  - null c, -2 too many aliases,
1472  *  -3 alias already present and pointing to another connection)
1473  * WARNING: must be called with TCPCONN_LOCK held */
1474 inline static int _tcpconn_add_alias_unsafe(struct tcp_connection* c, int port,
1475                                                                                 struct ip_addr* l_ip, int l_port,
1476                                                                                 int flags)
1477 {
1478         unsigned hash;
1479         struct tcp_conn_alias* a;
1480         struct tcp_conn_alias* nxt;
1481         struct tcp_connection* p;
1482         int is_local_ip_any;
1483         int i;
1484         int r;
1485         
1486         a=0;
1487         is_local_ip_any=ip_addr_any(l_ip);
1488         if (likely(c)){
1489                 hash=tcp_addr_hash(&c->rcv.src_ip, port, l_ip, l_port);
1490                 /* search the aliases for an already existing one */
1491                 for (a=tcpconn_aliases_hash[hash], nxt=0; a; a=nxt){
1492                         nxt=a->next;
1493                         if ( (a->parent->state!=S_CONN_BAD) && (port==a->port) &&
1494                                         ( (l_port==0) || (l_port==a->parent->rcv.dst_port)) &&
1495                                         (ip_addr_cmp(&c->rcv.src_ip, &a->parent->rcv.src_ip)) &&
1496                                         ( is_local_ip_any || 
1497                                           ip_addr_cmp(&a->parent->rcv.dst_ip, l_ip))
1498                                         ){
1499                                 /* found */
1500                                 if (unlikely(a->parent!=c)){
1501                                         if (flags & TCP_ALIAS_FORCE_ADD)
1502                                                 /* still have to walk the whole list to check if
1503                                                  * the alias was not already added */
1504                                                 continue;
1505                                         else if (flags & TCP_ALIAS_REPLACE){
1506                                                 /* remove the alias =>
1507                                                  * remove the current alias and all the following
1508                                                  *  ones from the corresponding connection, shift the 
1509                                                  *  connection aliases array and re-add the other 
1510                                                  *  aliases (!= current one) */
1511                                                 p=a->parent;
1512                                                 for (i=0; (i<p->aliases) && (&(p->con_aliases[i])!=a);
1513                                                                 i++);
1514                                                 if (unlikely(i==p->aliases)){
1515                                                         LM_CRIT("alias %p not found in con %p (id %d)\n",
1516                                                                         a, p, p->id);
1517                                                         goto error_not_found;
1518                                                 }
1519                                                 for (r=i; r<p->aliases; r++){
1520                                                         tcpconn_listrm(
1521                                                                 tcpconn_aliases_hash[p->con_aliases[r].hash],
1522                                                                 &p->con_aliases[r], next, prev);
1523                                                 }
1524                                                 if (likely((i+1)<p->aliases)){
1525                                                         memmove(&p->con_aliases[i], &p->con_aliases[i+1],
1526                                                                                         (p->aliases-i-1)*
1527                                                                                                 sizeof(p->con_aliases[0]));
1528                                                 }
1529                                                 p->aliases--;
1530                                                 /* re-add the remaining aliases */
1531                                                 for (r=i; r<p->aliases; r++){
1532                                                         tcpconn_listadd(
1533                                                                 tcpconn_aliases_hash[p->con_aliases[r].hash], 
1534                                                                 &p->con_aliases[r], next, prev);
1535                                                 }
1536                                         }else
1537                                                 goto error_sec;
1538                                 }else goto ok;
1539                         }
1540                 }
1541                 if (unlikely(c->aliases>=TCP_CON_MAX_ALIASES)) goto error_aliases;
1542                 c->con_aliases[c->aliases].parent=c;
1543                 c->con_aliases[c->aliases].port=port;
1544                 c->con_aliases[c->aliases].hash=hash;
1545                 tcpconn_listadd(tcpconn_aliases_hash[hash], 
1546                                                                 &c->con_aliases[c->aliases], next, prev);
1547                 c->aliases++;
1548         }else goto error_not_found;
1549 ok:
1550 #ifdef EXTRA_DEBUG
1551         if (a) LM_DBG("alias already present\n");
1552         else   LM_DBG("alias port %d for hash %d, id %d\n",
1553                         port, hash, c->id);
1554 #endif
1555         return 0;
1556 error_aliases:
1557         /* too many aliases */
1558         return -2;
1559 error_not_found:
1560         /* null connection */
1561         return -1;
1562 error_sec:
1563         /* alias already present and pointing to a different connection
1564          * (hijack attempt?) */
1565         return -3;
1566 }
1567
1568
1569
1570 /* add port as an alias for the "id" connection, 
1571  * returns 0 on success,-1 on failure */
1572 int tcpconn_add_alias(int id, int port, int proto)
1573 {
1574         struct tcp_connection* c;
1575         int ret;
1576         struct ip_addr zero_ip;
1577         int r;
1578         int alias_flags;
1579         
1580         /* fix the port */
1581         port=port?port:((proto==PROTO_TLS)?SIPS_PORT:SIP_PORT);
1582         TCPCONN_LOCK;
1583         /* check if alias already exists */
1584         c=_tcpconn_find(id, 0, 0, 0, 0);
1585         if (likely(c)){
1586                 ip_addr_mk_any(c->rcv.src_ip.af, &zero_ip);
1587                 alias_flags=cfg_get(tcp, tcp_cfg, alias_flags);
1588                 /* alias src_ip:port, 0, 0 */
1589                 ret=_tcpconn_add_alias_unsafe(c, port,  &zero_ip, 0, 
1590                                                                                 alias_flags);
1591                 if (ret<0 && ret!=-3) goto error;
1592                 /* alias src_ip:port, local_ip, 0 */
1593                 ret=_tcpconn_add_alias_unsafe(c, port,  &c->rcv.dst_ip, 0, 
1594                                                                                 alias_flags);
1595                 if (ret<0 && ret!=-3) goto error;
1596                 /* alias src_ip:port, local_ip, local_port */
1597                 ret=_tcpconn_add_alias_unsafe(c, port, &c->rcv.dst_ip, c->rcv.dst_port,
1598                                                                                 alias_flags);
1599                 if (unlikely(ret<0)) goto error;
1600         }else goto error_not_found;
1601         TCPCONN_UNLOCK;
1602         return 0;
1603 error_not_found:
1604         TCPCONN_UNLOCK;
1605         LM_ERR("no connection found for id %d\n",id);
1606         return -1;
1607 error:
1608         TCPCONN_UNLOCK;
1609         switch(ret){
1610                 case -2:
1611                         LM_ERR("too many aliases (%d) for connection %p (id %d) %s:%d <- %d\n",
1612                                         c->aliases, c, c->id, ip_addr2a(&c->rcv.src_ip),
1613                                         c->rcv.src_port, port);
1614                         for (r=0; r<c->aliases; r++){
1615                                 LM_ERR("alias %d: for %p (%d) %s:%d <-%d hash %x\n",  r, c, c->id, 
1616                                                 ip_addr2a(&c->rcv.src_ip), c->rcv.src_port, 
1617                                                 c->con_aliases[r].port, c->con_aliases[r].hash);
1618                         }
1619                         break;
1620                 case -3:
1621                         LM_ERR("possible port hijack attempt\n");
1622                         LM_ERR("alias for %d port %d already"
1623                                                 " present and points to another connection \n",
1624                                                 c->id, port);
1625                         break;
1626                 default:
1627                         LM_ERR("unknown error %d\n", ret);
1628         }
1629         return -1;
1630 }
1631
1632
1633
1634 #ifdef TCP_FD_CACHE
1635
1636 static void tcp_fd_cache_init(void)
1637 {
1638         int r;
1639         for (r=0; r<TCP_FD_CACHE_SIZE; r++)
1640                 fd_cache[r].fd=-1;
1641 }
1642
1643
1644 inline static struct fd_cache_entry* tcp_fd_cache_get(struct tcp_connection *c)
1645 {
1646         int h;
1647         
1648         h=c->id%TCP_FD_CACHE_SIZE;
1649         if ((fd_cache[h].fd>0) && (fd_cache[h].id==c->id) && (fd_cache[h].con==c))
1650                 return &fd_cache[h];
1651         return 0;
1652 }
1653
1654
1655 inline static void tcp_fd_cache_rm(struct fd_cache_entry* e)
1656 {
1657         e->fd=-1;
1658 }
1659
1660
1661 inline static void tcp_fd_cache_add(struct tcp_connection *c, int fd)
1662 {
1663         int h;
1664         
1665         h=c->id%TCP_FD_CACHE_SIZE;
1666         if (likely(fd_cache[h].fd>0))
1667                 tcp_safe_close(fd_cache[h].fd);
1668         fd_cache[h].fd=fd;
1669         fd_cache[h].id=c->id;
1670         fd_cache[h].con=c;
1671 }
1672
1673 #endif /* TCP_FD_CACHE */
1674
1675
1676
1677 inline static int tcpconn_chld_put(struct tcp_connection* tcpconn);
1678
1679 static int tcpconn_send_put(struct tcp_connection* c, const char* buf,
1680                                                         unsigned len, snd_flags_t send_flags);
1681 static int tcpconn_do_send(int fd, struct tcp_connection* c,
1682                                                         const char* buf, unsigned len,
1683                                                         snd_flags_t send_flags, long* resp, int locked);
1684
1685 static int tcpconn_1st_send(int fd, struct tcp_connection* c,
1686                                                         const char* buf, unsigned len,
1687                                                         snd_flags_t send_flags, long* resp, int locked);
1688
1689 /* finds a tcpconn & sends on it
1690  * uses the dst members to, proto (TCP|TLS) and id and tries to send
1691  *  from the "from" address (if non null and id==0)
1692  * returns: number of bytes written (>=0) on success
1693  *          <0 on error */
1694 int tcp_send(struct dest_info* dst, union sockaddr_union* from,
1695                                         const char* buf, unsigned len)
1696 {
1697         struct tcp_connection *c;
1698         struct ip_addr ip;
1699         int port;
1700         int fd;
1701         long response[2];
1702         int n;
1703         ticks_t con_lifetime;
1704 #ifdef USE_TLS
1705         const char* rest_buf;
1706         const char* t_buf;
1707         unsigned rest_len, t_len;
1708         long resp;
1709         snd_flags_t t_send_flags;
1710 #endif /* USE_TLS */
1711         
1712         port=su_getport(&dst->to);
1713         con_lifetime=cfg_get(tcp, tcp_cfg, con_lifetime);
1714         if (likely(port)){
1715                 su2ip_addr(&ip, &dst->to);
1716                 c=tcpconn_get(dst->id, &ip, port, from, con_lifetime); 
1717         }else if (likely(dst->id)){
1718                 c=tcpconn_get(dst->id, 0, 0, 0, con_lifetime);
1719         }else{
1720                 LM_CRIT("null id & to\n");
1721                 return -1;
1722         }
1723         
1724         if (likely(dst->id)){
1725                 if (unlikely(c==0)) {
1726                         if (likely(port)){
1727                                 /* try again w/o id */
1728                                 c=tcpconn_get(0, &ip, port, from, con_lifetime);
1729                         }else{
1730                                 LM_ERR("id %d not found, dropping\n", dst->id);
1731                                 return -1;
1732                         }
1733                 }
1734         }
1735         /* connection not found or unusable => open a new one and send on it */
1736         if (unlikely((c==0) || tcpconn_close_after_send(c))){
1737                 if (unlikely(c)){
1738                         /* can't use c if it's marked as close-after-send  =>
1739                            release it and try opening new one */
1740                         tcpconn_chld_put(c); /* release c (dec refcnt & free on 0) */
1741                         c=0;
1742                 }
1743                 /* check if connect() is disabled */
1744                 if (unlikely((dst->send_flags.f & SND_F_FORCE_CON_REUSE) ||
1745                                                 cfg_get(tcp, tcp_cfg, no_connect)))
1746                         return -1;
1747                 LM_DBG("no open tcp connection found, opening new one\n");
1748                 /* create tcp connection */
1749                 if (likely(from==0)){
1750                         /* check to see if we have to use a specific source addr. */
1751                         switch (dst->to.s.sa_family) {
1752                                 case AF_INET:
1753                                                 from = tcp_source_ipv4;
1754                                         break;
1755                                 case AF_INET6:
1756                                                 from = tcp_source_ipv6;
1757                                         break;
1758                                 default:
1759                                         /* error, bad af, ignore ... */
1760                                         break;
1761                         }
1762                 }
1763 #if defined(TCP_CONNECT_WAIT) && defined(TCP_ASYNC)
1764                 if (likely(cfg_get(tcp, tcp_cfg, tcp_connect_wait) && 
1765                                         cfg_get(tcp, tcp_cfg, async) )){
1766                         if (unlikely(*tcp_connections_no >=
1767                                                         cfg_get(tcp, tcp_cfg, max_connections))){
1768                                 LM_ERR("%s: maximum number of connections exceeded (%d/%d)\n",
1769                                                         su2a(&dst->to, sizeof(dst->to)),
1770                                                         *tcp_connections_no,
1771                                                         cfg_get(tcp, tcp_cfg, max_connections));
1772                                 return -1;
1773                         }
1774                         if (unlikely(dst->proto==PROTO_TLS)) {
1775                                 if (unlikely(*tls_connections_no >=
1776                                                         cfg_get(tcp, tcp_cfg, max_tls_connections))){
1777                                         LM_ERR("%s: maximum number of tls connections exceeded (%d/%d)\n",
1778                                                         su2a(&dst->to, sizeof(dst->to)),
1779                                                         *tls_connections_no,
1780                                                         cfg_get(tcp, tcp_cfg, max_tls_connections));
1781                                         return -1;
1782                                 }
1783                         }
1784                         c=tcpconn_new(-1, &dst->to, from, 0, dst->proto,
1785                                                         S_CONN_CONNECT);
1786                         if (unlikely(c==0)){
1787                                 LM_ERR("%s: could not create new connection\n",
1788                                                 su2a(&dst->to, sizeof(dst->to)));
1789                                 return -1;
1790                         }
1791                         c->flags|=F_CONN_PENDING|F_CONN_FD_CLOSED;
1792                         tcpconn_set_send_flags(c, dst->send_flags);
1793                         atomic_set(&c->refcnt, 2); /* ref from here and from main hash
1794                                                                                  table */
1795                         /* add it to id hash and aliases */
1796                         if (unlikely(tcpconn_add(c)==0)){
1797                                 LM_ERR("%s: could not add connection %p\n",
1798                                                 su2a(&dst->to, sizeof(dst->to)), c);
1799                                 _tcpconn_free(c);
1800                                 n=-1;
1801                                 goto end_no_conn;
1802                         }
1803                         /* do connect and if src ip or port changed, update the 
1804                          * aliases */
1805                         if (unlikely((fd=tcpconn_finish_connect(c, from))<0)){
1806                                 /* tcpconn_finish_connect will automatically blacklist
1807                                    on error => no need to do it here */
1808                                 LM_ERR("%s: tcpconn_finish_connect(%p) failed\n",
1809                                                 su2a(&dst->to, sizeof(dst->to)), c);
1810                                 goto conn_wait_error;
1811                         }
1812                         /* ? TODO: it might be faster just to queue the write directly
1813                          *  and send to main CONN_NEW_PENDING_WRITE */
1814                         /* delay sending the fd to main after the send */
1815                         
1816                         /* NOTE: no lock here, because the connection is marked as
1817                          * pending and nobody else will try to write on it. However
1818                          * this might produce out-of-order writes. If this is not
1819                          * desired either lock before the write or use 
1820                          * _wbufq_insert(...)
1821                          * NOTE2: _wbufq_insert() is used now (no out-of-order).
1822                          */
1823 #ifdef USE_TLS
1824                         if (unlikely(c->type==PROTO_TLS)) {
1825                         /* for TLS the TLS processing and the send must happen
1826                            atomically w/ respect to other sends on the same connection
1827                            (otherwise reordering might occur which would break TLS) =>
1828                            lock. However in this case this send will always be the first.
1829                            We can have the send() outside the lock only if this is the
1830                            first and only send (tls_encode is not called again), or
1831                            this is the last send for a tls_encode() loop and all the
1832                            previous ones did return CONN_NEW_COMPLETE or CONN_EOF.
1833                         */
1834                                 response[1] = CONN_NOP;
1835                                 t_buf = buf;
1836                                 t_len = len;
1837                                 lock_get(&c->write_lock);
1838 redo_tls_encode:
1839                                         t_send_flags = dst->send_flags;
1840                                         n = tls_encode(c, &t_buf, &t_len, &rest_buf, &rest_len,
1841                                                                         &t_send_flags);
1842                                         /* There are 4 cases:
1843                                            1. entire buffer consumed from the first try
1844                                              (rest_len == rest_buf == 0)
1845                                            2. rest_buf & first call
1846                                            3. rest_buf & not first call
1847                                                   3a. CONN_NEW_COMPLETE or CONN_EOF
1848                                                   3b. CONN_NEW_PENDING_WRITE
1849                                            4. entire buffer consumed, but not first call
1850                                                4a. CONN_NEW_COMPLETE or CONN_EOF
1851                                                    4b. CONN_NEW_PENDING_WRITE
1852                                                 We misuse response[1] == CONN_NOP to test for the
1853                                                 first call.
1854                                         */
1855                                         if (unlikely(n < 0)) {
1856                                                 lock_release(&c->write_lock);
1857                                                 goto conn_wait_error;
1858                                         }
1859                                         if (likely(rest_len == 0)) {
1860                                                 /* 1 or 4*: CONN_NEW_COMPLETE, CONN_EOF,  CONN_NOP
1861                                                     or CONN_NEW_PENDING_WRITE (*rest_len == 0) */
1862                                                 if (likely(response[1] != CONN_NEW_PENDING_WRITE)) {
1863                                                         /* 1 or 4a => it's safe to do the send outside the
1864                                                            lock (it will either send directly or
1865                                                            wbufq_insert())
1866                                                         */
1867                                                         lock_release(&c->write_lock);
1868                                                         if (likely(t_len != 0)) {
1869                                                                 n=tcpconn_1st_send(fd, c, t_buf, t_len,
1870                                                                                                         t_send_flags,
1871                                                                                                         &response[1], 0);
1872                                                         } else { /* t_len == 0 */
1873                                                                 if (response[1] == CONN_NOP) {
1874                                                                         /* nothing to send (e.g  parallel send
1875                                                                            tls_encode queues some data and then
1876                                                                            WANT_READ => this tls_encode will queue
1877                                                                            the cleartext too and will have nothing
1878                                                                            to send right now) and initial send =>
1879                                                                            behave as if the send was successful
1880                                                                            (but never return EOF here) */
1881                                                                         response[1] = CONN_NEW_COMPLETE;
1882                                                                 }
1883                                                         }
1884                                                         /* exit */
1885                                                 } else {
1886                                                         /* CONN_NEW_PENDING_WRITE:  4b: it was a
1887                                                            repeated tls_encode() (or otherwise we would
1888                                                            have here CONN_NOP) => add to the queue */
1889                                                         if (unlikely(t_len &&
1890                                                                                         _wbufq_add(c, t_buf, t_len) < 0)) {
1891                                                                 response[1] = CONN_ERROR;
1892                                                                 n = -1;
1893                                                         }
1894                                                         lock_release(&c->write_lock);
1895                                                         /* exit (no send) */
1896                                                 }
1897                                         } else {  /* rest_len != 0 */
1898                                                 /* 2 or 3*: if tls_encode hasn't finished, we have to
1899                                                    call tcpconn_1st_send() under lock (otherwise if it
1900                                                    returns CONN_NEW_PENDING_WRITE, there is no way
1901                                                    to find the right place to add the new queued
1902                                                    data from the 2nd tls_encode()) */
1903                                                 if (likely((response[1] == CONN_NOP /*2*/ ||
1904                                                                         response[1] == CONN_NEW_COMPLETE /*3a*/ ||
1905                                                                         response[1] == CONN_EOF /*3a*/) && t_len))
1906                                                         n = tcpconn_1st_send(fd, c, t_buf, t_len,
1907                                                                                                         t_send_flags,
1908                                                                                                         &response[1], 1);
1909                                                 else if (unlikely(t_len &&
1910                                                                                         _wbufq_add(c, t_buf, t_len) < 0)) {
1911                                                         /*3b: CONN_NEW_PENDING_WRITE*/
1912                                                         response[1] = CONN_ERROR;
1913                                                         n = -1;
1914                                                 }
1915                                                 if (likely(n >= 0)) {
1916                                                         /* if t_len == 0 => nothing was sent => previous
1917                                                            response will be kept */
1918                                                         t_buf = rest_buf;
1919                                                         t_len = rest_len;
1920                                                         goto redo_tls_encode;
1921                                                 } else {
1922                                                         lock_release(&c->write_lock);
1923                                                         /* error exit */
1924                                                 }
1925                                         }
1926                         } else
1927 #endif /* USE_TLS */
1928                                 n=tcpconn_1st_send(fd, c, buf, len, dst->send_flags,
1929                                                                         &response[1], 0);
1930                         if (unlikely(n<0)) /* this will catch CONN_ERROR too */
1931                                 goto conn_wait_error;
1932                         if (unlikely(response[1]==CONN_EOF)){
1933                                 /* if close-after-send requested, don't bother
1934                                    sending the fd back to tcp_main, try closing it
1935                                    immediately (no other tcp_send should use it,
1936                                    because it is marked as close-after-send before
1937                                    being added to the hash) */
1938                                 goto conn_wait_close;
1939                         }
1940                         /* send to tcp_main */
1941                         response[0]=(long)c;
1942                         if (unlikely(send_fd(unix_tcp_sock, response,
1943                                                                         sizeof(response), fd) <= 0)){
1944                                 LM_ERR("%s: %ld for %p failed:" " %s (%d)\n",
1945                                                         su2a(&dst->to, sizeof(dst->to)),
1946                                                         response[1], c, strerror(errno), errno);
1947                                 goto conn_wait_error;
1948                         }
1949                         goto conn_wait_success;
1950                 }
1951 #endif /* TCP_CONNECT_WAIT  && TCP_ASYNC */
1952                 if (unlikely((c=tcpconn_connect(&dst->to, from, dst->proto,
1953                                                                                 &dst->send_flags))==0)){
1954                         LM_ERR("%s: connect failed\n", su2a(&dst->to, sizeof(dst->to)));
1955                         return -1;
1956                 }
1957                 tcpconn_set_send_flags(c, dst->send_flags);
1958                 if (likely(c->state==S_CONN_OK))
1959                         TCP_STATS_ESTABLISHED(S_CONN_CONNECT);
1960                 atomic_set(&c->refcnt, 2); /* ref. from here and it will also
1961                                                                           be added in the tcp_main hash */
1962                 fd=c->s;
1963                 c->flags|=F_CONN_FD_CLOSED; /* not yet opened in main */
1964                 /* ? TODO: it might be faster just to queue the write and
1965                  * send to main a CONN_NEW_PENDING_WRITE */
1966                 
1967                 /* send the new tcpconn to "tcp main" */
1968                 response[0]=(long)c;
1969                 response[1]=CONN_NEW;
1970                 n=send_fd(unix_tcp_sock, response, sizeof(response), c->s);
1971                 if (unlikely(n<=0)){
1972                         LM_ERR("%s: failed send_fd: %s (%d)\n",
1973                                         su2a(&dst->to, sizeof(dst->to)),
1974                                         strerror(errno), errno);
1975                         /* we can safely delete it, it's not referenced by anybody */
1976                         _tcpconn_free(c);
1977                         n=-1;
1978                         goto end_no_conn;
1979                 }
1980                 /* new connection => send on it directly */
1981 #ifdef USE_TLS
1982                 if (unlikely(c->type==PROTO_TLS)) {
1983                         /* for TLS the TLS processing and the send must happen
1984                            atomically w/ respect to other sends on the same connection
1985                            (otherwise reordering might occur which would break TLS) =>
1986                            lock.
1987                         */
1988                         response[1] = CONN_NOP;
1989                         t_buf = buf;
1990                         t_len = len;
1991                         lock_get(&c->write_lock);
1992                                 do {
1993                                         t_send_flags = dst->send_flags;
1994                                         n = tls_encode(c, &t_buf, &t_len, &rest_buf, &rest_len,
1995                                                                         &t_send_flags);
1996                                         if (likely(n > 0)) {
1997                                                 n = tcpconn_do_send(fd, c, t_buf, t_len, t_send_flags,
1998                                                                                                 &resp, 1);
1999                                                 if (likely(response[1] != CONN_QUEUED_WRITE ||
2000                                                                         resp == CONN_ERROR))
2001                                                         /* don't overwrite a previous CONN_QUEUED_WRITE
2002                                                            unless error */
2003                                                         response[1] = resp;
2004                                         } else  if (unlikely(n < 0)) {
2005                                                 response[1] = CONN_ERROR;
2006                                                 break;
2007                                         }
2008                                         /* else do nothing for n (t_len) == 0, keep
2009                                            the last reponse */
2010                                         t_buf = rest_buf;
2011                                         t_len = rest_len;
2012                                 } while(unlikely(rest_len && n > 0));
2013                         lock_release(&c->write_lock);
2014                 } else
2015 #endif /* USE_TLS */
2016                         n = tcpconn_do_send(fd, c, buf, len, dst->send_flags,
2017                                                                         &response[1], 0);
2018                 if (unlikely(response[1] != CONN_NOP)) {
2019                         response[0]=(long)c;
2020                         if (send_all(unix_tcp_sock, response, sizeof(response)) <= 0) {
2021                                 BUG("tcp_main command %ld sending failed (write):"
2022                                                 "%s (%d)\n", response[1], strerror(errno), errno);
2023                                 /* all commands != CONN_NOP returned by tcpconn_do_send()
2024                                    (CONN_EOF, CONN_ERROR, CONN_QUEUED_WRITE) will auto-dec
2025                                    refcnt => if sending the command fails we have to
2026                                    dec. refcnt by hand */
2027                                 tcpconn_chld_put(c); /* deref. it manually */
2028                                 n=-1;
2029                         }
2030                         /* here refcnt for c is already decremented => c contents can
2031                            no longer be used and refcnt _must_ _not_ be decremented
2032                            again on exit */
2033                         if (unlikely(n < 0 || response[1] == CONN_EOF)) {
2034                                 /* on error or eof, close fd */
2035                                 tcp_safe_close(fd);
2036                         } else if (response[1] == CONN_QUEUED_WRITE) {
2037 #ifdef TCP_FD_CACHE
2038                                 if (cfg_get(tcp, tcp_cfg, fd_cache)) {
2039                                         tcp_fd_cache_add(c, fd);
2040                                 } else
2041 #endif /* TCP_FD_CACHE */
2042                                         tcp_safe_close(fd);
2043                         } else {
2044                                 BUG("unexpected tcpconn_do_send() return & response:"
2045                                                 " %d, %ld\n", n, response[1]);
2046                         }
2047                         goto end_no_deref;
2048                 }
2049 #ifdef TCP_FD_CACHE
2050                 if (cfg_get(tcp, tcp_cfg, fd_cache)) {
2051                         tcp_fd_cache_add(c, fd);
2052                 }else
2053 #endif /* TCP_FD_CACHE */
2054                         tcp_safe_close(fd);
2055         /* here we can have only commands that _do_ _not_ dec refcnt.
2056            (CONN_EOF, CON_ERROR, CON_QUEUED_WRITE are all treated above) */
2057                 goto release_c;
2058         } /* if (c==0 or unusable) new connection */
2059         /* existing connection, send on it */
2060         n = tcpconn_send_put(c, buf, len, dst->send_flags);
2061         /* no deref needed (automatically done inside tcpconn_send_put() */
2062         return n;
2063 #ifdef TCP_CONNECT_WAIT
2064 conn_wait_success:
2065 #ifdef TCP_FD_CACHE
2066         if (cfg_get(tcp, tcp_cfg, fd_cache)) {
2067                 tcp_fd_cache_add(c, fd);
2068         } else
2069 #endif /* TCP_FD_CACHE */
2070                 if (unlikely (tcp_safe_close(fd) < 0))
2071                         LM_ERR("closing temporary send fd for %p: %s: "
2072                                         "close(%d) failed (flags 0x%x): %s (%d)\n", c,
2073                                         su2a(&c->rcv.src_su, sizeof(c->rcv.src_su)),
2074                                         fd, c->flags, strerror(errno), errno);
2075         tcpconn_chld_put(c); /* release c (dec refcnt & free on 0) */
2076         return n;
2077 conn_wait_error:
2078         n=-1;
2079 conn_wait_close:
2080         /* connect or send failed or immediate close-after-send was requested on
2081          * newly created connection which was not yet sent to tcp_main (but was
2082          * already hashed) => don't send to main, unhash and destroy directly
2083          * (if refcnt>2 it will be destroyed when the last sender releases the
2084          * connection (tcpconn_chld_put(c))) or when tcp_main receives a
2085          * CONN_ERROR it*/
2086         c->state=S_CONN_BAD;
2087         /* we are here only if we opened a new fd (and not reused a cached or
2088            a reader one) => if the connect was successful close the fd */
2089         if (fd>=0) {
2090                 if (unlikely(tcp_safe_close(fd) < 0 ))
2091                         LM_ERR("closing temporary send fd for %p: %s: "
2092                                         "close(%d) failed (flags 0x%x): %s (%d)\n", c,
2093                                         su2a(&c->rcv.src_su, sizeof(c->rcv.src_su)),
2094                                         fd, c->flags, strerror(errno), errno);
2095         }
2096         /* here the connection is for sure in the hash (tcp_main will not
2097            remove it because it's marked as PENDing) and the refcnt is at least
2098            2
2099          */
2100         TCPCONN_LOCK;
2101                 _tcpconn_detach(c);
2102                 c->flags&=~F_CONN_HASHED;
2103                 tcpconn_put(c);
2104         TCPCONN_UNLOCK;
2105         /* dec refcnt -> mark it for destruction */
2106         tcpconn_chld_put(c);
2107         return n;
2108 #endif /* TCP_CONNECT_WAIT */
2109 release_c:
2110         tcpconn_chld_put(c); /* release c (dec refcnt & free on 0) */
2111 end_no_deref:
2112 end_no_conn:
2113         return n;
2114 }
2115
2116
2117
2118 /** sends on an existing tcpconn and auto-dec. con. ref counter.
2119  * As opposed to tcp_send(), this function requires an existing
2120  * tcp connection.
2121  * WARNING: the tcp_connection will be de-referenced.
2122  * @param c - existing tcp connection pointer.
2123  * @param buf - data to be sent.
2124  * @param len - data length,
2125  * @return >=0 on success, -1 on error.
2126  */
2127 static int tcpconn_send_put(struct tcp_connection* c, const char* buf,
2128                                                                 unsigned len, snd_flags_t send_flags)
2129 {
2130         struct tcp_connection *tmp;
2131         int fd;
2132         long response[2];
2133         int n;
2134         int do_close_fd;
2135 #ifdef USE_TLS
2136         const char* rest_buf;
2137         const char* t_buf;
2138         unsigned rest_len, t_len;
2139         long resp;
2140         snd_flags_t t_send_flags;
2141 #endif /* USE_TLS */
2142 #ifdef TCP_FD_CACHE
2143         struct fd_cache_entry* fd_cache_e;
2144         int use_fd_cache;
2145         
2146         use_fd_cache=cfg_get(tcp, tcp_cfg, fd_cache);
2147         fd_cache_e=0;
2148 #endif /* TCP_FD_CACHE */
2149         do_close_fd=1; /* close the fd on exit */
2150         response[1] = CONN_NOP;
2151 #ifdef TCP_ASYNC
2152         /* if data is already queued, we don't need the fd */
2153 #ifdef TCP_CONNECT_WAIT
2154                 if (unlikely(cfg_get(tcp, tcp_cfg, async) &&
2155                                                 (_wbufq_non_empty(c) || (c->flags&F_CONN_PENDING)) ))
2156 #else /* ! TCP_CONNECT_WAIT */
2157                 if (unlikely(cfg_get(tcp, tcp_cfg, async) && (_wbufq_non_empty(c)) ))
2158 #endif /* TCP_CONNECT_WAIT */
2159                 {
2160                         lock_get(&c->write_lock);
2161 #ifdef TCP_CONNECT_WAIT
2162                                 if (likely(_wbufq_non_empty(c) || (c->flags&F_CONN_PENDING)))
2163 #else /* ! TCP_CONNECT_WAIT */
2164                                 if (likely(_wbufq_non_empty(c)))
2165 #endif /* TCP_CONNECT_WAIT */
2166                                 {
2167                                         do_close_fd=0;
2168 #ifdef USE_TLS
2169                                         if (unlikely(c->type==PROTO_TLS || c->type==PROTO_WSS)) {
2170                                                 t_buf = buf;
2171                                                 t_len = len;
2172                                                 do {
2173                                                         t_send_flags = send_flags;
2174                                                         n = tls_encode(c, &t_buf, &t_len,
2175                                                                                         &rest_buf, &rest_len,
2176                                                                                         &t_send_flags);
2177                                                         if (unlikely((n < 0) || (t_len &&
2178                                                                          (_wbufq_add(c, t_buf, t_len) < 0)))) {
2179                                                                 lock_release(&c->write_lock);
2180                                                                 n=-1;
2181                                                                 response[1] = CONN_ERROR;
2182                                                                 c->state=S_CONN_BAD;
2183                                                                 c->timeout=get_ticks_raw(); /* force timeout */
2184                                                                 goto error;
2185                                                         }
2186                                                         t_buf = rest_buf;
2187                                                         t_len = rest_len;
2188                                                 } while(unlikely(rest_len && n > 0));
2189                                         } else
2190 #endif /* USE_TLS */
2191                                                 if (unlikely(len && (_wbufq_add(c, buf, len)<0))){
2192                                                         lock_release(&c->write_lock);
2193                                                         n=-1;
2194                                                         response[1] = CONN_ERROR;
2195                                                         c->state=S_CONN_BAD;
2196                                                         c->timeout=get_ticks_raw(); /* force timeout */
2197                                                         goto error;
2198                                                 }
2199                                         n=len;
2200                                         lock_release(&c->write_lock);
2201                                         goto release_c;
2202                                 }
2203                         lock_release(&c->write_lock);
2204                 }
2205 #endif /* TCP_ASYNC */
2206                 /* check if this is not the same reader process holding
2207                  *  c  and if so send directly on c->fd */
2208                 if (c->reader_pid==my_pid()){
2209                         LM_DBG("send from reader (%d (%d)), reusing fd\n",
2210                                         my_pid(), process_no);
2211                         fd=c->fd;
2212                         do_close_fd=0; /* don't close the fd on exit, it's in use */
2213 #ifdef TCP_FD_CACHE
2214                         use_fd_cache=0; /* don't cache: problems would arise due to the
2215                                                            close() on cache eviction (if the fd is still 
2216                                                            used). If it has to be cached then dup() _must_ 
2217                                                            be used */
2218                 }else if (likely(use_fd_cache && 
2219                                                         ((fd_cache_e=tcp_fd_cache_get(c))!=0))){
2220                         fd=fd_cache_e->fd;
2221                         do_close_fd=0;
2222                         LM_DBG("found fd in cache (%d, %p, %d)\n", fd, c, fd_cache_e->id);
2223 #endif /* TCP_FD_CACHE */
2224                 }else{
2225                         LM_DBG("tcp connection found (%p), acquiring fd\n", c);
2226                         /* get the fd */
2227                         response[0]=(long)c;
2228                         response[1]=CONN_GET_FD;
2229                         n=send_all(unix_tcp_sock, response, sizeof(response));
2230                         if (unlikely(n<=0)){
2231                                 LM_ERR("failed to get fd(write):%s (%d)\n", strerror(errno), errno);
2232                                 n=-1;
2233                                 goto release_c;
2234                         }
2235                         LM_DBG("c=%p, n=%d\n", c, n);
2236                         n=receive_fd(unix_tcp_sock, &tmp, sizeof(tmp), &fd, MSG_WAITALL);
2237                         if (unlikely(n<=0)){
2238                                 LM_ERR("failed to get fd(receive_fd): %s (%d)\n",
2239                                                 strerror(errno), errno);
2240                                 n=-1;
2241                                 do_close_fd=0;
2242                                 goto release_c;
2243                         }
2244                         /* handle fd closed or bad connection/error
2245                                 (it's possible that this happened in the time between
2246                                 we found the intial connection and the time when we get
2247                                 the fd)
2248                          */
2249                         if (unlikely(c!=tmp || fd==-1 || c->state==S_CONN_BAD)){
2250                                 if (unlikely(c!=tmp && tmp!=0))
2251                                         BUG("tcp_send: get_fd: got different connection:"
2252                                                 "  %p (id= %d, refcnt=%d state=%d) != "
2253                                                 "  %p (n=%d)\n",
2254                                                   c,   c->id,   atomic_get(&c->refcnt),   c->state,
2255                                                   tmp, n
2256                                                 );
2257                                 n=-1; /* fail */
2258                                 /* don't cache fd & close it */
2259                                 do_close_fd = (fd==-1)?0:1;
2260 #ifdef TCP_FD_CACHE
2261                                 use_fd_cache = 0;
2262 #endif /* TCP_FD_CACHE */
2263                                 goto end;
2264                         }
2265                         LM_DBG("after receive_fd: c= %p n=%d fd=%d\n",c, n, fd);
2266                 }
2267         
2268 #ifdef USE_TLS
2269                 if (unlikely(c->type==PROTO_TLS || c->type==PROTO_WSS)) {
2270                         /* for TLS the TLS processing and the send must happen
2271                            atomically w/ respect to other sends on the same connection
2272                            (otherwise reordering might occur which would break TLS) =>
2273                            lock.
2274                         */
2275                         response[1] = CONN_NOP;
2276                         t_buf = buf;
2277                         t_len = len;
2278                         lock_get(&c->write_lock);
2279                                 do {
2280                                         t_send_flags = send_flags;
2281                                         n = tls_encode(c, &t_buf, &t_len, &rest_buf, &rest_len,
2282                                                                         &t_send_flags);
2283                                         if (likely(n > 0)) {
2284                                                 n = tcpconn_do_send(fd, c, t_buf, t_len, t_send_flags,
2285                                                                                                 &resp, 1);
2286                                                 if (likely(response[1] != CONN_QUEUED_WRITE ||
2287                                                                         resp == CONN_ERROR))
2288                                                         /* don't overwrite a previous CONN_QUEUED_WRITE
2289                                                            unless error */
2290                                                         response[1] = resp;
2291                                         } else if (unlikely(n < 0)) {
2292                                                 response[1] = CONN_ERROR;
2293                                                 break;
2294                                         }
2295                                         /* else do nothing for n (t_len) == 0, keep
2296                                            the last reponse */
2297                                         t_buf = rest_buf;
2298                                         t_len = rest_len;
2299                                 } while(unlikely(rest_len && n > 0));
2300                         lock_release(&c->write_lock);
2301                 } else
2302 #endif
2303                         n = tcpconn_do_send(fd, c, buf, len, send_flags, &response[1], 0);
2304         if (unlikely(response[1] != CONN_NOP)) {
2305 error:
2306                 response[0]=(long)c;
2307                 if (send_all(unix_tcp_sock, response, sizeof(response)) <= 0) {
2308                         BUG("tcp_main command %ld sending failed (write):%s (%d)\n",
2309                                         response[1], strerror(errno), errno);
2310                         /* all commands != CONN_NOP returned by tcpconn_do_send()
2311                            (CONN_EOF, CONN_ERROR, CONN_QUEUED_WRITE) will auto-dec refcnt
2312                            => if sending the command fails we have to dec. refcnt by hand
2313                          */
2314                         tcpconn_chld_put(c); /* deref. it manually */
2315                         n=-1;
2316                 }
2317                 /* here refcnt for c is already decremented => c contents can no
2318                    longer be used and refcnt _must_ _not_ be decremented again
2319                    on exit */
2320                 if (unlikely(n < 0 || response[1] == CONN_EOF)) {
2321                         /* on error or eof, remove from cache or close fd */
2322 #ifdef TCP_FD_CACHE
2323                         if (unlikely(fd_cache_e)){
2324                                 tcp_fd_cache_rm(fd_cache_e);
2325                                 fd_cache_e = 0;
2326                                 tcp_safe_close(fd);
2327                         }else
2328 #endif /* TCP_FD_CACHE */
2329                                 if (do_close_fd) tcp_safe_close(fd);
2330                 } else if (response[1] == CONN_QUEUED_WRITE) {
2331 #ifdef TCP_FD_CACHE
2332                         if (unlikely((fd_cache_e==0) && use_fd_cache)){
2333                                 tcp_fd_cache_add(c, fd);
2334                         }else
2335 #endif /* TCP_FD_CACHE */
2336                                 if (do_close_fd) tcp_safe_close(fd);
2337                 } else {
2338                         BUG("unexpected tcpconn_do_send() return & response: %d, %ld\n",
2339                                         n, response[1]);
2340                 }
2341                 return n; /* no tcpconn_put */
2342         }
2343 end:
2344 #ifdef TCP_FD_CACHE
2345         if (unlikely((fd_cache_e==0) && use_fd_cache)){
2346                 tcp_fd_cache_add(c, fd);
2347         }else
2348 #endif /* TCP_FD_CACHE */
2349         if (do_close_fd) {
2350                 if (unlikely(tcp_safe_close(fd) < 0))
2351                         LM_ERR("closing temporary send fd for %p: %s: "
2352                                         "close(%d) failed (flags 0x%x): %s (%d)\n", c,
2353                                         su2a(&c->rcv.src_su, sizeof(c->rcv.src_su)),
2354                                         fd, c->flags, strerror(errno), errno);
2355         }
2356         /* here we can have only commands that _do_ _not_ dec refcnt.
2357            (CONN_EOF, CON_ERROR, CON_QUEUED_WRITE are all treated above) */
2358 release_c:
2359         tcpconn_chld_put(c); /* release c (dec refcnt & free on 0) */
2360         return n;
2361 }
2362
2363
2364
2365 /* unsafe send on a known tcp connection.
2366  * Directly send on a known tcp connection with a given fd.
2367  * It is assumed that the connection locks are already held.
2368  * Side effects: if needed it will send state update commands to
2369  *  tcp_main (e.g. CON_EOF, CON_ERROR, CON_QUEUED_WRITE).
2370  * @param fd - fd used for sending.
2371  * @param c - existing tcp connection pointer (state and flags might be
2372  *            changed).
2373  * @param buf - data to be sent.
2374  * @param len - data length.
2375  * @param send_flags
2376  * @return <0 on error, number of bytes sent on success.
2377  */
2378 int tcpconn_send_unsafe(int fd, struct tcp_connection *c,
2379                                                 const char* buf, unsigned len, snd_flags_t send_flags)
2380 {
2381         int n;
2382         long response[2];
2383         
2384         n = tcpconn_do_send(fd, c, buf, len, send_flags, &response[1], 1);
2385         if (unlikely(response[1] != CONN_NOP)) {
2386                 /* all commands != CONN_NOP returned by tcpconn_do_send()
2387                    (CONN_EOF, CONN_ERROR, CONN_QUEUED_WRITE) will auto-dec refcnt
2388                    => increment it (we don't want the connection to be destroyed
2389                    from under us)
2390                  */
2391                 atomic_inc(&c->refcnt);
2392                 response[0]=(long)c;
2393                 if (send_all(unix_tcp_sock, response, sizeof(response)) <= 0) {
2394                         BUG("connection %p command %ld sending failed (write):%s (%d)\n",
2395                                         c, response[1], strerror(errno), errno);
2396                         /* send failed => deref. it back by hand */
2397                         tcpconn_chld_put(c); 
2398                         n=-1;
2399                 }
2400                 /* here refcnt for c is already decremented => c contents can no
2401                    longer be used and refcnt _must_ _not_ be decremented again
2402                    on exit */
2403                 return n;
2404         }
2405         return n;
2406 }
2407
2408
2409
2410 /** lower level send (connection and fd should be known).
2411  * It takes care of possible write-queueing, blacklisting a.s.o.
2412  * It expects a valid tcp connection. It doesn't touch the ref. cnts.
2413  * It will also set the connection flags from send_flags (it's better
2414  * to do it here, because it's guaranteed to be under lock).
2415  * @param fd - fd used for sending.
2416  * @param c - existing tcp connection pointer (state and flags might be
2417  *            changed).
2418  * @param buf - data to be sent.
2419  * @param len - data length.
2420  * @param send_flags
2421  * @param resp - filled with a cmd. for tcp_main:
2422  *                      CONN_NOP - nothing needs to be done (do not send
2423  *                                 anything to tcp_main).
2424  *                      CONN_ERROR - error, connection should be closed.
2425  *                      CONN_EOF - no error, but connection should be closed.
2426  *                      CONN_QUEUED_WRITE - new write queue (connection
2427  *                                 should be watched for write and the wr.
2428  *                                 queue flushed).
2429  * @param locked - if set assume the connection is already locked (call from
2430  *                  tls) and do not lock/unlock the connection.
2431  * @return >=0 on success, < 0 on error && *resp == CON_ERROR.
2432  *
2433  */
2434 static int tcpconn_do_send(int fd, struct tcp_connection* c,
2435                                                         const char* buf, unsigned len,
2436                                                         snd_flags_t send_flags, long* resp,
2437                                                         int locked)
2438 {
2439         int  n;
2440 #ifdef TCP_ASYNC
2441         int enable_write_watch;
2442 #endif /* TCP_ASYNC */
2443
2444         LM_DBG("sending...\n");
2445         *resp = CONN_NOP;
2446         if (likely(!locked)) lock_get(&c->write_lock);
2447         /* update connection send flags with the current ones */
2448         tcpconn_set_send_flags(c, send_flags);
2449 #ifdef TCP_ASYNC
2450         if (likely(cfg_get(tcp, tcp_cfg, async))){
2451                 if (_wbufq_non_empty(c)
2452 #ifdef TCP_CONNECT_WAIT
2453                         || (c->flags&F_CONN_PENDING) 
2454 #endif /* TCP_CONNECT_WAIT */
2455                         ){
2456                         if (unlikely(_wbufq_add(c, buf, len)<0)){
2457                                 if (likely(!locked)) lock_release(&c->write_lock);
2458                                 n=-1;
2459                                 goto error;
2460                         }
2461                         if (likely(!locked)) lock_release(&c->write_lock);
2462                         n=len;
2463                         goto end;
2464                 }
2465                 n=_tcpconn_write_nb(fd, c, buf, len);
2466         }else{
2467 #endif /* TCP_ASYNC */
2468                 /* n=tcp_blocking_write(c, fd, buf, len); */
2469                 n=tsend_stream(fd, buf, len,
2470                                                 TICKS_TO_S(cfg_get(tcp, tcp_cfg, send_timeout)) *
2471                                                 1000);
2472 #ifdef TCP_ASYNC
2473         }
2474 #else /* ! TCP_ASYNC */
2475         if (likely(!locked)) lock_release(&c->write_lock);
2476 #endif /* TCP_ASYNC */
2477         
2478         LM_DBG("after real write: c= %p n=%d fd=%d\n",c, n, fd);
2479         LM_DBG("buf=\n%.*s\n", (int)len, buf);
2480         if (unlikely(n<(int)len)){
2481 #ifdef TCP_ASYNC
2482                 if (cfg_get(tcp, tcp_cfg, async) &&
2483                                 ((n>=0) || errno==EAGAIN || errno==EWOULDBLOCK)){
2484                         enable_write_watch=_wbufq_empty(c);
2485                         if (n<0) n=0;
2486                         else if (unlikely(c->state==S_CONN_CONNECT ||
2487                                                 c->state==S_CONN_ACCEPT)){
2488                                 TCP_STATS_ESTABLISHED(c->state);
2489                                 c->state=S_CONN_OK; /* something was written */
2490                         }
2491                         if (unlikely(_wbufq_add(c, buf+n, len-n)<0)){
2492                                 if (likely(!locked)) lock_release(&c->write_lock);
2493                                 n=-1;
2494                                 goto error;
2495                         }
2496                         if (likely(!locked)) lock_release(&c->write_lock);
2497                         n=len;
2498                         if (likely(enable_write_watch))
2499                                 *resp=CONN_QUEUED_WRITE;
2500                         goto end;
2501                 }else{
2502                         if (likely(!locked)) lock_release(&c->write_lock);
2503                 }
2504 #endif /* TCP_ASYNC */
2505                 if (unlikely(c->state==S_CONN_CONNECT)){
2506                         switch(errno){
2507                                 case ENETUNREACH:
2508                                 case EHOSTUNREACH: /* not posix for send() */
2509 #ifdef USE_DST_BLACKLIST
2510                                         dst_blacklist_su(BLST_ERR_CONNECT, c->rcv.proto,
2511                                                                                 &c->rcv.src_su, &c->send_flags, 0);
2512 #endif /* USE_DST_BLACKLIST */
2513                                         TCP_EV_CONNECT_UNREACHABLE(errno, TCP_LADDR(c),
2514                                                                         TCP_LPORT(c), TCP_PSU(c), TCP_PROTO(c));
2515                                         break;
2516                                 case ECONNREFUSED:
2517                                 case ECONNRESET:
2518 #ifdef USE_DST_BLACKLIST
2519                                         dst_blacklist_su(BLST_ERR_CONNECT, c->rcv.proto,
2520                                                                                 &c->rcv.src_su, &c->send_flags, 0);
2521 #endif /* USE_DST_BLACKLIST */
2522                                         TCP_EV_CONNECT_RST(errno, TCP_LADDR(c), TCP_LPORT(c),
2523                                                                                 TCP_PSU(c), TCP_PROTO(c));
2524                                         break;
2525                                 default:
2526                                         TCP_EV_CONNECT_ERR(errno, TCP_LADDR(c), TCP_LPORT(c),
2527                                                                                 TCP_PSU(c), TCP_PROTO(c));
2528                                 }
2529                         TCP_STATS_CONNECT_FAILED();
2530                 }else{
2531                         switch(errno){
2532                                 case ECONNREFUSED:
2533                                 case ECONNRESET:
2534                                         TCP_STATS_CON_RESET();
2535                                         /* no break */
2536                                 case ENETUNREACH:
2537                                 /*case EHOSTUNREACH: -- not posix */
2538 #ifdef USE_DST_BLACKLIST
2539                                         dst_blacklist_su(BLST_ERR_SEND, c->rcv.proto,
2540                                                                                 &c->rcv.src_su, &c->send_flags, 0);
2541 #endif /* USE_DST_BLACKLIST */
2542                                         break;
2543                         }
2544                 }
2545                 LM_ERR("failed to send on %p (%s:%d->%s): %s (%d)\n",
2546                                         c, ip_addr2a(&c->rcv.dst_ip), c->rcv.dst_port,
2547                                         su2a(&c->rcv.src_su, sizeof(c->rcv.src_su)),
2548                                         strerror(errno), errno);
2549                 n = -1;
2550 #ifdef TCP_ASYNC
2551 error:
2552 #endif /* TCP_ASYNC */
2553                 /* error on the connection , mark it as bad and set 0 timeout */
2554                 c->state=S_CONN_BAD;
2555                 c->timeout=get_ticks_raw();
2556                 /* tell "main" it should drop this (optional it will t/o anyway?)*/
2557                 *resp=CONN_ERROR;
2558                 return n; /* error return, no tcpconn_put */
2559         }
2560         
2561 #ifdef TCP_ASYNC
2562         if (likely(!locked)) lock_release(&c->write_lock);
2563 #endif /* TCP_ASYNC */
2564         /* in non-async mode here we're either in S_CONN_OK or S_CONN_ACCEPT*/
2565         if (unlikely(c->state==S_CONN_CONNECT || c->state==S_CONN_ACCEPT)){
2566                         TCP_STATS_ESTABLISHED(c->state);
2567                         c->state=S_CONN_OK;
2568         }
2569         if (unlikely(send_flags.f & SND_F_CON_CLOSE)){
2570                 /* close after write => send EOF request to tcp_main */
2571                 c->state=S_CONN_BAD;
2572                 c->timeout=get_ticks_raw();
2573                 /* tell "main" it should drop this*/
2574                 *resp=CONN_EOF;
2575                 return n;
2576         }
2577 end:
2578         return n;
2579 }
2580
2581
2582
2583 /** low level 1st send on a new connection.
2584  * It takes care of possible write-queueing, blacklisting a.s.o.
2585  * It expects a valid just-opened tcp connection. It doesn't touch the 
2586  * ref. counters. It's used only in the async first send case.
2587  * @param fd - fd used for sending.
2588  * @param c - existing tcp connection pointer (state and flags might be
2589  *            changed). The connection must be new (no previous send on it).
2590  * @param buf - data to be sent.
2591  * @param len - data length.
2592  * @param send_flags
2593  * @param resp - filled with a fd sending cmd. for tcp_main on success. It
2594  *                      _must_ be one of the commands listed below:
2595  *                      CONN_NEW_PENDING_WRITE - new connection, first write
2596  *                                 was partially successful (or EAGAIN) and
2597  *                                 was queued (connection should be watched
2598  *                                 for write and the write queue flushed).
2599  *                                 The fd should be sent to tcp_main.
2600  *                      CONN_NEW_COMPLETE - new connection, first write
2601  *                                 completed successfully and no data is
2602  *                                 queued. The fd should be sent to tcp_main.
2603  *                      CONN_EOF - no error, but the connection should be
2604  *                                  closed (e.g. SND_F_CON_CLOSE send flag).
2605  *                      CONN_ERROR - error, _must_ return < 0.
2606  * @param locked - if set assume the connection is already locked (call from
2607  *                  tls) and do not lock/unlock the connection.
2608  * @return >=0 on success, < 0 on error (on error *resp is undefined).
2609  *
2610  */
2611 static int tcpconn_1st_send(int fd, struct tcp_connection* c,
2612                                                         const char* buf, unsigned len,
2613                                                         snd_flags_t send_flags, long* resp,
2614                                                         int locked)
2615 {
2616         int n;
2617         
2618         n=_tcpconn_write_nb(fd, c, buf, len);
2619         if (unlikely(n<(int)len)){
2620                 /* on EAGAIN or ENOTCONN return success.
2621                    ENOTCONN appears on newer FreeBSD versions (non-blocking socket,
2622                    connect() & send immediately) */
2623                 if ((n>=0) || errno==EAGAIN || errno==EWOULDBLOCK || errno==ENOTCONN){
2624                         LM_DBG("pending write on new connection %p "
2625                                 "(%d/%d bytes written)\n", c, n, len);
2626                         if (unlikely(n<0)) n=0;
2627                         else{
2628                                 if (likely(c->state == S_CONN_CONNECT))
2629                                         TCP_STATS_ESTABLISHED(S_CONN_CONNECT);
2630                                 c->state=S_CONN_OK; /* partial write => connect()
2631                                                                                                 ended */
2632                         }
2633                         /* add to the write queue */
2634                         if (likely(!locked)) lock_get(&c->write_lock);
2635                                 if (unlikely(_wbufq_insert(c, buf+n, len-n)<0)){
2636                                         if (likely(!locked)) lock_release(&c->write_lock);
2637                                         n=-1;
2638                                         LM_ERR("%s: EAGAIN and write queue full or failed for %p\n",
2639                                                         su2a(&c->rcv.src_su, sizeof(c->rcv.src_su)), c);
2640                                         goto error;
2641                                 }
2642                         if (likely(!locked)) lock_release(&c->write_lock);
2643                         /* send to tcp_main */
2644                         *resp=CONN_NEW_PENDING_WRITE;
2645                         n=len;
2646                         goto end;
2647                 }
2648                 /* n < 0 and not EAGAIN => write error */
2649                 /* if first write failed it's most likely a
2650                    connect error */
2651                 switch(errno){
2652                         case ENETUNREACH:
2653                         case EHOSTUNREACH:  /* not posix for send() */
2654 #ifdef USE_DST_BLACKLIST
2655                                 dst_blacklist_su( BLST_ERR_CONNECT, c->rcv.proto,
2656                                                                         &c->rcv.src_su, &c->send_flags, 0);
2657 #endif /* USE_DST_BLACKLIST */
2658                                 TCP_EV_CONNECT_UNREACHABLE(errno, TCP_LADDR(c),
2659                                                                 TCP_LPORT(c), TCP_PSU(c), TCP_PROTO(c));
2660                                 break;
2661                         case ECONNREFUSED:
2662                         case ECONNRESET:
2663 #ifdef USE_DST_BLACKLIST
2664                                 dst_blacklist_su( BLST_ERR_CONNECT, c->rcv.proto,
2665                                                                         &c->rcv.src_su, &c->send_flags, 0);
2666 #endif /* USE_DST_BLACKLIST */
2667                                 TCP_EV_CONNECT_RST(errno, TCP_LADDR(c),
2668                                                                 TCP_LPORT(c), TCP_PSU(c), TCP_PROTO(c));
2669                                 break;
2670                         default:
2671                                 TCP_EV_CONNECT_ERR(errno, TCP_LADDR(c),
2672                                                                 TCP_LPORT(c), TCP_PSU(c), TCP_PROTO(c));
2673                 }
2674                 /* error: destroy it directly */
2675                 TCP_STATS_CONNECT_FAILED();
2676                 LM_ERR("%s: connect & send  for %p failed:" " %s (%d)\n",
2677                                         su2a(&c->rcv.src_su, sizeof(c->rcv.src_su)),
2678                                         c, strerror(errno), errno);
2679                 goto error;
2680         }
2681         LM_INFO("quick connect for %p\n", c);
2682         if (likely(c->state == S_CONN_CONNECT))
2683                 TCP_STATS_ESTABLISHED(S_CONN_CONNECT);
2684         if (unlikely(send_flags.f & SND_F_CON_CLOSE)){
2685                 /* close after write =>  EOF => close immediately */
2686                 c->state=S_CONN_BAD;
2687                 /* tell our caller that it should drop this*/
2688                 *resp=CONN_EOF;
2689         }else{
2690                 c->state=S_CONN_OK;
2691                 /* send to tcp_main */
2692                 *resp=CONN_NEW_COMPLETE;
2693         }
2694 end:
2695         return n; /* >= 0 */
2696 error:
2697         *resp=CONN_ERROR;
2698         return -1;
2699 }
2700
2701
2702
2703 int tcp_init(struct socket_info* sock_info)
2704 {
2705         union sockaddr_union* addr;
2706         int optval;
2707 #ifdef HAVE_TCP_ACCEPT_FILTER
2708         struct accept_filter_arg afa;
2709 #endif /* HAVE_TCP_ACCEPT_FILTER */
2710 #ifdef DISABLE_NAGLE
2711         int flag;
2712         struct protoent* pe;
2713
2714         if (tcp_proto_no==-1){ /* if not already set */
2715                 pe=getprotobyname("tcp");
2716                 if (pe==0){
2717                         LM_ERR("could not get TCP protocol number\n");
2718                         tcp_proto_no=-1;
2719                 }else{
2720                         tcp_proto_no=pe->p_proto;
2721                 }
2722         }
2723 #endif
2724
2725         addr=&sock_info->su;
2726         /* sock_info->proto=PROTO_TCP; */
2727         if (init_su(addr, &sock_info->address, sock_info->port_no)<0){
2728                 LM_ERR("could no init sockaddr_union\n");
2729                 goto error;
2730         }
2731         LM_DBG("added %s\n", su2a(addr, sizeof(*addr)));
2732         sock_info->socket=socket(AF2PF(addr->s.sa_family), SOCK_STREAM, 0);
2733         if (sock_info->socket==-1){
2734                 LM_ERR("tcp_init: socket: %s\n", strerror(errno));
2735                 goto error;
2736         }
2737 #ifdef DISABLE_NAGLE
2738         flag=1;
2739         if ( (tcp_proto_no!=-1) &&
2740                  (setsockopt(sock_info->socket, tcp_proto_no , TCP_NODELAY,
2741                                          &flag, sizeof(flag))<0) ){
2742                 LM_ERR("could not disable Nagle: %s\n", strerror(errno));
2743         }
2744 #endif
2745
2746
2747 #if  !defined(TCP_DONT_REUSEADDR) 
2748         /* Stevens, "Network Programming", Section 7.5, "Generic Socket
2749      * Options": "...server started,..a child continues..on existing
2750          * connection..listening server is restarted...call to bind fails
2751          * ... ALL TCP servers should specify the SO_REUSEADDRE option 
2752          * to allow the server to be restarted in this situation
2753          *
2754          * Indeed, without this option, the server can't restart.
2755          *   -jiri
2756          */
2757         optval=1;
2758         if (setsockopt(sock_info->socket, SOL_SOCKET, SO_REUSEADDR,
2759                                 (void*)&optval, sizeof(optval))==-1) {
2760                 LM_ERR("setsockopt %s\n", strerror(errno));
2761                 goto error;
2762         }