54288e908fac473b3ef68475428d373b5af35555
[sip-router] / src / core / tcp_main.c
1 /*
2  * Copyright (C) 2001-2003 FhG Fokus
3  *
4  * This file is part of Kamailio, a free SIP server.
5  *
6  * Kamailio is free software; you can redistribute it and/or modify
7  * it under the terms of the GNU General Public License as published by
8  * the Free Software Foundation; either version 2 of the License, or
9  * (at your option) any later version
10  *
11  * Kamailio is distributed in the hope that it will be useful,
12  * but WITHOUT ANY WARRANTY; without even the implied warranty of
13  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
14  * GNU General Public License for more details.
15  *
16  * You should have received a copy of the GNU General Public License
17  * along with this program; if not, write to the Free Software
18  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
19  */
20
21 /** Kamailio core: tcp main/dispatcher and tcp send functions.
22  * @file tcp_main.c
23  * @ingroup core
24  * Module: @ref core
25  */
26
27
28 #ifdef USE_TCP
29
30
31 #define HANDLE_IO_INLINE
32 #include "io_wait.h" /* include first to make sure the needed features are
33                                                 turned on (e.g. _GNU_SOURCE for POLLRDHUP) */
34
35 #include <sys/time.h>
36 #include <sys/types.h>
37 #include <sys/select.h>
38 #include <sys/socket.h>
39 #ifdef HAVE_FILIO_H
40 #include <sys/filio.h> /* needed on solaris 2.x for FIONREAD */
41 #elif defined __OS_solaris
42 #define BSD_COMP  /* needed on older solaris for FIONREAD */
43 #endif /* HAVE_FILIO_H / __OS_solaris */
44 #include <sys/ioctl.h>  /* ioctl() used on write error */
45 #include <arpa/inet.h>  /* for inet_pton() */
46 #include <netinet/in.h>
47 #include <netinet/in_systm.h>
48 #include <netinet/ip.h>
49 #include <netinet/tcp.h>
50 #include <sys/uio.h>  /* writev*/
51 #include <netdb.h>
52 #include <stdlib.h> /*exit() */
53 #include <stdint.h> /* UINT32_MAX */
54
55 #include <unistd.h>
56
57 #include <errno.h>
58 #include <string.h>
59
60 #ifdef HAVE_SELECT
61 #include <sys/select.h>
62 #endif
63 #include <poll.h>
64
65
66 #include "ip_addr.h"
67 #include "pass_fd.h"
68 #include "tcp_conn.h"
69 #include "globals.h"
70 #include "pt.h"
71 #include "locking.h"
72 #include "mem/mem.h"
73 #include "mem/shm_mem.h"
74 #include "timer.h"
75 #include "sr_module.h"
76 #include "tcp_server.h"
77 #include "tcp_init.h"
78 #include "tcp_int_send.h"
79 #include "tcp_stats.h"
80 #include "tcp_ev.h"
81 #include "tsend.h"
82 #include "timer_ticks.h"
83 #include "local_timer.h"
84 #ifdef CORE_TLS
85 #include "tls/tls_server.h"
86 #define tls_loaded() 1
87 #else
88 #include "tls_hooks_init.h"
89 #include "tls_hooks.h"
90 #endif /* CORE_TLS*/
91 #ifdef USE_DST_BLACKLIST
92 #include "dst_blacklist.h"
93 #endif /* USE_DST_BLACKLIST */
94
95 #include "tcp_info.h"
96 #include "tcp_options.h"
97 #include "ut.h"
98 #include "cfg/cfg_struct.h"
99
100 #include <fcntl.h> /* must be included after io_wait.h if SIGIO_RT is used */
101
102
103 #ifdef NO_MSG_DONTWAIT
104 #ifndef MSG_DONTWAIT
105 /* should work inside tcp_main */
106 #define MSG_DONTWAIT 0
107 #endif
108 #endif /*NO_MSG_DONTWAIT */
109
110
111 #define TCP_PASS_NEW_CONNECTION_ON_DATA /* don't pass a new connection
112                                                                                    immediately to a child, wait for
113                                                                                    some data on it first */
114 #define TCP_LISTEN_BACKLOG 1024
115 #define SEND_FD_QUEUE /* queue send fd requests on EAGAIN, instead of sending 
116                                                         them immediately */
117 #define TCP_CHILD_NON_BLOCKING 
118 #ifdef SEND_FD_QUEUE
119 #ifndef TCP_CHILD_NON_BLOCKING
120 #define TCP_CHILD_NON_BLOCKING
121 #endif
122 #define MAX_SEND_FD_QUEUE_SIZE  tcp_main_max_fd_no
123 #define SEND_FD_QUEUE_SIZE              128  /* initial size */
124 #define SEND_FD_QUEUE_TIMEOUT   MS_TO_TICKS(2000)  /* 2 s */
125 #endif
126
127 /* minimum interval local_timer_run() is allowed to run, in ticks */
128 #define TCPCONN_TIMEOUT_MIN_RUN 1  /* once per tick */
129 #define TCPCONN_WAIT_TIMEOUT 1 /* 1 tick */
130
131 #ifdef TCP_ASYNC
132 static unsigned int* tcp_total_wq=0;
133 #endif
134
135
136 enum fd_types { F_NONE, F_SOCKINFO /* a tcp_listen fd */,
137                                 F_TCPCONN, F_TCPCHILD, F_PROC };
138
139
140 #ifdef TCP_FD_CACHE
141
142 #define TCP_FD_CACHE_SIZE 8
143
144 struct fd_cache_entry{
145         struct tcp_connection* con;
146         int id;
147         int fd;
148 };
149
150
151 static struct fd_cache_entry fd_cache[TCP_FD_CACHE_SIZE];
152 #endif /* TCP_FD_CACHE */
153
154 static int is_tcp_main=0;
155
156
157 enum poll_types tcp_poll_method=0; /* by default choose the best method */
158 int tcp_main_max_fd_no=0;
159 int tcp_max_connections=DEFAULT_TCP_MAX_CONNECTIONS;
160 int tls_max_connections=DEFAULT_TLS_MAX_CONNECTIONS;
161
162 static union sockaddr_union tcp_source_ipv4_addr; /* saved bind/srv v4 addr. */
163 static union sockaddr_union* tcp_source_ipv4=0;
164 static union sockaddr_union tcp_source_ipv6_addr; /* saved bind/src v6 addr. */
165 static union sockaddr_union* tcp_source_ipv6=0;
166
167 static int* tcp_connections_no=0; /* current tcp (+tls) open connections */
168 static int* tls_connections_no=0; /* current tls open connections */
169
170 /* connection hash table (after ip&port) , includes also aliases */
171 struct tcp_conn_alias** tcpconn_aliases_hash=0;
172 /* connection hash table (after connection id) */
173 struct tcp_connection** tcpconn_id_hash=0;
174 gen_lock_t* tcpconn_lock=0;
175
176 struct tcp_child* tcp_children=0;
177 static int* connection_id=0; /*  unique for each connection, used for 
178                                                                 quickly finding the corresponding connection
179                                                                 for a reply */
180 int unix_tcp_sock;
181
182 static int tcp_proto_no=-1; /* tcp protocol number as returned by
183                                                            getprotobyname */
184
185 static io_wait_h io_h;
186
187 static struct local_timer tcp_main_ltimer;
188 static ticks_t tcp_main_prev_ticks;
189
190 /* tell if there are tcp workers that should handle only specific socket
191  * - used to optimize the search of least loaded worker for a tcp socket
192  * - 0 - no workers per tcp sockets have been set
193  * - 1 + generic_workers - when there are workers per tcp sockets
194  */
195 static int tcp_sockets_gworkers = 0;
196
197 static ticks_t tcpconn_main_timeout(ticks_t , struct timer_ln* , void* );
198
199 inline static int _tcpconn_add_alias_unsafe(struct tcp_connection* c, int port,
200                                                                                 struct ip_addr* l_ip, int l_port,
201                                                                                 int flags);
202
203
204
205 /* sets source address used when opening new sockets and no source is specified
206  *  (by default the address is choosen by the kernel)
207  * Should be used only on init.
208  * returns -1 on error */
209 int tcp_set_src_addr(struct ip_addr* ip)
210 {
211         switch (ip->af){
212                 case AF_INET:
213                         ip_addr2su(&tcp_source_ipv4_addr, ip, 0);
214                         tcp_source_ipv4=&tcp_source_ipv4_addr;
215                         break;
216                 case AF_INET6:
217                         ip_addr2su(&tcp_source_ipv6_addr, ip, 0);
218                         tcp_source_ipv6=&tcp_source_ipv6_addr;
219                         break;
220                 default:
221                         return -1;
222         }
223         return 0;
224 }
225
226
227
228 static inline int init_sock_keepalive(int s)
229 {
230         int optval;
231         
232 #ifdef HAVE_SO_KEEPALIVE
233         if (cfg_get(tcp, tcp_cfg, keepalive)){
234                 optval=1;
235                 if (setsockopt(s, SOL_SOCKET, SO_KEEPALIVE, &optval,
236                                                 sizeof(optval))<0){
237                         LM_WARN("failed to enable SO_KEEPALIVE: %s\n", strerror(errno));
238                         return -1;
239                 }
240         }
241 #endif
242 #ifdef HAVE_TCP_KEEPINTVL
243         if ((optval=cfg_get(tcp, tcp_cfg, keepintvl))){
244                 if (setsockopt(s, IPPROTO_TCP, TCP_KEEPINTVL, &optval,
245                                                 sizeof(optval))<0){
246                         LM_WARN("failed to set keepalive probes interval: %s\n", strerror(errno));
247                 }
248         }
249 #endif
250 #ifdef HAVE_TCP_KEEPIDLE
251         if ((optval=cfg_get(tcp, tcp_cfg, keepidle))){
252                 if (setsockopt(s, IPPROTO_TCP, TCP_KEEPIDLE, &optval,
253                                                 sizeof(optval))<0){
254                         LM_WARN("failed to set keepalive idle interval: %s\n", strerror(errno));
255                 }
256         }
257 #endif
258 #ifdef HAVE_TCP_KEEPCNT
259         if ((optval=cfg_get(tcp, tcp_cfg, keepcnt))){
260                 if (setsockopt(s, IPPROTO_TCP, TCP_KEEPCNT, &optval,
261                                                 sizeof(optval))<0){
262                         LM_WARN("failed to set maximum keepalive count: %s\n", strerror(errno));
263                 }
264         }
265 #endif
266         return 0;
267 }
268
269
270
271 /* set all socket/fd options for new sockets (e.g. before connect): 
272  *  disable nagle, tos lowdelay, reuseaddr, non-blocking
273  *
274  * return -1 on error */
275 static int init_sock_opt(int s, int af)
276 {
277         int flags;
278         int optval;
279         
280 #ifdef DISABLE_NAGLE
281         flags=1;
282         if ( (tcp_proto_no!=-1) && (setsockopt(s, tcp_proto_no , TCP_NODELAY,
283                                         &flags, sizeof(flags))<0) ){
284                 LM_WARN("could not disable Nagle: %s\n", strerror(errno));
285         }
286 #endif
287         /* tos*/
288         optval = tos;
289         if(af==AF_INET){
290                 if (setsockopt(s, IPPROTO_IP, IP_TOS, (void*)&optval,
291                                         sizeof(optval)) ==-1){
292                         LM_WARN("setsockopt tos: %s\n", strerror(errno));
293                         /* continue since this is not critical */
294                 }
295         } else if(af==AF_INET6){
296                 if (setsockopt(s, IPPROTO_IPV6, IPV6_TCLASS,
297                                         (void*)&optval, sizeof(optval)) ==-1) {
298                         LM_WARN("setsockopt v6 tos: %s\n", strerror(errno));
299                         /* continue since this is not critical */
300                 }
301         }
302
303 #if  !defined(TCP_DONT_REUSEADDR) 
304         optval=1;
305         if (setsockopt(s, SOL_SOCKET, SO_REUSEADDR,
306                                                 (void*)&optval, sizeof(optval))==-1){
307                 LM_ERR("setsockopt SO_REUSEADDR %s\n", strerror(errno));
308                 /* continue, not critical */
309         }
310 #endif /* !TCP_DONT_REUSEADDR */
311
312 #ifdef SO_REUSEPORT
313         if ((optval=cfg_get(tcp, tcp_cfg, reuse_port))) {
314                 if (setsockopt(s, SOL_SOCKET, SO_REUSEPORT,
315                                 (void*)&optval, sizeof(optval))==-1) {
316                         LM_ERR("setsockopt %s\n", strerror(errno));
317                 }
318         }
319 #endif
320
321 #ifdef HAVE_TCP_SYNCNT
322         if ((optval=cfg_get(tcp, tcp_cfg, syncnt))){
323                 if (setsockopt(s, IPPROTO_TCP, TCP_SYNCNT, &optval,
324                                                 sizeof(optval))<0){
325                         LM_WARN("failed to set maximum SYN retr. count: %s\n", strerror(errno));
326                 }
327         }
328 #endif
329 #ifdef HAVE_TCP_LINGER2
330         if ((optval=cfg_get(tcp, tcp_cfg, linger2))){
331                 if (setsockopt(s, IPPROTO_TCP, TCP_LINGER2, &optval,
332                                                 sizeof(optval))<0){
333                         LM_WARN("failed to set maximum LINGER2 timeout: %s\n", strerror(errno));
334                 }
335         }
336 #endif
337 #ifdef HAVE_TCP_QUICKACK
338         if (cfg_get(tcp, tcp_cfg, delayed_ack)){
339                 optval=0; /* reset quick ack => delayed ack */
340                 if (setsockopt(s, IPPROTO_TCP, TCP_QUICKACK, &optval,
341                                                 sizeof(optval))<0){
342                         LM_WARN("failed to reset TCP_QUICKACK: %s\n", strerror(errno));
343                 }
344         }
345 #endif /* HAVE_TCP_QUICKACK */
346         init_sock_keepalive(s);
347         
348         /* non-blocking */
349         flags=fcntl(s, F_GETFL);
350         if (flags==-1){
351                 LM_ERR("fnctl failed: (%d) %s\n", errno, strerror(errno));
352                 goto error;
353         }
354         if (fcntl(s, F_SETFL, flags|O_NONBLOCK)==-1){
355                 LM_ERR("fcntl: set non-blocking failed: (%d) %s\n", errno, strerror(errno));
356                 goto error;
357         }
358         return 0;
359 error:
360         return -1;
361 }
362
363
364
365 /* set all socket/fd options for "accepted" sockets 
366  *  only nonblocking is set since the rest is inherited from the
367  *  "parent" (listening) socket
368  *  Note: setting O_NONBLOCK is required on linux but it's not needed on
369  *        BSD and possibly solaris (where the flag is inherited from the 
370  *        parent socket). However since there is no standard document 
371  *        requiring a specific behaviour in this case it's safer to always set
372  *        it (at least for now)  --andrei
373  *  TODO: check on which OSes  O_NONBLOCK is inherited and make this 
374  *        function a nop.
375  *
376  * return -1 on error */
377 static int init_sock_opt_accept(int s)
378 {
379         int flags;
380         
381         /* non-blocking */
382         flags=fcntl(s, F_GETFL);
383         if (flags==-1){
384                 LM_ERR("fnctl failed: (%d) %s\n", errno, strerror(errno));
385                 goto error;
386         }
387         if (fcntl(s, F_SETFL, flags|O_NONBLOCK)==-1){
388                 LM_ERR("fcntl: set non-blocking failed: (%d) %s\n", errno, strerror(errno));
389                 goto error;
390         }
391         return 0;
392 error:
393         return -1;
394 }
395
396
397
398 /** close a socket, handling errno.
399  * On EINTR, repeat the close().
400  * Filter expected errors (return success if close() failed because
401  * EPIPE, ECONNRST a.s.o). Note that this happens on *BSDs (on linux close()
402  * does not fail for socket level errors).
403  * @param s - open valid socket.
404  * @return - 0 on success, < 0 on error (whatever close() returns). On error
405  *           errno is set.
406  */
407 static int tcp_safe_close(int s)
408 {
409         int ret;
410
411         if(s<0)
412                 return 0;
413
414 retry:
415         if (unlikely((ret = close(s)) < 0 )) {
416                 switch(errno) {
417                         case EINTR:
418                                 goto retry;
419                         case EPIPE:
420                         case ENOTCONN:
421                         case ECONNRESET:
422                         case ECONNREFUSED:
423                         case ENETUNREACH:
424                         case EHOSTUNREACH:
425                                 /* on *BSD we really get these errors at close() time 
426                                    => ignore them */
427                                 ret = 0;
428                                 break;
429                         default:
430                                 break;
431                 }
432         }
433         return ret;
434 }
435
436
437
438 /* blocking connect on a non-blocking fd; it will timeout after
439  * tcp_connect_timeout 
440  * if BLOCKING_USE_SELECT and HAVE_SELECT are defined it will internally
441  * use select() instead of poll (bad if fd > FD_SET_SIZE, poll is preferred)
442  */
443 static int tcp_blocking_connect(int fd, int type, snd_flags_t* send_flags,
444                                                                 const struct sockaddr *servaddr,
445                                                                 socklen_t addrlen)
446 {
447         int n;
448 #if defined(HAVE_SELECT) && defined(BLOCKING_USE_SELECT)
449         fd_set sel_set;
450         fd_set orig_set;
451         struct timeval timeout;
452 #else
453         struct pollfd pf;
454 #endif
455         int elapsed;
456         int to;
457         int ticks;
458         int err;
459         unsigned int err_len;
460         int poll_err;
461         
462         poll_err=0;
463         to=cfg_get(tcp, tcp_cfg, connect_timeout_s);
464         ticks=get_ticks();
465 again:
466         n=connect(fd, servaddr, addrlen);
467         if (n==-1){
468                 if (errno==EINTR){
469                         elapsed=(get_ticks()-ticks)*TIMER_TICK;
470                         if (elapsed<to)         goto again;
471                         else goto error_timeout;
472                 }
473                 if (errno!=EINPROGRESS && errno!=EALREADY){
474                         goto error_errno;
475                 }
476         }else goto end;
477         
478         /* poll/select loop */
479 #if defined(HAVE_SELECT) && defined(BLOCKING_USE_SELECT)
480                 FD_ZERO(&orig_set);
481                 FD_SET(fd, &orig_set);
482 #else
483                 pf.fd=fd;
484                 pf.events=POLLOUT;
485 #endif
486         while(1){
487                 elapsed=(get_ticks()-ticks)*TIMER_TICK;
488                 if (elapsed>=to)
489                         goto error_timeout;
490 #if defined(HAVE_SELECT) && defined(BLOCKING_USE_SELECT)
491                 sel_set=orig_set;
492                 timeout.tv_sec=to-elapsed;
493                 timeout.tv_usec=0;
494                 n=select(fd+1, 0, &sel_set, 0, &timeout);
495 #else
496                 n=poll(&pf, 1, (to-elapsed)*1000);
497 #endif
498                 if (n<0){
499                         if (errno==EINTR) continue;
500                         LM_ERR("%s: poll/select failed: (%d) %s\n",
501                                         su2a((union sockaddr_union*)servaddr, addrlen),
502                                         errno, strerror(errno));
503                         goto error;
504                 }else if (n==0) /* timeout */ continue;
505 #if defined(HAVE_SELECT) && defined(BLOCKING_USE_SELECT)
506                 if (FD_ISSET(fd, &sel_set))
507 #else
508                 if (pf.revents&(POLLERR|POLLHUP|POLLNVAL)){ 
509                         LM_ERR("%s: poll error: flags %x\n",
510                                         su2a((union sockaddr_union*)servaddr, addrlen),
511                                         pf.revents);
512                         poll_err=1;
513                 }
514 #endif
515                 {
516                         err_len=sizeof(err);
517                         getsockopt(fd, SOL_SOCKET, SO_ERROR, &err, &err_len);
518                         if ((err==0) && (poll_err==0)) goto end;
519                         if (err!=EINPROGRESS && err!=EALREADY){
520                                 LM_ERR("%s: SO_ERROR (%d) %s\n",
521                                                 su2a((union sockaddr_union*)servaddr, addrlen),
522                                                 err, strerror(err));
523                                 errno=err;
524                                 goto error_errno;
525                         }
526                 }
527         }
528 error_errno:
529         switch(errno){
530                 case ENETUNREACH:
531                 case EHOSTUNREACH:
532 #ifdef USE_DST_BLACKLIST
533                         dst_blacklist_su(BLST_ERR_CONNECT, type,
534                                                          (union sockaddr_union*)servaddr, send_flags, 0);
535 #endif /* USE_DST_BLACKLIST */
536                         TCP_EV_CONNECT_UNREACHABLE(errno, 0, 0,
537                                                         (union sockaddr_union*)servaddr, type);
538                         break;
539                 case ETIMEDOUT:
540 #ifdef USE_DST_BLACKLIST
541                         dst_blacklist_su(BLST_ERR_CONNECT, type,
542                                                          (union sockaddr_union*)servaddr, send_flags, 0);
543 #endif /* USE_DST_BLACKLIST */
544                         TCP_EV_CONNECT_TIMEOUT(errno, 0, 0,
545                                                         (union sockaddr_union*)servaddr, type);
546                         break;
547                 case ECONNREFUSED:
548                 case ECONNRESET:
549 #ifdef USE_DST_BLACKLIST
550                         dst_blacklist_su(BLST_ERR_CONNECT, type,
551                                                          (union sockaddr_union*)servaddr, send_flags, 0);
552 #endif /* USE_DST_BLACKLIST */
553                         TCP_EV_CONNECT_RST(errno, 0, 0,
554                                                         (union sockaddr_union*)servaddr, type);
555                         break;
556                 case EAGAIN: /* not posix, but supported on linux and bsd */
557                         TCP_EV_CONNECT_NO_MORE_PORTS(errno, 0, 0,
558                                                         (union sockaddr_union*)servaddr, type);
559                         break;
560                 default:
561                         TCP_EV_CONNECT_ERR(errno, 0, 0,
562                                                                 (union sockaddr_union*)servaddr, type);
563         }
564         LM_ERR("%s: (%d) %s\n",
565                         su2a((union sockaddr_union*)servaddr, addrlen),
566                         errno, strerror(errno));
567         goto error;
568 error_timeout:
569         /* timeout */
570 #ifdef USE_DST_BLACKLIST
571         dst_blacklist_su(BLST_ERR_CONNECT, type,
572                                                 (union sockaddr_union*)servaddr, send_flags, 0);
573 #endif /* USE_DST_BLACKLIST */
574         TCP_EV_CONNECT_TIMEOUT(0, 0, 0, (union sockaddr_union*)servaddr, type);
575         LM_ERR("%s: timeout %d s elapsed from %d s\n",
576                                 su2a((union sockaddr_union*)servaddr, addrlen),
577                                 elapsed, cfg_get(tcp, tcp_cfg, connect_timeout_s));
578 error:
579         TCP_STATS_CONNECT_FAILED();
580         return -1;
581 end:
582         return 0;
583 }
584
585
586
587 #ifdef TCP_ASYNC
588
589
590 /* unsafe version */
591 #define _wbufq_empty(con) ((con)->wbuf_q.first==0)
592 /* unsafe version */
593 #define _wbufq_non_empty(con) ((con)->wbuf_q.first!=0)
594
595
596 /* unsafe version, call while holding the connection write lock */
597 inline static int _wbufq_add(struct  tcp_connection* c, const char* data, 
598                                                         unsigned int size)
599 {
600         struct tcp_wbuffer_queue* q;
601         struct tcp_wbuffer* wb;
602         unsigned int last_free;
603         unsigned int wb_size;
604         unsigned int crt_size;
605         ticks_t t;
606         
607         q=&c->wbuf_q;
608         t=get_ticks_raw();
609         if (unlikely(   ((q->queued+size)>cfg_get(tcp, tcp_cfg, tcpconn_wq_max)) ||
610                                         ((*tcp_total_wq+size)>cfg_get(tcp, tcp_cfg, tcp_wq_max)) ||
611                                         (q->first &&
612                                         TICKS_LT(q->wr_timeout, t)) )){
613                 LM_ERR("(%d bytes): write queue full or timeout "
614                                         " (%d, total %d, last write %d s ago)\n",
615                                         size, q->queued, *tcp_total_wq,
616                                         TICKS_TO_S(t-(q->wr_timeout-
617                                                                 cfg_get(tcp, tcp_cfg, send_timeout))));
618                 if (q->first && TICKS_LT(q->wr_timeout, t)){
619                         if (unlikely(c->state==S_CONN_CONNECT)){
620 #ifdef USE_DST_BLACKLIST
621                                 (void)dst_blacklist_su( BLST_ERR_CONNECT, c->rcv.proto,
622                                                                                 &c->rcv.src_su, &c->send_flags, 0);
623 #endif /* USE_DST_BLACKLIST */
624                                 TCP_EV_CONNECT_TIMEOUT(0, TCP_LADDR(c), TCP_LPORT(c),
625                                                                                         TCP_PSU(c), TCP_PROTO(c));
626                                 TCP_STATS_CONNECT_FAILED();
627                         }else{
628 #ifdef USE_DST_BLACKLIST
629                                 (void)dst_blacklist_su( BLST_ERR_SEND, c->rcv.proto,
630                                                                         &c->rcv.src_su, &c->send_flags, 0);
631 #endif /* USE_DST_BLACKLIST */
632                                 TCP_EV_SEND_TIMEOUT(0, &c->rcv);
633                                 TCP_STATS_SEND_TIMEOUT();
634                         }
635                 }else{
636                         /* if it's not a timeout => queue full */
637                         TCP_EV_SENDQ_FULL(0, &c->rcv);
638                         TCP_STATS_SENDQ_FULL();
639                 }
640                 goto error;
641         }
642         
643         if (unlikely(q->last==0)){
644                 wb_size=MAX_unsigned(cfg_get(tcp, tcp_cfg, wq_blk_size), size);
645                 wb=shm_malloc(sizeof(*wb)+wb_size-1);
646                 if (unlikely(wb==0)) {
647                         SHM_MEM_ERROR;
648                         goto error;
649                 }
650                 wb->b_size=wb_size;
651                 wb->next=0;
652                 q->last=wb;
653                 q->first=wb;
654                 q->last_used=0;
655                 q->offset=0;
656                 q->wr_timeout=get_ticks_raw()+
657                         ((c->state==S_CONN_CONNECT)?
658                                         S_TO_TICKS(cfg_get(tcp, tcp_cfg, connect_timeout_s)):
659                                         cfg_get(tcp, tcp_cfg, send_timeout));
660         }else{
661                 wb=q->last;
662         }
663         
664         while(size){
665                 last_free=wb->b_size-q->last_used;
666                 if (last_free==0){
667                         wb_size=MAX_unsigned(cfg_get(tcp, tcp_cfg, wq_blk_size), size);
668                         wb=shm_malloc(sizeof(*wb)+wb_size-1);
669                         if (unlikely(wb==0)) {
670                                 SHM_MEM_ERROR;
671                                 goto error;
672                         }
673                         wb->b_size=wb_size;
674                         wb->next=0;
675                         q->last->next=wb;
676                         q->last=wb;
677                         q->last_used=0;
678                         last_free=wb->b_size;
679                 }
680                 crt_size=MIN_unsigned(last_free, size);
681                 memcpy(wb->buf+q->last_used, data, crt_size);
682                 q->last_used+=crt_size;
683                 size-=crt_size;
684                 data+=crt_size;
685                 q->queued+=crt_size;
686                 atomic_add_int((int*)tcp_total_wq, crt_size);
687         }
688         return 0;
689 error:
690         return -1;
691 }
692
693
694
695 /* unsafe version, call while holding the connection write lock
696  * inserts data at the beginning, it ignores the max queue size checks and
697  * the timeout (use sparingly)
698  * Note: it should never be called on a write buffer after wbufq_run() */
699 inline static int _wbufq_insert(struct  tcp_connection* c, const char* data, 
700                                                         unsigned int size)
701 {
702         struct tcp_wbuffer_queue* q;
703         struct tcp_wbuffer* wb;
704         
705         q=&c->wbuf_q;
706         if (likely(q->first==0)) /* if empty, use wbufq_add */
707                 return _wbufq_add(c, data, size);
708         
709         if (unlikely((*tcp_total_wq+size)>cfg_get(tcp, tcp_cfg, tcp_wq_max))){
710                 LM_ERR("(%d bytes): write queue full"
711                                         " (%d, total %d, last write %d s ago)\n",
712                                         size, q->queued, *tcp_total_wq,
713                                         TICKS_TO_S(get_ticks_raw()-q->wr_timeout-
714                                                                         cfg_get(tcp, tcp_cfg, send_timeout)));
715                 goto error;
716         }
717         if (unlikely(q->offset)){
718                 LM_CRIT("non-null offset %d (bad call, should"
719                                 "never be called after the wbufq_run())\n", q->offset);
720                 goto error;
721         }
722         if ((q->first==q->last) && ((q->last->b_size-q->last_used)>=size)){
723                 /* one block with enough space in it for size bytes */
724                 memmove(q->first->buf+size, q->first->buf, q->last_used);
725                 memcpy(q->first->buf, data, size);
726                 q->last_used+=size;
727         }else{
728                 /* create a size bytes block directly */
729                 wb=shm_malloc(sizeof(*wb)+size-1);
730                 if (unlikely(wb==0)) {
731                         SHM_MEM_ERROR;
732                         goto error;
733                 }
734                 wb->b_size=size;
735                 /* insert it */
736                 wb->next=q->first;
737                 q->first=wb;
738                 memcpy(wb->buf, data, size);
739         }
740         
741         q->queued+=size;
742         atomic_add_int((int*)tcp_total_wq, size);
743         return 0;
744 error:
745         return -1;
746 }
747
748
749
750 /* unsafe version, call while holding the connection write lock */
751 inline static void _wbufq_destroy( struct  tcp_wbuffer_queue* q)
752 {
753         struct tcp_wbuffer* wb;
754         struct tcp_wbuffer* next_wb;
755         int unqueued;
756         
757         unqueued=0;
758         if (likely(q->first)){
759                 wb=q->first;
760                 do{
761                         next_wb=wb->next;
762                         unqueued+=(wb==q->last)?q->last_used:wb->b_size;
763                         if (wb==q->first)
764                                 unqueued-=q->offset;
765                         shm_free(wb);
766                         wb=next_wb;
767                 }while(wb);
768         }
769         memset(q, 0, sizeof(*q));
770         atomic_add_int((int*)tcp_total_wq, -unqueued);
771 }
772
773
774
775 /* tries to empty the queue  (safe version, c->write_lock must not be hold)
776  * returns -1 on error, bytes written on success (>=0) 
777  * if the whole queue is emptied => sets *empty*/
778 inline static int wbufq_run(int fd, struct tcp_connection* c, int* empty)
779 {
780         struct tcp_wbuffer_queue* q;
781         struct tcp_wbuffer* wb;
782         int n;
783         int ret;
784         int block_size;
785         char* buf;
786         
787         *empty=0;
788         ret=0;
789         lock_get(&c->write_lock);
790         q=&c->wbuf_q;
791         while(q->first){
792                 block_size=((q->first==q->last)?q->last_used:q->first->b_size)-
793                                                 q->offset;
794                 buf=q->first->buf+q->offset;
795                 n=_tcpconn_write_nb(fd, c, buf, block_size);
796                 if (likely(n>0)){
797                         ret+=n;
798                         if (likely(n==block_size)){
799                                 wb=q->first;
800                                 q->first=q->first->next; 
801                                 shm_free(wb);
802                                 q->offset=0;
803                                 q->queued-=block_size;
804                                 atomic_add_int((int*)tcp_total_wq, -block_size);
805                         }else{
806                                 q->offset+=n;
807                                 q->queued-=n;
808                                 atomic_add_int((int*)tcp_total_wq, -n);
809                                 break;
810                         }
811                 }else{
812                         if (n<0){
813                                 /* EINTR is handled inside _tcpconn_write_nb */
814                                 if (!(errno==EAGAIN || errno==EWOULDBLOCK)){
815                                         if (unlikely(c->state==S_CONN_CONNECT)){
816                                                 switch(errno){
817                                                         case ENETUNREACH:
818                                                         case EHOSTUNREACH: /* not posix for send() */
819 #ifdef USE_DST_BLACKLIST
820                                                                 dst_blacklist_su(BLST_ERR_CONNECT,
821                                                                                                         c->rcv.proto,
822                                                                                                         &c->rcv.src_su,
823                                                                                                         &c->send_flags, 0);
824 #endif /* USE_DST_BLACKLIST */
825                                                                 TCP_EV_CONNECT_UNREACHABLE(errno, TCP_LADDR(c),
826                                                                                                         TCP_LPORT(c), TCP_PSU(c),
827                                                                                                         TCP_PROTO(c));
828                                                                 break;
829                                                         case ECONNREFUSED:
830                                                         case ECONNRESET:
831 #ifdef USE_DST_BLACKLIST
832                                                                 dst_blacklist_su(BLST_ERR_CONNECT,
833                                                                                                         c->rcv.proto,
834                                                                                                         &c->rcv.src_su,
835                                                                                                         &c->send_flags, 0);
836 #endif /* USE_DST_BLACKLIST */
837                                                                 TCP_EV_CONNECT_RST(0, TCP_LADDR(c),
838                                                                                                         TCP_LPORT(c), TCP_PSU(c),
839                                                                                                         TCP_PROTO(c));
840                                                                 break;
841                                                         default:
842                                                                 TCP_EV_CONNECT_ERR(errno, TCP_LADDR(c),
843                                                                                                         TCP_LPORT(c), TCP_PSU(c),
844                                                                                                         TCP_PROTO(c));
845                                                 }
846                                                 TCP_STATS_CONNECT_FAILED();
847                                         }else{
848                                                 switch(errno){
849                                                         case ECONNREFUSED:
850                                                         case ECONNRESET:
851                                                                 TCP_STATS_CON_RESET();
852                                                                 /* no break */
853                                                         case ENETUNREACH:
854                                                         case EHOSTUNREACH: /* not posix for send() */
855 #ifdef USE_DST_BLACKLIST
856                                                                 dst_blacklist_su(BLST_ERR_SEND,
857                                                                                                         c->rcv.proto,
858                                                                                                         &c->rcv.src_su,
859                                                                                                         &c->send_flags, 0);
860 #endif /* USE_DST_BLACKLIST */
861                                                                 break;
862                                                 }
863                                         }
864                                         ret=-1;
865                                         LM_ERR("%s [%d]\n", strerror(errno), errno);
866                                 }
867                         }
868                         break;
869                 }
870         }
871         if (likely(q->first==0)){
872                 q->last=0;
873                 q->last_used=0;
874                 q->offset=0;
875                 *empty=1;
876         }
877         lock_release(&c->write_lock);
878         if (likely(ret>0)){
879                 q->wr_timeout=get_ticks_raw()+cfg_get(tcp, tcp_cfg, send_timeout);
880                 if (unlikely(c->state==S_CONN_CONNECT || c->state==S_CONN_ACCEPT)){
881                         TCP_STATS_ESTABLISHED(c->state);
882                         c->state=S_CONN_OK;
883                 }
884         }
885         return ret;
886 }
887
888 #endif /* TCP_ASYNC */
889
890
891
892 #if 0
893 /* blocking write even on non-blocking sockets 
894  * if TCP_TIMEOUT will return with error */
895 static int tcp_blocking_write(struct tcp_connection* c, int fd, char* buf,
896                                                                 unsigned int len)
897 {
898         int n;
899         fd_set sel_set;
900         struct timeval timeout;
901         int ticks;
902         int initial_len;
903         
904         initial_len=len;
905 again:
906         
907         n=send(fd, buf, len,
908 #ifdef HAVE_MSG_NOSIGNAL
909                         MSG_NOSIGNAL
910 #else
911                         0
912 #endif
913                 );
914         if (n<0){
915                 if (errno==EINTR)       goto again;
916                 else if (errno!=EAGAIN && errno!=EWOULDBLOCK){
917                         LM_ERR("failed to send: (%d) %s\n", errno, strerror(errno));
918                         TCP_EV_SEND_TIMEOUT(errno, &c->rcv);
919                         TCP_STATS_SEND_TIMEOUT();
920                         goto error;
921                 }
922         }else if (n<len){
923                 /* partial write */
924                 buf+=n;
925                 len-=n;
926         }else{
927                 /* success: full write */
928                 goto end;
929         }
930         while(1){
931                 FD_ZERO(&sel_set);
932                 FD_SET(fd, &sel_set);
933                 timeout.tv_sec=tcp_send_timeout;
934                 timeout.tv_usec=0;
935                 ticks=get_ticks();
936                 n=select(fd+1, 0, &sel_set, 0, &timeout);
937                 if (n<0){
938                         if (errno==EINTR) continue; /* signal, ignore */
939                         LM_ERR("select failed: (%d) %s\n", errno, strerror(errno));
940                         goto error;
941                 }else if (n==0){
942                         /* timeout */
943                         if (get_ticks()-ticks>=tcp_send_timeout){
944                                 LM_ERR("send timeout (%d)\n", tcp_send_timeout);
945                                 goto error;
946                         }
947                         continue;
948                 }
949                 if (FD_ISSET(fd, &sel_set)){
950                         /* we can write again */
951                         goto again;
952                 }
953         }
954 error:
955                 return -1;
956 end:
957                 return initial_len;
958 }
959 #endif
960
961 /* Attempt to extract real connection information from an upstream load
962  * balancer or reverse proxy. This should be called right after accept()ing the
963  * connection, and before TLS negotiation.
964  *
965  * Returns:
966  *    -1 on parsing error (connection should be closed)
967  *    0 on parser success, and connection information was extracted
968  *    1 on parser success, but no connection information was provided by the
969  *      upstream load balancer or reverse proxy.
970  */
971 int tcpconn_read_haproxy(struct tcp_connection *c) {
972         int bytes, retval = 0;
973         uint32_t size, port;
974         char *p, *end;
975         struct ip_addr *src_ip, *dst_ip;
976
977         const char v2sig[12] = "\x0D\x0A\x0D\x0A\x00\x0D\x0A\x51\x55\x49\x54\x0A";
978
979         // proxy header union
980         union {
981                 // v1 struct
982                 struct {
983                         char line[108];
984                 } v1;
985
986                 // v2 struct
987                 struct {
988                         uint8_t sig[12];
989                         uint8_t ver_cmd;
990                         uint8_t fam;
991                         uint16_t len;
992
993                         union {
994                                 struct { /* for TCP/UDP over IPv4, len = 12 */
995                                         uint32_t src_addr;
996                                         uint32_t dst_addr;
997                                         uint16_t src_port;
998                                         uint16_t dst_port;
999                                 } ip4;
1000
1001                                 struct { /* for TCP/UDP over IPv6, len = 36 */
1002                                          uint8_t  src_addr[16];
1003                                          uint8_t  dst_addr[16];
1004                                          uint16_t src_port;
1005                                          uint16_t dst_port;
1006                                 } ip6;
1007
1008                                 struct { /* for AF_UNIX sockets, len = 216 */
1009                                          uint8_t src_addr[108];
1010                                          uint8_t dst_addr[108];
1011                                 } unx;
1012                         } addr;
1013                 } v2;
1014
1015         } hdr;
1016
1017         do {
1018                 bytes = recv(c->s, &hdr, sizeof(hdr), MSG_PEEK);
1019         } while (bytes == -1 && (errno == EINTR || errno == EAGAIN));
1020
1021         src_ip = &c->rcv.src_ip;
1022         dst_ip = &c->rcv.dst_ip;
1023
1024         if (bytes >= 16 && memcmp(&hdr.v2, v2sig, 12) == 0 &&
1025                 (hdr.v2.ver_cmd & 0xF0) == 0x20) {
1026                 LM_DBG("received PROXY protocol v2 header\n");
1027                 size = 16 + ntohs(hdr.v2.len);
1028
1029                 if (bytes < size) {
1030                         return -1; /* truncated or too large header */
1031                 }
1032
1033                 switch (hdr.v2.ver_cmd & 0xF) {
1034                         case 0x01: /* PROXY command */
1035                                 switch (hdr.v2.fam) {
1036                                         case 0x11: /* TCPv4 */
1037                                                 src_ip->af = AF_INET;
1038                                                 src_ip->len = 4;
1039                                                 src_ip->u.addr32[0] =
1040                                                         hdr.v2.addr.ip4.src_addr;
1041                                                 c->rcv.src_port =
1042                                                         hdr.v2.addr.ip4.src_port;
1043
1044                                                 dst_ip->af = AF_INET;
1045                                                 dst_ip->len = 4;
1046                                                 dst_ip->u.addr32[0] =
1047                                                         hdr.v2.addr.ip4.dst_addr;
1048                                                 c->rcv.dst_port =
1049                                                         hdr.v2.addr.ip4.dst_port;
1050
1051                                                 goto done;
1052
1053                                         case 0x21: /* TCPv6 */
1054                                                 src_ip->af = AF_INET6;
1055                                                 src_ip->len = 16;
1056                                                 memcpy(src_ip->u.addr,
1057                                                         hdr.v2.addr.ip6.src_addr, 16);
1058                                                 c->rcv.src_port =
1059                                                         hdr.v2.addr.ip6.src_port;
1060
1061                                                 dst_ip->af = AF_INET6;
1062                                                 dst_ip->len = 16;
1063                                                 memcpy(dst_ip->u.addr,
1064                                                         hdr.v2.addr.ip6.src_addr, 16);
1065                                                 c->rcv.dst_port =
1066                                                         hdr.v2.addr.ip6.dst_port;
1067
1068                                                 goto done;
1069
1070                                         default: /* unsupported protocol */
1071                                                 return -1;
1072                                 }
1073
1074                         case 0x00: /* LOCAL command */
1075                                 retval = 1; /* keep local connection address for LOCAL */
1076                                 goto done;
1077
1078                         default:
1079                                 return -1; /* not a supported command */
1080                 }
1081         }
1082         else if (bytes >= 8 && memcmp(hdr.v1.line, "PROXY", 5) == 0) {
1083                 LM_DBG("received PROXY protocol v1 header\n");
1084                 end = memchr(hdr.v1.line, '\r', bytes - 1);
1085                 if (!end || end[1] != '\n') {
1086                         return -1; /* partial or invalid header */
1087                 }
1088                 *end = '\0'; /* terminate the string to ease parsing */
1089                 size = end + 2 - hdr.v1.line;
1090                 p = hdr.v1.line + 5;
1091
1092                 if (strncmp(p, " TCP", 4) == 0) {
1093                         switch (p[4]) {
1094                                 case '4':
1095                                         src_ip->af  = dst_ip->af  = AF_INET;
1096                                         src_ip->len = dst_ip->len = 4;
1097                                         break;
1098                                 case '6':
1099                                         src_ip->af  = dst_ip->af  = AF_INET6;
1100                                         src_ip->len = dst_ip->len = 16;
1101                                         break;
1102                                 default:
1103                                         return -1; /* unknown TCP version */
1104                         }
1105
1106                         if (p[5] != ' ') {
1107                                 return -1; /* misformatted header */
1108                         }
1109                         p += 6; /* skip over the already-parsed bytes */
1110
1111                         /* Parse the source IP address */
1112                         end = strchr(p, ' ');
1113                         if (!end) {
1114                                 return -1; /* truncated header */
1115                         }
1116                         *end = '\0'; /* mark the end of the IP address */
1117                         if (inet_pton(src_ip->af, p, src_ip->u.addr) != 1) {
1118                                 return -1; /* missing IP address */
1119                         }
1120                         p = end + 1;
1121
1122                         /* Parse the destination IP address */
1123                         end = strchr(p, ' ');
1124                         if (!end) {
1125                                 return -1;
1126                         }
1127                         *end = '\0'; /* mark the end of the IP address */
1128                         if (inet_pton(dst_ip->af, p, dst_ip->u.addr) != 1) {
1129                                 return -1;
1130                         }
1131                         p = end + 1;
1132
1133                         /* Parse the source port */
1134                         port = strtoul(p, &end, 10);
1135                         if (port == UINT32_MAX || port == 0 || port >= (1 << 16)) {
1136                                 return -1; /* invalid port number */
1137                         }
1138                         c->rcv.src_port = port;
1139
1140                         if (*end != ' ') {
1141                                 return -1; /* invalid header */
1142                         }
1143                         p = end + 1;
1144
1145                         /* Parse the destination port */
1146                         port = strtoul(p, NULL, 10);
1147                         if (port == UINT32_MAX || port == 0 || port >= (1 << 16)) {
1148                                 return -1; /* invalid port number */
1149                         }
1150                         c->rcv.dst_port = port;
1151
1152                         goto done;
1153                 }
1154                 else if (strncmp(p, " UNKNOWN", 8) == 0) {
1155                         /* We know that the sender speaks the correct PROXY protocol with the
1156                          * appropriate version, and we SHOULD accept the connection and use the
1157                          * real connection's parameters as if there were no PROXY protocol header
1158                          * on the wire.
1159                          */
1160                         retval = 1; /* PROXY protocol parsed, but no IP override */
1161                         goto done;
1162                 }
1163                 else {
1164                         return -1; /* invalid header */
1165                 }
1166         } else if (bytes == 0) {
1167                 return 1; /* EOF? Return "no IP change" in any case */
1168         }
1169         else {
1170                 /* Wrong protocol */
1171                 return -1;
1172         }
1173
1174 done:
1175         /* we need to consume the appropriate amount of data from the socket */
1176         do {
1177                 bytes = recv(c->s, &hdr, size, 0);
1178         } while (bytes == -1 && errno == EINTR);
1179
1180         return (bytes >= 0) ? retval : -1;
1181 }
1182
1183 struct tcp_connection* tcpconn_new(int sock, union sockaddr_union* su,
1184                                                                         union sockaddr_union* local_addr,
1185                                                                         struct socket_info* ba, int type,
1186                                                                         int state)
1187 {
1188         struct tcp_connection *c;
1189         int rd_b_size, ret;
1190
1191         rd_b_size=cfg_get(tcp, tcp_cfg, rd_buf_size);
1192         c=shm_malloc(sizeof(struct tcp_connection) + rd_b_size);
1193         if (c==0){
1194                 SHM_MEM_ERROR;
1195                 goto error;
1196         }
1197         memset(c, 0, sizeof(struct tcp_connection)); /* zero init (skip rd buf)*/
1198         c->s=sock;
1199         c->fd=-1; /* not initialized */
1200         if (lock_init(&c->write_lock)==0){
1201                 LM_ERR("init lock failed\n");
1202                 goto error;
1203         }
1204
1205         c->rcv.src_su=*su;
1206
1207         atomic_set(&c->refcnt, 0);
1208         local_timer_init(&c->timer, tcpconn_main_timeout, c, 0);
1209         if (unlikely(ksr_tcp_accept_haproxy && state == S_CONN_ACCEPT)) {
1210                 ret = tcpconn_read_haproxy(c);
1211
1212                 if (ret == -1) {
1213                         LM_ERR("invalid PROXY protocol header\n");
1214                         goto error;
1215                 } else if (ret == 1) {
1216                         LM_DBG("PROXY protocol did not override IP addresses\n");
1217                         goto read_ip_info;
1218                 }
1219         } else {
1220 read_ip_info:
1221                 su2ip_addr(&c->rcv.src_ip, su);
1222                 c->rcv.src_port=su_getport(su);
1223                 if (likely(local_addr)){
1224                         su2ip_addr(&c->rcv.dst_ip, local_addr);
1225                         c->rcv.dst_port=su_getport(local_addr);
1226                 }else if (ba){
1227                         c->rcv.dst_ip=ba->address;
1228                         c->rcv.dst_port=ba->port_no;
1229                 }
1230         }
1231         c->rcv.bind_address=ba;
1232         print_ip("tcpconn_new: new tcp connection: ", &c->rcv.src_ip, "\n");
1233         LM_DBG("on port %d, type %d\n", c->rcv.src_port, type);
1234         init_tcp_req(&c->req, (char*)c+sizeof(struct tcp_connection), rd_b_size);
1235         c->id=(*connection_id)++;
1236         c->rcv.proto_reserved1=0; /* this will be filled before receive_message*/
1237         c->rcv.proto_reserved2=0;
1238         c->state=state;
1239         c->extra_data=0;
1240 #ifdef USE_TLS
1241         if (type==PROTO_TLS){
1242                 if (tls_tcpconn_init(c, sock)==-1) goto error;
1243         }else
1244 #endif /* USE_TLS*/
1245         {
1246                 c->type=PROTO_TCP;
1247                 c->rcv.proto=PROTO_TCP;
1248                 c->timeout=get_ticks_raw()+cfg_get(tcp, tcp_cfg, con_lifetime);
1249                 c->lifetime = cfg_get(tcp, tcp_cfg, con_lifetime);
1250         }
1251
1252         return c;
1253
1254 error:
1255         if (c) shm_free(c);
1256         return 0;
1257 }
1258
1259
1260
1261 /* do the actual connect, set sock. options a.s.o
1262  * returns socket on success, -1 on error
1263  * sets also *res_local_addr, res_si and state (S_CONN_CONNECT for an
1264  * unfinished connect and S_CONN_OK for a finished one)*/
1265 inline static int tcp_do_connect(       union sockaddr_union* server,
1266                                                                         union sockaddr_union* from,
1267                                                                         int type,
1268                                                                         snd_flags_t* send_flags,
1269                                                                         union sockaddr_union* res_local_addr,
1270                                                                         struct socket_info** res_si,
1271                                                                         enum tcp_conn_states *state
1272                                                                         )
1273 {
1274         int s;
1275         union sockaddr_union my_name;
1276         socklen_t my_name_len;
1277         struct ip_addr ip;
1278 #ifdef TCP_ASYNC
1279         int n;
1280 #endif /* TCP_ASYNC */
1281
1282         s=socket(AF2PF(server->s.sa_family), SOCK_STREAM, 0);
1283         if (unlikely(s==-1)){
1284                 LM_ERR("%s: socket: (%d) %s\n",
1285                                 su2a(server, sizeof(*server)), errno, strerror(errno));
1286                 goto error;
1287         }
1288         if (init_sock_opt(s, server->s.sa_family)<0){
1289                 LM_ERR("%s: init_sock_opt failed\n",
1290                                         su2a(server, sizeof(*server)));
1291                 goto error;
1292         }
1293         
1294         if (unlikely(from && bind(s, &from->s, sockaddru_len(*from)) != 0)){
1295                 LM_WARN("binding to source address %s failed: %s [%d]\n",
1296                                         su2a(from, sizeof(*from)),
1297                                         strerror(errno), errno);
1298         }
1299         *state=S_CONN_OK;
1300 #ifdef TCP_ASYNC
1301         if (likely(cfg_get(tcp, tcp_cfg, async))){
1302 again:
1303                 n=connect(s, &server->s, sockaddru_len(*server));
1304                 if (likely(n==-1)){ /*non-blocking => most probable EINPROGRESS*/
1305                         if (likely(errno==EINPROGRESS))
1306                                 *state=S_CONN_CONNECT;
1307                         else if (errno==EINTR) goto again;
1308                         else if (errno!=EALREADY){
1309                                 switch(errno){
1310                                         case ENETUNREACH:
1311                                         case EHOSTUNREACH:
1312 #ifdef USE_DST_BLACKLIST
1313                                                 dst_blacklist_su(BLST_ERR_CONNECT, type, server,
1314                                                                                         send_flags, 0);
1315 #endif /* USE_DST_BLACKLIST */
1316                                                 TCP_EV_CONNECT_UNREACHABLE(errno, 0, 0, server, type);
1317                                                 break;
1318                                         case ETIMEDOUT:
1319 #ifdef USE_DST_BLACKLIST
1320                                                 dst_blacklist_su(BLST_ERR_CONNECT, type, server,
1321                                                                                         send_flags, 0);
1322 #endif /* USE_DST_BLACKLIST */
1323                                                 TCP_EV_CONNECT_TIMEOUT(errno, 0, 0, server, type);
1324                                                 break;
1325                                         case ECONNREFUSED:
1326                                         case ECONNRESET:
1327 #ifdef USE_DST_BLACKLIST
1328                                                 dst_blacklist_su(BLST_ERR_CONNECT, type, server,
1329                                                                                         send_flags, 0);
1330 #endif /* USE_DST_BLACKLIST */
1331                                                 TCP_EV_CONNECT_RST(errno, 0, 0, server, type);
1332                                                 break;
1333                                         case EAGAIN:/* not posix, but supported on linux and bsd */
1334                                                 TCP_EV_CONNECT_NO_MORE_PORTS(errno, 0, 0, server,type);
1335                                                 break;
1336                                         default:
1337                                                 TCP_EV_CONNECT_ERR(errno, 0, 0, server, type);
1338                                 }
1339                                 TCP_STATS_CONNECT_FAILED();
1340                                 LM_ERR("connect %s: (%d) %s\n",
1341                                                         su2a(server, sizeof(*server)),
1342                                                         errno, strerror(errno));
1343                                 goto error;
1344                         }
1345                 }
1346         }else{
1347 #endif /* TCP_ASYNC */
1348                 if (tcp_blocking_connect(s, type,  send_flags, &server->s,
1349                                                                         sockaddru_len(*server))<0){
1350                         LM_ERR("tcp_blocking_connect %s failed\n",
1351                                                 su2a(server, sizeof(*server)));
1352                         goto error;
1353                 }
1354 #ifdef TCP_ASYNC
1355         }
1356 #endif /* TCP_ASYNC */
1357         if (from){
1358                 su2ip_addr(&ip, from);
1359                 if (!ip_addr_any(&ip))
1360                         /* we already know the source ip, skip the sys. call */
1361                         goto find_socket;
1362         }
1363         my_name_len=sizeof(my_name);
1364         if (unlikely(getsockname(s, &my_name.s, &my_name_len)!=0)){
1365                 LM_ERR("getsockname failed: %s(%d)\n", strerror(errno), errno);
1366                 *res_si=0;
1367                 goto error;
1368         }
1369         from=&my_name; /* update from with the real "from" address */
1370         su2ip_addr(&ip, &my_name);
1371 find_socket:
1372 #ifdef USE_TLS
1373         if (unlikely(type==PROTO_TLS))
1374                 *res_si=find_si(&ip, 0, PROTO_TLS);
1375         else
1376 #endif
1377                 *res_si=find_si(&ip, 0, PROTO_TCP);
1378         
1379         if (unlikely(*res_si==0)){
1380                 LM_WARN("%s: could not find corresponding"
1381                                 " listening socket for %s, using default...\n",
1382                                         su2a(server, sizeof(*server)), ip_addr2a(&ip));
1383                 if (server->s.sa_family==AF_INET) *res_si=sendipv4_tcp;
1384                 else *res_si=sendipv6_tcp;
1385         }
1386         *res_local_addr=*from;
1387         return s;
1388 error:
1389         if (s!=-1) tcp_safe_close(s);
1390         return -1;
1391 }
1392
1393
1394
1395 struct tcp_connection* tcpconn_connect( union sockaddr_union* server,
1396                                                                                 union sockaddr_union* from,
1397                                                                                 int type, snd_flags_t* send_flags)
1398 {
1399         int s;
1400         struct socket_info* si;
1401         union sockaddr_union my_name;
1402         struct tcp_connection* con;
1403         enum tcp_conn_states state;
1404
1405         s=-1;
1406
1407         if (*tcp_connections_no >= cfg_get(tcp, tcp_cfg, max_connections)){
1408                 LM_ERR("maximum number of connections exceeded (%d/%d)\n",
1409                                         *tcp_connections_no,
1410                                         cfg_get(tcp, tcp_cfg, max_connections));
1411                 goto error;
1412         }
1413         if (unlikely(type==PROTO_TLS)) {
1414                 if (*tls_connections_no >= cfg_get(tcp, tcp_cfg, max_tls_connections)){
1415                         LM_ERR("maximum number of tls connections"
1416                                                 " exceeded (%d/%d)\n",
1417                                                 *tls_connections_no,
1418                                                 cfg_get(tcp, tcp_cfg, max_tls_connections));
1419                         goto error;
1420                 }
1421         }
1422
1423         s=tcp_do_connect(server, from, type,  send_flags, &my_name, &si, &state);
1424         if (s==-1){
1425                 LM_ERR("tcp_do_connect %s: failed (%d) %s\n",
1426                                 su2a(server, sizeof(*server)), errno, strerror(errno));
1427                 goto error;
1428         }
1429         con=tcpconn_new(s, server, &my_name, si, type, state);
1430         if (con==0){
1431                 LM_ERR("%s: tcpconn_new failed, closing the "
1432                                         " socket\n", su2a(server, sizeof(*server)));
1433                 goto error;
1434         }
1435         tcpconn_set_send_flags(con, *send_flags);
1436         return con;
1437 error:
1438         if (s!=-1) tcp_safe_close(s); /* close the opened socket */
1439         return 0;
1440 }
1441
1442
1443
1444 #ifdef TCP_CONNECT_WAIT
1445 int tcpconn_finish_connect( struct tcp_connection* c,
1446                                                                                                 union sockaddr_union* from)
1447 {
1448         int s;
1449         int r;
1450         union sockaddr_union local_addr;
1451         struct socket_info* si;
1452         enum tcp_conn_states state;
1453         struct tcp_conn_alias* a;
1454         int new_conn_alias_flags;
1455
1456         s=tcp_do_connect(&c->rcv.src_su, from, c->type, &c->send_flags,
1457                                                 &local_addr, &si, &state);
1458         if (unlikely(s==-1)){
1459                 LM_ERR("%s: tcp_do_connect for %p failed\n",
1460                                         su2a(&c->rcv.src_su, sizeof(c->rcv.src_su)), c);
1461                 return -1;
1462         }
1463         c->rcv.bind_address=si;
1464         su2ip_addr(&c->rcv.dst_ip, &local_addr);
1465         c->rcv.dst_port=su_getport(&local_addr);
1466         /* update aliases if needed */
1467         if (likely(from==0)){
1468                 new_conn_alias_flags=cfg_get(tcp, tcp_cfg, new_conn_alias_flags);
1469                 /* add aliases */
1470                 TCPCONN_LOCK;
1471                 _tcpconn_add_alias_unsafe(c, c->rcv.src_port, &c->rcv.dst_ip, 0,
1472                                                                                                         new_conn_alias_flags);
1473                 _tcpconn_add_alias_unsafe(c, c->rcv.src_port, &c->rcv.dst_ip,
1474                                                                         c->rcv.dst_port, new_conn_alias_flags);
1475                 TCPCONN_UNLOCK;
1476         }else if (su_cmp(from, &local_addr)!=1){
1477                 new_conn_alias_flags=cfg_get(tcp, tcp_cfg, new_conn_alias_flags);
1478                 TCPCONN_LOCK;
1479                         /* remove all the aliases except the first one and re-add them
1480                          * (there shouldn't be more then the 3 default aliases at this
1481                          * stage) */
1482                         if (c->aliases > 1) {
1483                                 for (r=1; r<c->aliases; r++){
1484                                         a=&c->con_aliases[r];
1485                                         tcpconn_listrm(tcpconn_aliases_hash[a->hash],
1486                                                                         a, next, prev);
1487                                 }
1488                                 c->aliases=1;
1489                         }
1490                         /* add the local_ip:0 and local_ip:local_port aliases */
1491                         _tcpconn_add_alias_unsafe(c, c->rcv.src_port, &c->rcv.dst_ip,
1492                                                                                                 0, new_conn_alias_flags);
1493                         _tcpconn_add_alias_unsafe(c, c->rcv.src_port, &c->rcv.dst_ip,
1494                                                                         c->rcv.dst_port, new_conn_alias_flags);
1495                 TCPCONN_UNLOCK;
1496         }
1497
1498         return s;
1499 }
1500 #endif /* TCP_CONNECT_WAIT */
1501
1502
1503
1504 /* adds a tcp connection to the tcpconn hashes
1505  * Note: it's called _only_ from the tcp_main process */
1506 inline static struct tcp_connection*  tcpconn_add(struct tcp_connection *c)
1507 {
1508         struct ip_addr zero_ip;
1509         int new_conn_alias_flags;
1510
1511         if (likely(c)){
1512                 ip_addr_mk_any(c->rcv.src_ip.af, &zero_ip);
1513                 c->id_hash=tcp_id_hash(c->id);
1514                 c->aliases=0;
1515                 new_conn_alias_flags=cfg_get(tcp, tcp_cfg, new_conn_alias_flags);
1516                 TCPCONN_LOCK;
1517                 c->flags|=F_CONN_HASHED;
1518                 /* add it at the begining of the list*/
1519                 tcpconn_listadd(tcpconn_id_hash[c->id_hash], c, id_next, id_prev);
1520                 /* set the aliases */
1521                 /* first alias is for (peer_ip, peer_port, 0 ,0) -- for finding
1522                  *  any connection to peer_ip, peer_port
1523                  * the second alias is for (peer_ip, peer_port, local_addr, 0) -- for
1524                  *  finding any conenction to peer_ip, peer_port from local_addr 
1525                  * the third alias is for (peer_ip, peer_port, local_addr, local_port) 
1526                  *   -- for finding if a fully specified connection exists */
1527                 _tcpconn_add_alias_unsafe(c, c->rcv.src_port, &zero_ip, 0,
1528                                                                                                         new_conn_alias_flags);
1529                 if (likely(c->rcv.dst_ip.af && ! ip_addr_any(&c->rcv.dst_ip))){
1530                         _tcpconn_add_alias_unsafe(c, c->rcv.src_port, &c->rcv.dst_ip, 0,
1531                                                                                                         new_conn_alias_flags);
1532                         _tcpconn_add_alias_unsafe(c, c->rcv.src_port, &c->rcv.dst_ip,
1533                                                                         c->rcv.dst_port, new_conn_alias_flags);
1534                 }
1535                 /* ignore add_alias errors, there are some valid cases when one
1536                  *  of the add_alias would fail (e.g. first add_alias for 2 connections
1537                  *   with the same destination but different src. ip*/
1538                 TCPCONN_UNLOCK;
1539                 LM_DBG("hashes: %d:%d:%d, %d\n",
1540                                                                                                 c->con_aliases[0].hash,
1541                                                                                                 c->con_aliases[1].hash,
1542                                                                                                 c->con_aliases[2].hash,
1543                                                                                                 c->id_hash);
1544                 return c;
1545         }else{
1546                 LM_CRIT("null connection pointer\n");
1547                 return 0;
1548         }
1549 }
1550
1551
1552 static inline void _tcpconn_detach(struct tcp_connection *c)
1553 {
1554         int r;
1555         tcpconn_listrm(tcpconn_id_hash[c->id_hash], c, id_next, id_prev);
1556         /* remove all the aliases */
1557         for (r=0; r<c->aliases; r++)
1558                 tcpconn_listrm(tcpconn_aliases_hash[c->con_aliases[r].hash], 
1559                                                 &c->con_aliases[r], next, prev);
1560         c->aliases = 0;
1561 }
1562
1563
1564
1565 static inline void _tcpconn_free(struct tcp_connection* c)
1566 {
1567 #ifdef TCP_ASYNC
1568         if (unlikely(_wbufq_non_empty(c)))
1569                 _wbufq_destroy(&c->wbuf_q);
1570 #endif
1571         lock_destroy(&c->write_lock);
1572 #ifdef USE_TLS
1573         if (unlikely(c->type==PROTO_TLS || c->type==PROTO_WSS)) tls_tcpconn_clean(c);
1574 #endif
1575         shm_free(c);
1576 }
1577
1578
1579
1580 /* unsafe tcpconn_rm version (nolocks) */
1581 void _tcpconn_rm(struct tcp_connection* c)
1582 {
1583         _tcpconn_detach(c);
1584         _tcpconn_free(c);
1585 }
1586
1587
1588
1589 void tcpconn_rm(struct tcp_connection* c)
1590 {
1591         int r;
1592         TCPCONN_LOCK;
1593         tcpconn_listrm(tcpconn_id_hash[c->id_hash], c, id_next, id_prev);
1594         /* remove all the aliases */
1595         for (r=0; r<c->aliases; r++)
1596                 tcpconn_listrm(tcpconn_aliases_hash[c->con_aliases[r].hash], 
1597                                                 &c->con_aliases[r], next, prev);
1598         c->aliases = 0;
1599         TCPCONN_UNLOCK;
1600         lock_destroy(&c->write_lock);
1601 #ifdef USE_TLS
1602         if ((c->type==PROTO_TLS || c->type==PROTO_WSS)&&(c->extra_data)) tls_tcpconn_clean(c);
1603 #endif
1604         shm_free(c);
1605 }
1606
1607
1608 /* finds a connection, if id=0 uses the ip addr, port, local_ip and local port
1609  *  (host byte order) and tries to find the connection that matches all of
1610  *   them. Wild cards can be used for local_ip and local_port (a 0 filled
1611  *   ip address and/or a 0 local port).
1612  * WARNING: unprotected (locks) use tcpconn_get unless you really
1613  * know what you are doing */
1614 struct tcp_connection* _tcpconn_find(int id, struct ip_addr* ip, int port,
1615                                                                                 struct ip_addr* l_ip, int l_port)
1616 {
1617
1618         struct tcp_connection *c;
1619         struct tcp_conn_alias* a;
1620         unsigned hash;
1621         int is_local_ip_any;
1622         
1623 #ifdef EXTRA_DEBUG
1624         LM_DBG("%d  port %d\n",id, port);
1625         if (ip) print_ip("tcpconn_find: ip ", ip, "\n");
1626 #endif
1627         if (likely(id)){
1628                 hash=tcp_id_hash(id);
1629                 for (c=tcpconn_id_hash[hash]; c; c=c->id_next){
1630 #ifdef EXTRA_DEBUG
1631                         LM_DBG("c=%p, c->id=%d, port=%d\n", c, c->id, c->rcv.src_port);
1632                         print_ip("ip=", &c->rcv.src_ip, "\n");
1633 #endif
1634                         if ((id==c->id)&&(c->state!=S_CONN_BAD)) return c;
1635                 }
1636         }else if (likely(ip)){
1637                 hash=tcp_addr_hash(ip, port, l_ip, l_port);
1638                 is_local_ip_any=ip_addr_any(l_ip);
1639                 for (a=tcpconn_aliases_hash[hash]; a; a=a->next){
1640 #ifdef EXTRA_DEBUG
1641                         LM_DBG("a=%p, c=%p, c->id=%d, alias port= %d port=%d\n", a, a->parent,
1642                                         a->parent->id, a->port, a->parent->rcv.src_port);
1643                         print_ip("ip=",&a->parent->rcv.src_ip,"\n");
1644 #endif
1645                         if ( (a->parent->state!=S_CONN_BAD) && (port==a->port) &&
1646                                         ((l_port==0) || (l_port==a->parent->rcv.dst_port)) &&
1647                                         (ip_addr_cmp(ip, &a->parent->rcv.src_ip)) &&
1648                                         (is_local_ip_any ||
1649                                                 ip_addr_cmp(l_ip, &a->parent->rcv.dst_ip))
1650                                 )
1651                                 return a->parent;
1652                 }
1653         }
1654         return 0;
1655 }
1656
1657
1658
1659 /* _tcpconn_find with locks and timeout
1660  * local_addr contains the desired local ip:port. If null any local address 
1661  * will be used.  IN*ADDR_ANY or 0 port are wild cards.
1662  * If found, the connection's reference counter will be incremented, you might
1663  * want to decrement it after use.
1664  */
1665 struct tcp_connection* tcpconn_get(int id, struct ip_addr* ip, int port,
1666                                                                         union sockaddr_union* local_addr,
1667                                                                         ticks_t timeout)
1668 {
1669         struct tcp_connection* c;
1670         struct ip_addr local_ip;
1671         int local_port;
1672         
1673         local_port=0;
1674         if (likely(ip)){
1675                 if (unlikely(local_addr)){
1676                         su2ip_addr(&local_ip, local_addr);
1677                         local_port=su_getport(local_addr);
1678                 }else{
1679                         ip_addr_mk_any(ip->af, &local_ip);
1680                         local_port=0;
1681                 }
1682         }
1683         TCPCONN_LOCK;
1684         c=_tcpconn_find(id, ip, port, &local_ip, local_port);
1685         if (likely(c)){ 
1686                         atomic_inc(&c->refcnt);
1687                         /* update the timeout only if the connection is not handled
1688                          * by a tcp reader _and_the timeout is non-zero  (the tcp
1689                          * reader process uses c->timeout for its own internal
1690                          * timeout and c->timeout will be overwritten * anyway on
1691                          * return to tcp_main) */
1692                         if (likely(c->reader_pid==0 && timeout != 0))
1693                                 c->timeout=get_ticks_raw()+timeout;
1694         }
1695         TCPCONN_UNLOCK;
1696         return c;
1697 }
1698
1699
1700
1701 /* add c->dst:port, local_addr as an alias for the "id" connection, 
1702  * flags: TCP_ALIAS_FORCE_ADD  - add an alias even if a previous one exists
1703  *        TCP_ALIAS_REPLACE    - if a prev. alias exists, replace it with the
1704  *                                new one
1705  * returns 0 on success, <0 on failure ( -1  - null c, -2 too many aliases,
1706  *  -3 alias already present and pointing to another connection)
1707  * WARNING: must be called with TCPCONN_LOCK held */
1708 inline static int _tcpconn_add_alias_unsafe(struct tcp_connection* c, int port,
1709                                                                                 struct ip_addr* l_ip, int l_port,
1710                                                                                 int flags)
1711 {
1712         unsigned hash;
1713         struct tcp_conn_alias* a;
1714         struct tcp_conn_alias* nxt;
1715         struct tcp_connection* p;
1716         int is_local_ip_any;
1717         int i;
1718         int r;
1719         
1720         a=0;
1721         is_local_ip_any=ip_addr_any(l_ip);
1722         if (likely(c)){
1723                 hash=tcp_addr_hash(&c->rcv.src_ip, port, l_ip, l_port);
1724                 /* search the aliases for an already existing one */
1725                 for (a=tcpconn_aliases_hash[hash], nxt=0; a; a=nxt){
1726                         nxt=a->next;
1727                         if ( (a->parent->state!=S_CONN_BAD) && (port==a->port) &&
1728                                         ( (l_port==0) || (l_port==a->parent->rcv.dst_port)) &&
1729                                         (ip_addr_cmp(&c->rcv.src_ip, &a->parent->rcv.src_ip)) &&
1730                                         ( is_local_ip_any || 
1731                                           ip_addr_cmp(&a->parent->rcv.dst_ip, l_ip))
1732                                         ){
1733                                 /* found */
1734                                 if (unlikely(a->parent!=c)){
1735                                         if (flags & TCP_ALIAS_FORCE_ADD)
1736                                                 /* still have to walk the whole list to check if
1737                                                  * the alias was not already added */
1738                                                 continue;
1739                                         else if (flags & TCP_ALIAS_REPLACE){
1740                                                 /* remove the alias =>
1741                                                  * remove the current alias and all the following
1742                                                  *  ones from the corresponding connection, shift the 
1743                                                  *  connection aliases array and re-add the other 
1744                                                  *  aliases (!= current one) */
1745                                                 p=a->parent;
1746                                                 for (i=0; (i<p->aliases) && (&(p->con_aliases[i])!=a);
1747                                                                 i++);
1748                                                 if (unlikely(i==p->aliases)){
1749                                                         LM_CRIT("alias %p not found in con %p (id %d)\n",
1750                                                                         a, p, p->id);
1751                                                         goto error_not_found;
1752                                                 }
1753                                                 for (r=i; r<p->aliases; r++){
1754                                                         tcpconn_listrm(
1755                                                                 tcpconn_aliases_hash[p->con_aliases[r].hash],
1756                                                                 &p->con_aliases[r], next, prev);
1757                                                 }
1758                                                 if (likely((i+1)<p->aliases)){
1759                                                         memmove(&p->con_aliases[i], &p->con_aliases[i+1],
1760                                                                                         (p->aliases-i-1)*
1761                                                                                                 sizeof(p->con_aliases[0]));
1762                                                 }
1763                                                 p->aliases--;
1764                                                 /* re-add the remaining aliases */
1765                                                 for (r=i; r<p->aliases; r++){
1766                                                         tcpconn_listadd(
1767                                                                 tcpconn_aliases_hash[p->con_aliases[r].hash], 
1768                                                                 &p->con_aliases[r], next, prev);
1769                                                 }
1770                                         }else
1771                                                 goto error_sec;
1772                                 }else goto ok;
1773                         }
1774                 }
1775                 if (unlikely(c->aliases>=TCP_CON_MAX_ALIASES)) goto error_aliases;
1776                 c->con_aliases[c->aliases].parent=c;
1777                 c->con_aliases[c->aliases].port=port;
1778                 c->con_aliases[c->aliases].hash=hash;
1779                 tcpconn_listadd(tcpconn_aliases_hash[hash], 
1780                                                                 &c->con_aliases[c->aliases], next, prev);
1781                 c->aliases++;
1782         }else goto error_not_found;
1783 ok:
1784 #ifdef EXTRA_DEBUG
1785         if (a) LM_DBG("alias already present\n");
1786         else   LM_DBG("alias port %d for hash %d, id %d\n",
1787                         port, hash, c->id);
1788 #endif
1789         return 0;
1790 error_aliases:
1791         /* too many aliases */
1792         return -2;
1793 error_not_found:
1794         /* null connection */
1795         return -1;
1796 error_sec:
1797         /* alias already present and pointing to a different connection
1798          * (hijack attempt?) */
1799         return -3;
1800 }
1801
1802
1803
1804 /* add port as an alias for the "id" connection, 
1805  * returns 0 on success,-1 on failure */
1806 int tcpconn_add_alias(int id, int port, int proto)
1807 {
1808         struct tcp_connection* c;
1809         int ret;
1810         struct ip_addr zero_ip;
1811         int r;
1812         int alias_flags;
1813         
1814         /* fix the port */
1815         port=port?port:((proto==PROTO_TLS)?SIPS_PORT:SIP_PORT);
1816         TCPCONN_LOCK;
1817         /* check if alias already exists */
1818         c=_tcpconn_find(id, 0, 0, 0, 0);
1819         if (likely(c)){
1820                 ip_addr_mk_any(c->rcv.src_ip.af, &zero_ip);
1821                 alias_flags=cfg_get(tcp, tcp_cfg, alias_flags);
1822                 /* alias src_ip:port, 0, 0 */
1823                 ret=_tcpconn_add_alias_unsafe(c, port,  &zero_ip, 0, 
1824                                                                                 alias_flags);
1825                 if (ret<0 && ret!=-3) goto error;
1826                 /* alias src_ip:port, local_ip, 0 */
1827                 ret=_tcpconn_add_alias_unsafe(c, port,  &c->rcv.dst_ip, 0, 
1828                                                                                 alias_flags);
1829                 if (ret<0 && ret!=-3) goto error;
1830                 /* alias src_ip:port, local_ip, local_port */
1831                 ret=_tcpconn_add_alias_unsafe(c, port, &c->rcv.dst_ip, c->rcv.dst_port,
1832                                                                                 alias_flags);
1833                 if (unlikely(ret<0)) goto error;
1834         }else goto error_not_found;
1835         TCPCONN_UNLOCK;
1836         return 0;
1837 error_not_found:
1838         TCPCONN_UNLOCK;
1839         LM_ERR("no connection found for id %d\n",id);
1840         return -1;
1841 error:
1842         TCPCONN_UNLOCK;
1843         switch(ret){
1844                 case -2:
1845                         LM_ERR("too many aliases (%d) for connection %p (id %d) %s:%d <- %d\n",
1846                                         c->aliases, c, c->id, ip_addr2a(&c->rcv.src_ip),
1847                                         c->rcv.src_port, port);
1848                         for (r=0; r<c->aliases; r++){
1849                                 LM_ERR("alias %d: for %p (%d) %s:%d <-%d hash %x\n",  r, c, c->id, 
1850                                                 ip_addr2a(&c->rcv.src_ip), c->rcv.src_port, 
1851                                                 c->con_aliases[r].port, c->con_aliases[r].hash);
1852                         }
1853                         break;
1854                 case -3:
1855                         LM_ERR("possible port hijack attempt\n");
1856                         LM_ERR("alias for %d port %d already"
1857                                                 " present and points to another connection \n",
1858                                                 c->id, port);
1859                         break;
1860                 default:
1861                         LM_ERR("unknown error %d\n", ret);
1862         }
1863         return -1;
1864 }
1865
1866
1867
1868 #ifdef TCP_FD_CACHE
1869
1870 static void tcp_fd_cache_init(void)
1871 {
1872         int r;
1873         for (r=0; r<TCP_FD_CACHE_SIZE; r++)
1874                 fd_cache[r].fd=-1;
1875 }
1876
1877
1878 inline static struct fd_cache_entry* tcp_fd_cache_get(struct tcp_connection *c)
1879 {
1880         int h;
1881         
1882         h=c->id%TCP_FD_CACHE_SIZE;
1883         if ((fd_cache[h].fd>0) && (fd_cache[h].id==c->id) && (fd_cache[h].con==c))
1884                 return &fd_cache[h];
1885         return 0;
1886 }
1887
1888
1889 inline static void tcp_fd_cache_rm(struct fd_cache_entry* e)
1890 {
1891         e->fd=-1;
1892 }
1893
1894
1895 inline static void tcp_fd_cache_add(struct tcp_connection *c, int fd)
1896 {
1897         int h;
1898         
1899         h=c->id%TCP_FD_CACHE_SIZE;
1900         if (likely(fd_cache[h].fd>0))
1901                 tcp_safe_close(fd_cache[h].fd);
1902         fd_cache[h].fd=fd;
1903         fd_cache[h].id=c->id;
1904         fd_cache[h].con=c;
1905 }
1906
1907 #endif /* TCP_FD_CACHE */
1908
1909
1910
1911 inline static int tcpconn_chld_put(struct tcp_connection* tcpconn);
1912
1913 static int tcpconn_send_put(struct tcp_connection* c, const char* buf,
1914                                                         unsigned len, snd_flags_t send_flags);
1915 static int tcpconn_do_send(int fd, struct tcp_connection* c,
1916                                                         const char* buf, unsigned len,
1917                                                         snd_flags_t send_flags, long* resp, int locked);
1918
1919 static int tcpconn_1st_send(int fd, struct tcp_connection* c,
1920                                                         const char* buf, unsigned len,
1921                                                         snd_flags_t send_flags, long* resp, int locked);
1922
1923 /* finds a tcpconn & sends on it
1924  * uses the dst members to, proto (TCP|TLS) and id and tries to send
1925  *  from the "from" address (if non null and id==0)
1926  * returns: number of bytes written (>=0) on success
1927  *          <0 on error */
1928 int tcp_send(struct dest_info* dst, union sockaddr_union* from,
1929                                         const char* buf, unsigned len)
1930 {
1931         struct tcp_connection *c;
1932         struct ip_addr ip;
1933         int port;
1934         int fd;
1935         long response[2];
1936         int n;
1937         ticks_t con_lifetime;
1938 #ifdef USE_TLS
1939         const char* rest_buf;
1940         const char* t_buf;
1941         unsigned rest_len, t_len;
1942         long resp;
1943         snd_flags_t t_send_flags;
1944 #endif /* USE_TLS */
1945
1946         port=su_getport(&dst->to);
1947         con_lifetime=cfg_get(tcp, tcp_cfg, con_lifetime);
1948         if (likely(port)){
1949                 su2ip_addr(&ip, &dst->to);
1950                 c=tcpconn_get(dst->id, &ip, port, from, con_lifetime);
1951         }else if (likely(dst->id)){
1952                 c=tcpconn_get(dst->id, 0, 0, 0, con_lifetime);
1953         }else{
1954                 LM_CRIT("null id & to\n");
1955                 return -1;
1956         }
1957
1958         if (likely(dst->id)){
1959                 if (unlikely(c==0)) {
1960                         if (likely(port)){
1961                                 /* try again w/o id */
1962                                 c=tcpconn_get(0, &ip, port, from, con_lifetime);
1963                         }else{
1964                                 LM_ERR("id %d not found, dropping\n", dst->id);
1965                                 return -1;
1966                         }
1967                 }
1968         }
1969         /* connection not found or unusable => open a new one and send on it */
1970         if (unlikely((c==0) || tcpconn_close_after_send(c))){
1971                 if (unlikely(c)){
1972                         /* can't use c if it's marked as close-after-send  =>
1973                          * release it and try opening new one */
1974                         tcpconn_chld_put(c); /* release c (dec refcnt & free on 0) */
1975                         c=0;
1976                 }
1977                 /* check if connect() is disabled */
1978                 if (unlikely((dst->send_flags.f & SND_F_FORCE_CON_REUSE) ||
1979                                                 cfg_get(tcp, tcp_cfg, no_connect)))
1980                         return -1;
1981                 LM_DBG("no open tcp connection found, opening new one\n");
1982                 /* create tcp connection */
1983                 if (likely(from==0)){
1984                         /* check to see if we have to use a specific source addr. */
1985                         switch (dst->to.s.sa_family) {
1986                                 case AF_INET:
1987                                                 from = tcp_source_ipv4;
1988                                         break;
1989                                 case AF_INET6:
1990                                                 from = tcp_source_ipv6;
1991                                         break;
1992                                 default:
1993                                         /* error, bad af, ignore ... */
1994                                         break;
1995                         }
1996                 }
1997 #if defined(TCP_CONNECT_WAIT) && defined(TCP_ASYNC)
1998                 if (likely(cfg_get(tcp, tcp_cfg, tcp_connect_wait) &&
1999                                         cfg_get(tcp, tcp_cfg, async) )){
2000                         if (unlikely(*tcp_connections_no >=
2001                                                         cfg_get(tcp, tcp_cfg, max_connections))){
2002                                 LM_ERR("%s: maximum number of connections exceeded (%d/%d)\n",
2003                                                         su2a(&dst->to, sizeof(dst->to)),
2004                                                         *tcp_connections_no,
2005                                                         cfg_get(tcp, tcp_cfg, max_connections));
2006                                 return -1;
2007                         }
2008                         if (unlikely(dst->proto==PROTO_TLS)) {
2009                                 if (unlikely(*tls_connections_no >=
2010                                                         cfg_get(tcp, tcp_cfg, max_tls_connections))){
2011                                         LM_ERR("%s: maximum number of tls connections exceeded (%d/%d)\n",
2012                                                         su2a(&dst->to, sizeof(dst->to)),
2013                                                         *tls_connections_no,
2014                                                         cfg_get(tcp, tcp_cfg, max_tls_connections));
2015                                         return -1;
2016                                 }
2017                         }
2018                         c=tcpconn_new(-1, &dst->to, from, 0, dst->proto,
2019                                                         S_CONN_CONNECT);
2020                         if (unlikely(c==0)){
2021                                 LM_ERR("%s: could not create new connection\n",
2022                                                 su2a(&dst->to, sizeof(dst->to)));
2023                                 return -1;
2024                         }
2025                         c->flags|=F_CONN_PENDING|F_CONN_FD_CLOSED;
2026                         tcpconn_set_send_flags(c, dst->send_flags);
2027                         atomic_set(&c->refcnt, 2); /* ref from here and from main hash
2028                                                                                 * table */
2029                         /* add it to id hash and aliases */
2030                         if (unlikely(tcpconn_add(c)==0)){
2031                                 LM_ERR("%s: could not add connection %p\n",
2032                                                 su2a(&dst->to, sizeof(dst->to)), c);
2033                                 _tcpconn_free(c);
2034                                 n=-1;
2035                                 goto end_no_conn;
2036                         }
2037                         /* do connect and if src ip or port changed, update the
2038                          * aliases */
2039                         if (unlikely((fd=tcpconn_finish_connect(c, from))<0)){
2040                                 /* tcpconn_finish_connect will automatically blacklist
2041                                  * on error => no need to do it here */
2042                                 LM_ERR("%s: tcpconn_finish_connect(%p) failed\n",
2043                                                 su2a(&dst->to, sizeof(dst->to)), c);
2044                                 goto conn_wait_error;
2045                         }
2046                         if(c->flags & F_CONN_NOSEND) {
2047                                 /* connection marked as no-send data
2048                                  * (e.g., drop() from tls event route)*/
2049                                 LM_INFO("%s: connection marked for no-send (%p)\n",
2050                                                 su2a(&dst->to, sizeof(dst->to)), c);
2051                                 goto conn_wait_error;
2052                         }
2053                         /* ? TODO: it might be faster just to queue the write directly
2054                          *  and send to main CONN_NEW_PENDING_WRITE */
2055                         /* delay sending the fd to main after the send */
2056
2057                         /* NOTE: no lock here, because the connection is marked as
2058                          * pending and nobody else will try to write on it. However
2059                          * this might produce out-of-order writes. If this is not
2060                          * desired either lock before the write or use
2061                          * _wbufq_insert(...)
2062                          * NOTE2: _wbufq_insert() is used now (no out-of-order).
2063                          */
2064 #ifdef USE_TLS
2065                         if (unlikely(c->type==PROTO_TLS)) {
2066                                 /* for TLS the TLS processing and the send must happen
2067                                  * atomically w/ respect to other sends on the same connection
2068                                  * (otherwise reordering might occur which would break TLS) =>
2069                                  * lock. However in this case this send will always be the first.
2070                                  * We can have the send() outside the lock only if this is the
2071                                  * first and only send (tls_encode is not called again), or
2072                                  * this is the last send for a tls_encode() loop and all the
2073                                  * previous ones did return CONN_NEW_COMPLETE or CONN_EOF.
2074                                  */
2075                                 response[1] = CONN_NOP;
2076                                 t_buf = buf;
2077                                 t_len = len;
2078                                 lock_get(&c->write_lock);
2079 redo_tls_encode:
2080                                         t_send_flags = dst->send_flags;
2081                                         n = tls_encode(c, &t_buf, &t_len, &rest_buf, &rest_len,
2082                                                                         &t_send_flags);
2083                                         /* There are 4 cases:
2084                                          *  1. entire buffer consumed from the first try
2085                                          *    (rest_len == rest_buf == 0)
2086                                          *  2. rest_buf & first call
2087                                          *  3. rest_buf & not first call
2088                                          *        3a. CONN_NEW_COMPLETE or CONN_EOF
2089                                          *        3b. CONN_NEW_PENDING_WRITE
2090                                          *  4. entire buffer consumed, but not first call
2091                                          *      4a. CONN_NEW_COMPLETE or CONN_EOF
2092                                          *         4b. CONN_NEW_PENDING_WRITE
2093                                          *      We misuse response[1] == CONN_NOP to test for the
2094                                          *      first call.
2095                                          */
2096                                         if (unlikely(n < 0)) {
2097                                                 lock_release(&c->write_lock);
2098                                                 goto conn_wait_error;
2099                                         }
2100                                         if (likely(rest_len == 0)) {
2101                                                 /* 1 or 4*: CONN_NEW_COMPLETE, CONN_EOF,  CONN_NOP
2102                                                  * or CONN_NEW_PENDING_WRITE (*rest_len == 0) */
2103                                                 if (likely(response[1] != CONN_NEW_PENDING_WRITE)) {
2104                                                         /* 1 or 4a => it's safe to do the send outside the
2105                                                          * lock (it will either send directly or
2106                                                          * wbufq_insert())
2107                                                          */
2108                                                         lock_release(&c->write_lock);
2109                                                         if (likely(t_len != 0)) {
2110                                                                 n=tcpconn_1st_send(fd, c, t_buf, t_len,
2111                                                                                                         t_send_flags,
2112                                                                                                         &response[1], 0);
2113                                                         } else { /* t_len == 0 */
2114                                                                 if (response[1] == CONN_NOP) {
2115                                                                         /* nothing to send (e.g  parallel send
2116                                                                          * tls_encode queues some data and then
2117                                                                          * WANT_READ => this tls_encode will queue
2118                                                                          * the cleartext too and will have nothing
2119                                                                          * to send right now) and initial send =>
2120                                                                          * behave as if the send was successful
2121                                                                          * (but never return EOF here) */
2122                                                                         response[1] = CONN_NEW_COMPLETE;
2123                                                                 }
2124                                                         }
2125                                                         /* exit */
2126                                                 } else {
2127                                                         /* CONN_NEW_PENDING_WRITE:  4b: it was a
2128                                                          * repeated tls_encode() (or otherwise we would
2129                                                          * have here CONN_NOP) => add to the queue */
2130                                                         if (unlikely(t_len &&
2131                                                                                         _wbufq_add(c, t_buf, t_len) < 0)) {
2132                                                                 response[1] = CONN_ERROR;
2133                                                                 n = -1;
2134                                                         }
2135                                                         lock_release(&c->write_lock);
2136                                                         /* exit (no send) */
2137                                                 }
2138                                         } else {  /* rest_len != 0 */
2139                                                 /* 2 or 3*: if tls_encode hasn't finished, we have to
2140                                                  * call tcpconn_1st_send() under lock (otherwise if it
2141                                                  * returns CONN_NEW_PENDING_WRITE, there is no way
2142                                                  * to find the right place to add the new queued
2143                                                  * data from the 2nd tls_encode()) */
2144                                                 if (likely((response[1] == CONN_NOP /*2*/ ||
2145                                                                         response[1] == CONN_NEW_COMPLETE /*3a*/ ||
2146                                                                         response[1] == CONN_EOF /*3a*/) && t_len))
2147                                                         n = tcpconn_1st_send(fd, c, t_buf, t_len,
2148                                                                                                         t_send_flags,
2149                                                                                                         &response[1], 1);
2150                                                 else if (unlikely(t_len &&
2151                                                                                         _wbufq_add(c, t_buf, t_len) < 0)) {
2152                                                         /*3b: CONN_NEW_PENDING_WRITE*/
2153                                                         response[1] = CONN_ERROR;
2154                                                         n = -1;
2155                                                 }
2156                                                 if (likely(n >= 0)) {
2157                                                         /* if t_len == 0 => nothing was sent => previous
2158                                                          * response will be kept */
2159                                                         t_buf = rest_buf;
2160                                                         t_len = rest_len;
2161                                                         goto redo_tls_encode;
2162                                                 } else {
2163                                                         lock_release(&c->write_lock);
2164                                                         /* error exit */
2165                                                 }
2166                                         }
2167                         } else
2168 #endif /* USE_TLS */
2169                                 n=tcpconn_1st_send(fd, c, buf, len, dst->send_flags,
2170                                                                         &response[1], 0);
2171                         if (unlikely(n<0)) /* this will catch CONN_ERROR too */
2172                                 goto conn_wait_error;
2173                         if (unlikely(response[1]==CONN_EOF)){
2174                                 /* if close-after-send requested, don't bother
2175                                  * sending the fd back to tcp_main, try closing it
2176                                  * immediately (no other tcp_send should use it,
2177                                  * because it is marked as close-after-send before
2178                                  * being added to the hash) */
2179                                 goto conn_wait_close;
2180                         }
2181                         /* send to tcp_main */
2182                         response[0]=(long)c;
2183                         if (unlikely(send_fd(unix_tcp_sock, response,
2184                                                                         sizeof(response), fd) <= 0)){
2185                                 LM_ERR("%s: %ld for %p failed:" " %s (%d)\n",
2186                                                         su2a(&dst->to, sizeof(dst->to)),
2187                                                         response[1], c, strerror(errno), errno);
2188                                 goto conn_wait_error;
2189                         }
2190                         goto conn_wait_success;
2191                 }
2192 #endif /* TCP_CONNECT_WAIT  && TCP_ASYNC */
2193                 if (unlikely((c=tcpconn_connect(&dst->to, from, dst->proto,
2194                                                                                 &dst->send_flags))==0)){
2195                         LM_ERR("%s: connect failed\n", su2a(&dst->to, sizeof(dst->to)));
2196                         return -1;
2197                 }
2198                 if(c->flags & F_CONN_NOSEND) {
2199                         /* connection marked as no-send data
2200                          * (e.g., drop() from tls event route)*/
2201                         LM_INFO("%s: connection marked for no-send (%p)\n",
2202                                         su2a(&dst->to, sizeof(dst->to)), c);
2203                         /* we can safely delete it, it's not referenced by anybody */
2204                         _tcpconn_free(c);
2205                         n=-1;
2206                         goto end_no_conn;
2207                 }
2208                 tcpconn_set_send_flags(c, dst->send_flags);
2209                 if (likely(c->state==S_CONN_OK))
2210                         TCP_STATS_ESTABLISHED(S_CONN_CONNECT);
2211                 atomic_set(&c->refcnt, 2); /* ref. from here and it will also
2212                                                                         * be added in the tcp_main hash */
2213                 fd=c->s;
2214                 c->flags|=F_CONN_FD_CLOSED; /* not yet opened in main */
2215                 /* ? TODO: it might be faster just to queue the write and
2216                  * send to main a CONN_NEW_PENDING_WRITE */
2217
2218                 /* send the new tcpconn to "tcp main" */
2219                 response[0]=(long)c;
2220                 response[1]=CONN_NEW;
2221                 n=send_fd(unix_tcp_sock, response, sizeof(response), c->s);
2222                 if (unlikely(n<=0)){
2223                         LM_ERR("%s: failed send_fd: %s (%d)\n",
2224                                         su2a(&dst->to, sizeof(dst->to)),
2225                                         strerror(errno), errno);
2226                         /* we can safely delete it, it's not referenced by anybody */
2227                         _tcpconn_free(c);
2228                         n=-1;
2229                         goto end_no_conn;
2230                 }
2231                 /* new connection => send on it directly */
2232 #ifdef USE_TLS
2233                 if (unlikely(c->type==PROTO_TLS)) {
2234                         /* for TLS the TLS processing and the send must happen
2235                          * atomically w/ respect to other sends on the same connection
2236                          * (otherwise reordering might occur which would break TLS) =>
2237                          * lock.
2238                         */
2239                         response[1] = CONN_NOP;
2240                         t_buf = buf;
2241                         t_len = len;
2242                         lock_get(&c->write_lock);
2243                                 do {
2244                                         t_send_flags = dst->send_flags;
2245                                         n = tls_encode(c, &t_buf, &t_len, &rest_buf, &rest_len,
2246                                                                         &t_send_flags);
2247                                         if (likely(n > 0)) {
2248                                                 n = tcpconn_do_send(fd, c, t_buf, t_len, t_send_flags,
2249                                                                                                 &resp, 1);
2250                                                 if (likely(response[1] != CONN_QUEUED_WRITE ||
2251                                                                         resp == CONN_ERROR))
2252                                                         /* don't overwrite a previous CONN_QUEUED_WRITE
2253                                                          * unless error */
2254                                                         response[1] = resp;
2255                                         } else  if (unlikely(n < 0)) {
2256                                                 response[1] = CONN_ERROR;
2257                                                 break;
2258                                         }
2259                                         /* else do nothing for n (t_len) == 0, keep
2260                                          * the last reponse */
2261                                         t_buf = rest_buf;
2262                                         t_len = rest_len;
2263                                 } while(unlikely(rest_len && n > 0));
2264                         lock_release(&c->write_lock);
2265                 } else
2266 #endif /* USE_TLS */
2267                         n = tcpconn_do_send(fd, c, buf, len, dst->send_flags,
2268                                                                         &response[1], 0);
2269                 if (unlikely(response[1] != CONN_NOP)) {
2270                         response[0]=(long)c;
2271                         if (send_all(unix_tcp_sock, response, sizeof(response)) <= 0) {
2272                                 BUG("tcp_main command %ld sending failed (write):"
2273                                                 "%s (%d)\n", response[1], strerror(errno), errno);
2274                                 /* all commands != CONN_NOP returned by tcpconn_do_send()
2275                                  * (CONN_EOF, CONN_ERROR, CONN_QUEUED_WRITE) will auto-dec
2276                                  * refcnt => if sending the command fails we have to
2277                                  * dec. refcnt by hand */
2278                                 tcpconn_chld_put(c); /* deref. it manually */
2279                                 n=-1;
2280                         }
2281                         /* here refcnt for c is already decremented => c contents can
2282                          * no longer be used and refcnt _must_ _not_ be decremented
2283                          * again on exit */
2284                         if (unlikely(n < 0 || response[1] == CONN_EOF)) {
2285                                 /* on error or eof, close fd */
2286                                 tcp_safe_close(fd);
2287                         } else if (response[1] == CONN_QUEUED_WRITE) {
2288 #ifdef TCP_FD_CACHE
2289                                 if (cfg_get(tcp, tcp_cfg, fd_cache)) {
2290                                         tcp_fd_cache_add(c, fd);
2291                                 } else
2292 #endif /* TCP_FD_CACHE */
2293                                         tcp_safe_close(fd);
2294                         } else {
2295                                 BUG("unexpected tcpconn_do_send() return & response:"
2296                                                 " %d, %ld\n", n, response[1]);
2297                         }
2298                         goto end_no_deref;
2299                 }
2300 #ifdef TCP_FD_CACHE
2301                 if (cfg_get(tcp, tcp_cfg, fd_cache)) {
2302                         tcp_fd_cache_add(c, fd);
2303                 }else
2304 #endif /* TCP_FD_CACHE */
2305                         tcp_safe_close(fd);
2306         /* here we can have only commands that _do_ _not_ dec refcnt.
2307          * (CONN_EOF, CON_ERROR, CON_QUEUED_WRITE are all treated above) */
2308                 goto release_c;
2309         } /* if (c==0 or unusable) new connection */
2310         /* existing connection, send on it */
2311         n = tcpconn_send_put(c, buf, len, dst->send_flags);
2312         /* no deref needed (automatically done inside tcpconn_send_put() */
2313         return n;
2314 #ifdef TCP_CONNECT_WAIT
2315 conn_wait_success:
2316 #ifdef TCP_FD_CACHE
2317         if (cfg_get(tcp, tcp_cfg, fd_cache)) {
2318                 tcp_fd_cache_add(c, fd);
2319         } else
2320 #endif /* TCP_FD_CACHE */
2321                 if (unlikely (tcp_safe_close(fd) < 0))
2322                         LM_ERR("closing temporary send fd for %p: %s: "
2323                                         "close(%d) failed (flags 0x%x): %s (%d)\n", c,
2324                                         su2a(&c->rcv.src_su, sizeof(c->rcv.src_su)),
2325                                         fd, c->flags, strerror(errno), errno);
2326         tcpconn_chld_put(c); /* release c (dec refcnt & free on 0) */
2327         return n;
2328 conn_wait_error:
2329         n=-1;
2330 conn_wait_close:
2331         /* connect or send failed or immediate close-after-send was requested on
2332          * newly created connection which was not yet sent to tcp_main (but was
2333          * already hashed) => don't send to main, unhash and destroy directly
2334          * (if refcnt>2 it will be destroyed when the last sender releases the
2335          * connection (tcpconn_chld_put(c))) or when tcp_main receives a
2336          * CONN_ERROR it*/
2337         c->state=S_CONN_BAD;
2338         /* we are here only if we opened a new fd (and not reused a cached or
2339          * a reader one) => if the connect was successful close the fd */
2340         if (fd>=0) {
2341                 if (unlikely(tcp_safe_close(fd) < 0 ))
2342                         LM_ERR("closing temporary send fd for %p: %s: "
2343                                         "close(%d) failed (flags 0x%x): %s (%d)\n", c,
2344                                         su2a(&c->rcv.src_su, sizeof(c->rcv.src_su)),
2345                                         fd, c->flags, strerror(errno), errno);
2346         }
2347         /* here the connection is for sure in the hash (tcp_main will not
2348          * remove it because it's marked as PENDing) and the refcnt is at least 2
2349          */
2350         TCPCONN_LOCK;
2351                 _tcpconn_detach(c);
2352                 c->flags&=~F_CONN_HASHED;
2353                 tcpconn_put(c);
2354         TCPCONN_UNLOCK;
2355         /* dec refcnt -> mark it for destruction */
2356         tcpconn_chld_put(c);
2357         return n;
2358 #endif /* TCP_CONNECT_WAIT */
2359 release_c:
2360         tcpconn_chld_put(c); /* release c (dec refcnt & free on 0) */
2361 end_no_deref:
2362 end_no_conn:
2363         return n;
2364 }
2365
2366
2367
2368 /** sends on an existing tcpconn and auto-dec. con. ref counter.
2369  * As opposed to tcp_send(), this function requires an existing
2370  * tcp connection.
2371  * WARNING: the tcp_connection will be de-referenced.
2372  * @param c - existing tcp connection pointer.
2373  * @param buf - data to be sent.
2374  * @param len - data length,
2375  * @return >=0 on success, -1 on error.
2376  */
2377 static int tcpconn_send_put(struct tcp_connection* c, const char* buf,
2378                                                                 unsigned len, snd_flags_t send_flags)
2379 {
2380         struct tcp_connection *tmp;
2381         int fd;
2382         long response[2];
2383         int n;
2384         int do_close_fd;
2385 #ifdef USE_TLS
2386         const char* rest_buf;
2387         const char* t_buf;
2388         unsigned rest_len, t_len;
2389         long resp;
2390         snd_flags_t t_send_flags;
2391 #endif /* USE_TLS */
2392 #ifdef TCP_FD_CACHE
2393         struct fd_cache_entry* fd_cache_e;
2394         int use_fd_cache;
2395         
2396         use_fd_cache=cfg_get(tcp, tcp_cfg, fd_cache);
2397         fd_cache_e=0;
2398 #endif /* TCP_FD_CACHE */
2399         do_close_fd=1; /* close the fd on exit */
2400         response[1] = CONN_NOP;
2401 #ifdef TCP_ASYNC
2402         /* if data is already queued, we don't need the fd */
2403 #ifdef TCP_CONNECT_WAIT
2404                 if (unlikely(cfg_get(tcp, tcp_cfg, async) &&
2405                                                 (_wbufq_non_empty(c) || (c->flags&F_CONN_PENDING)) ))
2406 #else /* ! TCP_CONNECT_WAIT */
2407                 if (unlikely(cfg_get(tcp, tcp_cfg, async) && (_wbufq_non_empty(c)) ))
2408 #endif /* TCP_CONNECT_WAIT */
2409                 {
2410                         lock_get(&c->write_lock);
2411 #ifdef TCP_CONNECT_WAIT
2412                                 if (likely(_wbufq_non_empty(c) || (c->flags&F_CONN_PENDING)))
2413 #else /* ! TCP_CONNECT_WAIT */
2414                                 if (likely(_wbufq_non_empty(c)))
2415 #endif /* TCP_CONNECT_WAIT */
2416                                 {
2417                                         do_close_fd=0;
2418 #ifdef USE_TLS
2419                                         if (unlikely(c->type==PROTO_TLS || c->type==PROTO_WSS)) {
2420                                                 t_buf = buf;
2421                                                 t_len = len;
2422                                                 do {
2423                                                         t_send_flags = send_flags;
2424                                                         n = tls_encode(c, &t_buf, &t_len,
2425                                                                                         &rest_buf, &rest_len,
2426                                                                                         &t_send_flags);
2427                                                         if (unlikely((n < 0) || (t_len &&
2428                                                                          (_wbufq_add(c, t_buf, t_len) < 0)))) {
2429                                                                 lock_release(&c->write_lock);
2430                                                                 n=-1;
2431                                                                 response[1] = CONN_ERROR;
2432                                                                 c->state=S_CONN_BAD;
2433                                                                 c->timeout=get_ticks_raw(); /* force timeout */
2434                                                                 goto error;
2435                                                         }
2436                                                         t_buf = rest_buf;
2437                                                         t_len = rest_len;
2438                                                 } while(unlikely(rest_len && n > 0));
2439                                         } else
2440 #endif /* USE_TLS */
2441                                                 if (unlikely(len && (_wbufq_add(c, buf, len)<0))){
2442                                                         lock_release(&c->write_lock);
2443                                                         n=-1;
2444                                                         response[1] = CONN_ERROR;
2445                                                         c->state=S_CONN_BAD;
2446                                                         c->timeout=get_ticks_raw(); /* force timeout */
2447                                                         goto error;
2448                                                 }
2449                                         n=len;
2450                                         lock_release(&c->write_lock);
2451                                         goto release_c;
2452                                 }
2453                         lock_release(&c->write_lock);
2454                 }
2455 #endif /* TCP_ASYNC */
2456                 /* check if this is not the same reader process holding
2457                  *  c  and if so send directly on c->fd */
2458                 if (c->reader_pid==my_pid()){
2459                         LM_DBG("send from reader (%d (%d)), reusing fd\n",
2460                                         my_pid(), process_no);
2461                         fd=c->fd;
2462                         do_close_fd=0; /* don't close the fd on exit, it's in use */
2463 #ifdef TCP_FD_CACHE
2464                         use_fd_cache=0; /* don't cache: problems would arise due to the
2465                                                            close() on cache eviction (if the fd is still 
2466                                                            used). If it has to be cached then dup() _must_ 
2467                                                            be used */
2468                 }else if (likely(use_fd_cache && 
2469                                                         ((fd_cache_e=tcp_fd_cache_get(c))!=0))){
2470                         fd=fd_cache_e->fd;
2471                         do_close_fd=0;
2472                         LM_DBG("found fd in cache (%d, %p, %d)\n", fd, c, fd_cache_e->id);
2473 #endif /* TCP_FD_CACHE */
2474                 }else{
2475                         LM_DBG("tcp connection found (%p), acquiring fd\n", c);
2476                         /* get the fd */
2477                         response[0]=(long)c;
2478                         response[1]=CONN_GET_FD;
2479                         n=send_all(unix_tcp_sock, response, sizeof(response));
2480                         if (unlikely(n<=0)){
2481                                 LM_ERR("failed to get fd(write):%s (%d)\n", strerror(errno), errno);
2482                                 n=-1;
2483                                 goto release_c;
2484                         }
2485                         LM_DBG("c=%p, n=%d\n", c, n);
2486                         n=receive_fd(unix_tcp_sock, &tmp, sizeof(tmp), &fd, MSG_WAITALL);
2487                         if (unlikely(n<=0)){
2488                                 LM_ERR("failed to get fd(receive_fd): %s (%d)\n",
2489                                                 strerror(errno), errno);
2490                                 n=-1;
2491                                 do_close_fd=0;
2492                                 goto release_c;
2493                         }
2494                         /* handle fd closed or bad connection/error
2495                                 (it's possible that this happened in the time between
2496                                 we found the intial connection and the time when we get
2497                                 the fd)
2498                          */
2499                         if (unlikely(c!=tmp || fd==-1 || c->state==S_CONN_BAD)){
2500                                 if (unlikely(c!=tmp && tmp!=0))
2501                                         BUG("tcp_send: get_fd: got different connection:"
2502                                                 "  %p (id= %d, refcnt=%d state=%d) != "
2503                                                 "  %p (n=%d)\n",
2504                                                   c,   c->id,   atomic_get(&c->refcnt),   c->state,
2505                                                   tmp, n
2506                                                 );
2507                                 n=-1; /* fail */
2508                                 /* don't cache fd & close it */
2509                                 do_close_fd = (fd==-1)?0:1;
2510 #ifdef TCP_FD_CACHE
2511                                 use_fd_cache = 0;
2512 #endif /* TCP_FD_CACHE */
2513                                 goto end;
2514                         }
2515                         LM_DBG("after receive_fd: c= %p n=%d fd=%d\n",c, n, fd);
2516                 }
2517         
2518 #ifdef USE_TLS
2519                 if (unlikely(c->type==PROTO_TLS || c->type==PROTO_WSS)) {
2520                         /* for TLS the TLS processing and the send must happen
2521                            atomically w/ respect to other sends on the same connection
2522                            (otherwise reordering might occur which would break TLS) =>
2523                            lock.
2524                         */
2525                         response[1] = CONN_NOP;
2526                         t_buf = buf;
2527                         t_len = len;
2528                         lock_get(&c->write_lock);
2529                                 do {
2530                                         t_send_flags = send_flags;
2531                                         n = tls_encode(c, &t_buf, &t_len, &rest_buf, &rest_len,
2532                                                                         &t_send_flags);
2533                                         if (likely(n > 0)) {
2534                                                 n = tcpconn_do_send(fd, c, t_buf, t_len, t_send_flags,
2535                                                                                                 &resp, 1);
2536                                                 if (likely(response[1] != CONN_QUEUED_WRITE ||
2537                                                                         resp == CONN_ERROR))
2538                                                         /* don't overwrite a previous CONN_QUEUED_WRITE
2539                                                            unless error */
2540                                                         response[1] = resp;
2541                                         } else if (unlikely(n < 0)) {
2542                                                 response[1] = CONN_ERROR;
2543                                                 break;
2544                                         }
2545                                         /* else do nothing for n (t_len) == 0, keep
2546                                            the last reponse */
2547                                         t_buf = rest_buf;
2548                                         t_len = rest_len;
2549                                 } while(unlikely(rest_len && n > 0));
2550                         lock_release(&c->write_lock);
2551                 } else
2552 #endif
2553                         n = tcpconn_do_send(fd, c, buf, len, send_flags, &response[1], 0);
2554         if (unlikely(response[1] != CONN_NOP)) {
2555 error:
2556                 response[0]=(long)c;
2557                 if (send_all(unix_tcp_sock, response, sizeof(response)) <= 0) {
2558                         BUG("tcp_main command %ld sending failed (write):%s (%d)\n",
2559                                         response[1], strerror(errno), errno);
2560                         /* all commands != CONN_NOP returned by tcpconn_do_send()
2561                            (CONN_EOF, CONN_ERROR, CONN_QUEUED_WRITE) will auto-dec refcnt
2562                            => if sending the command fails we have to dec. refcnt by hand
2563                          */
2564                         tcpconn_chld_put(c); /* deref. it manually */
2565                         n=-1;
2566                 }
2567                 /* here refcnt for c is already decremented => c contents can no
2568                    longer be used and refcnt _must_ _not_ be decremented again
2569                    on exit */
2570                 if (unlikely(n < 0 || response[1] == CONN_EOF)) {
2571                         /* on error or eof, remove from cache or close fd */
2572 #ifdef TCP_FD_CACHE
2573                         if (unlikely(fd_cache_e)){
2574                                 tcp_fd_cache_rm(fd_cache_e);
2575                                 fd_cache_e = 0;
2576                                 tcp_safe_close(fd);
2577                         }else
2578 #endif /* TCP_FD_CACHE */
2579                                 if (do_close_fd) tcp_safe_close(fd);
2580                 } else if (response[1] == CONN_QUEUED_WRITE) {
2581 #ifdef TCP_FD_CACHE
2582                         if (unlikely((fd_cache_e==0) && use_fd_cache)){
2583                                 tcp_fd_cache_add(c, fd);
2584                         }else
2585 #endif /* TCP_FD_CACHE */
2586                                 if (do_close_fd) tcp_safe_close(fd);
2587                 } else {
2588                         BUG("unexpected tcpconn_do_send() return & response: %d, %ld\n",
2589                                         n, response[1]);
2590                 }
2591                 return n; /* no tcpconn_put */
2592         }
2593 end:
2594 #ifdef TCP_FD_CACHE
2595         if (unlikely((fd_cache_e==0) && use_fd_cache)){
2596                 tcp_fd_cache_add(c, fd);
2597         }else
2598 #endif /* TCP_FD_CACHE */
2599         if (do_close_fd) {
2600                 if (unlikely(tcp_safe_close(fd) < 0))
2601                         LM_ERR("closing temporary send fd for %p: %s: "
2602                                         "close(%d) failed (flags 0x%x): %s (%d)\n", c,
2603                                         su2a(&c->rcv.src_su, sizeof(c->rcv.src_su)),
2604                                         fd, c->flags, strerror(errno), errno);
2605         }
2606         /* here we can have only commands that _do_ _not_ dec refcnt.
2607            (CONN_EOF, CON_ERROR, CON_QUEUED_WRITE are all treated above) */
2608 release_c:
2609         tcpconn_chld_put(c); /* release c (dec refcnt & free on 0) */
2610         return n;
2611 }
2612
2613
2614
2615 /* unsafe send on a known tcp connection.
2616  * Directly send on a known tcp connection with a given fd.
2617  * It is assumed that the connection locks are already held.
2618  * Side effects: if needed it will send state update commands to
2619  *  tcp_main (e.g. CON_EOF, CON_ERROR, CON_QUEUED_WRITE).
2620  * @param fd - fd used for sending.
2621  * @param c - existing tcp connection pointer (state and flags might be
2622  *            changed).
2623  * @param buf - data to be sent.
2624  * @param len - data length.
2625  * @param send_flags
2626  * @return <0 on error, number of bytes sent on success.
2627  */
2628 int tcpconn_send_unsafe(int fd, struct tcp_connection *c,
2629                                                 const char* buf, unsigned len, snd_flags_t send_flags)
2630 {
2631         int n;
2632         long response[2];
2633         
2634         n = tcpconn_do_send(fd, c, buf, len, send_flags, &response[1], 1);
2635         if (unlikely(response[1] != CONN_NOP)) {
2636                 /* all commands != CONN_NOP returned by tcpconn_do_send()
2637                    (CONN_EOF, CONN_ERROR, CONN_QUEUED_WRITE) will auto-dec refcnt
2638                    => increment it (we don't want the connection to be destroyed
2639                    from under us)
2640                  */
2641                 atomic_inc(&c->refcnt);
2642                 response[0]=(long)c;
2643                 if (send_all(unix_tcp_sock, response, sizeof(response)) <= 0) {
2644                         BUG("connection %p command %ld sending failed (write):%s (%d)\n",
2645                                         c, response[1], strerror(errno), errno);
2646                         /* send failed => deref. it back by hand */
2647                         tcpconn_chld_put(c); 
2648                         n=-1;
2649                 }
2650                 /* here refcnt for c is already decremented => c contents can no
2651                    longer be used and refcnt _must_ _not_ be decremented again
2652                    on exit */
2653                 return n;
2654         }
2655         return n;
2656 }
2657
2658
2659
2660 /** lower level send (connection and fd should be known).
2661  * It takes care of possible write-queueing, blacklisting a.s.o.
2662  * It expects a valid tcp connection. It doesn't touch the ref. cnts.
2663  * It will also set the connection flags from send_flags (it's better
2664  * to do it here, because it's guaranteed to be under lock).
2665  * @param fd - fd used for sending.
2666  * @param c - existing tcp connection pointer (state and flags might be
2667  *            changed).
2668  * @param buf - data to be sent.
2669  * @param len - data length.
2670  * @param send_flags
2671  * @param resp - filled with a cmd. for tcp_main:
2672  *                      CONN_NOP - nothing needs to be done (do not send
2673  *                                 anything to tcp_main).
2674  *                      CONN_ERROR - error, connection should be closed.
2675  *                      CONN_EOF - no error, but connection should be closed.
2676  *                      CONN_QUEUED_WRITE - new write queue (connection
2677  *                                 should be watched for write and the wr.
2678  *                                 queue flushed).
2679  * @param locked - if set assume the connection is already locked (call from
2680  *                  tls) and do not lock/unlock the connection.
2681  * @return >=0 on success, < 0 on error && *resp == CON_ERROR.
2682  *
2683  */
2684 static int tcpconn_do_send(int fd, struct tcp_connection* c,
2685                                                         const char* buf, unsigned len,
2686                                                         snd_flags_t send_flags, long* resp,
2687                                                         int locked)
2688 {
2689         int  n;
2690 #ifdef TCP_ASYNC
2691         int enable_write_watch;
2692 #endif /* TCP_ASYNC */
2693
2694         LM_DBG("sending...\n");
2695         *resp = CONN_NOP;
2696         if (likely(!locked)) lock_get(&c->write_lock);
2697         /* update connection send flags with the current ones */
2698         tcpconn_set_send_flags(c, send_flags);
2699 #ifdef TCP_ASYNC
2700         if (likely(cfg_get(tcp, tcp_cfg, async))){
2701                 if (_wbufq_non_empty(c)
2702 #ifdef TCP_CONNECT_WAIT
2703                         || (c->flags&F_CONN_PENDING) 
2704 #endif /* TCP_CONNECT_WAIT */
2705                         ){
2706                         if (unlikely(_wbufq_add(c, buf, len)<0)){
2707                                 if (likely(!locked)) lock_release(&c->write_lock);
2708                                 n=-1;
2709                                 goto error;
2710                         }
2711                         if (likely(!locked)) lock_release(&c->write_lock);
2712                         n=len;
2713                         goto end;
2714                 }
2715                 n=_tcpconn_write_nb(fd, c, buf, len);
2716         }else{
2717 #endif /* TCP_ASYNC */
2718                 /* n=tcp_blocking_write(c, fd, buf, len); */
2719                 n=tsend_stream(fd, buf, len,
2720                                                 TICKS_TO_S(cfg_get(tcp, tcp_cfg, send_timeout)) *
2721                                                 1000);
2722 #ifdef TCP_ASYNC
2723         }
2724 #else /* ! TCP_ASYNC */
2725         if (likely(!locked)) lock_release(&c->write_lock);
2726 #endif /* TCP_ASYNC */
2727         
2728         LM_DBG("after real write: c= %p n=%d fd=%d\n",c, n, fd);
2729         LM_DBG("buf=\n%.*s\n", (int)len, buf);
2730         if (unlikely(n<(int)len)){
2731 #ifdef TCP_ASYNC
2732                 if (cfg_get(tcp, tcp_cfg, async) &&
2733                                 ((n>=0) || errno==EAGAIN || errno==EWOULDBLOCK)){
2734                         enable_write_watch=_wbufq_empty(c);
2735                         if (n<0) n=0;
2736                         else if (unlikely(c->state==S_CONN_CONNECT ||
2737                                                 c->state==S_CONN_ACCEPT)){
2738                                 TCP_STATS_ESTABLISHED(c->state);
2739                                 c->state=S_CONN_OK; /* something was written */
2740                         }
2741                         if (unlikely(_wbufq_add(c, buf+n, len-n)<0)){
2742                                 if (likely(!locked)) lock_release(&c->write_lock);
2743                                 n=-1;
2744                                 goto error;
2745                         }
2746                         if (likely(!locked)) lock_release(&c->write_lock);
2747                         n=len;
2748                         if (likely(enable_write_watch))
2749                                 *resp=CONN_QUEUED_WRITE;
2750                         goto end;
2751                 }else{
2752                         if (likely(!locked)) lock_release(&c->write_lock);
2753                 }
2754 #endif /* TCP_ASYNC */
2755                 if (unlikely(c->state==S_CONN_CONNECT)){
2756                         switch(errno){
2757                                 case ENETUNREACH:
2758                                 case EHOSTUNREACH: /* not posix for send() */
2759 #ifdef USE_DST_BLACKLIST
2760                                         dst_blacklist_su(BLST_ERR_CONNECT, c->rcv.proto,
2761                                                                                 &c->rcv.src_su, &c->send_flags, 0);
2762 #endif /* USE_DST_BLACKLIST */
2763                                         TCP_EV_CONNECT_UNREACHABLE(errno, TCP_LADDR(c),
2764                                                                         TCP_LPORT(c), TCP_PSU(c), TCP_PROTO(c));
2765                                         break;
2766                                 case ECONNREFUSED:
2767                                 case ECONNRESET:
2768 #ifdef USE_DST_BLACKLIST
2769                                         dst_blacklist_su(BLST_ERR_CONNECT, c->rcv.proto,
2770                                                                                 &c->rcv.src_su, &c->send_flags, 0);
2771 #endif /* USE_DST_BLACKLIST */
2772                                         TCP_EV_CONNECT_RST(errno, TCP_LADDR(c), TCP_LPORT(c),
2773                                                                                 TCP_PSU(c), TCP_PROTO(c));
2774                                         break;
2775                                 default:
2776                                         TCP_EV_CONNECT_ERR(errno, TCP_LADDR(c), TCP_LPORT(c),
2777                                                                                 TCP_PSU(c), TCP_PROTO(c));
2778                                 }
2779                         TCP_STATS_CONNECT_FAILED();
2780                 }else{
2781                         switch(errno){
2782                                 case ECONNREFUSED:
2783                                 case ECONNRESET:
2784                                         TCP_STATS_CON_RESET();
2785                                         /* no break */
2786                                 case ENETUNREACH:
2787                                 /*case EHOSTUNREACH: -- not posix */
2788 #ifdef USE_DST_BLACKLIST
2789                                         dst_blacklist_su(BLST_ERR_SEND, c->rcv.proto,
2790                                                                                 &c->rcv.src_su, &c->send_flags, 0);
2791 #endif /* USE_DST_BLACKLIST */
2792                                         break;
2793                         }
2794                 }
2795                 LM_ERR("failed to send on %p (%s:%d->%s): %s (%d)\n",
2796                                         c, ip_addr2a(&c->rcv.dst_ip), c->rcv.dst_port,
2797                                         su2a(&c->rcv.src_su, sizeof(c->rcv.src_su)),
2798                                         strerror(errno), errno);
2799                 n = -1;
2800 #ifdef TCP_ASYNC
2801 error:
2802 #endif /* TCP_ASYNC */
2803                 /* error on the connection , mark it as bad and set 0 timeout */
2804                 c->state=S_CONN_BAD;
2805                 c->timeout=get_ticks_raw();
2806                 /* tell "main" it should drop this (optional it will t/o anyway?)*/
2807                 *resp=CONN_ERROR;
2808                 return n; /* error return, no tcpconn_put */
2809         }
2810         
2811 #ifdef TCP_ASYNC
2812         if (likely(!locked)) lock_release(&c->write_lock);
2813 #endif /* TCP_ASYNC */
2814         /* in non-async mode here we're either in S_CONN_OK or S_CONN_ACCEPT*/
2815         if (unlikely(c->state==S_CONN_CONNECT || c->state==S_CONN_ACCEPT)){
2816                         TCP_STATS_ESTABLISHED(c->state);
2817                         c->state=S_CONN_OK;
2818         }
2819         if (unlikely(send_flags.f & SND_F_CON_CLOSE)){
2820                 /* close after write => send EOF request to tcp_main */
2821                 c->state=S_CONN_BAD;
2822                 c->timeout=get_ticks_raw();
2823                 /* tell "main" it should drop this*/
2824                 *resp=CONN_EOF;
2825                 return n;
2826         }
2827 end:
2828         return n;
2829 }
2830
2831
2832
2833 /** low level 1st send on a new connection.
2834  * It takes care of possible write-queueing, blacklisting a.s.o.
2835  * It expects a valid just-opened tcp connection. It doesn't touch the 
2836  * ref. counters. It's used only in the async first send case.
2837  * @param fd - fd used for sending.
2838  * @param c - existing tcp connection pointer (state and flags might be
2839  *            changed). The connection must be new (no previous send on it).
2840  * @param buf - data to be sent.
2841  * @param len - data length.
2842  * @param send_flags
2843  * @param resp - filled with a fd sending cmd. for tcp_main on success. It
2844  *                      _must_ be one of the commands listed below:
2845  *                      CONN_NEW_PENDING_WRITE - new connection, first write
2846  *                                 was partially successful (or EAGAIN) and
2847  *                                 was queued (connection should be watched
2848  *                                 for write and the write queue flushed).
2849  *                                 The fd should be sent to tcp_main.
2850  *                      CONN_NEW_COMPLETE - new connection, first write
2851  *                                 completed successfully and no data is
2852  *                                 queued. The fd should be sent to tcp_main.
2853  *                      CONN_EOF - no error, but the connection should be
2854  *                                  closed (e.g. SND_F_CON_CLOSE send flag).
2855  *                      CONN_ERROR - error, _must_ return < 0.
2856  * @param locked - if set assume the connection is already locked (call from
2857  *                  tls) and do not lock/unlock the connection.
2858  * @return >=0 on success, < 0 on error (on error *resp is undefined).
2859  *
2860  */
2861 static int tcpconn_1st_send(int fd, struct tcp_connection* c,
2862                                                         const char* buf, unsigned len,
2863                                                         snd_flags_t send_flags, long* resp,
2864                                                         int locked)
2865 {
2866         int n;
2867         
2868         n=_tcpconn_write_nb(fd, c, buf, len);
2869         if (unlikely(n<(int)len)){
2870                 /* on EAGAIN or ENOTCONN return success.
2871                    ENOTCONN appears on newer FreeBSD versions (non-blocking socket,
2872                    connect() & send immediately) */
2873                 if ((n>=0) || errno==EAGAIN || errno==EWOULDBLOCK || errno==ENOTCONN){
2874                         if(n<0) {
2875                                 LM_DBG("pending write on new connection %p "
2876                                         "(%d/%d bytes written) (err: %d - %s)\n", c, n, len,
2877                                         errno, strerror(errno));
2878                         } else {
2879                                 LM_DBG("pending write on new connection %p "
2880                                         "(%d/%d bytes written)\n", c, n, len);
2881                         }
2882                         if (unlikely(n<0)) n=0;
2883                         else{
2884                                 if (likely(c->state == S_CONN_CONNECT))
2885                                         TCP_STATS_ESTABLISHED(S_CONN_CONNECT);
2886                                 c->state=S_CONN_OK; /* partial write => connect()
2887                                                                                                 ended */
2888                         }
2889                         /* add to the write queue */
2890                         if (likely(!locked)) lock_get(&c->write_lock);
2891                                 if (unlikely(_wbufq_insert(c, buf+n, len-n)<0)){
2892                                         if (likely(!locked)) lock_release(&c->write_lock);
2893                                         n=-1;
2894                                         LM_ERR("%s: EAGAIN and write queue full or failed for %p\n",
2895                                                         su2a(&c->rcv.src_su, sizeof(c->rcv.src_su)), c);
2896                                         goto error;
2897                                 }
2898                         if (likely(!locked)) lock_release(&c->write_lock);
2899                         /* send to tcp_main */
2900                         *resp=CONN_NEW_PENDING_WRITE;
2901                         n=len;
2902                         goto end;
2903                 }
2904                 /* n < 0 and not EAGAIN => write error */
2905                 /* if first write failed it's most likely a
2906                    connect error */
2907                 switch(errno){
2908                         case ENETUNREACH:
2909                         case EHOSTUNREACH:  /* not posix for send() */
2910 #ifdef USE_DST_BLACKLIST
2911                                 dst_blacklist_su( BLST_ERR_CONNECT, c->rcv.proto,
2912                                                                         &c->rcv.src_su, &c->send_flags, 0);
2913 #endif /* USE_DST_BLACKLIST */
2914                                 TCP_EV_CONNECT_UNREACHABLE(errno, TCP_LADDR(c),
2915                                                                 TCP_LPORT(c), TCP_PSU(c), TCP_PROTO(c));
2916                                 break;
2917                         case ECONNREFUSED:
2918                         case ECONNRESET:
2919 #ifdef USE_DST_BLACKLIST
2920                                 dst_blacklist_su( BLST_ERR_CONNECT, c->rcv.proto,
2921                                                                         &c->rcv.src_su, &c->send_flags, 0);
2922 #endif /* USE_DST_BLACKLIST */
2923                                 TCP_EV_CONNECT_RST(errno, TCP_LADDR(c),
2924                                                                 TCP_LPORT(c), TCP_PSU(c), TCP_PROTO(c));
2925                                 break;
2926                         default:
2927                                 TCP_EV_CONNECT_ERR(errno, TCP_LADDR(c),
2928                                                                 TCP_LPORT(c), TCP_PSU(c), TCP_PROTO(c));
2929                 }
2930                 /* error: destroy it directly */
2931                 TCP_STATS_CONNECT_FAILED();
2932                 LM_ERR("%s: connect & send  for %p failed:" " %s (%d)\n",
2933                                         su2a(&c->rcv.src_su, sizeof(c->rcv.src_su)),
2934                                         c, strerror(errno), errno);
2935                 goto error;
2936         }
2937         LM_INFO("quick connect for %p\n", c);
2938         if (likely(c->state == S_CONN_CONNECT))
2939                 TCP_STATS_ESTABLISHED(S_CONN_CONNECT);
2940         if (unlikely(send_flags.f & SND_F_CON_CLOSE)){
2941                 /* close after write =>  EOF => close immediately */
2942                 c->state=S_CONN_BAD;
2943                 /* tell our caller that it should drop this*/
2944                 *resp=CONN_EOF;
2945         }else{
2946                 c->state=S_CONN_OK;
2947                 /* send to tcp_main */
2948                 *resp=CONN_NEW_COMPLETE;
2949         }
2950 end:
2951         return n; /* >= 0 */
2952 error:
2953         *resp=CONN_ERROR;
2954         return -1;
2955 }
2956
2957
2958
2959 int tcp_init(struct socket_info* sock_info)
2960 {
2961         union sockaddr_union* addr;
2962         int optval;
2963 #ifdef HAVE_TCP_ACCEPT_FILTER
2964         struct accept_filter_arg afa;
2965 #endif /* HAVE_TCP_ACCEPT_FILTER */
2966 #ifdef DISABLE_NAGLE
2967         int flag;
2968         struct protoent* pe;
2969
2970         if (tcp_proto_no==-1){ /* if not already set */
2971                 pe=getprotobyname("tcp");
2972                 if (pe==0){
2973                         LM_ERR("could not get TCP protocol number\n");
2974                         tcp_proto_no=-1;
2975                 }else{
2976                         tcp_proto_no=pe->p_proto;
2977                 }
2978         }
2979 #endif
2980
2981         addr=&sock_info->su;
2982         /* sock_info->proto=PROTO_TCP; */
2983         if (init_su(addr, &sock_info->address, sock_info->port_no)<0){
2984                 LM_ERR("could no init sockaddr_union\n");
2985                 goto error;
2986         }
2987         LM_DBG("added %s\n", su2a(addr, sizeof(*addr)));
2988         sock_info->socket=socket(AF2PF(addr->s.sa_family), SOCK_STREAM, 0);
2989         if (sock_info->socket==-1){
2990                 LM_ERR("tcp_init: socket: %s\n", strerror(errno));
2991                 goto error;
2992         }
2993 #ifdef DISABLE_NAGLE
2994         flag=1;
2995         if ( (tcp_proto_no!=-1) &&
2996                  (setsockopt(sock_info->socket, tcp_proto_no , TCP_NODELAY,
2997                                          &flag, sizeof(flag))<0) ){
2998                 LM_ERR("could not disable Nagle: %s\n", strerror(errno));
2999         }
3000 #endif
3001
3002
3003 #if  !defined(TCP_DONT_REUSEADDR) 
3004         /* Stevens, "Network Programming", Section 7.5, "Generic Socket
3005      * Options": "...server started,..a child continues..on existing
3006          * connection..listening server is restarted...call to bind fails
3007          * ... ALL TCP servers should specify the SO_REUSEADDRE option 
3008          * to allow the server to be restarted in this situation
3009          *
3010          * Indeed, without this option, the server can't restart.
3011          *   -jiri
3012          */
3013         optval=1;
3014         if (setsockopt(sock_info->socket, SOL_SOCKET, SO_REUSEADDR,
3015                                 (void*)&optval, sizeof(optval))==-1) {
3016                 LM_ERR("setsockopt %s\n", strerror(errno));
3017                 goto error;
3018         }
3019 #endif
3020
3021 #ifdef SO_REUSEPORT
3022         if ((optval=cfg_get(tcp, tcp_cfg, reuse_port))) {
3023                 if (setsockopt(sock_info->socket, SOL_SOCKET, SO_REUSEPORT,
3024                                 (void*)&optval, sizeof(optval))==-1) {
3025                         LM_ERR("setsockopt %s\n", strerror(errno));
3026                 }
3027         }
3028 #endif
3029
3030         /* tos */
3031         optval = tos;
3032         if(sock_info->address.af==AF_INET){
3033                 if (setsockopt(sock_info->socket, IPPROTO_IP, IP_TOS, (void*)&optval,
3034                                         sizeof(optval)) ==-1){
3035                         LM_WARN("setsockopt tos: %s (%d)\n", strerror(errno), tos);
3036                         /* continue since this is not critical */
3037                 }
3038         } else if(sock_info->address.af==AF_INET6){
3039                 if (setsockopt(sock_info->socket, IPPROTO_IPV6, IPV6_TCLASS,
3040                                         (void*)&optval, sizeof(optval)) ==-1) {
3041                         LM_WARN("setsockopt v6 tos: %s (%d)\n", strerror(errno), tos);
3042                         /* continue since this is not critical */
3043                 }
3044                 if(sr_bind_ipv6_link_local!=0) {
3045                         LM_INFO("setting scope of %s\n", sock_info->address_str.s);
3046                         addr->sin6.sin6_scope_id =
3047                                 ipv6_get_netif_scope(sock_info->address_str.s);
3048                 }
3049         }
3050
3051 #if defined(IP_FREEBIND)
3052         /* allow bind to non local address.
3053          * useful when daemon started before network initialized */
3054         if (_sr_ip_free_bind && setsockopt(sock_info->socket, IPPROTO_IP,
3055                                 IP_FREEBIND, (void*)&optval, sizeof(optval)) ==-1) {
3056                 LM_WARN("setsockopt freebind failed: %s\n", strerror(errno));
3057                 /* continue since this is not critical */
3058         }
3059 #endif
3060
3061 #ifdef HAVE_TCP_DEFER_ACCEPT
3062         /* linux only */
3063         if ((optval=cfg_get(tcp, tcp_cfg, defer_accept))){
3064                 if (setsockopt(sock_info->socket, IPPROTO_TCP, TCP_DEFER_ACCEPT,
3065                                         (void*)&optval, sizeof(optval)) ==-1){
3066                         LM_WARN("setsockopt TCP_DEFER_ACCEPT %s\n", strerror(errno));
3067                 /* continue since this is not critical */
3068                 }
3069         }
3070 #endif /* HAVE_TCP_DEFFER_ACCEPT */
3071 #ifdef HAVE_TCP_SYNCNT
3072         if ((optval=cfg_get(tcp, tcp_cfg, syncnt))){
3073                 if (setsockopt(sock_info->socket, IPPROTO_TCP, TCP_SYNCNT, &optval,
3074                                                 sizeof(optval))<0){
3075                         LM_WARN("failed to set maximum SYN retr. count: %s\n", strerror(errno));
3076                 }
3077         }
3078 #endif
3079 #ifdef HAVE_TCP_LINGER2
3080         if ((optval=cfg_get(tcp, tcp_cfg, linger2))){
3081                 if (setsockopt(sock_info->socket, IPPROTO_TCP, TCP_LINGER2, &optval,
3082                                                 sizeof(optval))<0){
3083                         LM_WARN("failed to set maximum LINGER2 timeout: %s\n", strerror(errno));
3084                 }
3085         }
3086 #endif
3087         init_sock_keepalive(sock_info->socket);
3088         if (bind(sock_info->socket, &addr->s, sockaddru_len(*addr))==-1){
3089                 LM_ERR("bind(%x, %p, %d) on %s:%d : %s\n",
3090                                 sock_info->socket,  &addr->s, 
3091                                 (unsigned)sockaddru_len(*addr),
3092                                 sock_info->address_str.s,
3093                                 sock_info->port_no,
3094                                 strerror(errno));
3095                 goto error;
3096         }
3097         if (listen(sock_info->socket, TCP_LISTEN_BACKLOG)==-1){
3098                 LM_ERR("listen(%x, %p, %d) on %s: %s\n",
3099                                 sock_info->socket, &addr->s, 
3100                                 (unsigned)sockaddru_len(*addr),
3101                       &n