89e3b56b7cfe45febbe2955c9ad0372d2caf8dfb
[sip-router] / src / core / tcp_main.c
1 /*
2  * Copyright (C) 2001-2003 FhG Fokus
3  *
4  * This file is part of Kamailio, a free SIP server.
5  *
6  * Kamailio is free software; you can redistribute it and/or modify
7  * it under the terms of the GNU General Public License as published by
8  * the Free Software Foundation; either version 2 of the License, or
9  * (at your option) any later version
10  *
11  * Kamailio is distributed in the hope that it will be useful,
12  * but WITHOUT ANY WARRANTY; without even the implied warranty of
13  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
14  * GNU General Public License for more details.
15  *
16  * You should have received a copy of the GNU General Public License
17  * along with this program; if not, write to the Free Software
18  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
19  */
20
21 /** Kamailio core: tcp main/dispatcher and tcp send functions.
22  * @file tcp_main.c
23  * @ingroup core
24  * Module: @ref core
25  */
26
27
28 #ifdef USE_TCP
29
30
31 #define HANDLE_IO_INLINE
32 #include "io_wait.h" /* include first to make sure the needed features are
33                                                 turned on (e.g. _GNU_SOURCE for POLLRDHUP) */
34
35 #include <sys/time.h>
36 #include <sys/types.h>
37 #include <sys/select.h>
38 #include <sys/socket.h>
39 #ifdef HAVE_FILIO_H
40 #include <sys/filio.h> /* needed on solaris 2.x for FIONREAD */
41 #elif defined __OS_solaris
42 #define BSD_COMP  /* needed on older solaris for FIONREAD */
43 #endif /* HAVE_FILIO_H / __OS_solaris */
44 #include <sys/ioctl.h>  /* ioctl() used on write error */
45 #include <arpa/inet.h>  /* for inet_pton() */
46 #include <netinet/in.h>
47 #include <netinet/in_systm.h>
48 #include <netinet/ip.h>
49 #include <netinet/tcp.h>
50 #include <sys/uio.h>  /* writev*/
51 #include <netdb.h>
52 #include <stdlib.h> /*exit() */
53 #include <stdint.h> /* UINT32_MAX */
54
55 #include <unistd.h>
56
57 #include <errno.h>
58 #include <string.h>
59
60 #ifdef HAVE_SELECT
61 #include <sys/select.h>
62 #endif
63 #include <poll.h>
64
65
66 #include "ip_addr.h"
67 #include "pass_fd.h"
68 #include "tcp_conn.h"
69 #include "globals.h"
70 #include "pt.h"
71 #include "locking.h"
72 #include "mem/mem.h"
73 #include "mem/shm_mem.h"
74 #include "timer.h"
75 #include "sr_module.h"
76 #include "tcp_server.h"
77 #include "tcp_init.h"
78 #include "tcp_int_send.h"
79 #include "tcp_stats.h"
80 #include "tcp_ev.h"
81 #include "tsend.h"
82 #include "timer_ticks.h"
83 #include "local_timer.h"
84 #ifdef CORE_TLS
85 #include "tls/tls_server.h"
86 #define tls_loaded() 1
87 #else
88 #include "tls_hooks_init.h"
89 #include "tls_hooks.h"
90 #endif /* CORE_TLS*/
91 #ifdef USE_DST_BLACKLIST
92 #include "dst_blacklist.h"
93 #endif /* USE_DST_BLACKLIST */
94
95 #include "tcp_info.h"
96 #include "tcp_options.h"
97 #include "ut.h"
98 #include "cfg/cfg_struct.h"
99
100 #include <fcntl.h> /* must be included after io_wait.h if SIGIO_RT is used */
101
102
103 #ifdef NO_MSG_DONTWAIT
104 #ifndef MSG_DONTWAIT
105 /* should work inside tcp_main */
106 #define MSG_DONTWAIT 0
107 #endif
108 #endif /*NO_MSG_DONTWAIT */
109
110
111 #define TCP_PASS_NEW_CONNECTION_ON_DATA /* don't pass a new connection
112                                                                                    immediately to a child, wait for
113                                                                                    some data on it first */
114 #define TCP_LISTEN_BACKLOG 1024
115 #define SEND_FD_QUEUE /* queue send fd requests on EAGAIN, instead of sending 
116                                                         them immediately */
117 #define TCP_CHILD_NON_BLOCKING 
118 #ifdef SEND_FD_QUEUE
119 #ifndef TCP_CHILD_NON_BLOCKING
120 #define TCP_CHILD_NON_BLOCKING
121 #endif
122 #define MAX_SEND_FD_QUEUE_SIZE  tcp_main_max_fd_no
123 #define SEND_FD_QUEUE_SIZE              128  /* initial size */
124 #define SEND_FD_QUEUE_TIMEOUT   MS_TO_TICKS(2000)  /* 2 s */
125 #endif
126
127 /* minimum interval local_timer_run() is allowed to run, in ticks */
128 #define TCPCONN_TIMEOUT_MIN_RUN 1  /* once per tick */
129 #define TCPCONN_WAIT_TIMEOUT 1 /* 1 tick */
130
131 #ifdef TCP_ASYNC
132 static unsigned int* tcp_total_wq=0;
133 #endif
134
135
136 enum fd_types { F_NONE, F_SOCKINFO /* a tcp_listen fd */,
137                                 F_TCPCONN, F_TCPCHILD, F_PROC };
138
139
140 #ifdef TCP_FD_CACHE
141
142 #define TCP_FD_CACHE_SIZE 8
143
144 struct fd_cache_entry{
145         struct tcp_connection* con;
146         int id;
147         int fd;
148 };
149
150
151 static struct fd_cache_entry fd_cache[TCP_FD_CACHE_SIZE];
152 #endif /* TCP_FD_CACHE */
153
154 static int is_tcp_main=0;
155
156
157 enum poll_types tcp_poll_method=0; /* by default choose the best method */
158 int tcp_main_max_fd_no=0;
159 int tcp_max_connections=DEFAULT_TCP_MAX_CONNECTIONS;
160 int tls_max_connections=DEFAULT_TLS_MAX_CONNECTIONS;
161 int tcp_accept_unique=0;
162
163 int tcp_connection_match=TCPCONN_MATCH_DEFAULT;
164
165 static union sockaddr_union tcp_source_ipv4_addr; /* saved bind/srv v4 addr. */
166 static union sockaddr_union* tcp_source_ipv4=0;
167 static union sockaddr_union tcp_source_ipv6_addr; /* saved bind/src v6 addr. */
168 static union sockaddr_union* tcp_source_ipv6=0;
169
170 static int* tcp_connections_no=0; /* current tcp (+tls) open connections */
171 static int* tls_connections_no=0; /* current tls open connections */
172
173 /* connection hash table (after ip&port) , includes also aliases */
174 struct tcp_conn_alias** tcpconn_aliases_hash=0;
175 /* connection hash table (after connection id) */
176 struct tcp_connection** tcpconn_id_hash=0;
177 gen_lock_t* tcpconn_lock=0;
178
179 struct tcp_child* tcp_children=0;
180 static int* connection_id=0; /*  unique for each connection, used for 
181                                                                 quickly finding the corresponding connection
182                                                                 for a reply */
183 int unix_tcp_sock;
184
185 static int tcp_proto_no=-1; /* tcp protocol number as returned by
186                                                            getprotobyname */
187
188 static io_wait_h io_h;
189
190 static struct local_timer tcp_main_ltimer;
191 static ticks_t tcp_main_prev_ticks;
192
193 /* tell if there are tcp workers that should handle only specific socket
194  * - used to optimize the search of least loaded worker for a tcp socket
195  * - 0 - no workers per tcp sockets have been set
196  * - 1 + generic_workers - when there are workers per tcp sockets
197  */
198 static int tcp_sockets_gworkers = 0;
199
200 static ticks_t tcpconn_main_timeout(ticks_t , struct timer_ln* , void* );
201
202 inline static int _tcpconn_add_alias_unsafe(struct tcp_connection* c, int port,
203                                                                                 struct ip_addr* l_ip, int l_port,
204                                                                                 int flags);
205
206
207
208 /* sets source address used when opening new sockets and no source is specified
209  *  (by default the address is choosen by the kernel)
210  * Should be used only on init.
211  * returns -1 on error */
212 int tcp_set_src_addr(struct ip_addr* ip)
213 {
214         switch (ip->af){
215                 case AF_INET:
216                         ip_addr2su(&tcp_source_ipv4_addr, ip, 0);
217                         tcp_source_ipv4=&tcp_source_ipv4_addr;
218                         break;
219                 case AF_INET6:
220                         ip_addr2su(&tcp_source_ipv6_addr, ip, 0);
221                         tcp_source_ipv6=&tcp_source_ipv6_addr;
222                         break;
223                 default:
224                         return -1;
225         }
226         return 0;
227 }
228
229
230
231 static inline int init_sock_keepalive(int s)
232 {
233         int optval;
234         
235 #ifdef HAVE_SO_KEEPALIVE
236         if (cfg_get(tcp, tcp_cfg, keepalive)){
237                 optval=1;
238                 if (setsockopt(s, SOL_SOCKET, SO_KEEPALIVE, &optval,
239                                                 sizeof(optval))<0){
240                         LM_WARN("failed to enable SO_KEEPALIVE: %s\n", strerror(errno));
241                         return -1;
242                 }
243         }
244 #endif
245 #ifdef HAVE_TCP_KEEPINTVL
246         if ((optval=cfg_get(tcp, tcp_cfg, keepintvl))){
247                 if (setsockopt(s, IPPROTO_TCP, TCP_KEEPINTVL, &optval,
248                                                 sizeof(optval))<0){
249                         LM_WARN("failed to set keepalive probes interval: %s\n", strerror(errno));
250                 }
251         }
252 #endif
253 #ifdef HAVE_TCP_KEEPIDLE
254         if ((optval=cfg_get(tcp, tcp_cfg, keepidle))){
255                 if (setsockopt(s, IPPROTO_TCP, TCP_KEEPIDLE, &optval,
256                                                 sizeof(optval))<0){
257                         LM_WARN("failed to set keepalive idle interval: %s\n", strerror(errno));
258                 }
259         }
260 #endif
261 #ifdef HAVE_TCP_KEEPCNT
262         if ((optval=cfg_get(tcp, tcp_cfg, keepcnt))){
263                 if (setsockopt(s, IPPROTO_TCP, TCP_KEEPCNT, &optval,
264                                                 sizeof(optval))<0){
265                         LM_WARN("failed to set maximum keepalive count: %s\n", strerror(errno));
266                 }
267         }
268 #endif
269         return 0;
270 }
271
272
273
274 /* set all socket/fd options for new sockets (e.g. before connect): 
275  *  disable nagle, tos lowdelay, reuseaddr, non-blocking
276  *
277  * return -1 on error */
278 static int init_sock_opt(int s, int af)
279 {
280         int flags;
281         int optval;
282         
283 #ifdef DISABLE_NAGLE
284         flags=1;
285         if ( (tcp_proto_no!=-1) && (setsockopt(s, tcp_proto_no , TCP_NODELAY,
286                                         &flags, sizeof(flags))<0) ){
287                 LM_WARN("could not disable Nagle: %s\n", strerror(errno));
288         }
289 #endif
290         /* tos*/
291         optval = tos;
292         if(af==AF_INET){
293                 if (setsockopt(s, IPPROTO_IP, IP_TOS, (void*)&optval,
294                                         sizeof(optval)) ==-1){
295                         LM_WARN("setsockopt tos: %s\n", strerror(errno));
296                         /* continue since this is not critical */
297                 }
298         } else if(af==AF_INET6){
299                 if (setsockopt(s, IPPROTO_IPV6, IPV6_TCLASS,
300                                         (void*)&optval, sizeof(optval)) ==-1) {
301                         LM_WARN("setsockopt v6 tos: %s\n", strerror(errno));
302                         /* continue since this is not critical */
303                 }
304         }
305
306 #if  !defined(TCP_DONT_REUSEADDR) 
307         optval=1;
308         if (setsockopt(s, SOL_SOCKET, SO_REUSEADDR,
309                                                 (void*)&optval, sizeof(optval))==-1){
310                 LM_ERR("setsockopt SO_REUSEADDR %s\n", strerror(errno));
311                 /* continue, not critical */
312         }
313 #endif /* !TCP_DONT_REUSEADDR */
314
315 #ifdef SO_REUSEPORT
316         if ((optval=cfg_get(tcp, tcp_cfg, reuse_port))) {
317                 if (setsockopt(s, SOL_SOCKET, SO_REUSEPORT,
318                                 (void*)&optval, sizeof(optval))==-1) {
319                         LM_ERR("setsockopt %s\n", strerror(errno));
320                 }
321         }
322 #endif
323
324 #ifdef HAVE_TCP_SYNCNT
325         if ((optval=cfg_get(tcp, tcp_cfg, syncnt))){
326                 if (setsockopt(s, IPPROTO_TCP, TCP_SYNCNT, &optval,
327                                                 sizeof(optval))<0){
328                         LM_WARN("failed to set maximum SYN retr. count: %s\n", strerror(errno));
329                 }
330         }
331 #endif
332 #ifdef HAVE_TCP_LINGER2
333         if ((optval=cfg_get(tcp, tcp_cfg, linger2))){
334                 if (setsockopt(s, IPPROTO_TCP, TCP_LINGER2, &optval,
335                                                 sizeof(optval))<0){
336                         LM_WARN("failed to set maximum LINGER2 timeout: %s\n", strerror(errno));
337                 }
338         }
339 #endif
340 #ifdef HAVE_TCP_QUICKACK
341         if (cfg_get(tcp, tcp_cfg, delayed_ack)){
342                 optval=0; /* reset quick ack => delayed ack */
343                 if (setsockopt(s, IPPROTO_TCP, TCP_QUICKACK, &optval,
344                                                 sizeof(optval))<0){
345                         LM_WARN("failed to reset TCP_QUICKACK: %s\n", strerror(errno));
346                 }
347         }
348 #endif /* HAVE_TCP_QUICKACK */
349         init_sock_keepalive(s);
350         
351         /* non-blocking */
352         flags=fcntl(s, F_GETFL);
353         if (flags==-1){
354                 LM_ERR("fnctl failed: (%d) %s\n", errno, strerror(errno));
355                 goto error;
356         }
357         if (fcntl(s, F_SETFL, flags|O_NONBLOCK)==-1){
358                 LM_ERR("fcntl: set non-blocking failed: (%d) %s\n", errno, strerror(errno));
359                 goto error;
360         }
361         return 0;
362 error:
363         return -1;
364 }
365
366
367
368 /* set all socket/fd options for "accepted" sockets 
369  *  only nonblocking is set since the rest is inherited from the
370  *  "parent" (listening) socket
371  *  Note: setting O_NONBLOCK is required on linux but it's not needed on
372  *        BSD and possibly solaris (where the flag is inherited from the 
373  *        parent socket). However since there is no standard document 
374  *        requiring a specific behaviour in this case it's safer to always set
375  *        it (at least for now)  --andrei
376  *  TODO: check on which OSes  O_NONBLOCK is inherited and make this 
377  *        function a nop.
378  *
379  * return -1 on error */
380 static int init_sock_opt_accept(int s)
381 {
382         int flags;
383         
384         /* non-blocking */
385         flags=fcntl(s, F_GETFL);
386         if (flags==-1){
387                 LM_ERR("fnctl failed: (%d) %s\n", errno, strerror(errno));
388                 goto error;
389         }
390         if (fcntl(s, F_SETFL, flags|O_NONBLOCK)==-1){
391                 LM_ERR("fcntl: set non-blocking failed: (%d) %s\n", errno, strerror(errno));
392                 goto error;
393         }
394         return 0;
395 error:
396         return -1;
397 }
398
399
400
401 /** close a socket, handling errno.
402  * On EINTR, repeat the close().
403  * Filter expected errors (return success if close() failed because
404  * EPIPE, ECONNRST a.s.o). Note that this happens on *BSDs (on linux close()
405  * does not fail for socket level errors).
406  * @param s - open valid socket.
407  * @return - 0 on success, < 0 on error (whatever close() returns). On error
408  *           errno is set.
409  */
410 static int tcp_safe_close(int s)
411 {
412         int ret;
413
414         if(s<0)
415                 return 0;
416
417 retry:
418         if (unlikely((ret = close(s)) < 0 )) {
419                 switch(errno) {
420                         case EINTR:
421                                 goto retry;
422                         case EPIPE:
423                         case ENOTCONN:
424                         case ECONNRESET:
425                         case ECONNREFUSED:
426                         case ENETUNREACH:
427                         case EHOSTUNREACH:
428                                 /* on *BSD we really get these errors at close() time 
429                                    => ignore them */
430                                 ret = 0;
431                                 break;
432                         default:
433                                 break;
434                 }
435         }
436         return ret;
437 }
438
439
440
441 /* blocking connect on a non-blocking fd; it will timeout after
442  * tcp_connect_timeout 
443  * if BLOCKING_USE_SELECT and HAVE_SELECT are defined it will internally
444  * use select() instead of poll (bad if fd > FD_SET_SIZE, poll is preferred)
445  */
446 static int tcp_blocking_connect(int fd, int type, snd_flags_t* send_flags,
447                                                                 const struct sockaddr *servaddr,
448                                                                 socklen_t addrlen)
449 {
450         int n;
451 #if defined(HAVE_SELECT) && defined(BLOCKING_USE_SELECT)
452         fd_set sel_set;
453         fd_set orig_set;
454         struct timeval timeout;
455 #else
456         struct pollfd pf;
457 #endif
458         int elapsed;
459         int to;
460         int ticks;
461         int err;
462         unsigned int err_len;
463         int poll_err;
464         
465         poll_err=0;
466         to=cfg_get(tcp, tcp_cfg, connect_timeout_s);
467         ticks=get_ticks();
468 again:
469         n=connect(fd, servaddr, addrlen);
470         if (n==-1){
471                 if (errno==EINTR){
472                         elapsed=(get_ticks()-ticks)*TIMER_TICK;
473                         if (elapsed<to)         goto again;
474                         else goto error_timeout;
475                 }
476                 if (errno!=EINPROGRESS && errno!=EALREADY){
477                         goto error_errno;
478                 }
479         }else goto end;
480         
481         /* poll/select loop */
482 #if defined(HAVE_SELECT) && defined(BLOCKING_USE_SELECT)
483                 FD_ZERO(&orig_set);
484                 FD_SET(fd, &orig_set);
485 #else
486                 pf.fd=fd;
487                 pf.events=POLLOUT;
488 #endif
489         while(1){
490                 elapsed=(get_ticks()-ticks)*TIMER_TICK;
491                 if (elapsed>=to)
492                         goto error_timeout;
493 #if defined(HAVE_SELECT) && defined(BLOCKING_USE_SELECT)
494                 sel_set=orig_set;
495                 timeout.tv_sec=to-elapsed;
496                 timeout.tv_usec=0;
497                 n=select(fd+1, 0, &sel_set, 0, &timeout);
498 #else
499                 n=poll(&pf, 1, (to-elapsed)*1000);
500 #endif
501                 if (n<0){
502                         if (errno==EINTR) continue;
503                         LM_ERR("%s: poll/select failed: (%d) %s\n",
504                                         su2a((union sockaddr_union*)servaddr, addrlen),
505                                         errno, strerror(errno));
506                         goto error;
507                 }else if (n==0) /* timeout */ continue;
508 #if defined(HAVE_SELECT) && defined(BLOCKING_USE_SELECT)
509                 if (FD_ISSET(fd, &sel_set))
510 #else
511                 if (pf.revents&(POLLERR|POLLHUP|POLLNVAL)){ 
512                         LM_ERR("%s: poll error: flags %x\n",
513                                         su2a((union sockaddr_union*)servaddr, addrlen),
514                                         pf.revents);
515                         poll_err=1;
516                 }
517 #endif
518                 {
519                         err_len=sizeof(err);
520                         getsockopt(fd, SOL_SOCKET, SO_ERROR, &err, &err_len);
521                         if ((err==0) && (poll_err==0)) goto end;
522                         if (err!=EINPROGRESS && err!=EALREADY){
523                                 LM_ERR("%s: SO_ERROR (%d) %s\n",
524                                                 su2a((union sockaddr_union*)servaddr, addrlen),
525                                                 err, strerror(err));
526                                 errno=err;
527                                 goto error_errno;
528                         }
529                 }
530         }
531 error_errno:
532         switch(errno){
533                 case ENETUNREACH:
534                 case EHOSTUNREACH:
535 #ifdef USE_DST_BLACKLIST
536                         dst_blacklist_su(BLST_ERR_CONNECT, type,
537                                                          (union sockaddr_union*)servaddr, send_flags, 0);
538 #endif /* USE_DST_BLACKLIST */
539                         TCP_EV_CONNECT_UNREACHABLE(errno, 0, 0,
540                                                         (union sockaddr_union*)servaddr, type);
541                         break;
542                 case ETIMEDOUT:
543 #ifdef USE_DST_BLACKLIST
544                         dst_blacklist_su(BLST_ERR_CONNECT, type,
545                                                          (union sockaddr_union*)servaddr, send_flags, 0);
546 #endif /* USE_DST_BLACKLIST */
547                         TCP_EV_CONNECT_TIMEOUT(errno, 0, 0,
548                                                         (union sockaddr_union*)servaddr, type);
549                         break;
550                 case ECONNREFUSED:
551                 case ECONNRESET:
552 #ifdef USE_DST_BLACKLIST
553                         dst_blacklist_su(BLST_ERR_CONNECT, type,
554                                                          (union sockaddr_union*)servaddr, send_flags, 0);
555 #endif /* USE_DST_BLACKLIST */
556                         TCP_EV_CONNECT_RST(errno, 0, 0,
557                                                         (union sockaddr_union*)servaddr, type);
558                         break;
559                 case EAGAIN: /* not posix, but supported on linux and bsd */
560                         TCP_EV_CONNECT_NO_MORE_PORTS(errno, 0, 0,
561                                                         (union sockaddr_union*)servaddr, type);
562                         break;
563                 default:
564                         TCP_EV_CONNECT_ERR(errno, 0, 0,
565                                                                 (union sockaddr_union*)servaddr, type);
566         }
567         LM_ERR("%s: (%d) %s\n",
568                         su2a((union sockaddr_union*)servaddr, addrlen),
569                         errno, strerror(errno));
570         goto error;
571 error_timeout:
572         /* timeout */
573 #ifdef USE_DST_BLACKLIST
574         dst_blacklist_su(BLST_ERR_CONNECT, type,
575                                                 (union sockaddr_union*)servaddr, send_flags, 0);
576 #endif /* USE_DST_BLACKLIST */
577         TCP_EV_CONNECT_TIMEOUT(0, 0, 0, (union sockaddr_union*)servaddr, type);
578         LM_ERR("%s: timeout %d s elapsed from %d s\n",
579                                 su2a((union sockaddr_union*)servaddr, addrlen),
580                                 elapsed, cfg_get(tcp, tcp_cfg, connect_timeout_s));
581 error:
582         TCP_STATS_CONNECT_FAILED();
583         return -1;
584 end:
585         return 0;
586 }
587
588
589
590 #ifdef TCP_ASYNC
591
592
593 /* unsafe version */
594 #define _wbufq_empty(con) ((con)->wbuf_q.first==0)
595 /* unsafe version */
596 #define _wbufq_non_empty(con) ((con)->wbuf_q.first!=0)
597
598
599 /* unsafe version, call while holding the connection write lock */
600 inline static int _wbufq_add(struct  tcp_connection* c, const char* data, 
601                                                         unsigned int size)
602 {
603         struct tcp_wbuffer_queue* q;
604         struct tcp_wbuffer* wb;
605         unsigned int last_free;
606         unsigned int wb_size;
607         unsigned int crt_size;
608         ticks_t t;
609         
610         q=&c->wbuf_q;
611         t=get_ticks_raw();
612         if (unlikely(   ((q->queued+size)>cfg_get(tcp, tcp_cfg, tcpconn_wq_max)) ||
613                                         ((*tcp_total_wq+size)>cfg_get(tcp, tcp_cfg, tcp_wq_max)) ||
614                                         (q->first &&
615                                         TICKS_LT(q->wr_timeout, t)) )){
616                 LM_ERR("(%u bytes): write queue full or timeout "
617                                         " (%u, total %u, last write %d s ago)\n",
618                                         size, q->queued, *tcp_total_wq,
619                                         TICKS_TO_S(t-(q->wr_timeout-
620                                                                 cfg_get(tcp, tcp_cfg, send_timeout))));
621                 if (q->first && TICKS_LT(q->wr_timeout, t)){
622                         if (unlikely(c->state==S_CONN_CONNECT)){
623 #ifdef USE_DST_BLACKLIST
624                                 (void)dst_blacklist_su( BLST_ERR_CONNECT, c->rcv.proto,
625                                                                                 &c->rcv.src_su, &c->send_flags, 0);
626 #endif /* USE_DST_BLACKLIST */
627                                 TCP_EV_CONNECT_TIMEOUT(0, TCP_LADDR(c), TCP_LPORT(c),
628                                                                                         TCP_PSU(c), TCP_PROTO(c));
629                                 TCP_STATS_CONNECT_FAILED();
630                         }else{
631 #ifdef USE_DST_BLACKLIST
632                                 (void)dst_blacklist_su( BLST_ERR_SEND, c->rcv.proto,
633                                                                         &c->rcv.src_su, &c->send_flags, 0);
634 #endif /* USE_DST_BLACKLIST */
635                                 TCP_EV_SEND_TIMEOUT(0, &c->rcv);
636                                 TCP_STATS_SEND_TIMEOUT();
637                         }
638                 }else{
639                         /* if it's not a timeout => queue full */
640                         TCP_EV_SENDQ_FULL(0, &c->rcv);
641                         TCP_STATS_SENDQ_FULL();
642                 }
643                 goto error;
644         }
645         
646         if (unlikely(q->last==0)){
647                 wb_size=MAX_unsigned(cfg_get(tcp, tcp_cfg, wq_blk_size), size);
648                 wb=shm_malloc(sizeof(*wb)+wb_size-1);
649                 if (unlikely(wb==0)) {
650                         SHM_MEM_ERROR;
651                         goto error;
652                 }
653                 wb->b_size=wb_size;
654                 wb->next=0;
655                 q->last=wb;
656                 q->first=wb;
657                 q->last_used=0;
658                 q->offset=0;
659                 q->wr_timeout=get_ticks_raw()+
660                         ((c->state==S_CONN_CONNECT)?
661                                         S_TO_TICKS(cfg_get(tcp, tcp_cfg, connect_timeout_s)):
662                                         cfg_get(tcp, tcp_cfg, send_timeout));
663         }else{
664                 wb=q->last;
665         }
666         
667         while(size){
668                 last_free=wb->b_size-q->last_used;
669                 if (last_free==0){
670                         wb_size=MAX_unsigned(cfg_get(tcp, tcp_cfg, wq_blk_size), size);
671                         wb=shm_malloc(sizeof(*wb)+wb_size-1);
672                         if (unlikely(wb==0)) {
673                                 SHM_MEM_ERROR;
674                                 goto error;
675                         }
676                         wb->b_size=wb_size;
677                         wb->next=0;
678                         q->last->next=wb;
679                         q->last=wb;
680                         q->last_used=0;
681                         last_free=wb->b_size;
682                 }
683                 crt_size=MIN_unsigned(last_free, size);
684                 memcpy(wb->buf+q->last_used, data, crt_size);
685                 q->last_used+=crt_size;
686                 size-=crt_size;
687                 data+=crt_size;
688                 q->queued+=crt_size;
689                 atomic_add_int((int*)tcp_total_wq, crt_size);
690         }
691         return 0;
692 error:
693         return -1;
694 }
695
696
697
698 /* unsafe version, call while holding the connection write lock
699  * inserts data at the beginning, it ignores the max queue size checks and
700  * the timeout (use sparingly)
701  * Note: it should never be called on a write buffer after wbufq_run() */
702 inline static int _wbufq_insert(struct  tcp_connection* c, const char* data, 
703                                                         unsigned int size)
704 {
705         struct tcp_wbuffer_queue* q;
706         struct tcp_wbuffer* wb;
707         
708         q=&c->wbuf_q;
709         if (likely(q->first==0)) /* if empty, use wbufq_add */
710                 return _wbufq_add(c, data, size);
711         
712         if (unlikely((*tcp_total_wq+size)>cfg_get(tcp, tcp_cfg, tcp_wq_max))){
713                 LM_ERR("(%d bytes): write queue full"
714                                         " (%d, total %d, last write %d s ago)\n",
715                                         size, q->queued, *tcp_total_wq,
716                                         TICKS_TO_S(get_ticks_raw()-q->wr_timeout-
717                                                                         cfg_get(tcp, tcp_cfg, send_timeout)));
718                 goto error;
719         }
720         if (unlikely(q->offset)){
721                 LM_CRIT("non-null offset %d (bad call, should"
722                                 "never be called after the wbufq_run())\n", q->offset);
723                 goto error;
724         }
725         if ((q->first==q->last) && ((q->last->b_size-q->last_used)>=size)){
726                 /* one block with enough space in it for size bytes */
727                 memmove(q->first->buf+size, q->first->buf, q->last_used);
728                 memcpy(q->first->buf, data, size);
729                 q->last_used+=size;
730         }else{
731                 /* create a size bytes block directly */
732                 wb=shm_malloc(sizeof(*wb)+size-1);
733                 if (unlikely(wb==0)) {
734                         SHM_MEM_ERROR;
735                         goto error;
736                 }
737                 wb->b_size=size;
738                 /* insert it */
739                 wb->next=q->first;
740                 q->first=wb;
741                 memcpy(wb->buf, data, size);
742         }
743         
744         q->queued+=size;
745         atomic_add_int((int*)tcp_total_wq, size);
746         return 0;
747 error:
748         return -1;
749 }
750
751
752
753 /* unsafe version, call while holding the connection write lock */
754 inline static void _wbufq_destroy( struct  tcp_wbuffer_queue* q)
755 {
756         struct tcp_wbuffer* wb;
757         struct tcp_wbuffer* next_wb;
758         int unqueued;
759         
760         unqueued=0;
761         if (likely(q->first)){
762                 wb=q->first;
763                 do{
764                         next_wb=wb->next;
765                         unqueued+=(wb==q->last)?q->last_used:wb->b_size;
766                         if (wb==q->first)
767                                 unqueued-=q->offset;
768                         shm_free(wb);
769                         wb=next_wb;
770                 }while(wb);
771         }
772         memset(q, 0, sizeof(*q));
773         atomic_add_int((int*)tcp_total_wq, -unqueued);
774 }
775
776
777
778 /* tries to empty the queue  (safe version, c->write_lock must not be hold)
779  * returns -1 on error, bytes written on success (>=0) 
780  * if the whole queue is emptied => sets *empty*/
781 inline static int wbufq_run(int fd, struct tcp_connection* c, int* empty)
782 {
783         struct tcp_wbuffer_queue* q;
784         struct tcp_wbuffer* wb;
785         int n;
786         int ret;
787         int block_size;
788         char* buf;
789         
790         *empty=0;
791         ret=0;
792         lock_get(&c->write_lock);
793         q=&c->wbuf_q;
794         while(q->first){
795                 block_size=((q->first==q->last)?q->last_used:q->first->b_size)-
796                                                 q->offset;
797                 buf=q->first->buf+q->offset;
798                 n=_tcpconn_write_nb(fd, c, buf, block_size);
799                 if (likely(n>0)){
800                         ret+=n;
801                         if (likely(n==block_size)){
802                                 wb=q->first;
803                                 q->first=q->first->next; 
804                                 shm_free(wb);
805                                 q->offset=0;
806                                 q->queued-=block_size;
807                                 atomic_add_int((int*)tcp_total_wq, -block_size);
808                         }else{
809                                 q->offset+=n;
810                                 q->queued-=n;
811                                 atomic_add_int((int*)tcp_total_wq, -n);
812                                 break;
813                         }
814                 }else{
815                         if (n<0){
816                                 /* EINTR is handled inside _tcpconn_write_nb */
817                                 if (!(errno==EAGAIN || errno==EWOULDBLOCK)){
818                                         if (unlikely(c->state==S_CONN_CONNECT)){
819                                                 switch(errno){
820                                                         case ENETUNREACH:
821                                                         case EHOSTUNREACH: /* not posix for send() */
822 #ifdef USE_DST_BLACKLIST
823                                                                 dst_blacklist_su(BLST_ERR_CONNECT,
824                                                                                                         c->rcv.proto,
825                                                                                                         &c->rcv.src_su,
826                                                                                                         &c->send_flags, 0);
827 #endif /* USE_DST_BLACKLIST */
828                                                                 TCP_EV_CONNECT_UNREACHABLE(errno, TCP_LADDR(c),
829                                                                                                         TCP_LPORT(c), TCP_PSU(c),
830                                                                                                         TCP_PROTO(c));
831                                                                 break;
832                                                         case ECONNREFUSED:
833                                                         case ECONNRESET:
834 #ifdef USE_DST_BLACKLIST
835                                                                 dst_blacklist_su(BLST_ERR_CONNECT,
836                                                                                                         c->rcv.proto,
837                                                                                                         &c->rcv.src_su,
838                                                                                                         &c->send_flags, 0);
839 #endif /* USE_DST_BLACKLIST */
840                                                                 TCP_EV_CONNECT_RST(0, TCP_LADDR(c),
841                                                                                                         TCP_LPORT(c), TCP_PSU(c),
842                                                                                                         TCP_PROTO(c));
843                                                                 break;
844                                                         default:
845                                                                 TCP_EV_CONNECT_ERR(errno, TCP_LADDR(c),
846                                                                                                         TCP_LPORT(c), TCP_PSU(c),
847                                                                                                         TCP_PROTO(c));
848                                                 }
849                                                 TCP_STATS_CONNECT_FAILED();
850                                         }else{
851                                                 switch(errno){
852                                                         case ECONNREFUSED:
853                                                         case ECONNRESET:
854                                                                 TCP_STATS_CON_RESET();
855                                                                 /* no break */
856                                                         case ENETUNREACH:
857                                                         case EHOSTUNREACH: /* not posix for send() */
858 #ifdef USE_DST_BLACKLIST
859                                                                 dst_blacklist_su(BLST_ERR_SEND,
860                                                                                                         c->rcv.proto,
861                                                                                                         &c->rcv.src_su,
862                                                                                                         &c->send_flags, 0);
863 #endif /* USE_DST_BLACKLIST */
864                                                                 break;
865                                                 }
866                                         }
867                                         ret=-1;
868                                         LM_ERR("%s [%d]\n", strerror(errno), errno);
869                                 }
870                         }
871                         break;
872                 }
873         }
874         if (likely(q->first==0)){
875                 q->last=0;
876                 q->last_used=0;
877                 q->offset=0;
878                 *empty=1;
879         }
880         lock_release(&c->write_lock);
881         if (likely(ret>0)){
882                 q->wr_timeout=get_ticks_raw()+cfg_get(tcp, tcp_cfg, send_timeout);
883                 if (unlikely(c->state==S_CONN_CONNECT || c->state==S_CONN_ACCEPT)){
884                         TCP_STATS_ESTABLISHED(c->state);
885                         c->state=S_CONN_OK;
886                 }
887         }
888         return ret;
889 }
890
891 #endif /* TCP_ASYNC */
892
893
894
895 /* Attempt to extract real connection information from an upstream load
896  * balancer or reverse proxy. This should be called right after accept()ing the
897  * connection, and before TLS negotiation.
898  *
899  * Returns:
900  *    -1 on parsing error (connection should be closed)
901  *    0 on parser success, and connection information was extracted
902  *    1 on parser success, but no connection information was provided by the
903  *      upstream load balancer or reverse proxy.
904  */
905 int tcpconn_read_haproxy(struct tcp_connection *c) {
906         int bytes, retval = 0;
907         uint32_t size, port;
908         char *p, *end;
909         struct ip_addr *src_ip, *dst_ip;
910
911         const char v2sig[12] = "\x0D\x0A\x0D\x0A\x00\x0D\x0A\x51\x55\x49\x54\x0A";
912
913         // proxy header union
914         union {
915                 // v1 struct
916                 struct {
917                         char line[108];
918                 } v1;
919
920                 // v2 struct
921                 struct {
922                         uint8_t sig[12];
923                         uint8_t ver_cmd;
924                         uint8_t fam;
925                         uint16_t len;
926
927                         union {
928                                 struct { /* for TCP/UDP over IPv4, len = 12 */
929                                         uint32_t src_addr;
930                                         uint32_t dst_addr;
931                                         uint16_t src_port;
932                                         uint16_t dst_port;
933                                 } ip4;
934
935                                 struct { /* for TCP/UDP over IPv6, len = 36 */
936                                          uint8_t  src_addr[16];
937                                          uint8_t  dst_addr[16];
938                                          uint16_t src_port;
939                                          uint16_t dst_port;
940                                 } ip6;
941
942                                 struct { /* for AF_UNIX sockets, len = 216 */
943                                          uint8_t src_addr[108];
944                                          uint8_t dst_addr[108];
945                                 } unx;
946                         } addr;
947                 } v2;
948
949         } hdr;
950
951         do {
952                 bytes = recv(c->s, &hdr, sizeof(hdr), MSG_PEEK);
953         } while (bytes == -1 && (errno == EINTR || errno == EAGAIN));
954
955         /* copy original tunnel address details */
956         memcpy(&c->cinfo.src_ip, &c->rcv.src_ip, sizeof(ip_addr_t));
957         memcpy(&c->cinfo.dst_ip, &c->rcv.dst_ip, sizeof(ip_addr_t));
958         c->cinfo.src_port = c->rcv.src_port;
959         c->cinfo.dst_port = c->rcv.dst_port;
960         c->cinfo.proto = (int)c->rcv.proto;
961         c->cinfo.csocket = c->rcv.bind_address;
962
963         src_ip = &c->rcv.src_ip;
964         dst_ip = &c->rcv.dst_ip;
965
966         if (bytes >= 16 && memcmp(&hdr.v2, v2sig, 12) == 0 &&
967                 (hdr.v2.ver_cmd & 0xF0) == 0x20) {
968                 LM_DBG("received PROXY protocol v2 header\n");
969                 size = 16 + ntohs(hdr.v2.len);
970
971                 if (bytes < size) {
972                         return -1; /* truncated or too large header */
973                 }
974
975                 switch (hdr.v2.ver_cmd & 0xF) {
976                         case 0x01: /* PROXY command */
977                                 switch (hdr.v2.fam) {
978                                         case 0x11: /* TCPv4 */
979                                                 src_ip->af = AF_INET;
980                                                 src_ip->len = 4;
981                                                 src_ip->u.addr32[0] =
982                                                         hdr.v2.addr.ip4.src_addr;
983                                                 c->rcv.src_port =
984                                                         hdr.v2.addr.ip4.src_port;
985
986                                                 dst_ip->af = AF_INET;
987                                                 dst_ip->len = 4;
988                                                 dst_ip->u.addr32[0] =
989                                                         hdr.v2.addr.ip4.dst_addr;
990                                                 c->rcv.dst_port =
991                                                         hdr.v2.addr.ip4.dst_port;
992
993                                                 goto done;
994
995                                         case 0x21: /* TCPv6 */
996                                                 src_ip->af = AF_INET6;
997                                                 src_ip->len = 16;
998                                                 memcpy(src_ip->u.addr,
999                                                         hdr.v2.addr.ip6.src_addr, 16);
1000                                                 c->rcv.src_port =
1001                                                         hdr.v2.addr.ip6.src_port;
1002
1003                                                 dst_ip->af = AF_INET6;
1004                                                 dst_ip->len = 16;
1005                                                 memcpy(dst_ip->u.addr,
1006                                                         hdr.v2.addr.ip6.src_addr, 16);
1007                                                 c->rcv.dst_port =
1008                                                         hdr.v2.addr.ip6.dst_port;
1009
1010                                                 goto done;
1011
1012                                         default: /* unsupported protocol */
1013                                                 return -1;
1014                                 }
1015
1016                         case 0x00: /* LOCAL command */
1017                                 retval = 1; /* keep local connection address for LOCAL */
1018                                 goto done;
1019
1020                         default:
1021                                 return -1; /* not a supported command */
1022                 }
1023         }
1024         else if (bytes >= 8 && memcmp(hdr.v1.line, "PROXY", 5) == 0) {
1025                 LM_DBG("received PROXY protocol v1 header\n");
1026                 end = memchr(hdr.v1.line, '\r', bytes - 1);
1027                 if (!end || end[1] != '\n') {
1028                         return -1; /* partial or invalid header */
1029                 }
1030                 *end = '\0'; /* terminate the string to ease parsing */
1031                 size = end + 2 - hdr.v1.line;
1032                 p = hdr.v1.line + 5;
1033
1034                 if (strncmp(p, " TCP", 4) == 0) {
1035                         switch (p[4]) {
1036                                 case '4':
1037                                         src_ip->af  = dst_ip->af  = AF_INET;
1038                                         src_ip->len = dst_ip->len = 4;
1039                                         break;
1040                                 case '6':
1041                                         src_ip->af  = dst_ip->af  = AF_INET6;
1042                                         src_ip->len = dst_ip->len = 16;
1043                                         break;
1044                                 default:
1045                                         return -1; /* unknown TCP version */
1046                         }
1047
1048                         if (p[5] != ' ') {
1049                                 return -1; /* misformatted header */
1050                         }
1051                         p += 6; /* skip over the already-parsed bytes */
1052
1053                         /* Parse the source IP address */
1054                         end = strchr(p, ' ');
1055                         if (!end) {
1056                                 return -1; /* truncated header */
1057                         }
1058                         *end = '\0'; /* mark the end of the IP address */
1059                         if (inet_pton(src_ip->af, p, src_ip->u.addr) != 1) {
1060                                 return -1; /* missing IP address */
1061                         }
1062                         p = end + 1;
1063
1064                         /* Parse the destination IP address */
1065                         end = strchr(p, ' ');
1066                         if (!end) {
1067                                 return -1;
1068                         }
1069                         *end = '\0'; /* mark the end of the IP address */
1070                         if (inet_pton(dst_ip->af, p, dst_ip->u.addr) != 1) {
1071                                 return -1;
1072                         }
1073                         p = end + 1;
1074
1075                         /* Parse the source port */
1076                         port = strtoul(p, &end, 10);
1077                         if (port == UINT32_MAX || port == 0 || port >= (1 << 16)) {
1078                                 return -1; /* invalid port number */
1079                         }
1080                         c->rcv.src_port = port;
1081
1082                         if (*end != ' ') {
1083                                 return -1; /* invalid header */
1084                         }
1085                         p = end + 1;
1086
1087                         /* Parse the destination port */
1088                         port = strtoul(p, NULL, 10);
1089                         if (port == UINT32_MAX || port == 0 || port >= (1 << 16)) {
1090                                 return -1; /* invalid port number */
1091                         }
1092                         c->rcv.dst_port = port;
1093
1094                         goto done;
1095                 }
1096                 else if (strncmp(p, " UNKNOWN", 8) == 0) {
1097                         /* We know that the sender speaks the correct PROXY protocol with the
1098                          * appropriate version, and we SHOULD accept the connection and use the
1099                          * real connection's parameters as if there were no PROXY protocol header
1100                          * on the wire.
1101                          */
1102                         retval = 1; /* PROXY protocol parsed, but no IP override */
1103                         goto done;
1104                 }
1105                 else {
1106                         return -1; /* invalid header */
1107                 }
1108         } else if (bytes == 0) {
1109                 return 1; /* EOF? Return "no IP change" in any case */
1110         } else {
1111                 /* not haproxy protocol */
1112                 return 2;
1113         }
1114
1115 done:
1116         /* we need to consume the appropriate amount of data from the socket */
1117         do {
1118                 bytes = recv(c->s, &hdr, size, 0);
1119         } while (bytes == -1 && errno == EINTR);
1120
1121         return (bytes >= 0) ? retval : -1;
1122 }
1123
1124 struct tcp_connection* tcpconn_new(int sock, union sockaddr_union* su,
1125                                                                         union sockaddr_union* local_addr,
1126                                                                         struct socket_info* ba, int type,
1127                                                                         int state)
1128 {
1129         struct tcp_connection *c;
1130         int rd_b_size, ret;
1131
1132         rd_b_size=cfg_get(tcp, tcp_cfg, rd_buf_size);
1133         c=shm_malloc(sizeof(struct tcp_connection) + rd_b_size);
1134         if (c==0){
1135                 SHM_MEM_ERROR;
1136                 goto error;
1137         }
1138         memset(c, 0, sizeof(struct tcp_connection)); /* zero init (skip rd buf)*/
1139         c->s=sock;
1140         c->fd=-1; /* not initialized */
1141         if (lock_init(&c->write_lock)==0){
1142                 LM_ERR("init lock failed\n");
1143                 goto error;
1144         }
1145
1146         c->rcv.src_su=*su;
1147
1148         su2ip_addr(&c->rcv.src_ip, su);
1149         c->rcv.src_port=su_getport(su);
1150         if (likely(local_addr)){
1151                 su2ip_addr(&c->rcv.dst_ip, local_addr);
1152                 c->rcv.dst_port=su_getport(local_addr);
1153         }else if (ba){
1154                 c->rcv.dst_ip=ba->address;
1155                 c->rcv.dst_port=ba->port_no;
1156         }
1157         c->rcv.bind_address=ba;
1158
1159         atomic_set(&c->refcnt, 0);
1160         local_timer_init(&c->timer, tcpconn_main_timeout, c, 0);
1161
1162         if (unlikely(ksr_tcp_accept_haproxy && state == S_CONN_ACCEPT)) {
1163                 ret = tcpconn_read_haproxy(c);
1164                 if (ret == -1) {
1165                         LM_ERR("invalid PROXY protocol header\n");
1166                         goto error;
1167                 } else if (ret == 1) {
1168                         LM_DBG("PROXY protocol did not override IP addresses\n");
1169                 } else if (ret == 2) {
1170                         LM_DBG("PROXY protocol header not found\n");
1171                 }
1172         }
1173         print_ip("tcpconn_new: new tcp connection: ", &c->rcv.src_ip, "\n");
1174         LM_DBG("on port %d, type %d, socket %d\n", c->rcv.src_port, type, sock);
1175         init_tcp_req(&c->req, (char*)c+sizeof(struct tcp_connection), rd_b_size);
1176         c->id=(*connection_id)++;
1177         c->rcv.proto_reserved1=0; /* this will be filled before receive_message*/
1178         c->rcv.proto_reserved2=0;
1179         c->state=state;
1180         c->extra_data=0;
1181 #ifdef USE_TLS
1182         if (type==PROTO_TLS){
1183                 if (tls_tcpconn_init(c, sock)==-1) goto error;
1184         }else
1185 #endif /* USE_TLS*/
1186         {
1187                 c->type=PROTO_TCP;
1188                 c->rcv.proto=PROTO_TCP;
1189                 c->timeout=get_ticks_raw()+cfg_get(tcp, tcp_cfg, con_lifetime);
1190                 c->lifetime = cfg_get(tcp, tcp_cfg, con_lifetime);
1191         }
1192
1193         return c;
1194
1195 error:
1196         if (c) shm_free(c);
1197         return 0;
1198 }
1199
1200
1201
1202 /* do the actual connect, set sock. options a.s.o
1203  * returns socket on success, -1 on error
1204  * sets also *res_local_addr, res_si and state (S_CONN_CONNECT for an
1205  * unfinished connect and S_CONN_OK for a finished one)*/
1206 inline static int tcp_do_connect(       union sockaddr_union* server,
1207                                                                         union sockaddr_union* from,
1208                                                                         int type,
1209                                                                         snd_flags_t* send_flags,
1210                                                                         union sockaddr_union* res_local_addr,
1211                                                                         struct socket_info** res_si,
1212                                                                         enum tcp_conn_states *state
1213                                                                         )
1214 {
1215         int s;
1216         union sockaddr_union my_name;
1217         socklen_t my_name_len;
1218         struct ip_addr ip;
1219 #ifdef TCP_ASYNC
1220         int n;
1221 #endif /* TCP_ASYNC */
1222
1223         s=socket(AF2PF(server->s.sa_family), SOCK_STREAM, 0);
1224         if (unlikely(s==-1)){
1225                 LM_ERR("%s: socket: (%d) %s\n",
1226                                 su2a(server, sizeof(*server)), errno, strerror(errno));
1227                 goto error;
1228         }
1229         if (init_sock_opt(s, server->s.sa_family)<0){
1230                 LM_ERR("%s: init_sock_opt failed\n",
1231                                         su2a(server, sizeof(*server)));
1232                 goto error;
1233         }
1234         
1235         if (unlikely(from && bind(s, &from->s, sockaddru_len(*from)) != 0)){
1236                 LM_WARN("binding to source address %s failed: %s [%d]\n",
1237                                         su2a(from, sizeof(*from)),
1238                                         strerror(errno), errno);
1239         }
1240         *state=S_CONN_OK;
1241 #ifdef TCP_ASYNC
1242         if (likely(cfg_get(tcp, tcp_cfg, async))){
1243 again:
1244                 n=connect(s, &server->s, sockaddru_len(*server));
1245                 if (likely(n==-1)){ /*non-blocking => most probable EINPROGRESS*/
1246                         if (likely(errno==EINPROGRESS))
1247                                 *state=S_CONN_CONNECT;
1248                         else if (errno==EINTR) goto again;
1249                         else if (errno!=EALREADY){
1250                                 switch(errno){
1251                                         case ENETUNREACH:
1252                                         case EHOSTUNREACH:
1253 #ifdef USE_DST_BLACKLIST
1254                                                 dst_blacklist_su(BLST_ERR_CONNECT, type, server,
1255                                                                                         send_flags, 0);
1256 #endif /* USE_DST_BLACKLIST */
1257                                                 TCP_EV_CONNECT_UNREACHABLE(errno, 0, 0, server, type);
1258                                                 break;
1259                                         case ETIMEDOUT:
1260 #ifdef USE_DST_BLACKLIST
1261                                                 dst_blacklist_su(BLST_ERR_CONNECT, type, server,
1262                                                                                         send_flags, 0);
1263 #endif /* USE_DST_BLACKLIST */
1264                                                 TCP_EV_CONNECT_TIMEOUT(errno, 0, 0, server, type);
1265                                                 break;
1266                                         case ECONNREFUSED:
1267                                         case ECONNRESET:
1268 #ifdef USE_DST_BLACKLIST
1269                                                 dst_blacklist_su(BLST_ERR_CONNECT, type, server,
1270                                                                                         send_flags, 0);
1271 #endif /* USE_DST_BLACKLIST */
1272                                                 TCP_EV_CONNECT_RST(errno, 0, 0, server, type);
1273                                                 break;
1274                                         case EAGAIN:/* not posix, but supported on linux and bsd */
1275                                                 TCP_EV_CONNECT_NO_MORE_PORTS(errno, 0, 0, server,type);
1276                                                 break;
1277                                         default:
1278                                                 TCP_EV_CONNECT_ERR(errno, 0, 0, server, type);
1279                                 }
1280                                 TCP_STATS_CONNECT_FAILED();
1281                                 LM_ERR("connect %s: (%d) %s\n",
1282                                                         su2a(server, sizeof(*server)),
1283                                                         errno, strerror(errno));
1284                                 goto error;
1285                         }
1286                 }
1287         }else{
1288 #endif /* TCP_ASYNC */
1289                 if (tcp_blocking_connect(s, type,  send_flags, &server->s,
1290                                                                         sockaddru_len(*server))<0){
1291                         LM_ERR("tcp_blocking_connect %s failed\n",
1292                                                 su2a(server, sizeof(*server)));
1293                         goto error;
1294                 }
1295 #ifdef TCP_ASYNC
1296         }
1297 #endif /* TCP_ASYNC */
1298         if (from){
1299                 su2ip_addr(&ip, from);
1300                 if (!ip_addr_any(&ip))
1301                         /* we already know the source ip, skip the sys. call */
1302                         goto find_socket;
1303         }
1304         my_name_len=sizeof(my_name);
1305         if (unlikely(getsockname(s, &my_name.s, &my_name_len)!=0)){
1306                 LM_ERR("getsockname failed: %s(%d)\n", strerror(errno), errno);
1307                 *res_si=0;
1308                 goto error;
1309         }
1310         from=&my_name; /* update from with the real "from" address */
1311         su2ip_addr(&ip, &my_name);
1312 find_socket:
1313 #ifdef USE_TLS
1314         if (unlikely(type==PROTO_TLS)) {
1315                 *res_si=find_si(&ip, 0, PROTO_TLS);
1316         } else {
1317                 *res_si=find_si(&ip, 0, PROTO_TCP);
1318         }
1319 #else
1320         *res_si=find_si(&ip, 0, PROTO_TCP);
1321 #endif
1322
1323         if (unlikely(*res_si==0)){
1324                 LM_WARN("%s: could not find corresponding"
1325                                 " listening socket for %s, using default...\n",
1326                                         su2a(server, sizeof(*server)), ip_addr2a(&ip));
1327 #ifdef USE_TLS
1328                 if (unlikely(type==PROTO_TLS)) {
1329                         if (server->s.sa_family==AF_INET) *res_si=sendipv4_tls;
1330                         else *res_si=sendipv6_tls;
1331                 } else {
1332                         if (server->s.sa_family==AF_INET) *res_si=sendipv4_tcp;
1333                         else *res_si=sendipv6_tcp;
1334                 }
1335 #else
1336                 if (server->s.sa_family==AF_INET) *res_si=sendipv4_tcp;
1337                 else *res_si=sendipv6_tcp;
1338 #endif
1339         }
1340         *res_local_addr=*from;
1341         return s;
1342 error:
1343         if (s!=-1) tcp_safe_close(s);
1344         return -1;
1345 }
1346
1347
1348
1349 struct tcp_connection* tcpconn_connect( union sockaddr_union* server,
1350                                                                                 union sockaddr_union* from,
1351                                                                                 int type, snd_flags_t* send_flags)
1352 {
1353         int s;
1354         struct socket_info* si;
1355         union sockaddr_union my_name;
1356         struct tcp_connection* con;
1357         enum tcp_conn_states state;
1358
1359         s=-1;
1360
1361         if (*tcp_connections_no >= cfg_get(tcp, tcp_cfg, max_connections)){
1362                 LM_ERR("maximum number of connections exceeded (%d/%d)\n",
1363                                         *tcp_connections_no,
1364                                         cfg_get(tcp, tcp_cfg, max_connections));
1365                 goto error;
1366         }
1367         if (unlikely(type==PROTO_TLS)) {
1368                 if (*tls_connections_no >= cfg_get(tcp, tcp_cfg, max_tls_connections)){
1369                         LM_ERR("maximum number of tls connections"
1370                                                 " exceeded (%d/%d)\n",
1371                                                 *tls_connections_no,
1372                                                 cfg_get(tcp, tcp_cfg, max_tls_connections));
1373                         goto error;
1374                 }
1375         }
1376
1377         s=tcp_do_connect(server, from, type,  send_flags, &my_name, &si, &state);
1378         if (s==-1){
1379                 LM_ERR("tcp_do_connect %s: failed (%d) %s\n",
1380                                 su2a(server, sizeof(*server)), errno, strerror(errno));
1381                 goto error;
1382         }
1383         con=tcpconn_new(s, server, &my_name, si, type, state);
1384         if (con==0){
1385                 LM_ERR("%s: tcpconn_new failed, closing the "
1386                                         " socket\n", su2a(server, sizeof(*server)));
1387                 goto error;
1388         }
1389         tcpconn_set_send_flags(con, *send_flags);
1390         return con;
1391 error:
1392         if (s!=-1) tcp_safe_close(s); /* close the opened socket */
1393         return 0;
1394 }
1395
1396
1397
1398 #ifdef TCP_CONNECT_WAIT
1399 int tcpconn_finish_connect( struct tcp_connection* c,
1400                                                                                                 union sockaddr_union* from)
1401 {
1402         int s;
1403         int r;
1404         union sockaddr_union local_addr;
1405         struct socket_info* si;
1406         enum tcp_conn_states state;
1407         struct tcp_conn_alias* a;
1408         int new_conn_alias_flags;
1409
1410         s=tcp_do_connect(&c->rcv.src_su, from, c->type, &c->send_flags,
1411                                                 &local_addr, &si, &state);
1412         if (unlikely(s==-1)){
1413                 LM_ERR("%s: tcp_do_connect for %p failed\n",
1414                                         su2a(&c->rcv.src_su, sizeof(c->rcv.src_su)), c);
1415                 return -1;
1416         }
1417         c->rcv.bind_address=si;
1418         su2ip_addr(&c->rcv.dst_ip, &local_addr);
1419         c->rcv.dst_port=su_getport(&local_addr);
1420         /* update aliases if needed */
1421         if (likely(from==0)){
1422                 new_conn_alias_flags=cfg_get(tcp, tcp_cfg, new_conn_alias_flags);
1423                 /* add aliases */
1424                 TCPCONN_LOCK;
1425                 _tcpconn_add_alias_unsafe(c, c->rcv.src_port, &c->rcv.dst_ip, 0,
1426                                                                                                         new_conn_alias_flags);
1427                 _tcpconn_add_alias_unsafe(c, c->rcv.src_port, &c->rcv.dst_ip,
1428                                                                         c->rcv.dst_port, new_conn_alias_flags);
1429                 TCPCONN_UNLOCK;
1430         }else if (su_cmp(from, &local_addr)!=1){
1431                 new_conn_alias_flags=cfg_get(tcp, tcp_cfg, new_conn_alias_flags);
1432                 TCPCONN_LOCK;
1433                         /* remove all the aliases except the first one and re-add them
1434                          * (there shouldn't be more then the 3 default aliases at this
1435                          * stage) */
1436                         if (c->aliases > 1) {
1437                                 for (r=1; r<c->aliases; r++){
1438                                         a=&c->con_aliases[r];
1439                                         tcpconn_listrm(tcpconn_aliases_hash[a->hash],
1440                                                                         a, next, prev);
1441                                 }
1442                                 c->aliases=1;
1443                         }
1444                         /* add the local_ip:0 and local_ip:local_port aliases */
1445                         _tcpconn_add_alias_unsafe(c, c->rcv.src_port, &c->rcv.dst_ip,
1446                                                                                                 0, new_conn_alias_flags);
1447                         _tcpconn_add_alias_unsafe(c, c->rcv.src_port, &c->rcv.dst_ip,
1448                                                                         c->rcv.dst_port, new_conn_alias_flags);
1449                 TCPCONN_UNLOCK;
1450         }
1451
1452         return s;
1453 }
1454 #endif /* TCP_CONNECT_WAIT */
1455
1456
1457
1458 /* adds a tcp connection to the tcpconn hashes
1459  * Note: it's called _only_ from the tcp_main process */
1460 inline static struct tcp_connection*  tcpconn_add(struct tcp_connection *c)
1461 {
1462         struct ip_addr zero_ip;
1463         int new_conn_alias_flags;
1464
1465         if (likely(c)){
1466                 ip_addr_mk_any(c->rcv.src_ip.af, &zero_ip);
1467                 c->id_hash=tcp_id_hash(c->id);
1468                 c->aliases=0;
1469                 new_conn_alias_flags=cfg_get(tcp, tcp_cfg, new_conn_alias_flags);
1470                 TCPCONN_LOCK;
1471                 c->flags|=F_CONN_HASHED;
1472                 /* add it at the begining of the list*/
1473                 tcpconn_listadd(tcpconn_id_hash[c->id_hash], c, id_next, id_prev);
1474                 /* set the aliases */
1475                 /* first alias is for (peer_ip, peer_port, 0 ,0) -- for finding
1476                  *  any connection to peer_ip, peer_port
1477                  * the second alias is for (peer_ip, peer_port, local_addr, 0) -- for
1478                  *  finding any conenction to peer_ip, peer_port from local_addr 
1479                  * the third alias is for (peer_ip, peer_port, local_addr, local_port) 
1480                  *   -- for finding if a fully specified connection exists */
1481                 _tcpconn_add_alias_unsafe(c, c->rcv.src_port, &zero_ip, 0,
1482                                                                                                         new_conn_alias_flags);
1483                 if (likely(c->rcv.dst_ip.af && ! ip_addr_any(&c->rcv.dst_ip))){
1484                         _tcpconn_add_alias_unsafe(c, c->rcv.src_port, &c->rcv.dst_ip, 0,
1485                                                                                                         new_conn_alias_flags);
1486                         _tcpconn_add_alias_unsafe(c, c->rcv.src_port, &c->rcv.dst_ip,
1487                                                                         c->rcv.dst_port, new_conn_alias_flags);
1488                 }
1489                 /* ignore add_alias errors, there are some valid cases when one
1490                  *  of the add_alias would fail (e.g. first add_alias for 2 connections
1491                  *   with the same destination but different src. ip*/
1492                 TCPCONN_UNLOCK;
1493                 LM_DBG("hashes: %d:%d:%d, %d\n",
1494                                                                                                 c->con_aliases[0].hash,
1495                                                                                                 c->con_aliases[1].hash,
1496                                                                                                 c->con_aliases[2].hash,
1497                                                                                                 c->id_hash);
1498                 return c;
1499         }else{
1500                 LM_CRIT("null connection pointer\n");
1501                 return 0;
1502         }
1503 }
1504
1505
1506 static inline void _tcpconn_detach(struct tcp_connection *c)
1507 {
1508         int r;
1509         tcpconn_listrm(tcpconn_id_hash[c->id_hash], c, id_next, id_prev);
1510         /* remove all the aliases */
1511         for (r=0; r<c->aliases; r++)
1512                 tcpconn_listrm(tcpconn_aliases_hash[c->con_aliases[r].hash], 
1513                                                 &c->con_aliases[r], next, prev);
1514         c->aliases = 0;
1515 }
1516
1517
1518
1519 static inline void _tcpconn_free(struct tcp_connection* c)
1520 {
1521 #ifdef TCP_ASYNC
1522         if (unlikely(_wbufq_non_empty(c)))
1523                 _wbufq_destroy(&c->wbuf_q);
1524 #endif
1525         lock_destroy(&c->write_lock);
1526 #ifdef USE_TLS
1527         if (unlikely(c->type==PROTO_TLS || c->type==PROTO_WSS)) tls_tcpconn_clean(c);
1528 #endif
1529         shm_free(c);
1530 }
1531
1532
1533
1534 /* unsafe tcpconn_rm version (nolocks) */
1535 void _tcpconn_rm(struct tcp_connection* c)
1536 {
1537         _tcpconn_detach(c);
1538         _tcpconn_free(c);
1539 }
1540
1541
1542
1543 void tcpconn_rm(struct tcp_connection* c)
1544 {
1545         int r;
1546         TCPCONN_LOCK;
1547         tcpconn_listrm(tcpconn_id_hash[c->id_hash], c, id_next, id_prev);
1548         /* remove all the aliases */
1549         for (r=0; r<c->aliases; r++)
1550                 tcpconn_listrm(tcpconn_aliases_hash[c->con_aliases[r].hash], 
1551                                                 &c->con_aliases[r], next, prev);
1552         c->aliases = 0;
1553         TCPCONN_UNLOCK;
1554         lock_destroy(&c->write_lock);
1555 #ifdef USE_TLS
1556         if ((c->type==PROTO_TLS || c->type==PROTO_WSS)&&(c->extra_data)) tls_tcpconn_clean(c);
1557 #endif
1558         shm_free(c);
1559 }
1560
1561
1562 /* finds a connection, if id=0 uses the ip addr, port, local_ip and local port
1563  *  (host byte order) and tries to find the connection that matches all of
1564  *   them. Wild cards can be used for local_ip and local_port (a 0 filled
1565  *   ip address and/or a 0 local port).
1566  * WARNING: unprotected (locks) use tcpconn_get unless you really
1567  * know what you are doing */
1568 struct tcp_connection* _tcpconn_find(int id, struct ip_addr* ip, int port,
1569                                                                                 struct ip_addr* l_ip, int l_port)
1570 {
1571
1572         struct tcp_connection *c;
1573         struct tcp_conn_alias* a;
1574         unsigned hash;
1575         int is_local_ip_any;
1576         
1577 #ifdef EXTRA_DEBUG
1578         LM_DBG("%d  port %d\n", id, port);
1579         if (ip) print_ip("tcpconn_find: ip ", ip, "\n");
1580 #endif
1581         if (likely(id)){
1582                 hash=tcp_id_hash(id);
1583                 for (c=tcpconn_id_hash[hash]; c; c=c->id_next){
1584 #ifdef EXTRA_DEBUG
1585                         LM_DBG("c=%p, c->id=%d, port=%d\n", c, c->id, c->rcv.src_port);
1586                         print_ip("ip=", &c->rcv.src_ip, "\n");
1587 #endif
1588                         if ((id==c->id)&&(c->state!=S_CONN_BAD)) {
1589                                 LM_DBG("found connection by id: %d\n", id);
1590                                 return c;
1591                         }
1592                 }
1593         }else if (likely(ip)){
1594                 hash=tcp_addr_hash(ip, port, l_ip, l_port);
1595                 is_local_ip_any=ip_addr_any(l_ip);
1596                 for (a=tcpconn_aliases_hash[hash]; a; a=a->next){
1597 #ifdef EXTRA_DEBUG
1598                         LM_DBG("a=%p, c=%p, c->id=%d, alias port= %d port=%d\n", a, a->parent,
1599                                         a->parent->id, a->port, a->parent->rcv.src_port);
1600                         print_ip("ip=",&a->parent->rcv.src_ip,"\n");
1601 #endif
1602                         if ( (a->parent->state!=S_CONN_BAD) && (port==a->port) &&
1603                                         ((l_port==0) || (l_port==a->parent->rcv.dst_port)) &&
1604                                         (ip_addr_cmp(ip, &a->parent->rcv.src_ip)) &&
1605                                         (is_local_ip_any ||
1606                                                 ip_addr_cmp(l_ip, &a->parent->rcv.dst_ip))
1607                            ) {
1608                                 LM_DBG("found connection by peer address (id: %d)\n",
1609                                                 a->parent->id);
1610                                 return a->parent;
1611                         }
1612                 }
1613         }
1614         return 0;
1615 }
1616
1617
1618 /**
1619  * find if a tcp connection exits by id or remote+local address/port
1620  * - return: 1 if found; 0 if not found
1621  */
1622 int tcpconn_exists(int conn_id, ip_addr_t* peer_ip, int peer_port,
1623                                                 ip_addr_t* local_ip, int local_port)
1624 {
1625         tcp_connection_t* c;
1626
1627         TCPCONN_LOCK;
1628         c=_tcpconn_find(conn_id, peer_ip, peer_port, local_ip, local_port);
1629         TCPCONN_UNLOCK;
1630         if (c) {
1631                 return 1;
1632         }
1633         return 0;
1634
1635 }
1636
1637 /* TCP connection find with locks and timeout
1638  * - local_addr contains the desired local ip:port. If null any local address
1639  * will be used. IN*ADDR_ANY or 0 port are wild cards.
1640  * - try_local_port makes the search use it first, instead of port from local_addr
1641  * If found, the connection's reference counter will be incremented, you might
1642  * want to decrement it after use.
1643  */
1644 struct tcp_connection* tcpconn_lookup(int id, struct ip_addr* ip, int port,
1645                 union sockaddr_union* local_addr, int try_local_port, ticks_t timeout)
1646 {
1647         struct tcp_connection* c;
1648         struct ip_addr local_ip;
1649         int local_port;
1650
1651         local_port=0;
1652         c = NULL;
1653         if (likely(ip)){
1654                 if (unlikely(local_addr)){
1655                         su2ip_addr(&local_ip, local_addr);
1656                         local_port=su_getport(local_addr);
1657                 }else{
1658                         ip_addr_mk_any(ip->af, &local_ip);
1659                         local_port=0;
1660                 }
1661         }
1662         TCPCONN_LOCK;
1663         if(likely(try_local_port!=0) && likely(local_port==0)) {
1664                 c=_tcpconn_find(id, ip, port, &local_ip, try_local_port);
1665         }
1666         if(unlikely(c==NULL)) {
1667                 c=_tcpconn_find(id, ip, port, &local_ip, local_port);
1668         }
1669         if (likely(c)) {
1670                         atomic_inc(&c->refcnt);
1671                         /* update the timeout only if the connection is not handled
1672                          * by a tcp reader _and_the timeout is non-zero  (the tcp
1673                          * reader process uses c->timeout for its own internal
1674                          * timeout and c->timeout will be overwritten * anyway on
1675                          * return to tcp_main) */
1676                         if (likely(c->reader_pid==0 && timeout != 0))
1677                                 c->timeout=get_ticks_raw()+timeout;
1678         }
1679         TCPCONN_UNLOCK;
1680         return c;
1681 }
1682
1683 /* TCP connection find with locks and timeout
1684  * - local_addr contains the desired local ip:port. If null any local address
1685  * will be used.  IN*ADDR_ANY or 0 port are wild cards.
1686  * If found, the connection's reference counter will be incremented, you might
1687  * want to decrement it after use.
1688  */
1689 struct tcp_connection* tcpconn_get(int id, struct ip_addr* ip, int port,
1690                                                                         union sockaddr_union* local_addr,
1691                                                                         ticks_t timeout)
1692 {
1693         return tcpconn_lookup(id, ip, port, local_addr, 0, timeout);
1694 }
1695
1696
1697 /* add c->dst:port, local_addr as an alias for the "id" connection, 
1698  * flags: TCP_ALIAS_FORCE_ADD  - add an alias even if a previous one exists
1699  *        TCP_ALIAS_REPLACE    - if a prev. alias exists, replace it with the
1700  *                                new one
1701  * returns 0 on success, <0 on failure ( -1  - null c, -2 too many aliases,
1702  *  -3 alias already present and pointing to another connection)
1703  * WARNING: must be called with TCPCONN_LOCK held */
1704 inline static int _tcpconn_add_alias_unsafe(struct tcp_connection* c, int port,
1705                                                                                 struct ip_addr* l_ip, int l_port,
1706                                                                                 int flags)
1707 {
1708         unsigned hash;
1709         struct tcp_conn_alias* a;
1710         struct tcp_conn_alias* nxt;
1711         struct tcp_connection* p;
1712         int is_local_ip_any;
1713         int i;
1714         int r;
1715         
1716         a=0;
1717         is_local_ip_any=ip_addr_any(l_ip);
1718         if (likely(c)){
1719                 hash=tcp_addr_hash(&c->rcv.src_ip, port, l_ip, l_port);
1720                 /* search the aliases for an already existing one */
1721                 for (a=tcpconn_aliases_hash[hash], nxt=0; a; a=nxt){
1722                         nxt=a->next;
1723                         if ( (a->parent->state!=S_CONN_BAD) && (port==a->port) &&
1724                                         ( (l_port==0) || (l_port==a->parent->rcv.dst_port)) &&
1725                                         (ip_addr_cmp(&c->rcv.src_ip, &a->parent->rcv.src_ip)) &&
1726                                         ( is_local_ip_any || 
1727                                           ip_addr_cmp(&a->parent->rcv.dst_ip, l_ip))
1728                                         ){
1729                                 /* found */
1730                                 if (unlikely(a->parent!=c)){
1731                                         if (flags & TCP_ALIAS_FORCE_ADD)
1732                                                 /* still have to walk the whole list to check if
1733                                                  * the alias was not already added */
1734                                                 continue;
1735                                         else if (flags & TCP_ALIAS_REPLACE){
1736                                                 /* remove the alias =>
1737                                                  * remove the current alias and all the following
1738                                                  *  ones from the corresponding connection, shift the 
1739                                                  *  connection aliases array and re-add the other 
1740                                                  *  aliases (!= current one) */
1741                                                 p=a->parent;
1742                                                 for (i=0; (i<p->aliases) && (&(p->con_aliases[i])!=a);
1743                                                                 i++);
1744                                                 if (unlikely(i==p->aliases)){
1745                                                         LM_CRIT("alias %p not found in con %p (id %d)\n",
1746                                                                         a, p, p->id);
1747                                                         goto error_not_found;
1748                                                 }
1749                                                 for (r=i; r<p->aliases; r++){
1750                                                         tcpconn_listrm(
1751                                                                 tcpconn_aliases_hash[p->con_aliases[r].hash],
1752                                                                 &p->con_aliases[r], next, prev);
1753                                                 }
1754                                                 if (likely((i+1)<p->aliases)){
1755                                                         memmove(&p->con_aliases[i], &p->con_aliases[i+1],
1756                                                                                         (p->aliases-i-1)*
1757                                                                                                 sizeof(p->con_aliases[0]));
1758                                                 }
1759                                                 p->aliases--;
1760                                                 /* re-add the remaining aliases */
1761                                                 for (r=i; r<p->aliases; r++){
1762                                                         tcpconn_listadd(
1763                                                                 tcpconn_aliases_hash[p->con_aliases[r].hash], 
1764                                                                 &p->con_aliases[r], next, prev);
1765                                                 }
1766                                         }else
1767                                                 goto error_sec;
1768                                 }else goto ok;
1769                         }
1770                 }
1771                 if (unlikely(c->aliases>=TCP_CON_MAX_ALIASES)) goto error_aliases;
1772                 c->con_aliases[c->aliases].parent=c;
1773                 c->con_aliases[c->aliases].port=port;
1774                 c->con_aliases[c->aliases].hash=hash;
1775                 tcpconn_listadd(tcpconn_aliases_hash[hash], 
1776                                                                 &c->con_aliases[c->aliases], next, prev);
1777                 c->aliases++;
1778         }else goto error_not_found;
1779 ok:
1780 #ifdef EXTRA_DEBUG
1781         if (a) LM_DBG("alias already present\n");
1782         else   LM_DBG("alias port %d for hash %d, id %d\n",
1783                         port, hash, c->id);
1784 #endif
1785         return 0;
1786 error_aliases:
1787         /* too many aliases */
1788         return -2;
1789 error_not_found:
1790         /* null connection */
1791         return -1;
1792 error_sec:
1793         /* alias already present and pointing to a different connection
1794          * (hijack attempt?) */
1795         return -3;
1796 }
1797
1798
1799
1800 /* add port as an alias for the "id" connection, 
1801  * returns 0 on success,-1 on failure */
1802 int tcpconn_add_alias(int id, int port, int proto)
1803 {
1804         struct tcp_connection* c;
1805         int ret;
1806         struct ip_addr zero_ip;
1807         int r;
1808         int alias_flags;
1809         
1810         /* fix the port */
1811         port=port?port:((proto==PROTO_TLS)?SIPS_PORT:SIP_PORT);
1812         TCPCONN_LOCK;
1813         /* check if alias already exists */
1814         c=_tcpconn_find(id, 0, 0, 0, 0);
1815         if (likely(c)){
1816                 ip_addr_mk_any(c->rcv.src_ip.af, &zero_ip);
1817                 alias_flags=cfg_get(tcp, tcp_cfg, alias_flags);
1818                 /* alias src_ip:port, 0, 0 */
1819                 ret=_tcpconn_add_alias_unsafe(c, port,  &zero_ip, 0, 
1820                                                                                 alias_flags);
1821                 if (ret<0 && ret!=-3) goto error;
1822                 /* alias src_ip:port, local_ip, 0 */
1823                 ret=_tcpconn_add_alias_unsafe(c, port,  &c->rcv.dst_ip, 0, 
1824                                                                                 alias_flags);
1825                 if (ret<0 && ret!=-3) goto error;
1826                 /* alias src_ip:port, local_ip, local_port */
1827                 ret=_tcpconn_add_alias_unsafe(c, port, &c->rcv.dst_ip, c->rcv.dst_port,
1828                                                                                 alias_flags);
1829                 if (unlikely(ret<0)) goto error;
1830         }else goto error_not_found;
1831         TCPCONN_UNLOCK;
1832         return 0;
1833 error_not_found:
1834         TCPCONN_UNLOCK;
1835         LM_ERR("no connection found for id %d\n",id);
1836         return -1;
1837 error:
1838         TCPCONN_UNLOCK;
1839         switch(ret){
1840                 case -2:
1841                         LM_ERR("too many aliases (%d) for connection %p (id %d) %s:%d <- %d\n",
1842                                         c->aliases, c, c->id, ip_addr2a(&c->rcv.src_ip),
1843                                         c->rcv.src_port, port);
1844                         for (r=0; r<c->aliases; r++){
1845                                 LM_ERR("alias %d: for %p (%d) %s:%d <-%d hash %x\n",  r, c, c->id, 
1846                                                 ip_addr2a(&c->rcv.src_ip), c->rcv.src_port, 
1847                                                 c->con_aliases[r].port, c->con_aliases[r].hash);
1848                         }
1849                         break;
1850                 case -3:
1851                         LM_ERR("possible port hijack attempt\n");
1852                         LM_ERR("alias for %d port %d already"
1853                                                 " present and points to another connection \n",
1854                                                 c->id, port);
1855                         break;
1856                 default:
1857                         LM_ERR("unknown error %d\n", ret);
1858         }
1859         return -1;
1860 }
1861
1862
1863
1864 #ifdef TCP_FD_CACHE
1865
1866 static void tcp_fd_cache_init(void)
1867 {
1868         int r;
1869         for (r=0; r<TCP_FD_CACHE_SIZE; r++)
1870                 fd_cache[r].fd=-1;
1871 }
1872
1873
1874 inline static struct fd_cache_entry* tcp_fd_cache_get(struct tcp_connection *c)
1875 {
1876         int h;
1877         
1878         h=c->id%TCP_FD_CACHE_SIZE;
1879         if ((fd_cache[h].fd>0) && (fd_cache[h].id==c->id) && (fd_cache[h].con==c))
1880                 return &fd_cache[h];
1881         return 0;
1882 }
1883
1884
1885 inline static void tcp_fd_cache_rm(struct fd_cache_entry* e)
1886 {
1887         e->fd=-1;
1888 }
1889
1890
1891 inline static void tcp_fd_cache_add(struct tcp_connection *c, int fd)
1892 {
1893         int h;
1894         
1895         h=c->id%TCP_FD_CACHE_SIZE;
1896         if (likely(fd_cache[h].fd>0))
1897                 tcp_safe_close(fd_cache[h].fd);
1898         fd_cache[h].fd=fd;
1899         fd_cache[h].id=c->id;
1900         fd_cache[h].con=c;
1901 }
1902
1903 #endif /* TCP_FD_CACHE */
1904
1905
1906
1907 inline static int tcpconn_chld_put(struct tcp_connection* tcpconn);
1908
1909 static int tcpconn_send_put(struct tcp_connection* c, const char* buf,
1910                                                         unsigned len, snd_flags_t send_flags);
1911 static int tcpconn_do_send(int fd, struct tcp_connection* c,
1912                                                         const char* buf, unsigned len,
1913                                                         snd_flags_t send_flags, long* resp, int locked);
1914
1915 static int tcpconn_1st_send(int fd, struct tcp_connection* c,
1916                                                         const char* buf, unsigned len,
1917                                                         snd_flags_t send_flags, long* resp, int locked);
1918
1919 /* finds a tcpconn & sends on it
1920  * uses the dst members to, proto (TCP|TLS) and id and tries to send
1921  *  from the "from" address (if non null and id==0)
1922  * returns: number of bytes written (>=0) on success
1923  *          <0 on error */
1924 int tcp_send(struct dest_info* dst, union sockaddr_union* from,
1925                                         const char* buf, unsigned len)
1926 {
1927         struct tcp_connection *c;
1928         struct ip_addr ip;
1929         int port;
1930         int fd;
1931         long response[2];
1932         int n;
1933         ticks_t con_lifetime;
1934         int try_local_port;
1935 #ifdef USE_TLS
1936         const char* rest_buf;
1937         const char* t_buf;
1938         unsigned rest_len, t_len;
1939         long resp;
1940         snd_flags_t t_send_flags;
1941 #endif /* USE_TLS */
1942
1943         if(unlikely(dst==NULL)) {
1944                 LM_ERR("no destination address provided\n");
1945                 return -1;
1946         }
1947
1948         port=su_getport(&dst->to);
1949         try_local_port = (dst->send_sock)?dst->send_sock->port_no:0;
1950         con_lifetime=cfg_get(tcp, tcp_cfg, con_lifetime);
1951         if (likely(port)){
1952                 su2ip_addr(&ip, &dst->to);
1953                 if(tcp_connection_match==TCPCONN_MATCH_STRICT) {
1954                         c=tcpconn_lookup(dst->id, &ip, port, from, try_local_port, con_lifetime);
1955                 } else {
1956                         c=tcpconn_get(dst->id, &ip, port, from, con_lifetime);
1957                 }
1958         }else if (likely(dst->id)){
1959                 c=tcpconn_get(dst->id, 0, 0, 0, con_lifetime);
1960         }else{
1961                 LM_CRIT("null id & to\n");
1962                 return -1;
1963         }
1964
1965         if (likely(dst->id)){
1966                 if (unlikely(c==0)) {
1967                         if (likely(port)){
1968                                 /* try again w/o id */
1969                                 if(tcp_connection_match==TCPCONN_MATCH_STRICT) {
1970                                         c=tcpconn_lookup(0, &ip, port, from, try_local_port, con_lifetime);
1971                                 } else {
1972                                         c=tcpconn_get(0, &ip, port, from, con_lifetime);
1973                                 }
1974                         }else{
1975                                 LM_ERR("id %d not found, dropping\n", dst->id);
1976                                 return -1;
1977                         }
1978                 }
1979         }
1980         /* connection not found or unusable => open a new one and send on it */
1981         if (unlikely((c==0) || tcpconn_close_after_send(c))){
1982                 if (unlikely(c)){
1983                         /* can't use c if it's marked as close-after-send  =>
1984                          * release it and try opening new one */
1985                         tcpconn_chld_put(c); /* release c (dec refcnt & free on 0) */
1986                         c=0;
1987                 }
1988                 /* check if connect() is disabled */
1989                 if (unlikely((dst->send_flags.f & SND_F_FORCE_CON_REUSE) ||
1990                                                 cfg_get(tcp, tcp_cfg, no_connect)))
1991                         return -1;
1992                 LM_DBG("no open tcp connection found, opening new one\n");
1993                 /* create tcp connection */
1994                 if (likely(from==0)){
1995                         /* check to see if we have to use a specific source addr. */
1996                         switch (dst->to.s.sa_family) {
1997                                 case AF_INET:
1998                                                 from = tcp_source_ipv4;
1999                                         break;
2000                                 case AF_INET6:
2001                                                 from = tcp_source_ipv6;
2002                                         break;
2003                                 default:
2004                                         /* error, bad af, ignore ... */
2005                                         break;
2006                         }
2007                 }
2008 #if defined(TCP_CONNECT_WAIT) && defined(TCP_ASYNC)
2009                 if (likely(cfg_get(tcp, tcp_cfg, tcp_connect_wait) &&
2010                                         cfg_get(tcp, tcp_cfg, async) )){
2011                         if (unlikely(*tcp_connections_no >=
2012                                                         cfg_get(tcp, tcp_cfg, max_connections))){
2013                                 LM_ERR("%s: maximum number of connections exceeded (%d/%d)\n",
2014                                                         su2a(&dst->to, sizeof(dst->to)),
2015                                                         *tcp_connections_no,
2016                                                         cfg_get(tcp, tcp_cfg, max_connections));
2017                                 return -1;
2018                         }
2019                         if (unlikely(dst->proto==PROTO_TLS)) {
2020                                 if (unlikely(*tls_connections_no >=
2021                                                         cfg_get(tcp, tcp_cfg, max_tls_connections))){
2022                                         LM_ERR("%s: maximum number of tls connections exceeded (%d/%d)\n",
2023                                                         su2a(&dst->to, sizeof(dst->to)),
2024                                                         *tls_connections_no,
2025                                                         cfg_get(tcp, tcp_cfg, max_tls_connections));
2026                                         return -1;
2027                                 }
2028                         }
2029                         c=tcpconn_new(-1, &dst->to, from, 0, dst->proto,
2030                                                         S_CONN_CONNECT);
2031                         if (unlikely(c==0)){
2032                                 LM_ERR("%s: could not create new connection\n",
2033                                                 su2a(&dst->to, sizeof(dst->to)));
2034                                 return -1;
2035                         }
2036                         c->flags|=F_CONN_PENDING|F_CONN_FD_CLOSED;
2037                         tcpconn_set_send_flags(c, dst->send_flags);
2038                         atomic_set(&c->refcnt, 2); /* ref from here and from main hash
2039                                                                                 * table */
2040                         /* add it to id hash and aliases */
2041                         if (unlikely(tcpconn_add(c)==0)){
2042                                 LM_ERR("%s: could not add connection %p\n",
2043                                                 su2a(&dst->to, sizeof(dst->to)), c);
2044                                 _tcpconn_free(c);
2045                                 n=-1;
2046                                 goto end_no_conn;
2047                         }
2048                         /* do connect and if src ip or port changed, update the
2049                          * aliases */
2050                         if (unlikely((fd=tcpconn_finish_connect(c, from))<0)){
2051                                 /* tcpconn_finish_connect will automatically blacklist
2052                                  * on error => no need to do it here */
2053                                 LM_ERR("%s: tcpconn_finish_connect(%p) failed\n",
2054                                                 su2a(&dst->to, sizeof(dst->to)), c);
2055                                 goto conn_wait_error;
2056                         }
2057                         if(c->flags & F_CONN_NOSEND) {
2058                                 /* connection marked as no-send data
2059                                  * (e.g., drop() from tls event route)*/
2060                                 LM_INFO("%s: connection marked for no-send (%p)\n",
2061                                                 su2a(&dst->to, sizeof(dst->to)), c);
2062                                 goto conn_wait_error;
2063                         }
2064                         /* ? TODO: it might be faster just to queue the write directly
2065                          *  and send to main CONN_NEW_PENDING_WRITE */
2066                         /* delay sending the fd to main after the send */
2067
2068                         /* NOTE: no lock here, because the connection is marked as
2069                          * pending and nobody else will try to write on it. However
2070                          * this might produce out-of-order writes. If this is not
2071                          * desired either lock before the write or use
2072                          * _wbufq_insert(...)
2073                          * NOTE2: _wbufq_insert() is used now (no out-of-order).
2074                          */
2075 #ifdef USE_TLS
2076                         if (unlikely(c->type==PROTO_TLS)) {
2077                                 /* for TLS the TLS processing and the send must happen
2078                                  * atomically w/ respect to other sends on the same connection
2079                                  * (otherwise reordering might occur which would break TLS) =>
2080                                  * lock. However in this case this send will always be the first.
2081                                  * We can have the send() outside the lock only if this is the
2082                                  * first and only send (tls_encode is not called again), or
2083                                  * this is the last send for a tls_encode() loop and all the
2084                                  * previous ones did return CONN_NEW_COMPLETE or CONN_EOF.
2085                                  */
2086                                 response[1] = CONN_NOP;
2087                                 t_buf = buf;
2088                                 t_len = len;
2089                                 lock_get(&c->write_lock);
2090 redo_tls_encode:
2091                                         t_send_flags = dst->send_flags;
2092                                         n = tls_encode(c, &t_buf, &t_len, &rest_buf, &rest_len,
2093                                                                         &t_send_flags);
2094                                         /* There are 4 cases:
2095                                          *  1. entire buffer consumed from the first try
2096                                          *    (rest_len == rest_buf == 0)
2097                                          *  2. rest_buf & first call
2098                                          *  3. rest_buf & not first call
2099                                          *        3a. CONN_NEW_COMPLETE or CONN_EOF
2100                                          *        3b. CONN_NEW_PENDING_WRITE
2101                                          *  4. entire buffer consumed, but not first call
2102                                          *      4a. CONN_NEW_COMPLETE or CONN_EOF
2103                                          *         4b. CONN_NEW_PENDING_WRITE
2104                                          *      We misuse response[1] == CONN_NOP to test for the
2105                                          *      first call.
2106                                          */
2107                                         if (unlikely(n < 0)) {
2108                                                 lock_release(&c->write_lock);
2109                                                 goto conn_wait_error;
2110                                         }
2111                                         if (likely(rest_len == 0)) {
2112                                                 /* 1 or 4*: CONN_NEW_COMPLETE, CONN_EOF,  CONN_NOP
2113                                                  * or CONN_NEW_PENDING_WRITE (*rest_len == 0) */
2114                                                 if (likely(response[1] != CONN_NEW_PENDING_WRITE)) {
2115                                                         /* 1 or 4a => it's safe to do the send outside the
2116                                                          * lock (it will either send directly or
2117                                                          * wbufq_insert())
2118                                                          */
2119                                                         lock_release(&c->write_lock);
2120                                                         if (likely(t_len != 0)) {
2121                                                                 n=tcpconn_1st_send(fd, c, t_buf, t_len,
2122                                                                                                         t_send_flags,
2123                                                                                                         &response[1], 0);
2124                                                         } else { /* t_len == 0 */
2125                                                                 if (response[1] == CONN_NOP) {
2126                                                                         /* nothing to send (e.g  parallel send
2127                                                                          * tls_encode queues some data and then
2128                                                                          * WANT_READ => this tls_encode will queue
2129                                                                          * the cleartext too and will have nothing
2130                                                                          * to send right now) and initial send =>
2131                                                                          * behave as if the send was successful
2132                                                                          * (but never return EOF here) */
2133                                                                         response[1] = CONN_NEW_COMPLETE;
2134                                                                 }
2135                                                         }
2136                                                         /* exit */
2137                                                 } else {
2138                                                         /* CONN_NEW_PENDING_WRITE:  4b: it was a
2139                                                          * repeated tls_encode() (or otherwise we would
2140                                                          * have here CONN_NOP) => add to the queue */
2141                                                         if (unlikely(t_len &&
2142                                                                                         _wbufq_add(c, t_buf, t_len) < 0)) {
2143                                                                 response[1] = CONN_ERROR;
2144                                                                 n = -1;
2145                                                         }
2146                                                         lock_release(&c->write_lock);
2147                                                         /* exit (no send) */
2148                                                 }
2149                                         } else {  /* rest_len != 0 */
2150                                                 /* 2 or 3*: if tls_encode hasn't finished, we have to
2151                                                  * call tcpconn_1st_send() under lock (otherwise if it
2152                                                  * returns CONN_NEW_PENDING_WRITE, there is no way
2153                                                  * to find the right place to add the new queued
2154                                                  * data from the 2nd tls_encode()) */
2155                                                 if (likely((response[1] == CONN_NOP /*2*/ ||
2156                                                                         response[1] == CONN_NEW_COMPLETE /*3a*/ ||
2157                                                                         response[1] == CONN_EOF /*3a*/) && t_len))
2158                                                         n = tcpconn_1st_send(fd, c, t_buf, t_len,
2159                                                                                                         t_send_flags,
2160                                                                                                         &response[1], 1);
2161                                                 else if (unlikely(t_len &&
2162                                                                                         _wbufq_add(c, t_buf, t_len) < 0)) {
2163                                                         /*3b: CONN_NEW_PENDING_WRITE*/
2164                                                         response[1] = CONN_ERROR;
2165                                                         n = -1;
2166                                                 }
2167                                                 if (likely(n >= 0)) {
2168                                                         /* if t_len == 0 => nothing was sent => previous
2169                                                          * response will be kept */
2170                                                         t_buf = rest_buf;
2171                                                         t_len = rest_len;
2172                                                         goto redo_tls_encode;
2173                                                 } else {
2174                                                         lock_release(&c->write_lock);
2175                                                         /* error exit */
2176                                                 }
2177                                         }
2178                         } else
2179 #endif /* USE_TLS */
2180                                 n=tcpconn_1st_send(fd, c, buf, len, dst->send_flags,
2181                                                                         &response[1], 0);
2182                         if (unlikely(n<0)) /* this will catch CONN_ERROR too */
2183                                 goto conn_wait_error;
2184                         if (unlikely(response[1]==CONN_EOF)){
2185                                 /* if close-after-send requested, don't bother
2186                                  * sending the fd back to tcp_main, try closing it
2187                                  * immediately (no other tcp_send should use it,
2188                                  * because it is marked as close-after-send before
2189                                  * being added to the hash) */
2190                                 goto conn_wait_close;
2191                         }
2192                         /* send to tcp_main */
2193                         response[0]=(long)c;
2194                         if (unlikely(send_fd(unix_tcp_sock, response,
2195                                                                         sizeof(response), fd) <= 0)){
2196                                 LM_ERR("%s: %ld for %p failed:" " %s (%d)\n",
2197                                                         su2a(&dst->to, sizeof(dst->to)),
2198                                                         response[1], c, strerror(errno), errno);
2199                                 goto conn_wait_error;
2200                         }
2201                         goto conn_wait_success;
2202                 }
2203 #endif /* TCP_CONNECT_WAIT  && TCP_ASYNC */
2204                 if (unlikely((c=tcpconn_connect(&dst->to, from, dst->proto,
2205                                                                                 &dst->send_flags))==0)){
2206                         LM_ERR("%s: connect failed\n", su2a(&dst->to, sizeof(dst->to)));
2207                         return -1;
2208                 }
2209                 if(c->flags & F_CONN_NOSEND) {
2210                         /* connection marked as no-send data
2211                          * (e.g., drop() from tls event route)*/
2212                         LM_INFO("%s: connection marked for no-send (%p)\n",
2213                                         su2a(&dst->to, sizeof(dst->to)), c);
2214                         /* we can safely delete it, it's not referenced by anybody */
2215                         _tcpconn_free(c);
2216                         n=-1;
2217                         goto end_no_conn;
2218                 }
2219                 tcpconn_set_send_flags(c, dst->send_flags);
2220                 if (likely(c->state==S_CONN_OK))
2221                         TCP_STATS_ESTABLISHED(S_CONN_CONNECT);
2222                 atomic_set(&c->refcnt, 2); /* ref. from here and it will also
2223                                                                         * be added in the tcp_main hash */
2224                 fd=c->s;
2225                 c->flags|=F_CONN_FD_CLOSED; /* not yet opened in main */
2226                 /* ? TODO: it might be faster just to queue the write and
2227                  * send to main a CONN_NEW_PENDING_WRITE */
2228
2229                 /* send the new tcpconn to "tcp main" */
2230                 response[0]=(long)c;
2231                 response[1]=CONN_NEW;
2232                 n=send_fd(unix_tcp_sock, response, sizeof(response), c->s);
2233                 if (unlikely(n<=0)){
2234                         LM_ERR("%s: failed send_fd: %s (%d)\n",
2235                                         su2a(&dst->to, sizeof(dst->to)),
2236                                         strerror(errno), errno);
2237                         /* we can safely delete it, it's not referenced by anybody */
2238                         _tcpconn_free(c);
2239                         n=-1;
2240                         goto end_no_conn;
2241                 }
2242                 /* new connection => send on it directly */
2243 #ifdef USE_TLS
2244                 if (unlikely(c->type==PROTO_TLS)) {
2245                         /* for TLS the TLS processing and the send must happen
2246                          * atomically w/ respect to other sends on the same connection
2247                          * (otherwise reordering might occur which would break TLS) =>
2248                          * lock.
2249                         */
2250                         response[1] = CONN_NOP;
2251                         t_buf = buf;
2252                         t_len = len;
2253                         lock_get(&c->write_lock);
2254                                 do {
2255                                         t_send_flags = dst->send_flags;
2256                                         n = tls_encode(c, &t_buf, &t_len, &rest_buf, &rest_len,
2257                                                                         &t_send_flags);
2258                                         if (likely(n > 0)) {
2259                                                 n = tcpconn_do_send(fd, c, t_buf, t_len, t_send_flags,
2260                                                                                                 &resp, 1);
2261                                                 if (likely(response[1] != CONN_QUEUED_WRITE ||
2262                                                                         resp == CONN_ERROR))
2263                                                         /* don't overwrite a previous CONN_QUEUED_WRITE
2264                                                          * unless error */
2265                                                         response[1] = resp;
2266                                         } else  if (unlikely(n < 0)) {
2267                                                 response[1] = CONN_ERROR;
2268                                                 break;
2269                                         }
2270                                         /* else do nothing for n (t_len) == 0, keep
2271                                          * the last reponse */
2272                                         t_buf = rest_buf;
2273                                         t_len = rest_len;
2274                                 } while(unlikely(rest_len && n > 0));
2275                         lock_release(&c->write_lock);
2276                 } else
2277 #endif /* USE_TLS */
2278                         n = tcpconn_do_send(fd, c, buf, len, dst->send_flags,
2279                                                                         &response[1], 0);
2280                 if (unlikely(response[1] != CONN_NOP)) {
2281                         response[0]=(long)c;
2282                         if (send_all(unix_tcp_sock, response, sizeof(response)) <= 0) {
2283                                 BUG("tcp_main command %ld sending failed (write):"
2284                                                 "%s (%d)\n", response[1], strerror(errno), errno);
2285                                 /* all commands != CONN_NOP returned by tcpconn_do_send()
2286                                  * (CONN_EOF, CONN_ERROR, CONN_QUEUED_WRITE) will auto-dec
2287                                  * refcnt => if sending the command fails we have to
2288                                  * dec. refcnt by hand */
2289                                 tcpconn_chld_put(c); /* deref. it manually */
2290                                 n=-1;
2291                         }
2292                         /* here refcnt for c is already decremented => c contents can
2293                          * no longer be used and refcnt _must_ _not_ be decremented
2294                          * again on exit */
2295                         if (unlikely(n < 0 || response[1] == CONN_EOF)) {
2296                                 /* on error or eof, close fd */
2297                                 tcp_safe_close(fd);
2298                         } else if (response[1] == CONN_QUEUED_WRITE) {
2299 #ifdef TCP_FD_CACHE
2300                                 if (cfg_get(tcp, tcp_cfg, fd_cache)) {
2301                                         tcp_fd_cache_add(c, fd);
2302                                 } else
2303 #endif /* TCP_FD_CACHE */
2304                                         tcp_safe_close(fd);
2305                         } else {
2306                                 BUG("unexpected tcpconn_do_send() return & response:"
2307                                                 " %d, %ld\n", n, response[1]);
2308                         }
2309                         goto end_no_deref;
2310                 }
2311 #ifdef TCP_FD_CACHE
2312                 if (cfg_get(tcp, tcp_cfg, fd_cache)) {
2313                         tcp_fd_cache_add(c, fd);
2314                 }else
2315 #endif /* TCP_FD_CACHE */
2316                         tcp_safe_close(fd);
2317         /* here we can have only commands that _do_ _not_ dec refcnt.
2318          * (CONN_EOF, CON_ERROR, CON_QUEUED_WRITE are all treated above) */
2319                 goto release_c;
2320         } /* if (c==0 or unusable) new connection */
2321         /* existing connection, send on it */
2322         n = tcpconn_send_put(c, buf, len, dst->send_flags);
2323         /* no deref needed (automatically done inside tcpconn_send_put() */
2324         return n;
2325 #ifdef TCP_CONNECT_WAIT
2326 conn_wait_success:
2327 #ifdef TCP_FD_CACHE
2328         if (cfg_get(tcp, tcp_cfg, fd_cache)) {
2329                 tcp_fd_cache_add(c, fd);
2330         } else
2331 #endif /* TCP_FD_CACHE */
2332                 if (unlikely (tcp_safe_close(fd) < 0))
2333                         LM_ERR("closing temporary send fd for %p: %s: "
2334                                         "close(%d) failed (flags 0x%x): %s (%d)\n", c,
2335                                         su2a(&c->rcv.src_su, sizeof(c->rcv.src_su)),
2336                                         fd, c->flags, strerror(errno), errno);
2337         tcpconn_chld_put(c); /* release c (dec refcnt & free on 0) */
2338         return n;
2339 conn_wait_error:
2340         n=-1;
2341 conn_wait_close:
2342         /* connect or send failed or immediate close-after-send was requested on
2343          * newly created connection which was not yet sent to tcp_main (but was
2344          * already hashed) => don't send to main, unhash and destroy directly
2345          * (if refcnt>2 it will be destroyed when the last sender releases the
2346          * connection (tcpconn_chld_put(c))) or when tcp_main receives a
2347          * CONN_ERROR it*/
2348         c->state=S_CONN_BAD;
2349         /* we are here only if we opened a new fd (and not reused a cached or
2350          * a reader one) => if the connect was successful close the fd */
2351         if (fd>=0) {
2352                 if (unlikely(tcp_safe_close(fd) < 0 ))
2353                         LM_ERR("closing temporary send fd for %p: %s: "
2354                                         "close(%d) failed (flags 0x%x): %s (%d)\n", c,
2355                                         su2a(&c->rcv.src_su, sizeof(c->rcv.src_su)),
2356                                         fd, c->flags, strerror(errno), errno);
2357         }
2358         /* here the connection is for sure in the hash (tcp_main will not
2359          * remove it because it's marked as PENDing) and the refcnt is at least 2
2360          */
2361         TCPCONN_LOCK;
2362                 _tcpconn_detach(c);
2363                 c->flags&=~F_CONN_HASHED;
2364                 tcpconn_put(c);
2365         TCPCONN_UNLOCK;
2366         /* dec refcnt -> mark it for destruction */
2367         tcpconn_chld_put(c);
2368         return n;
2369 #endif /* TCP_CONNECT_WAIT */
2370 release_c:
2371         tcpconn_chld_put(c); /* release c (dec refcnt & free on 0) */
2372 end_no_deref:
2373 end_no_conn:
2374         return n;
2375 }
2376
2377
2378
2379 /** sends on an existing tcpconn and auto-dec. con. ref counter.
2380  * As opposed to tcp_send(), this function requires an existing
2381  * tcp connection.
2382  * WARNING: the tcp_connection will be de-referenced.
2383  * @param c - existing tcp connection pointer.
2384  * @param buf - data to be sent.
2385  * @param len - data length,
2386  * @return >=0 on success, -1 on error.
2387  */
2388 static int tcpconn_send_put(struct tcp_connection* c, const char* buf,
2389                                                                 unsigned len, snd_flags_t send_flags)
2390 {
2391         struct tcp_connection *tmp;
2392         int fd;
2393         long response[2];
2394         int n;
2395         int do_close_fd;
2396 #ifdef USE_TLS
2397         const char* rest_buf;
2398         const char* t_buf;
2399         unsigned rest_len, t_len;
2400         long resp;
2401         snd_flags_t t_send_flags;
2402 #endif /* USE_TLS */
2403 #ifdef TCP_FD_CACHE
2404         struct fd_cache_entry* fd_cache_e;
2405         int use_fd_cache;
2406         
2407         use_fd_cache=cfg_get(tcp, tcp_cfg, fd_cache);
2408         fd_cache_e=0;
2409 #endif /* TCP_FD_CACHE */
2410         do_close_fd=1; /* close the fd on exit */
2411         response[1] = CONN_NOP;
2412 #ifdef TCP_ASYNC
2413         /* if data is already queued, we don't need the fd */
2414 #ifdef TCP_CONNECT_WAIT
2415                 if (unlikely(cfg_get(tcp, tcp_cfg, async) &&
2416                                                 (_wbufq_non_empty(c) || (c->flags&F_CONN_PENDING)) ))
2417 #else /* ! TCP_CONNECT_WAIT */
2418                 if (unlikely(cfg_get(tcp, tcp_cfg, async) && (_wbufq_non_empty(c)) ))
2419 #endif /* TCP_CONNECT_WAIT */
2420                 {
2421                         lock_get(&c->write_lock);
2422 #ifdef TCP_CONNECT_WAIT
2423                                 if (likely(_wbufq_non_empty(c) || (c->flags&F_CONN_PENDING)))
2424 #else /* ! TCP_CONNECT_WAIT */
2425                                 if (likely(_wbufq_non_empty(c)))
2426 #endif /* TCP_CONNECT_WAIT */
2427                                 {
2428                                         do_close_fd=0;
2429 #ifdef USE_TLS
2430                                         if (unlikely(c->type==PROTO_TLS || c->type==PROTO_WSS)) {
2431                                                 t_buf = buf;
2432                                                 t_len = len;
2433                                                 do {
2434                                                         t_send_flags = send_flags;
2435                                                         n = tls_encode(c, &t_buf, &t_len,
2436                                                                                         &rest_buf, &rest_len,
2437                                                                                         &t_send_flags);
2438                                                         if (unlikely((n < 0) || (t_len &&
2439                                                                          (_wbufq_add(c, t_buf, t_len) < 0)))) {
2440                                                                 lock_release(&c->write_lock);
2441                                                                 n=-1;
2442                                                                 response[1] = CONN_ERROR;
2443                                                                 c->state=S_CONN_BAD;
2444                                                                 c->timeout=get_ticks_raw(); /* force timeout */
2445                                                                 goto error;
2446                                                         }
2447                                                         t_buf = rest_buf;
2448                                                         t_len = rest_len;
2449                                                 } while(unlikely(rest_len && n > 0));
2450                                         } else
2451 #endif /* USE_TLS */
2452                                                 if (unlikely(len && (_wbufq_add(c, buf, len)<0))){
2453                                                         lock_release(&c->write_lock);
2454                                                         n=-1;
2455                                                         response[1] = CONN_ERROR;
2456                                                         c->state=S_CONN_BAD;
2457                                                         c->timeout=get_ticks_raw(); /* force timeout */
2458                                                         goto error;
2459                                                 }
2460                                         n=len;
2461                                         lock_release(&c->write_lock);
2462                                         goto release_c;
2463                                 }
2464                         lock_release(&c->write_lock);
2465                 }
2466 #endif /* TCP_ASYNC */
2467                 /* check if this is not the same reader process holding
2468                  *  c  and if so send directly on c->fd */
2469                 if (c->reader_pid==my_pid()){
2470                         LM_DBG("send from reader (%d (%d)), reusing fd\n",
2471                                         my_pid(), process_no);
2472                         fd=c->fd;
2473                         do_close_fd=0; /* don't close the fd on exit, it's in use */
2474 #ifdef TCP_FD_CACHE
2475                         use_fd_cache=0; /* don't cache: problems would arise due to the
2476                                                            close() on cache eviction (if the fd is still 
2477                                                            used). If it has to be cached then dup() _must_ 
2478                                                            be used */
2479                 }else if (likely(use_fd_cache && 
2480                                                         ((fd_cache_e=tcp_fd_cache_get(c))!=0))){
2481                         fd=fd_cache_e->fd;
2482                         do_close_fd=0;
2483                         LM_DBG("found fd in cache (%d, %p, %d)\n", fd, c, fd_cache_e->id);
2484 #endif /* TCP_FD_CACHE */
2485                 }else{
2486                         LM_DBG("tcp connection found (%p), acquiring fd\n", c);
2487                         /* get the fd */
2488                         response[0]=(long)c;
2489                         response[1]=CONN_GET_FD;
2490                         n=send_all(unix_tcp_sock, response, sizeof(response));
2491                         if (unlikely(n<=0)){
2492                                 LM_ERR("failed to get fd(write):%s (%d)\n", strerror(errno), errno);
2493                                 n=-1;
2494                                 goto release_c;
2495                         }
2496                         LM_DBG("c=%p, n=%d\n", c, n);
2497                         n=receive_fd(unix_tcp_sock, &tmp, sizeof(tmp), &fd, MSG_WAITALL);
2498                         if (unlikely(n<=0)){
2499                                 LM_ERR("failed to get fd(receive_fd): %s (%d)\n",
2500                                                 strerror(errno), errno);
2501                                 n=-1;
2502                                 do_close_fd=0;
2503                                 goto release_c;
2504                         }
2505                         /* handle fd closed or bad connection/error
2506                                 (it's possible that this happened in the time between
2507                                 we found the intial connection and the time when we get
2508                                 the fd)
2509                          */
2510                         if (unlikely(c!=tmp || fd==-1 || c->state==S_CONN_BAD)){
2511                                 if (unlikely(c!=tmp && tmp!=0))
2512                                         BUG("tcp_send: get_fd: got different connection:"
2513                                                 "  %p (id= %d, refcnt=%d state=%d) != "
2514                                                 "  %p (n=%d)\n",
2515                                                   c,   c->id,   atomic_get(&c->refcnt),   c->state,
2516                                                   tmp, n
2517                                                 );
2518                                 n=-1; /* fail */
2519                                 /* don't cache fd & close it */
2520                                 do_close_fd = (fd==-1)?0:1;
2521 #ifdef TCP_FD_CACHE
2522                                 use_fd_cache = 0;
2523 #endif /* TCP_FD_CACHE */
2524                                 goto end;
2525                         }
2526                         LM_DBG("after receive_fd: c= %p n=%d fd=%d\n",c, n, fd);
2527                 }
2528         
2529 #ifdef USE_TLS
2530                 if (unlikely(c->type==PROTO_TLS || c->type==PROTO_WSS)) {
2531                         /* for TLS the TLS processing and the send must happen
2532                            atomically w/ respect to other sends on the same connection
2533                            (otherwise reordering might occur which would break TLS) =>
2534                            lock.
2535                         */
2536                         response[1] = CONN_NOP;
2537                         t_buf = buf;
2538                         t_len = len;
2539                         lock_get(&c->write_lock);
2540                                 do {
2541                                         t_send_flags = send_flags;
2542                                         n = tls_encode(c, &t_buf, &t_len, &rest_buf, &rest_len,
2543                                                                         &t_send_flags);
2544                                         if (likely(n > 0)) {
2545                                                 n = tcpconn_do_send(fd, c, t_buf, t_len, t_send_flags,
2546                                                                                                 &resp, 1);
2547                                                 if (likely(response[1] != CONN_QUEUED_WRITE ||
2548                                                                         resp == CONN_ERROR))
2549                                                         /* don't overwrite a previous CONN_QUEUED_WRITE
2550                                                            unless error */
2551                                                         response[1] = resp;
2552                                         } else if (unlikely(n < 0)) {
2553                                                 response[1] = CONN_ERROR;
2554                                                 break;
2555                                         }
2556                                         /* else do nothing for n (t_len) == 0, keep
2557                                            the last reponse */
2558                                         t_buf = rest_buf;
2559                                         t_len = rest_len;
2560                                 } while(unlikely(rest_len && n > 0));
2561                         lock_release(&c->write_lock);
2562                 } else
2563 #endif
2564                         n = tcpconn_do_send(fd, c, buf, len, send_flags, &response[1], 0);
2565         if (unlikely(response[1] != CONN_NOP)) {
2566 error:
2567                 response[0]=(long)c;
2568                 if (send_all(unix_tcp_sock, response, sizeof(response)) <= 0) {
2569                         BUG("tcp_main command %ld sending failed (write):%s (%d)\n",
2570                                         response[1], strerror(errno), errno);
2571                         /* all commands != CONN_NOP returned by tcpconn_do_send()
2572                            (CONN_EOF, CONN_ERROR, CONN_QUEUED_WRITE) will auto-dec refcnt
2573                            => if sending the command fails we have to dec. refcnt by hand
2574                          */
2575                         tcpconn_chld_put(c); /* deref. it manually */
2576                         n=-1;
2577                 }
2578                 /* here refcnt for c is already decremented => c contents can no
2579                    longer be used and refcnt _must_ _not_ be decremented again
2580                    on exit */
2581                 if (unlikely(n < 0 || response[1] == CONN_EOF)) {
2582                         /* on error or eof, remove from cache or close fd */
2583 #ifdef TCP_FD_CACHE
2584                         if (unlikely(fd_cache_e)){
2585                                 tcp_fd_cache_rm(fd_cache_e);
2586                                 fd_cache_e = 0;
2587                                 tcp_safe_close(fd);
2588                         }else
2589 #endif /* TCP_FD_CACHE */
2590                                 if (do_close_fd) tcp_safe_close(fd);
2591                 } else if (response[1] == CONN_QUEUED_WRITE) {
2592 #ifdef TCP_FD_CACHE
2593                         if (unlikely((fd_cache_e==0) && use_fd_cache)){
2594                                 tcp_fd_cache_add(c, fd);
2595                         }else
2596 #endif /* TCP_FD_CACHE */
2597                                 if (do_close_fd) tcp_safe_close(fd);
2598                 } else {
2599                         BUG("unexpected tcpconn_do_send() return & response: %d, %ld\n",
2600                                         n, response[1]);
2601                 }
2602                 return n; /* no tcpconn_put */
2603         }
2604 end:
2605 #ifdef TCP_FD_CACHE
2606         if (unlikely((fd_cache_e==0) && use_fd_cache)){
2607                 tcp_fd_cache_add(c, fd);
2608         }else
2609 #endif /* TCP_FD_CACHE */
2610         if (do_close_fd) {
2611                 if (unlikely(tcp_safe_close(fd) < 0))
2612                         LM_ERR("closing temporary send fd for %p: %s: "
2613                                         "close(%d) failed (flags 0x%x): %s (%d)\n", c,
2614                                         su2a(&c->rcv.src_su, sizeof(c->rcv.src_su)),
2615                                         fd, c->flags, strerror(errno), errno);
2616         }
2617         /* here we can have only commands that _do_ _not_ dec refcnt.
2618            (CONN_EOF, CON_ERROR, CON_QUEUED_WRITE are all treated above) */
2619 release_c:
2620         tcpconn_chld_put(c); /* release c (dec refcnt & free on 0) */
2621         return n;
2622 }
2623
2624
2625
2626 /* unsafe send on a known tcp connection.
2627  * Directly send on a known tcp connection with a given fd.
2628  * It is assumed that the connection locks are already held.
2629  * Side effects: if needed it will send state update commands to
2630  *  tcp_main (e.g. CON_EOF, CON_ERROR, CON_QUEUED_WRITE).
2631  * @param fd - fd used for sending.
2632  * @param c - existing tcp connection pointer (state and flags might be
2633  *            changed).
2634  * @param buf - data to be sent.
2635  * @param len - data length.
2636  * @param send_flags
2637  * @return <0 on error, number of bytes sent on success.
2638  */
2639 int tcpconn_send_unsafe(int fd, struct tcp_connection *c,
2640                                                 const char* buf, unsigned len, snd_flags_t send_flags)
2641 {
2642         int n;
2643         long response[2];
2644         
2645         n = tcpconn_do_send(fd, c, buf, len, send_flags, &response[1], 1);
2646         if (unlikely(response[1] != CONN_NOP)) {
2647                 /* all commands != CONN_NOP returned by tcpconn_do_send()
2648                    (CONN_EOF, CONN_ERROR, CONN_QUEUED_WRITE) will auto-dec refcnt
2649                    => increment it (we don't want the connection to be destroyed
2650                    from under us)
2651                  */
2652                 atomic_inc(&c->refcnt);
2653                 response[0]=(long)c;
2654                 if (send_all(unix_tcp_sock, response, sizeof(response)) <= 0) {
2655                         BUG("connection %p command %ld sending failed (write):%s (%d)\n",
2656                                         c, response[1], strerror(errno), errno);
2657                         /* send failed => deref. it back by hand */
2658                         tcpconn_chld_put(c); 
2659                         n=-1;
2660                 }
2661                 /* here refcnt for c is already decremented => c contents can no
2662                    longer be used and refcnt _must_ _not_ be decremented again
2663                    on exit */
2664                 return n;
2665         }
2666         return n;
2667 }
2668
2669
2670
2671 /** lower level send (connection and fd should be known).
2672  * It takes care of possible write-queueing, blacklisting a.s.o.
2673  * It expects a valid tcp connection. It doesn't touch the ref. cnts.
2674  * It will also set the connection flags from send_flags (it's better
2675  * to do it here, because it's guaranteed to be under lock).
2676  * @param fd - fd used for sending.
2677  * @param c - existing tcp connection pointer (state and flags might be
2678  *            changed).
2679  * @param buf - data to be sent.
2680  * @param len - data length.
2681  * @param send_flags
2682  * @param resp - filled with a cmd. for tcp_main:
2683  *                      CONN_NOP - nothing needs to be done (do not send
2684  *                                 anything to tcp_main).
2685  *                      CONN_ERROR - error, connection should be closed.
2686  *                      CONN_EOF - no error, but connection should be closed.
2687  *                      CONN_QUEUED_WRITE - new write queue (connection
2688  *                                 should be watched for write and the wr.
2689  *                                 queue flushed).
2690  * @param locked - if set assume the connection is already locked (call from
2691  *                  tls) and do not lock/unlock the connection.
2692  * @return >=0 on success, < 0 on error && *resp == CON_ERROR.
2693  *
2694  */
2695 static int tcpconn_do_send(int fd, struct tcp_connection* c,
2696                                                         const char* buf, unsigned len,
2697                                                         snd_flags_t send_flags, long* resp,
2698                                                         int locked)
2699 {
2700         int  n;
2701 #ifdef TCP_ASYNC
2702         int enable_write_watch;
2703 #endif /* TCP_ASYNC */
2704
2705         LM_DBG("sending...\n");
2706         *resp = CONN_NOP;
2707         if (likely(!locked)) lock_get(&c->write_lock);
2708         /* update connection send flags with the current ones */
2709         tcpconn_set_send_flags(c, send_flags);
2710 #ifdef TCP_ASYNC
2711         if (likely(cfg_get(tcp, tcp_cfg, async))){
2712                 if (_wbufq_non_empty(c)
2713 #ifdef TCP_CONNECT_WAIT
2714                         || (c->flags&F_CONN_PENDING) 
2715 #endif /* TCP_CONNECT_WAIT */
2716                         ){
2717                         if (unlikely(_wbufq_add(c, buf, len)<0)){
2718                                 if (likely(!locked)) lock_release(&c->write_lock);
2719                                 n=-1;
2720                                 goto error;
2721                         }
2722                         if (likely(!locked)) lock_release(&c->write_lock);
2723                         n=len;
2724                         goto end;
2725                 }
2726                 n=_tcpconn_write_nb(fd, c, buf, len);
2727         }else{
2728 #endif /* TCP_ASYNC */
2729                 n=tsend_stream(fd, buf, len,
2730                                                 TICKS_TO_S(cfg_get(tcp, tcp_cfg, send_timeout)) *
2731                                                 1000);
2732 #ifdef TCP_ASYNC
2733         }
2734 #else /* ! TCP_ASYNC */
2735         if (likely(!locked)) lock_release(&c->write_lock);
2736 #endif /* TCP_ASYNC */
2737         
2738         LM_DBG("after real write: c= %p n=%d fd=%d\n",c, n, fd);
2739         LM_DBG("buf=\n%.*s\n", (int)len, buf);
2740         if (unlikely(n<(int)len)){
2741 #ifdef TCP_ASYNC
2742                 if (cfg_get(tcp, tcp_cfg, async) &&
2743                                 ((n>=0) || errno==EAGAIN || errno==EWOULDBLOCK)){
2744                         enable_write_watch=_wbufq_empty(c);
2745                         if (n<0) n=0;
2746                         else if (unlikely(c->state==S_CONN_CONNECT ||
2747                                                 c->state==S_CONN_ACCEPT)){
2748                                 TCP_STATS_ESTABLISHED(c->state);
2749                                 c->state=S_CONN_OK; /* something was written */
2750                         }
2751                         if (unlikely(_wbufq_add(c, buf+n, len-n)<0)){
2752                                 if (likely(!locked)) lock_release(&c->write_lock);
2753                                 n=-1;
2754                                 goto error;
2755                         }
2756                         if (likely(!locked)) lock_release(&c->write_lock);
2757                         n=len;
2758                         if (likely(enable_write_watch))
2759                                 *resp=CONN_QUEUED_WRITE;
2760                         goto end;
2761                 }else{
2762                         if (likely(!locked)) lock_release(&c->write_lock);
2763                 }
2764 #endif /* TCP_ASYNC */
2765                 if (unlikely(c->state==S_CONN_CONNECT)){
2766                         switch(errno){
2767                                 case ENETUNREACH:
2768                                 case EHOSTUNREACH: /* not posix for send() */
2769 #ifdef USE_DST_BLACKLIST
2770                                         dst_blacklist_su(BLST_ERR_CONNECT, c->rcv.proto,
2771                                                                                 &c->rcv.src_su, &c->send_flags, 0);
2772 #endif /* USE_DST_BLACKLIST */
2773                                         TCP_EV_CONNECT_UNREACHABLE(errno, TCP_LADDR(c),
2774                                                                         TCP_LPORT(c), TCP_PSU(c), TCP_PROTO(c));
2775                                         break;
2776                                 case ECONNREFUSED:
2777                                 case ECONNRESET:
2778 #ifdef USE_DST_BLACKLIST
2779                                         dst_blacklist_su(BLST_ERR_CONNECT, c->rcv.proto,
2780                                                                                 &c->rcv.src_su, &c->send_flags, 0);
2781 #endif /* USE_DST_BLACKLIST */
2782                                         TCP_EV_CONNECT_RST(errno, TCP_LADDR(c), TCP_LPORT(c),
2783                                                                                 TCP_PSU(c), TCP_PROTO(c));
2784                                         break;
2785                                 default:
2786                                         TCP_EV_CONNECT_ERR(errno, TCP_LADDR(c), TCP_LPORT(c),
2787                                                                                 TCP_PSU(c), TCP_PROTO(c));
2788                                 }
2789                         TCP_STATS_CONNECT_FAILED();
2790                 }else{
2791                         switch(errno){
2792                                 case ECONNREFUSED:
2793                                 case ECONNRESET:
2794                                         TCP_STATS_CON_RESET();
2795                                         /* no break */
2796                                 case ENETUNREACH:
2797                                 /*case EHOSTUNREACH: -- not posix */
2798 #ifdef USE_DST_BLACKLIST
2799                                         dst_blacklist_su(BLST_ERR_SEND, c->rcv.proto,
2800                                                                                 &c->rcv.src_su, &c->send_flags, 0);
2801 #endif /* USE_DST_BLACKLIST */
2802                                         break;
2803                         }
2804                 }
2805                 LM_ERR("failed to send on %p (%s:%d->%s): %s (%d)\n",
2806                                         c, ip_addr2a(&c->rcv.dst_ip), c->rcv.dst_port,
2807                                         su2a(&c->rcv.src_su, sizeof(c->rcv.src_su)),
2808                                         strerror(errno), errno);
2809                 n = -1;
2810 #ifdef TCP_ASYNC
2811 error:
2812 #endif /* TCP_ASYNC */
2813                 /* error on the connection , mark it as bad and set 0 timeout */
2814                 c->state=S_CONN_BAD;
2815                 c->timeout=get_ticks_raw();
2816                 /* tell "main" it should drop this (optional it will t/o anyway?)*/
2817                 *resp=CONN_ERROR;
2818                 return n; /* error return, no tcpconn_put */
2819         }
2820         
2821 #ifdef TCP_ASYNC
2822         if (likely(!locked)) lock_release(&c->write_lock);
2823 #endif /* TCP_ASYNC */
2824         /* in non-async mode here we're either in S_CONN_OK or S_CONN_ACCEPT*/
2825         if (unlikely(c->state==S_CONN_CONNECT || c->state==S_CONN_ACCEPT)){
2826                         TCP_STATS_ESTABLISHED(c->state);
2827                         c->state=S_CONN_OK;
2828         }
2829         if (unlikely(send_flags.f & SND_F_CON_CLOSE)){
2830                 /* close after write => send EOF request to tcp_main */
2831                 c->state=S_CONN_BAD;
2832                 c->timeout=get_ticks_raw();
2833                 /* tell "main" it should drop this*/
2834                 *resp=CONN_EOF;
2835                 return n;
2836         }
2837 end:
2838         return n;
2839 }
2840
2841
2842
2843 /** low level 1st send on a new connection.
2844  * It takes care of possible write-queueing, blacklisting a.s.o.
2845  * It expects a valid just-opened tcp connection. It doesn't touch the 
2846  * ref. counters. It's used only in the async first send case.
2847  * @param fd - fd used for sending.
2848  * @param c - existing tcp connection pointer (state and flags might be
2849  *            changed). The connection must be new (no previous send on it).
2850  * @param buf - data to be sent.
2851  * @param len - data length.
2852  * @param send_flags
2853  * @param resp - filled with a fd sending cmd. for tcp_main on success. It
2854  *                      _must_ be one of the commands listed below:
2855  *                      CONN_NEW_PENDING_WRITE - new connection, first write
2856  *                                 was partially successful (or EAGAIN) and
2857  *                                 was queued (connection should be watched
2858  *                                 for write and the write queue flushed).
2859  *                                 The fd should be sent to tcp_main.
2860  *                      CONN_NEW_COMPLETE - new connection, first write
2861  *                                 completed successfully and no data is
2862  *                                 queued. The fd should be sent to tcp_main.
2863  *                      CONN_EOF - no error, but the connection should be
2864  *                                  closed (e.g. SND_F_CON_CLOSE send flag).
2865  *                      CONN_ERROR - error, _must_ return < 0.
2866  * @param locked - if set assume the connection is already locked (call from
2867  *                  tls) and do not lock/unlock the connection.
2868  * @return >=0 on success, < 0 on error (on error *resp is undefined).
2869  *
2870  */
2871 static int tcpconn_1st_send(int fd, struct tcp_connection* c,
2872                                                         const char* buf, unsigned len,
2873                                                         snd_flags_t send_flags, long* resp,
2874                                                         int locked)
2875 {
2876         int n;
2877
2878         n=_tcpconn_write_nb(fd, c, buf, len);
2879         if (unlikely(n<(int)len)){
2880                 /* on EAGAIN or ENOTCONN return success.
2881                    ENOTCONN appears on newer FreeBSD versions (non-blocking socket,
2882                    connect() & send immediately) */
2883                 if ((n>=0) || errno==EAGAIN || errno==EWOULDBLOCK || errno==ENOTCONN){
2884                         if(n<0) {
2885                                 LM_DBG("pending write on new connection %p sock %d "
2886                                         "(%d/%d bytes written) (err: %d - %s)\n", c, fd, n, len,
2887                                         errno, strerror(errno));
2888                         } else {
2889                                 LM_DBG("pending write on new connection %p sock %d "
2890                                         "(%d/%d bytes written)\n", c, fd, n, len);
2891                         }
2892                         if (unlikely(n<0)) n=0;
2893                         else{
2894                                 if (likely(c->state == S_CONN_CONNECT))
2895                                         TCP_STATS_ESTABLISHED(S_CONN_CONNECT);
2896                                 c->state=S_CONN_OK; /* partial write => connect()
2897                                                                                                 ended */
2898                         }
2899                         /* add to the write queue */
2900                         if (likely(!locked)) lock_get(&c->write_lock);
2901                                 if (unlikely(_wbufq_insert(c, buf+n, len-n)<0)){
2902                                         if (likely(!locked)) lock_release(&c->write_lock);
2903                                         n=-1;
2904                                         LM_ERR("%s: EAGAIN and write queue full or failed for %p"
2905                                                         " sock %d\n", su2a(&c->rcv.src_su,
2906                                                                 sizeof(c->rcv.src_su)), c, fd);
2907                                         goto error;
2908                                 }
2909                         if (likely(!locked)) lock_release(&c->write_lock);
2910                         /* send to tcp_main */
2911                         *resp=CONN_NEW_PENDING_WRITE;
2912                         n=len;
2913                         goto end;
2914                 }
2915                 /* n < 0 and not EAGAIN => write error */
2916                 /* if first write failed it's most likely a
2917                    connect error */
2918                 switch(errno){
2919                         case ENETUNREACH:
2920                         case EHOSTUNREACH:  /* not posix for send() */
2921 #ifdef USE_DST_BLACKLIST
2922                                 dst_blacklist_su( BLST_ERR_CONNECT, c->rcv.proto,
2923                                                                         &c->rcv.src_su, &c->send_flags, 0);
2924 #endif /* USE_DST_BLACKLIST */
2925                                 TCP_EV_CONNECT_UNREACHABLE(errno, TCP_LADDR(c),
2926                                                                 TCP_LPORT(c), TCP_PSU(c), TCP_PROTO(c));
2927                                 break;
2928                         case ECONNREFUSED:
2929                         case ECONNRESET:
2930 #ifdef USE_DST_BLACKLIST
2931                                 dst_blacklist_su( BLST_ERR_CONNECT, c->rcv.proto,
2932                                                                         &c->rcv.src_su, &c->send_flags, 0);
2933 #endif /* USE_DST_BLACKLIST */
2934                                 TCP_EV_CONNECT_RST(errno, TCP_LADDR(c),
2935                                                                 TCP_LPORT(c), TCP_PSU(c), TCP_PROTO(c));
2936                                 break;
2937                         default:
2938                                 TCP_EV_CONNECT_ERR(errno, TCP_LADDR(c),
2939                                                                 TCP_LPORT(c), TCP_PSU(c), TCP_PROTO(c));
2940                 }
2941                 /* error: destroy it directly */
2942                 TCP_STATS_CONNECT_FAILED();
2943                 LM_ERR("%s: connect & send for %p (sock %d) failed:" " %s (%d)\n",
2944                                         su2a(&c->rcv.src_su, sizeof(c->rcv.src_su)),
2945                                         c, fd, strerror(errno), errno);
2946                 goto error;
2947         }
2948         LM_INFO("quick connect for %p sock %d\n", c, fd);
2949         if (likely(c->state == S_CONN_CONNECT))
2950                 TCP_STATS_ESTABLISHED(S_CONN_CONNECT);
2951         if (unlikely(send_flags.f & SND_F_CON_CLOSE)){
2952                 /* close after write =>  EOF => close immediately */
2953                 c->state=S_CONN_BAD;
2954                 /* tell our caller that it should drop this*/
2955                 *resp=CONN_EOF;
2956         }else{
2957                 c->state=S_CONN_OK;
2958                 /* send to tcp_main */
2959                 *resp=CONN_NEW_COMPLETE;
2960         }
2961 end:
2962         return n; /* >= 0 */
2963 error:
2964         *resp=CONN_ERROR;
2965         return -1;
2966 }
2967
2968
2969
2970 int tcp_init(struct socket_info* sock_info)
2971 {
2972         union sockaddr_union* addr;
2973         int optval;
2974 #ifdef HAVE_TCP_ACCEPT_FILTER
2975         struct accept_filter_arg afa;
2976 #endif /* HAVE_TCP_ACCEPT_FILTER */
2977 #ifdef DISABLE_NAGLE
2978         int flag;
2979         struct protoent* pe;
2980
2981         if (tcp_proto_no==-1){ /* if not already set */
2982                 pe=getprotobyname("tcp");
2983                 if (pe==0){
2984                         LM_ERR("could not get TCP protocol number\n");
2985                         tcp_proto_no=-1;
2986                 }else{
2987                         tcp_proto_no=pe->p_proto;
2988                 }
2989         }
2990 #endif
2991
2992         addr=&sock_info->su;
2993         /* sock_info->proto=PROTO_TCP; */
2994         if (init_su(addr, &sock_info->address, sock_info->port_no)<0){
2995                 LM_ERR("could no init sockaddr_union\n");
2996                 goto error;
2997         }
2998         LM_DBG("added %s\n", su2a(addr, sizeof(*addr)));
2999         sock_info->socket=socket(AF2PF(addr->s.sa_family), SOCK_STREAM, 0);
3000         if (sock_info->socket==-1){
3001                 LM_ERR("tcp_init: socket: %s\n", strerror(errno));
3002                 goto error;
3003         }
3004 #ifdef DISABLE_NAGLE
3005         flag=1;
3006         if ( (tcp_proto_no!=-1) &&
3007                  (setsockopt(sock_info->socket, tcp_proto_no , TCP_NODELAY,
3008                                          &flag, sizeof(flag))<0) ){
3009                 LM_ERR("could not disable Nagle: %s\n", strerror(errno));
3010         }
3011 #endif
3012
3013
3014 #if  !defined(TCP_DONT_REUSEADDR) 
3015         /* Stevens, "Network Programming", Section 7.5, "Generic Socket
3016      * Options": "...server started,..a child continues..on existing
3017          * connection..listening server is restarted...call to bind fails
3018          * ... ALL TCP servers should specify the SO_REUSEADDRE option 
3019          * to allow the server to be restarted in this situation
3020          *
3021          * Indeed, without this option, the server can't restart.
3022          *   -jiri
3023          */
3024         optval=1;
3025         if (setsockopt(sock_info->socket, SOL_SOCKET, SO_REUSEADDR,
3026                                 (void*)&optval, sizeof(optval))==-1) {
3027                 LM_ERR("setsockopt %s\n", strerror(errno));
3028                 goto error;
3029         }
3030 #endif
3031
3032 #ifdef SO_REUSEPORT
3033         if ((optval=cfg_get(tcp, tcp_cfg, reuse_port))) {
3034                 if (setsockopt(sock_info->socket, SOL_SOCKET, SO_REUSEPORT,
3035                                 (void*)&optval, sizeof(optval))==-1) {
3036                         LM_ERR("setsockopt %s\n", strerror(errno));
3037                 }
3038         }
3039 #endif
3040
3041         /* tos */
3042         optval = tos;
3043         if(sock_info->address.af==AF_INET){
3044                 if (setsockopt(sock_info->socket, IPPROTO_IP, IP_TOS, (void*)&optval,
3045                                         sizeof(optval)) ==-1){
3046                         LM_WARN("setsockopt tos: %s (%d)\n", strerror(errno), tos);
3047                         /* continue since this is not critical */
3048                 }
3049         } else if(sock_info->address.af==AF_INET6){
3050                 if (setsockopt(sock_info->socket, IPPROTO_IPV6, IPV6_TCLASS,
3051                                         (void*)&optval, sizeof(optval)) ==-1) {
3052                         LM_WARN("setsockopt v6 tos: %s (%d)\n", strerror(errno), tos);
3053                         /* continue since this is not critical */
3054                 }
3055                 if(sr_bind_ipv6_link_local!=0) {
3056                         LM_INFO("setting scope of %s\n", sock_info->address_str.s);
3057                         addr->sin6.sin6_scope_id =
3058                                 ipv6_get_netif_scope(sock_info->address_str.s);
3059                 }
3060         }
3061
3062 #if defined(IP_FREEBIND)
3063         /* allow bind to non local address.
3064          * useful when daemon started before network initialized */
3065         if (_sr_ip_free_bind && setsockopt(sock_info->socket, IPPROTO_IP,
3066                                 IP_FREEBIND, (void*)&optval, sizeof(optval)) ==-1) {
3067                 LM_WARN("setsockopt freebind failed: %s\n", strerror(errno));
3068                 /* continue since this is not critical */
3069         }
3070 #endif
3071
3072 #ifdef HAVE_TCP_DEFER_ACCEPT
3073         /* linux only */
3074         if ((optval=cfg_get(tcp, tcp_cfg, defer_accept))){
3075                 if (setsockopt(sock_info->socket, IPPROTO_TCP, TCP_DEFER_ACCEPT,
3076                                         (void*)&optval, sizeof(optval)) ==-1){
3077                         LM_WARN("setsockopt TCP_DEFER_ACCEPT %s\n", strerror(errno));
3078                 /* continue since this is not critical */
3079                 }
3080         }
3081 #endif /* HAVE_TCP_DEFFER_ACCEPT */
3082 #ifdef HAVE_TCP_SYNCNT
3083         if ((optval=cfg_get(tcp, tcp_cfg, syncnt))){
3084                 if (setsockopt(sock_info->socket, IPPROTO_TCP, TCP_SYNCNT, &optval,
3085                                                 sizeof(optval))<0){
3086                         LM_WARN("failed to set maximum SYN retr. count: %s\n", strerror(errno));
3087                 }
3088         }
3089 #endif
3090 #ifdef HAVE_TCP_LINGER2
3091         if ((optval=cfg_get(tcp, tcp_cfg, linger2))){
3092                 if (setsockopt(sock_info->socket, IPPROTO_TCP, TCP_LINGER2, &optval,
3093                                                 sizeof(optval))<0){
3094                         LM_WARN("failed to set maximum LINGER2 timeout: %s\n", strerror(errno));
3095                 }
3096         }
3097 #endif
3098         init_sock_keepalive(sock_info->socket);
3099         if (bind(sock_info->socket, &addr->s, sockaddru_len(*addr))==-1){
3100                 LM_ERR("bind(%x, %p, %d) on %s:%d : %s\n",
3101                                 sock_info->socket,  &addr->s, 
3102                                 (unsigned)sockaddru_len(*addr),
3103                    &nbs