core: tcp - new core parameter tcp_accept_unique
[kamailio] / src / core / tcp_main.c
1 /*
2  * Copyright (C) 2001-2003 FhG Fokus
3  *
4  * This file is part of Kamailio, a free SIP server.
5  *
6  * Kamailio is free software; you can redistribute it and/or modify
7  * it under the terms of the GNU General Public License as published by
8  * the Free Software Foundation; either version 2 of the License, or
9  * (at your option) any later version
10  *
11  * Kamailio is distributed in the hope that it will be useful,
12  * but WITHOUT ANY WARRANTY; without even the implied warranty of
13  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
14  * GNU General Public License for more details.
15  *
16  * You should have received a copy of the GNU General Public License
17  * along with this program; if not, write to the Free Software
18  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
19  */
20
21 /** Kamailio core: tcp main/dispatcher and tcp send functions.
22  * @file tcp_main.c
23  * @ingroup core
24  * Module: @ref core
25  */
26
27
28 #ifdef USE_TCP
29
30
31 #define HANDLE_IO_INLINE
32 #include "io_wait.h" /* include first to make sure the needed features are
33                                                 turned on (e.g. _GNU_SOURCE for POLLRDHUP) */
34
35 #include <sys/time.h>
36 #include <sys/types.h>
37 #include <sys/select.h>
38 #include <sys/socket.h>
39 #ifdef HAVE_FILIO_H
40 #include <sys/filio.h> /* needed on solaris 2.x for FIONREAD */
41 #elif defined __OS_solaris
42 #define BSD_COMP  /* needed on older solaris for FIONREAD */
43 #endif /* HAVE_FILIO_H / __OS_solaris */
44 #include <sys/ioctl.h>  /* ioctl() used on write error */
45 #include <arpa/inet.h>  /* for inet_pton() */
46 #include <netinet/in.h>
47 #include <netinet/in_systm.h>
48 #include <netinet/ip.h>
49 #include <netinet/tcp.h>
50 #include <sys/uio.h>  /* writev*/
51 #include <netdb.h>
52 #include <stdlib.h> /*exit() */
53 #include <stdint.h> /* UINT32_MAX */
54
55 #include <unistd.h>
56
57 #include <errno.h>
58 #include <string.h>
59
60 #ifdef HAVE_SELECT
61 #include <sys/select.h>
62 #endif
63 #include <poll.h>
64
65
66 #include "ip_addr.h"
67 #include "pass_fd.h"
68 #include "tcp_conn.h"
69 #include "globals.h"
70 #include "pt.h"
71 #include "locking.h"
72 #include "mem/mem.h"
73 #include "mem/shm_mem.h"
74 #include "timer.h"
75 #include "sr_module.h"
76 #include "tcp_server.h"
77 #include "tcp_init.h"
78 #include "tcp_int_send.h"
79 #include "tcp_stats.h"
80 #include "tcp_ev.h"
81 #include "tsend.h"
82 #include "timer_ticks.h"
83 #include "local_timer.h"
84 #ifdef CORE_TLS
85 #include "tls/tls_server.h"
86 #define tls_loaded() 1
87 #else
88 #include "tls_hooks_init.h"
89 #include "tls_hooks.h"
90 #endif /* CORE_TLS*/
91 #ifdef USE_DST_BLACKLIST
92 #include "dst_blacklist.h"
93 #endif /* USE_DST_BLACKLIST */
94
95 #include "tcp_info.h"
96 #include "tcp_options.h"
97 #include "ut.h"
98 #include "cfg/cfg_struct.h"
99
100 #include <fcntl.h> /* must be included after io_wait.h if SIGIO_RT is used */
101
102
103 #ifdef NO_MSG_DONTWAIT
104 #ifndef MSG_DONTWAIT
105 /* should work inside tcp_main */
106 #define MSG_DONTWAIT 0
107 #endif
108 #endif /*NO_MSG_DONTWAIT */
109
110
111 #define TCP_PASS_NEW_CONNECTION_ON_DATA /* don't pass a new connection
112                                                                                    immediately to a child, wait for
113                                                                                    some data on it first */
114 #define TCP_LISTEN_BACKLOG 1024
115 #define SEND_FD_QUEUE /* queue send fd requests on EAGAIN, instead of sending 
116                                                         them immediately */
117 #define TCP_CHILD_NON_BLOCKING 
118 #ifdef SEND_FD_QUEUE
119 #ifndef TCP_CHILD_NON_BLOCKING
120 #define TCP_CHILD_NON_BLOCKING
121 #endif
122 #define MAX_SEND_FD_QUEUE_SIZE  tcp_main_max_fd_no
123 #define SEND_FD_QUEUE_SIZE              128  /* initial size */
124 #define SEND_FD_QUEUE_TIMEOUT   MS_TO_TICKS(2000)  /* 2 s */
125 #endif
126
127 /* minimum interval local_timer_run() is allowed to run, in ticks */
128 #define TCPCONN_TIMEOUT_MIN_RUN 1  /* once per tick */
129 #define TCPCONN_WAIT_TIMEOUT 1 /* 1 tick */
130
131 #ifdef TCP_ASYNC
132 static unsigned int* tcp_total_wq=0;
133 #endif
134
135
136 enum fd_types { F_NONE, F_SOCKINFO /* a tcp_listen fd */,
137                                 F_TCPCONN, F_TCPCHILD, F_PROC };
138
139
140 #ifdef TCP_FD_CACHE
141
142 #define TCP_FD_CACHE_SIZE 8
143
144 struct fd_cache_entry{
145         struct tcp_connection* con;
146         int id;
147         int fd;
148 };
149
150
151 static struct fd_cache_entry fd_cache[TCP_FD_CACHE_SIZE];
152 #endif /* TCP_FD_CACHE */
153
154 static int is_tcp_main=0;
155
156
157 enum poll_types tcp_poll_method=0; /* by default choose the best method */
158 int tcp_main_max_fd_no=0;
159 int tcp_max_connections=DEFAULT_TCP_MAX_CONNECTIONS;
160 int tls_max_connections=DEFAULT_TLS_MAX_CONNECTIONS;
161 int tcp_accept_unique=0;
162
163 static union sockaddr_union tcp_source_ipv4_addr; /* saved bind/srv v4 addr. */
164 static union sockaddr_union* tcp_source_ipv4=0;
165 static union sockaddr_union tcp_source_ipv6_addr; /* saved bind/src v6 addr. */
166 static union sockaddr_union* tcp_source_ipv6=0;
167
168 static int* tcp_connections_no=0; /* current tcp (+tls) open connections */
169 static int* tls_connections_no=0; /* current tls open connections */
170
171 /* connection hash table (after ip&port) , includes also aliases */
172 struct tcp_conn_alias** tcpconn_aliases_hash=0;
173 /* connection hash table (after connection id) */
174 struct tcp_connection** tcpconn_id_hash=0;
175 gen_lock_t* tcpconn_lock=0;
176
177 struct tcp_child* tcp_children=0;
178 static int* connection_id=0; /*  unique for each connection, used for 
179                                                                 quickly finding the corresponding connection
180                                                                 for a reply */
181 int unix_tcp_sock;
182
183 static int tcp_proto_no=-1; /* tcp protocol number as returned by
184                                                            getprotobyname */
185
186 static io_wait_h io_h;
187
188 static struct local_timer tcp_main_ltimer;
189 static ticks_t tcp_main_prev_ticks;
190
191 /* tell if there are tcp workers that should handle only specific socket
192  * - used to optimize the search of least loaded worker for a tcp socket
193  * - 0 - no workers per tcp sockets have been set
194  * - 1 + generic_workers - when there are workers per tcp sockets
195  */
196 static int tcp_sockets_gworkers = 0;
197
198 static ticks_t tcpconn_main_timeout(ticks_t , struct timer_ln* , void* );
199
200 inline static int _tcpconn_add_alias_unsafe(struct tcp_connection* c, int port,
201                                                                                 struct ip_addr* l_ip, int l_port,
202                                                                                 int flags);
203
204
205
206 /* sets source address used when opening new sockets and no source is specified
207  *  (by default the address is choosen by the kernel)
208  * Should be used only on init.
209  * returns -1 on error */
210 int tcp_set_src_addr(struct ip_addr* ip)
211 {
212         switch (ip->af){
213                 case AF_INET:
214                         ip_addr2su(&tcp_source_ipv4_addr, ip, 0);
215                         tcp_source_ipv4=&tcp_source_ipv4_addr;
216                         break;
217                 case AF_INET6:
218                         ip_addr2su(&tcp_source_ipv6_addr, ip, 0);
219                         tcp_source_ipv6=&tcp_source_ipv6_addr;
220                         break;
221                 default:
222                         return -1;
223         }
224         return 0;
225 }
226
227
228
229 static inline int init_sock_keepalive(int s)
230 {
231         int optval;
232         
233 #ifdef HAVE_SO_KEEPALIVE
234         if (cfg_get(tcp, tcp_cfg, keepalive)){
235                 optval=1;
236                 if (setsockopt(s, SOL_SOCKET, SO_KEEPALIVE, &optval,
237                                                 sizeof(optval))<0){
238                         LM_WARN("failed to enable SO_KEEPALIVE: %s\n", strerror(errno));
239                         return -1;
240                 }
241         }
242 #endif
243 #ifdef HAVE_TCP_KEEPINTVL
244         if ((optval=cfg_get(tcp, tcp_cfg, keepintvl))){
245                 if (setsockopt(s, IPPROTO_TCP, TCP_KEEPINTVL, &optval,
246                                                 sizeof(optval))<0){
247                         LM_WARN("failed to set keepalive probes interval: %s\n", strerror(errno));
248                 }
249         }
250 #endif
251 #ifdef HAVE_TCP_KEEPIDLE
252         if ((optval=cfg_get(tcp, tcp_cfg, keepidle))){
253                 if (setsockopt(s, IPPROTO_TCP, TCP_KEEPIDLE, &optval,
254                                                 sizeof(optval))<0){
255                         LM_WARN("failed to set keepalive idle interval: %s\n", strerror(errno));
256                 }
257         }
258 #endif
259 #ifdef HAVE_TCP_KEEPCNT
260         if ((optval=cfg_get(tcp, tcp_cfg, keepcnt))){
261                 if (setsockopt(s, IPPROTO_TCP, TCP_KEEPCNT, &optval,
262                                                 sizeof(optval))<0){
263                         LM_WARN("failed to set maximum keepalive count: %s\n", strerror(errno));
264                 }
265         }
266 #endif
267         return 0;
268 }
269
270
271
272 /* set all socket/fd options for new sockets (e.g. before connect): 
273  *  disable nagle, tos lowdelay, reuseaddr, non-blocking
274  *
275  * return -1 on error */
276 static int init_sock_opt(int s, int af)
277 {
278         int flags;
279         int optval;
280         
281 #ifdef DISABLE_NAGLE
282         flags=1;
283         if ( (tcp_proto_no!=-1) && (setsockopt(s, tcp_proto_no , TCP_NODELAY,
284                                         &flags, sizeof(flags))<0) ){
285                 LM_WARN("could not disable Nagle: %s\n", strerror(errno));
286         }
287 #endif
288         /* tos*/
289         optval = tos;
290         if(af==AF_INET){
291                 if (setsockopt(s, IPPROTO_IP, IP_TOS, (void*)&optval,
292                                         sizeof(optval)) ==-1){
293                         LM_WARN("setsockopt tos: %s\n", strerror(errno));
294                         /* continue since this is not critical */
295                 }
296         } else if(af==AF_INET6){
297                 if (setsockopt(s, IPPROTO_IPV6, IPV6_TCLASS,
298                                         (void*)&optval, sizeof(optval)) ==-1) {
299                         LM_WARN("setsockopt v6 tos: %s\n", strerror(errno));
300                         /* continue since this is not critical */
301                 }
302         }
303
304 #if  !defined(TCP_DONT_REUSEADDR) 
305         optval=1;
306         if (setsockopt(s, SOL_SOCKET, SO_REUSEADDR,
307                                                 (void*)&optval, sizeof(optval))==-1){
308                 LM_ERR("setsockopt SO_REUSEADDR %s\n", strerror(errno));
309                 /* continue, not critical */
310         }
311 #endif /* !TCP_DONT_REUSEADDR */
312
313 #ifdef SO_REUSEPORT
314         if ((optval=cfg_get(tcp, tcp_cfg, reuse_port))) {
315                 if (setsockopt(s, SOL_SOCKET, SO_REUSEPORT,
316                                 (void*)&optval, sizeof(optval))==-1) {
317                         LM_ERR("setsockopt %s\n", strerror(errno));
318                 }
319         }
320 #endif
321
322 #ifdef HAVE_TCP_SYNCNT
323         if ((optval=cfg_get(tcp, tcp_cfg, syncnt))){
324                 if (setsockopt(s, IPPROTO_TCP, TCP_SYNCNT, &optval,
325                                                 sizeof(optval))<0){
326                         LM_WARN("failed to set maximum SYN retr. count: %s\n", strerror(errno));
327                 }
328         }
329 #endif
330 #ifdef HAVE_TCP_LINGER2
331         if ((optval=cfg_get(tcp, tcp_cfg, linger2))){
332                 if (setsockopt(s, IPPROTO_TCP, TCP_LINGER2, &optval,
333                                                 sizeof(optval))<0){
334                         LM_WARN("failed to set maximum LINGER2 timeout: %s\n", strerror(errno));
335                 }
336         }
337 #endif
338 #ifdef HAVE_TCP_QUICKACK
339         if (cfg_get(tcp, tcp_cfg, delayed_ack)){
340                 optval=0; /* reset quick ack => delayed ack */
341                 if (setsockopt(s, IPPROTO_TCP, TCP_QUICKACK, &optval,
342                                                 sizeof(optval))<0){
343                         LM_WARN("failed to reset TCP_QUICKACK: %s\n", strerror(errno));
344                 }
345         }
346 #endif /* HAVE_TCP_QUICKACK */
347         init_sock_keepalive(s);
348         
349         /* non-blocking */
350         flags=fcntl(s, F_GETFL);
351         if (flags==-1){
352                 LM_ERR("fnctl failed: (%d) %s\n", errno, strerror(errno));
353                 goto error;
354         }
355         if (fcntl(s, F_SETFL, flags|O_NONBLOCK)==-1){
356                 LM_ERR("fcntl: set non-blocking failed: (%d) %s\n", errno, strerror(errno));
357                 goto error;
358         }
359         return 0;
360 error:
361         return -1;
362 }
363
364
365
366 /* set all socket/fd options for "accepted" sockets 
367  *  only nonblocking is set since the rest is inherited from the
368  *  "parent" (listening) socket
369  *  Note: setting O_NONBLOCK is required on linux but it's not needed on
370  *        BSD and possibly solaris (where the flag is inherited from the 
371  *        parent socket). However since there is no standard document 
372  *        requiring a specific behaviour in this case it's safer to always set
373  *        it (at least for now)  --andrei
374  *  TODO: check on which OSes  O_NONBLOCK is inherited and make this 
375  *        function a nop.
376  *
377  * return -1 on error */
378 static int init_sock_opt_accept(int s)
379 {
380         int flags;
381         
382         /* non-blocking */
383         flags=fcntl(s, F_GETFL);
384         if (flags==-1){
385                 LM_ERR("fnctl failed: (%d) %s\n", errno, strerror(errno));
386                 goto error;
387         }
388         if (fcntl(s, F_SETFL, flags|O_NONBLOCK)==-1){
389                 LM_ERR("fcntl: set non-blocking failed: (%d) %s\n", errno, strerror(errno));
390                 goto error;
391         }
392         return 0;
393 error:
394         return -1;
395 }
396
397
398
399 /** close a socket, handling errno.
400  * On EINTR, repeat the close().
401  * Filter expected errors (return success if close() failed because
402  * EPIPE, ECONNRST a.s.o). Note that this happens on *BSDs (on linux close()
403  * does not fail for socket level errors).
404  * @param s - open valid socket.
405  * @return - 0 on success, < 0 on error (whatever close() returns). On error
406  *           errno is set.
407  */
408 static int tcp_safe_close(int s)
409 {
410         int ret;
411
412         if(s<0)
413                 return 0;
414
415 retry:
416         if (unlikely((ret = close(s)) < 0 )) {
417                 switch(errno) {
418                         case EINTR:
419                                 goto retry;
420                         case EPIPE:
421                         case ENOTCONN:
422                         case ECONNRESET:
423                         case ECONNREFUSED:
424                         case ENETUNREACH:
425                         case EHOSTUNREACH:
426                                 /* on *BSD we really get these errors at close() time 
427                                    => ignore them */
428                                 ret = 0;
429                                 break;
430                         default:
431                                 break;
432                 }
433         }
434         return ret;
435 }
436
437
438
439 /* blocking connect on a non-blocking fd; it will timeout after
440  * tcp_connect_timeout 
441  * if BLOCKING_USE_SELECT and HAVE_SELECT are defined it will internally
442  * use select() instead of poll (bad if fd > FD_SET_SIZE, poll is preferred)
443  */
444 static int tcp_blocking_connect(int fd, int type, snd_flags_t* send_flags,
445                                                                 const struct sockaddr *servaddr,
446                                                                 socklen_t addrlen)
447 {
448         int n;
449 #if defined(HAVE_SELECT) && defined(BLOCKING_USE_SELECT)
450         fd_set sel_set;
451         fd_set orig_set;
452         struct timeval timeout;
453 #else
454         struct pollfd pf;
455 #endif
456         int elapsed;
457         int to;
458         int ticks;
459         int err;
460         unsigned int err_len;
461         int poll_err;
462         
463         poll_err=0;
464         to=cfg_get(tcp, tcp_cfg, connect_timeout_s);
465         ticks=get_ticks();
466 again:
467         n=connect(fd, servaddr, addrlen);
468         if (n==-1){
469                 if (errno==EINTR){
470                         elapsed=(get_ticks()-ticks)*TIMER_TICK;
471                         if (elapsed<to)         goto again;
472                         else goto error_timeout;
473                 }
474                 if (errno!=EINPROGRESS && errno!=EALREADY){
475                         goto error_errno;
476                 }
477         }else goto end;
478         
479         /* poll/select loop */
480 #if defined(HAVE_SELECT) && defined(BLOCKING_USE_SELECT)
481                 FD_ZERO(&orig_set);
482                 FD_SET(fd, &orig_set);
483 #else
484                 pf.fd=fd;
485                 pf.events=POLLOUT;
486 #endif
487         while(1){
488                 elapsed=(get_ticks()-ticks)*TIMER_TICK;
489                 if (elapsed>=to)
490                         goto error_timeout;
491 #if defined(HAVE_SELECT) && defined(BLOCKING_USE_SELECT)
492                 sel_set=orig_set;
493                 timeout.tv_sec=to-elapsed;
494                 timeout.tv_usec=0;
495                 n=select(fd+1, 0, &sel_set, 0, &timeout);
496 #else
497                 n=poll(&pf, 1, (to-elapsed)*1000);
498 #endif
499                 if (n<0){
500                         if (errno==EINTR) continue;
501                         LM_ERR("%s: poll/select failed: (%d) %s\n",
502                                         su2a((union sockaddr_union*)servaddr, addrlen),
503                                         errno, strerror(errno));
504                         goto error;
505                 }else if (n==0) /* timeout */ continue;
506 #if defined(HAVE_SELECT) && defined(BLOCKING_USE_SELECT)
507                 if (FD_ISSET(fd, &sel_set))
508 #else
509                 if (pf.revents&(POLLERR|POLLHUP|POLLNVAL)){ 
510                         LM_ERR("%s: poll error: flags %x\n",
511                                         su2a((union sockaddr_union*)servaddr, addrlen),
512                                         pf.revents);
513                         poll_err=1;
514                 }
515 #endif
516                 {
517                         err_len=sizeof(err);
518                         getsockopt(fd, SOL_SOCKET, SO_ERROR, &err, &err_len);
519                         if ((err==0) && (poll_err==0)) goto end;
520                         if (err!=EINPROGRESS && err!=EALREADY){
521                                 LM_ERR("%s: SO_ERROR (%d) %s\n",
522                                                 su2a((union sockaddr_union*)servaddr, addrlen),
523                                                 err, strerror(err));
524                                 errno=err;
525                                 goto error_errno;
526                         }
527                 }
528         }
529 error_errno:
530         switch(errno){
531                 case ENETUNREACH:
532                 case EHOSTUNREACH:
533 #ifdef USE_DST_BLACKLIST
534                         dst_blacklist_su(BLST_ERR_CONNECT, type,
535                                                          (union sockaddr_union*)servaddr, send_flags, 0);
536 #endif /* USE_DST_BLACKLIST */
537                         TCP_EV_CONNECT_UNREACHABLE(errno, 0, 0,
538                                                         (union sockaddr_union*)servaddr, type);
539                         break;
540                 case ETIMEDOUT:
541 #ifdef USE_DST_BLACKLIST
542                         dst_blacklist_su(BLST_ERR_CONNECT, type,
543                                                          (union sockaddr_union*)servaddr, send_flags, 0);
544 #endif /* USE_DST_BLACKLIST */
545                         TCP_EV_CONNECT_TIMEOUT(errno, 0, 0,
546                                                         (union sockaddr_union*)servaddr, type);
547                         break;
548                 case ECONNREFUSED:
549                 case ECONNRESET:
550 #ifdef USE_DST_BLACKLIST
551                         dst_blacklist_su(BLST_ERR_CONNECT, type,
552                                                          (union sockaddr_union*)servaddr, send_flags, 0);
553 #endif /* USE_DST_BLACKLIST */
554                         TCP_EV_CONNECT_RST(errno, 0, 0,
555                                                         (union sockaddr_union*)servaddr, type);
556                         break;
557                 case EAGAIN: /* not posix, but supported on linux and bsd */
558                         TCP_EV_CONNECT_NO_MORE_PORTS(errno, 0, 0,
559                                                         (union sockaddr_union*)servaddr, type);
560                         break;
561                 default:
562                         TCP_EV_CONNECT_ERR(errno, 0, 0,
563                                                                 (union sockaddr_union*)servaddr, type);
564         }
565         LM_ERR("%s: (%d) %s\n",
566                         su2a((union sockaddr_union*)servaddr, addrlen),
567                         errno, strerror(errno));
568         goto error;
569 error_timeout:
570         /* timeout */
571 #ifdef USE_DST_BLACKLIST
572         dst_blacklist_su(BLST_ERR_CONNECT, type,
573                                                 (union sockaddr_union*)servaddr, send_flags, 0);
574 #endif /* USE_DST_BLACKLIST */
575         TCP_EV_CONNECT_TIMEOUT(0, 0, 0, (union sockaddr_union*)servaddr, type);
576         LM_ERR("%s: timeout %d s elapsed from %d s\n",
577                                 su2a((union sockaddr_union*)servaddr, addrlen),
578                                 elapsed, cfg_get(tcp, tcp_cfg, connect_timeout_s));
579 error:
580         TCP_STATS_CONNECT_FAILED();
581         return -1;
582 end:
583         return 0;
584 }
585
586
587
588 #ifdef TCP_ASYNC
589
590
591 /* unsafe version */
592 #define _wbufq_empty(con) ((con)->wbuf_q.first==0)
593 /* unsafe version */
594 #define _wbufq_non_empty(con) ((con)->wbuf_q.first!=0)
595
596
597 /* unsafe version, call while holding the connection write lock */
598 inline static int _wbufq_add(struct  tcp_connection* c, const char* data, 
599                                                         unsigned int size)
600 {
601         struct tcp_wbuffer_queue* q;
602         struct tcp_wbuffer* wb;
603         unsigned int last_free;
604         unsigned int wb_size;
605         unsigned int crt_size;
606         ticks_t t;
607         
608         q=&c->wbuf_q;
609         t=get_ticks_raw();
610         if (unlikely(   ((q->queued+size)>cfg_get(tcp, tcp_cfg, tcpconn_wq_max)) ||
611                                         ((*tcp_total_wq+size)>cfg_get(tcp, tcp_cfg, tcp_wq_max)) ||
612                                         (q->first &&
613                                         TICKS_LT(q->wr_timeout, t)) )){
614                 LM_ERR("(%d bytes): write queue full or timeout "
615                                         " (%d, total %d, last write %d s ago)\n",
616                                         size, q->queued, *tcp_total_wq,
617                                         TICKS_TO_S(t-(q->wr_timeout-
618                                                                 cfg_get(tcp, tcp_cfg, send_timeout))));
619                 if (q->first && TICKS_LT(q->wr_timeout, t)){
620                         if (unlikely(c->state==S_CONN_CONNECT)){
621 #ifdef USE_DST_BLACKLIST
622                                 (void)dst_blacklist_su( BLST_ERR_CONNECT, c->rcv.proto,
623                                                                                 &c->rcv.src_su, &c->send_flags, 0);
624 #endif /* USE_DST_BLACKLIST */
625                                 TCP_EV_CONNECT_TIMEOUT(0, TCP_LADDR(c), TCP_LPORT(c),
626                                                                                         TCP_PSU(c), TCP_PROTO(c));
627                                 TCP_STATS_CONNECT_FAILED();
628                         }else{
629 #ifdef USE_DST_BLACKLIST
630                                 (void)dst_blacklist_su( BLST_ERR_SEND, c->rcv.proto,
631                                                                         &c->rcv.src_su, &c->send_flags, 0);
632 #endif /* USE_DST_BLACKLIST */
633                                 TCP_EV_SEND_TIMEOUT(0, &c->rcv);
634                                 TCP_STATS_SEND_TIMEOUT();
635                         }
636                 }else{
637                         /* if it's not a timeout => queue full */
638                         TCP_EV_SENDQ_FULL(0, &c->rcv);
639                         TCP_STATS_SENDQ_FULL();
640                 }
641                 goto error;
642         }
643         
644         if (unlikely(q->last==0)){
645                 wb_size=MAX_unsigned(cfg_get(tcp, tcp_cfg, wq_blk_size), size);
646                 wb=shm_malloc(sizeof(*wb)+wb_size-1);
647                 if (unlikely(wb==0)) {
648                         SHM_MEM_ERROR;
649                         goto error;
650                 }
651                 wb->b_size=wb_size;
652                 wb->next=0;
653                 q->last=wb;
654                 q->first=wb;
655                 q->last_used=0;
656                 q->offset=0;
657                 q->wr_timeout=get_ticks_raw()+
658                         ((c->state==S_CONN_CONNECT)?
659                                         S_TO_TICKS(cfg_get(tcp, tcp_cfg, connect_timeout_s)):
660                                         cfg_get(tcp, tcp_cfg, send_timeout));
661         }else{
662                 wb=q->last;
663         }
664         
665         while(size){
666                 last_free=wb->b_size-q->last_used;
667                 if (last_free==0){
668                         wb_size=MAX_unsigned(cfg_get(tcp, tcp_cfg, wq_blk_size), size);
669                         wb=shm_malloc(sizeof(*wb)+wb_size-1);
670                         if (unlikely(wb==0)) {
671                                 SHM_MEM_ERROR;
672                                 goto error;
673                         }
674                         wb->b_size=wb_size;
675                         wb->next=0;
676                         q->last->next=wb;
677                         q->last=wb;
678                         q->last_used=0;
679                         last_free=wb->b_size;
680                 }
681                 crt_size=MIN_unsigned(last_free, size);
682                 memcpy(wb->buf+q->last_used, data, crt_size);
683                 q->last_used+=crt_size;
684                 size-=crt_size;
685                 data+=crt_size;
686                 q->queued+=crt_size;
687                 atomic_add_int((int*)tcp_total_wq, crt_size);
688         }
689         return 0;
690 error:
691         return -1;
692 }
693
694
695
696 /* unsafe version, call while holding the connection write lock
697  * inserts data at the beginning, it ignores the max queue size checks and
698  * the timeout (use sparingly)
699  * Note: it should never be called on a write buffer after wbufq_run() */
700 inline static int _wbufq_insert(struct  tcp_connection* c, const char* data, 
701                                                         unsigned int size)
702 {
703         struct tcp_wbuffer_queue* q;
704         struct tcp_wbuffer* wb;
705         
706         q=&c->wbuf_q;
707         if (likely(q->first==0)) /* if empty, use wbufq_add */
708                 return _wbufq_add(c, data, size);
709         
710         if (unlikely((*tcp_total_wq+size)>cfg_get(tcp, tcp_cfg, tcp_wq_max))){
711                 LM_ERR("(%d bytes): write queue full"
712                                         " (%d, total %d, last write %d s ago)\n",
713                                         size, q->queued, *tcp_total_wq,
714                                         TICKS_TO_S(get_ticks_raw()-q->wr_timeout-
715                                                                         cfg_get(tcp, tcp_cfg, send_timeout)));
716                 goto error;
717         }
718         if (unlikely(q->offset)){
719                 LM_CRIT("non-null offset %d (bad call, should"
720                                 "never be called after the wbufq_run())\n", q->offset);
721                 goto error;
722         }
723         if ((q->first==q->last) && ((q->last->b_size-q->last_used)>=size)){
724                 /* one block with enough space in it for size bytes */
725                 memmove(q->first->buf+size, q->first->buf, q->last_used);
726                 memcpy(q->first->buf, data, size);
727                 q->last_used+=size;
728         }else{
729                 /* create a size bytes block directly */
730                 wb=shm_malloc(sizeof(*wb)+size-1);
731                 if (unlikely(wb==0)) {
732                         SHM_MEM_ERROR;
733                         goto error;
734                 }
735                 wb->b_size=size;
736                 /* insert it */
737                 wb->next=q->first;
738                 q->first=wb;
739                 memcpy(wb->buf, data, size);
740         }
741         
742         q->queued+=size;
743         atomic_add_int((int*)tcp_total_wq, size);
744         return 0;
745 error:
746         return -1;
747 }
748
749
750
751 /* unsafe version, call while holding the connection write lock */
752 inline static void _wbufq_destroy( struct  tcp_wbuffer_queue* q)
753 {
754         struct tcp_wbuffer* wb;
755         struct tcp_wbuffer* next_wb;
756         int unqueued;
757         
758         unqueued=0;
759         if (likely(q->first)){
760                 wb=q->first;
761                 do{
762                         next_wb=wb->next;
763                         unqueued+=(wb==q->last)?q->last_used:wb->b_size;
764                         if (wb==q->first)
765                                 unqueued-=q->offset;
766                         shm_free(wb);
767                         wb=next_wb;
768                 }while(wb);
769         }
770         memset(q, 0, sizeof(*q));
771         atomic_add_int((int*)tcp_total_wq, -unqueued);
772 }
773
774
775
776 /* tries to empty the queue  (safe version, c->write_lock must not be hold)
777  * returns -1 on error, bytes written on success (>=0) 
778  * if the whole queue is emptied => sets *empty*/
779 inline static int wbufq_run(int fd, struct tcp_connection* c, int* empty)
780 {
781         struct tcp_wbuffer_queue* q;
782         struct tcp_wbuffer* wb;
783         int n;
784         int ret;
785         int block_size;
786         char* buf;
787         
788         *empty=0;
789         ret=0;
790         lock_get(&c->write_lock);
791         q=&c->wbuf_q;
792         while(q->first){
793                 block_size=((q->first==q->last)?q->last_used:q->first->b_size)-
794                                                 q->offset;
795                 buf=q->first->buf+q->offset;
796                 n=_tcpconn_write_nb(fd, c, buf, block_size);
797                 if (likely(n>0)){
798                         ret+=n;
799                         if (likely(n==block_size)){
800                                 wb=q->first;
801                                 q->first=q->first->next; 
802                                 shm_free(wb);
803                                 q->offset=0;
804                                 q->queued-=block_size;
805                                 atomic_add_int((int*)tcp_total_wq, -block_size);
806                         }else{
807                                 q->offset+=n;
808                                 q->queued-=n;
809                                 atomic_add_int((int*)tcp_total_wq, -n);
810                                 break;
811                         }
812                 }else{
813                         if (n<0){
814                                 /* EINTR is handled inside _tcpconn_write_nb */
815                                 if (!(errno==EAGAIN || errno==EWOULDBLOCK)){
816                                         if (unlikely(c->state==S_CONN_CONNECT)){
817                                                 switch(errno){
818                                                         case ENETUNREACH:
819                                                         case EHOSTUNREACH: /* not posix for send() */
820 #ifdef USE_DST_BLACKLIST
821                                                                 dst_blacklist_su(BLST_ERR_CONNECT,
822                                                                                                         c->rcv.proto,
823                                                                                                         &c->rcv.src_su,
824                                                                                                         &c->send_flags, 0);
825 #endif /* USE_DST_BLACKLIST */
826                                                                 TCP_EV_CONNECT_UNREACHABLE(errno, TCP_LADDR(c),
827                                                                                                         TCP_LPORT(c), TCP_PSU(c),
828                                                                                                         TCP_PROTO(c));
829                                                                 break;
830                                                         case ECONNREFUSED:
831                                                         case ECONNRESET:
832 #ifdef USE_DST_BLACKLIST
833                                                                 dst_blacklist_su(BLST_ERR_CONNECT,
834                                                                                                         c->rcv.proto,
835                                                                                                         &c->rcv.src_su,
836                                                                                                         &c->send_flags, 0);
837 #endif /* USE_DST_BLACKLIST */
838                                                                 TCP_EV_CONNECT_RST(0, TCP_LADDR(c),
839                                                                                                         TCP_LPORT(c), TCP_PSU(c),
840                                                                                                         TCP_PROTO(c));
841                                                                 break;
842                                                         default:
843                                                                 TCP_EV_CONNECT_ERR(errno, TCP_LADDR(c),
844                                                                                                         TCP_LPORT(c), TCP_PSU(c),
845                                                                                                         TCP_PROTO(c));
846                                                 }
847                                                 TCP_STATS_CONNECT_FAILED();
848                                         }else{
849                                                 switch(errno){
850                                                         case ECONNREFUSED:
851                                                         case ECONNRESET:
852                                                                 TCP_STATS_CON_RESET();
853                                                                 /* no break */
854                                                         case ENETUNREACH:
855                                                         case EHOSTUNREACH: /* not posix for send() */
856 #ifdef USE_DST_BLACKLIST
857                                                                 dst_blacklist_su(BLST_ERR_SEND,
858                                                                                                         c->rcv.proto,
859                                                                                                         &c->rcv.src_su,
860                                                                                                         &c->send_flags, 0);
861 #endif /* USE_DST_BLACKLIST */
862                                                                 break;
863                                                 }
864                                         }
865                                         ret=-1;
866                                         LM_ERR("%s [%d]\n", strerror(errno), errno);
867                                 }
868                         }
869                         break;
870                 }
871         }
872         if (likely(q->first==0)){
873                 q->last=0;
874                 q->last_used=0;
875                 q->offset=0;
876                 *empty=1;
877         }
878         lock_release(&c->write_lock);
879         if (likely(ret>0)){
880                 q->wr_timeout=get_ticks_raw()+cfg_get(tcp, tcp_cfg, send_timeout);
881                 if (unlikely(c->state==S_CONN_CONNECT || c->state==S_CONN_ACCEPT)){
882                         TCP_STATS_ESTABLISHED(c->state);
883                         c->state=S_CONN_OK;
884                 }
885         }
886         return ret;
887 }
888
889 #endif /* TCP_ASYNC */
890
891
892
893 #if 0
894 /* blocking write even on non-blocking sockets 
895  * if TCP_TIMEOUT will return with error */
896 static int tcp_blocking_write(struct tcp_connection* c, int fd, char* buf,
897                                                                 unsigned int len)
898 {
899         int n;
900         fd_set sel_set;
901         struct timeval timeout;
902         int ticks;
903         int initial_len;
904         
905         initial_len=len;
906 again:
907         
908         n=send(fd, buf, len,
909 #ifdef HAVE_MSG_NOSIGNAL
910                         MSG_NOSIGNAL
911 #else
912                         0
913 #endif
914                 );
915         if (n<0){
916                 if (errno==EINTR)       goto again;
917                 else if (errno!=EAGAIN && errno!=EWOULDBLOCK){
918                         LM_ERR("failed to send: (%d) %s\n", errno, strerror(errno));
919                         TCP_EV_SEND_TIMEOUT(errno, &c->rcv);
920                         TCP_STATS_SEND_TIMEOUT();
921                         goto error;
922                 }
923         }else if (n<len){
924                 /* partial write */
925                 buf+=n;
926                 len-=n;
927         }else{
928                 /* success: full write */
929                 goto end;
930         }
931         while(1){
932                 FD_ZERO(&sel_set);
933                 FD_SET(fd, &sel_set);
934                 timeout.tv_sec=tcp_send_timeout;
935                 timeout.tv_usec=0;
936                 ticks=get_ticks();
937                 n=select(fd+1, 0, &sel_set, 0, &timeout);
938                 if (n<0){
939                         if (errno==EINTR) continue; /* signal, ignore */
940                         LM_ERR("select failed: (%d) %s\n", errno, strerror(errno));
941                         goto error;
942                 }else if (n==0){
943                         /* timeout */
944                         if (get_ticks()-ticks>=tcp_send_timeout){
945                                 LM_ERR("send timeout (%d)\n", tcp_send_timeout);
946                                 goto error;
947                         }
948                         continue;
949                 }
950                 if (FD_ISSET(fd, &sel_set)){
951                         /* we can write again */
952                         goto again;
953                 }
954         }
955 error:
956                 return -1;
957 end:
958                 return initial_len;
959 }
960 #endif
961
962 /* Attempt to extract real connection information from an upstream load
963  * balancer or reverse proxy. This should be called right after accept()ing the
964  * connection, and before TLS negotiation.
965  *
966  * Returns:
967  *    -1 on parsing error (connection should be closed)
968  *    0 on parser success, and connection information was extracted
969  *    1 on parser success, but no connection information was provided by the
970  *      upstream load balancer or reverse proxy.
971  */
972 int tcpconn_read_haproxy(struct tcp_connection *c) {
973         int bytes, retval = 0;
974         uint32_t size, port;
975         char *p, *end;
976         struct ip_addr *src_ip, *dst_ip;
977
978         const char v2sig[12] = "\x0D\x0A\x0D\x0A\x00\x0D\x0A\x51\x55\x49\x54\x0A";
979
980         // proxy header union
981         union {
982                 // v1 struct
983                 struct {
984                         char line[108];
985                 } v1;
986
987                 // v2 struct
988                 struct {
989                         uint8_t sig[12];
990                         uint8_t ver_cmd;
991                         uint8_t fam;
992                         uint16_t len;
993
994                         union {
995                                 struct { /* for TCP/UDP over IPv4, len = 12 */
996                                         uint32_t src_addr;
997                                         uint32_t dst_addr;
998                                         uint16_t src_port;
999                                         uint16_t dst_port;
1000                                 } ip4;
1001
1002                                 struct { /* for TCP/UDP over IPv6, len = 36 */
1003                                          uint8_t  src_addr[16];
1004                                          uint8_t  dst_addr[16];
1005                                          uint16_t src_port;
1006                                          uint16_t dst_port;
1007                                 } ip6;
1008
1009                                 struct { /* for AF_UNIX sockets, len = 216 */
1010                                          uint8_t src_addr[108];
1011                                          uint8_t dst_addr[108];
1012                                 } unx;
1013                         } addr;
1014                 } v2;
1015
1016         } hdr;
1017
1018         do {
1019                 bytes = recv(c->s, &hdr, sizeof(hdr), MSG_PEEK);
1020         } while (bytes == -1 && (errno == EINTR || errno == EAGAIN));
1021
1022         src_ip = &c->rcv.src_ip;
1023         dst_ip = &c->rcv.dst_ip;
1024
1025         if (bytes >= 16 && memcmp(&hdr.v2, v2sig, 12) == 0 &&
1026                 (hdr.v2.ver_cmd & 0xF0) == 0x20) {
1027                 LM_DBG("received PROXY protocol v2 header\n");
1028                 size = 16 + ntohs(hdr.v2.len);
1029
1030                 if (bytes < size) {
1031                         return -1; /* truncated or too large header */
1032                 }
1033
1034                 switch (hdr.v2.ver_cmd & 0xF) {
1035                         case 0x01: /* PROXY command */
1036                                 switch (hdr.v2.fam) {
1037                                         case 0x11: /* TCPv4 */
1038                                                 src_ip->af = AF_INET;
1039                                                 src_ip->len = 4;
1040                                                 src_ip->u.addr32[0] =
1041                                                         hdr.v2.addr.ip4.src_addr;
1042                                                 c->rcv.src_port =
1043                                                         hdr.v2.addr.ip4.src_port;
1044
1045                                                 dst_ip->af = AF_INET;
1046                                                 dst_ip->len = 4;
1047                                                 dst_ip->u.addr32[0] =
1048                                                         hdr.v2.addr.ip4.dst_addr;
1049                                                 c->rcv.dst_port =
1050                                                         hdr.v2.addr.ip4.dst_port;
1051
1052                                                 goto done;
1053
1054                                         case 0x21: /* TCPv6 */
1055                                                 src_ip->af = AF_INET6;
1056                                                 src_ip->len = 16;
1057                                                 memcpy(src_ip->u.addr,
1058                                                         hdr.v2.addr.ip6.src_addr, 16);
1059                                                 c->rcv.src_port =
1060                                                         hdr.v2.addr.ip6.src_port;
1061
1062                                                 dst_ip->af = AF_INET6;
1063                                                 dst_ip->len = 16;
1064                                                 memcpy(dst_ip->u.addr,
1065                                                         hdr.v2.addr.ip6.src_addr, 16);
1066                                                 c->rcv.dst_port =
1067                                                         hdr.v2.addr.ip6.dst_port;
1068
1069                                                 goto done;
1070
1071                                         default: /* unsupported protocol */
1072                                                 return -1;
1073                                 }
1074
1075                         case 0x00: /* LOCAL command */
1076                                 retval = 1; /* keep local connection address for LOCAL */
1077                                 goto done;
1078
1079                         default:
1080                                 return -1; /* not a supported command */
1081                 }
1082         }
1083         else if (bytes >= 8 && memcmp(hdr.v1.line, "PROXY", 5) == 0) {
1084                 LM_DBG("received PROXY protocol v1 header\n");
1085                 end = memchr(hdr.v1.line, '\r', bytes - 1);
1086                 if (!end || end[1] != '\n') {
1087                         return -1; /* partial or invalid header */
1088                 }
1089                 *end = '\0'; /* terminate the string to ease parsing */
1090                 size = end + 2 - hdr.v1.line;
1091                 p = hdr.v1.line + 5;
1092
1093                 if (strncmp(p, " TCP", 4) == 0) {
1094                         switch (p[4]) {
1095                                 case '4':
1096                                         src_ip->af  = dst_ip->af  = AF_INET;
1097                                         src_ip->len = dst_ip->len = 4;
1098                                         break;
1099                                 case '6':
1100                                         src_ip->af  = dst_ip->af  = AF_INET6;
1101                                         src_ip->len = dst_ip->len = 16;
1102                                         break;
1103                                 default:
1104                                         return -1; /* unknown TCP version */
1105                         }
1106
1107                         if (p[5] != ' ') {
1108                                 return -1; /* misformatted header */
1109                         }
1110                         p += 6; /* skip over the already-parsed bytes */
1111
1112                         /* Parse the source IP address */
1113                         end = strchr(p, ' ');
1114                         if (!end) {
1115                                 return -1; /* truncated header */
1116                         }
1117                         *end = '\0'; /* mark the end of the IP address */
1118                         if (inet_pton(src_ip->af, p, src_ip->u.addr) != 1) {
1119                                 return -1; /* missing IP address */
1120                         }
1121                         p = end + 1;
1122
1123                         /* Parse the destination IP address */
1124                         end = strchr(p, ' ');
1125                         if (!end) {
1126                                 return -1;
1127                         }
1128                         *end = '\0'; /* mark the end of the IP address */
1129                         if (inet_pton(dst_ip->af, p, dst_ip->u.addr) != 1) {
1130                                 return -1;
1131                         }
1132                         p = end + 1;
1133
1134                         /* Parse the source port */
1135                         port = strtoul(p, &end, 10);
1136                         if (port == UINT32_MAX || port == 0 || port >= (1 << 16)) {
1137                                 return -1; /* invalid port number */
1138                         }
1139                         c->rcv.src_port = port;
1140
1141                         if (*end != ' ') {
1142                                 return -1; /* invalid header */
1143                         }
1144                         p = end + 1;
1145
1146                         /* Parse the destination port */
1147                         port = strtoul(p, NULL, 10);
1148                         if (port == UINT32_MAX || port == 0 || port >= (1 << 16)) {
1149                                 return -1; /* invalid port number */
1150                         }
1151                         c->rcv.dst_port = port;
1152
1153                         goto done;
1154                 }
1155                 else if (strncmp(p, " UNKNOWN", 8) == 0) {
1156                         /* We know that the sender speaks the correct PROXY protocol with the
1157                          * appropriate version, and we SHOULD accept the connection and use the
1158                          * real connection's parameters as if there were no PROXY protocol header
1159                          * on the wire.
1160                          */
1161                         retval = 1; /* PROXY protocol parsed, but no IP override */
1162                         goto done;
1163                 }
1164                 else {
1165                         return -1; /* invalid header */
1166                 }
1167         } else if (bytes == 0) {
1168                 return 1; /* EOF? Return "no IP change" in any case */
1169         }
1170         else {
1171                 /* Wrong protocol */
1172                 return -1;
1173         }
1174
1175 done:
1176         /* we need to consume the appropriate amount of data from the socket */
1177         do {
1178                 bytes = recv(c->s, &hdr, size, 0);
1179         } while (bytes == -1 && errno == EINTR);
1180
1181         return (bytes >= 0) ? retval : -1;
1182 }
1183
1184 struct tcp_connection* tcpconn_new(int sock, union sockaddr_union* su,
1185                                                                         union sockaddr_union* local_addr,
1186                                                                         struct socket_info* ba, int type,
1187                                                                         int state)
1188 {
1189         struct tcp_connection *c;
1190         int rd_b_size, ret;
1191
1192         rd_b_size=cfg_get(tcp, tcp_cfg, rd_buf_size);
1193         c=shm_malloc(sizeof(struct tcp_connection) + rd_b_size);
1194         if (c==0){
1195                 SHM_MEM_ERROR;
1196                 goto error;
1197         }
1198         memset(c, 0, sizeof(struct tcp_connection)); /* zero init (skip rd buf)*/
1199         c->s=sock;
1200         c->fd=-1; /* not initialized */
1201         if (lock_init(&c->write_lock)==0){
1202                 LM_ERR("init lock failed\n");
1203                 goto error;
1204         }
1205
1206         c->rcv.src_su=*su;
1207
1208         atomic_set(&c->refcnt, 0);
1209         local_timer_init(&c->timer, tcpconn_main_timeout, c, 0);
1210         if (unlikely(ksr_tcp_accept_haproxy && state == S_CONN_ACCEPT)) {
1211                 ret = tcpconn_read_haproxy(c);
1212
1213                 if (ret == -1) {
1214                         LM_ERR("invalid PROXY protocol header\n");
1215                         goto error;
1216                 } else if (ret == 1) {
1217                         LM_DBG("PROXY protocol did not override IP addresses\n");
1218                         goto read_ip_info;
1219                 }
1220         } else {
1221 read_ip_info:
1222                 su2ip_addr(&c->rcv.src_ip, su);
1223                 c->rcv.src_port=su_getport(su);
1224                 if (likely(local_addr)){
1225                         su2ip_addr(&c->rcv.dst_ip, local_addr);
1226                         c->rcv.dst_port=su_getport(local_addr);
1227                 }else if (ba){
1228                         c->rcv.dst_ip=ba->address;
1229                         c->rcv.dst_port=ba->port_no;
1230                 }
1231         }
1232         c->rcv.bind_address=ba;
1233         print_ip("tcpconn_new: new tcp connection: ", &c->rcv.src_ip, "\n");
1234         LM_DBG("on port %d, type %d\n", c->rcv.src_port, type);
1235         init_tcp_req(&c->req, (char*)c+sizeof(struct tcp_connection), rd_b_size);
1236         c->id=(*connection_id)++;
1237         c->rcv.proto_reserved1=0; /* this will be filled before receive_message*/
1238         c->rcv.proto_reserved2=0;
1239         c->state=state;
1240         c->extra_data=0;
1241 #ifdef USE_TLS
1242         if (type==PROTO_TLS){
1243                 if (tls_tcpconn_init(c, sock)==-1) goto error;
1244         }else
1245 #endif /* USE_TLS*/
1246         {
1247                 c->type=PROTO_TCP;
1248                 c->rcv.proto=PROTO_TCP;
1249                 c->timeout=get_ticks_raw()+cfg_get(tcp, tcp_cfg, con_lifetime);
1250                 c->lifetime = cfg_get(tcp, tcp_cfg, con_lifetime);
1251         }
1252
1253         return c;
1254
1255 error:
1256         if (c) shm_free(c);
1257         return 0;
1258 }
1259
1260
1261
1262 /* do the actual connect, set sock. options a.s.o
1263  * returns socket on success, -1 on error
1264  * sets also *res_local_addr, res_si and state (S_CONN_CONNECT for an
1265  * unfinished connect and S_CONN_OK for a finished one)*/
1266 inline static int tcp_do_connect(       union sockaddr_union* server,
1267                                                                         union sockaddr_union* from,
1268                                                                         int type,
1269                                                                         snd_flags_t* send_flags,
1270                                                                         union sockaddr_union* res_local_addr,
1271                                                                         struct socket_info** res_si,
1272                                                                         enum tcp_conn_states *state
1273                                                                         )
1274 {
1275         int s;
1276         union sockaddr_union my_name;
1277         socklen_t my_name_len;
1278         struct ip_addr ip;
1279 #ifdef TCP_ASYNC
1280         int n;
1281 #endif /* TCP_ASYNC */
1282
1283         s=socket(AF2PF(server->s.sa_family), SOCK_STREAM, 0);
1284         if (unlikely(s==-1)){
1285                 LM_ERR("%s: socket: (%d) %s\n",
1286                                 su2a(server, sizeof(*server)), errno, strerror(errno));
1287                 goto error;
1288         }
1289         if (init_sock_opt(s, server->s.sa_family)<0){
1290                 LM_ERR("%s: init_sock_opt failed\n",
1291                                         su2a(server, sizeof(*server)));
1292                 goto error;
1293         }
1294         
1295         if (unlikely(from && bind(s, &from->s, sockaddru_len(*from)) != 0)){
1296                 LM_WARN("binding to source address %s failed: %s [%d]\n",
1297                                         su2a(from, sizeof(*from)),
1298                                         strerror(errno), errno);
1299         }
1300         *state=S_CONN_OK;
1301 #ifdef TCP_ASYNC
1302         if (likely(cfg_get(tcp, tcp_cfg, async))){
1303 again:
1304                 n=connect(s, &server->s, sockaddru_len(*server));
1305                 if (likely(n==-1)){ /*non-blocking => most probable EINPROGRESS*/
1306                         if (likely(errno==EINPROGRESS))
1307                                 *state=S_CONN_CONNECT;
1308                         else if (errno==EINTR) goto again;
1309                         else if (errno!=EALREADY){
1310                                 switch(errno){
1311                                         case ENETUNREACH:
1312                                         case EHOSTUNREACH:
1313 #ifdef USE_DST_BLACKLIST
1314                                                 dst_blacklist_su(BLST_ERR_CONNECT, type, server,
1315                                                                                         send_flags, 0);
1316 #endif /* USE_DST_BLACKLIST */
1317                                                 TCP_EV_CONNECT_UNREACHABLE(errno, 0, 0, server, type);
1318                                                 break;
1319                                         case ETIMEDOUT:
1320 #ifdef USE_DST_BLACKLIST
1321                                                 dst_blacklist_su(BLST_ERR_CONNECT, type, server,
1322                                                                                         send_flags, 0);
1323 #endif /* USE_DST_BLACKLIST */
1324                                                 TCP_EV_CONNECT_TIMEOUT(errno, 0, 0, server, type);
1325                                                 break;
1326                                         case ECONNREFUSED:
1327                                         case ECONNRESET:
1328 #ifdef USE_DST_BLACKLIST
1329                                                 dst_blacklist_su(BLST_ERR_CONNECT, type, server,
1330                                                                                         send_flags, 0);
1331 #endif /* USE_DST_BLACKLIST */
1332                                                 TCP_EV_CONNECT_RST(errno, 0, 0, server, type);
1333                                                 break;
1334                                         case EAGAIN:/* not posix, but supported on linux and bsd */
1335                                                 TCP_EV_CONNECT_NO_MORE_PORTS(errno, 0, 0, server,type);
1336                                                 break;
1337                                         default:
1338                                                 TCP_EV_CONNECT_ERR(errno, 0, 0, server, type);
1339                                 }
1340                                 TCP_STATS_CONNECT_FAILED();
1341                                 LM_ERR("connect %s: (%d) %s\n",
1342                                                         su2a(server, sizeof(*server)),
1343                                                         errno, strerror(errno));
1344                                 goto error;
1345                         }
1346                 }
1347         }else{
1348 #endif /* TCP_ASYNC */
1349                 if (tcp_blocking_connect(s, type,  send_flags, &server->s,
1350                                                                         sockaddru_len(*server))<0){
1351                         LM_ERR("tcp_blocking_connect %s failed\n",
1352                                                 su2a(server, sizeof(*server)));
1353                         goto error;
1354                 }
1355 #ifdef TCP_ASYNC
1356         }
1357 #endif /* TCP_ASYNC */
1358         if (from){
1359                 su2ip_addr(&ip, from);
1360                 if (!ip_addr_any(&ip))
1361                         /* we already know the source ip, skip the sys. call */
1362                         goto find_socket;
1363         }
1364         my_name_len=sizeof(my_name);
1365         if (unlikely(getsockname(s, &my_name.s, &my_name_len)!=0)){
1366                 LM_ERR("getsockname failed: %s(%d)\n", strerror(errno), errno);
1367                 *res_si=0;
1368                 goto error;
1369         }
1370         from=&my_name; /* update from with the real "from" address */
1371         su2ip_addr(&ip, &my_name);
1372 find_socket:
1373 #ifdef USE_TLS
1374         if (unlikely(type==PROTO_TLS))
1375                 *res_si=find_si(&ip, 0, PROTO_TLS);
1376         else
1377 #endif
1378                 *res_si=find_si(&ip, 0, PROTO_TCP);
1379         
1380         if (unlikely(*res_si==0)){
1381                 LM_WARN("%s: could not find corresponding"
1382                                 " listening socket for %s, using default...\n",
1383                                         su2a(server, sizeof(*server)), ip_addr2a(&ip));
1384                 if (server->s.sa_family==AF_INET) *res_si=sendipv4_tcp;
1385                 else *res_si=sendipv6_tcp;
1386         }
1387         *res_local_addr=*from;
1388         return s;
1389 error:
1390         if (s!=-1) tcp_safe_close(s);
1391         return -1;
1392 }
1393
1394
1395
1396 struct tcp_connection* tcpconn_connect( union sockaddr_union* server,
1397                                                                                 union sockaddr_union* from,
1398                                                                                 int type, snd_flags_t* send_flags)
1399 {
1400         int s;
1401         struct socket_info* si;
1402         union sockaddr_union my_name;
1403         struct tcp_connection* con;
1404         enum tcp_conn_states state;
1405
1406         s=-1;
1407
1408         if (*tcp_connections_no >= cfg_get(tcp, tcp_cfg, max_connections)){
1409                 LM_ERR("maximum number of connections exceeded (%d/%d)\n",
1410                                         *tcp_connections_no,
1411                                         cfg_get(tcp, tcp_cfg, max_connections));
1412                 goto error;
1413         }
1414         if (unlikely(type==PROTO_TLS)) {
1415                 if (*tls_connections_no >= cfg_get(tcp, tcp_cfg, max_tls_connections)){
1416                         LM_ERR("maximum number of tls connections"
1417                                                 " exceeded (%d/%d)\n",
1418                                                 *tls_connections_no,
1419                                                 cfg_get(tcp, tcp_cfg, max_tls_connections));
1420                         goto error;
1421                 }
1422         }
1423
1424         s=tcp_do_connect(server, from, type,  send_flags, &my_name, &si, &state);
1425         if (s==-1){
1426                 LM_ERR("tcp_do_connect %s: failed (%d) %s\n",
1427                                 su2a(server, sizeof(*server)), errno, strerror(errno));
1428                 goto error;
1429         }
1430         con=tcpconn_new(s, server, &my_name, si, type, state);
1431         if (con==0){
1432                 LM_ERR("%s: tcpconn_new failed, closing the "
1433                                         " socket\n", su2a(server, sizeof(*server)));
1434                 goto error;
1435         }
1436         tcpconn_set_send_flags(con, *send_flags);
1437         return con;
1438 error:
1439         if (s!=-1) tcp_safe_close(s); /* close the opened socket */
1440         return 0;
1441 }
1442
1443
1444
1445 #ifdef TCP_CONNECT_WAIT
1446 int tcpconn_finish_connect( struct tcp_connection* c,
1447                                                                                                 union sockaddr_union* from)
1448 {
1449         int s;
1450         int r;
1451         union sockaddr_union local_addr;
1452         struct socket_info* si;
1453         enum tcp_conn_states state;
1454         struct tcp_conn_alias* a;
1455         int new_conn_alias_flags;
1456
1457         s=tcp_do_connect(&c->rcv.src_su, from, c->type, &c->send_flags,
1458                                                 &local_addr, &si, &state);
1459         if (unlikely(s==-1)){
1460                 LM_ERR("%s: tcp_do_connect for %p failed\n",
1461                                         su2a(&c->rcv.src_su, sizeof(c->rcv.src_su)), c);
1462                 return -1;
1463         }
1464         c->rcv.bind_address=si;
1465         su2ip_addr(&c->rcv.dst_ip, &local_addr);
1466         c->rcv.dst_port=su_getport(&local_addr);
1467         /* update aliases if needed */
1468         if (likely(from==0)){
1469                 new_conn_alias_flags=cfg_get(tcp, tcp_cfg, new_conn_alias_flags);
1470                 /* add aliases */
1471                 TCPCONN_LOCK;
1472                 _tcpconn_add_alias_unsafe(c, c->rcv.src_port, &c->rcv.dst_ip, 0,
1473                                                                                                         new_conn_alias_flags);
1474                 _tcpconn_add_alias_unsafe(c, c->rcv.src_port, &c->rcv.dst_ip,
1475                                                                         c->rcv.dst_port, new_conn_alias_flags);
1476                 TCPCONN_UNLOCK;
1477         }else if (su_cmp(from, &local_addr)!=1){
1478                 new_conn_alias_flags=cfg_get(tcp, tcp_cfg, new_conn_alias_flags);
1479                 TCPCONN_LOCK;
1480                         /* remove all the aliases except the first one and re-add them
1481                          * (there shouldn't be more then the 3 default aliases at this
1482                          * stage) */
1483                         if (c->aliases > 1) {
1484                                 for (r=1; r<c->aliases; r++){
1485                                         a=&c->con_aliases[r];
1486                                         tcpconn_listrm(tcpconn_aliases_hash[a->hash],
1487                                                                         a, next, prev);
1488                                 }
1489                                 c->aliases=1;
1490                         }
1491                         /* add the local_ip:0 and local_ip:local_port aliases */
1492                         _tcpconn_add_alias_unsafe(c, c->rcv.src_port, &c->rcv.dst_ip,
1493                                                                                                 0, new_conn_alias_flags);
1494                         _tcpconn_add_alias_unsafe(c, c->rcv.src_port, &c->rcv.dst_ip,
1495                                                                         c->rcv.dst_port, new_conn_alias_flags);
1496                 TCPCONN_UNLOCK;
1497         }
1498
1499         return s;
1500 }
1501 #endif /* TCP_CONNECT_WAIT */
1502
1503
1504
1505 /* adds a tcp connection to the tcpconn hashes
1506  * Note: it's called _only_ from the tcp_main process */
1507 inline static struct tcp_connection*  tcpconn_add(struct tcp_connection *c)
1508 {
1509         struct ip_addr zero_ip;
1510         int new_conn_alias_flags;
1511
1512         if (likely(c)){
1513                 ip_addr_mk_any(c->rcv.src_ip.af, &zero_ip);
1514                 c->id_hash=tcp_id_hash(c->id);
1515                 c->aliases=0;
1516                 new_conn_alias_flags=cfg_get(tcp, tcp_cfg, new_conn_alias_flags);
1517                 TCPCONN_LOCK;
1518                 c->flags|=F_CONN_HASHED;
1519                 /* add it at the begining of the list*/
1520                 tcpconn_listadd(tcpconn_id_hash[c->id_hash], c, id_next, id_prev);
1521                 /* set the aliases */
1522                 /* first alias is for (peer_ip, peer_port, 0 ,0) -- for finding
1523                  *  any connection to peer_ip, peer_port
1524                  * the second alias is for (peer_ip, peer_port, local_addr, 0) -- for
1525                  *  finding any conenction to peer_ip, peer_port from local_addr 
1526                  * the third alias is for (peer_ip, peer_port, local_addr, local_port) 
1527                  *   -- for finding if a fully specified connection exists */
1528                 _tcpconn_add_alias_unsafe(c, c->rcv.src_port, &zero_ip, 0,
1529                                                                                                         new_conn_alias_flags);
1530                 if (likely(c->rcv.dst_ip.af && ! ip_addr_any(&c->rcv.dst_ip))){
1531                         _tcpconn_add_alias_unsafe(c, c->rcv.src_port, &c->rcv.dst_ip, 0,
1532                                                                                                         new_conn_alias_flags);
1533                         _tcpconn_add_alias_unsafe(c, c->rcv.src_port, &c->rcv.dst_ip,
1534                                                                         c->rcv.dst_port, new_conn_alias_flags);
1535                 }
1536                 /* ignore add_alias errors, there are some valid cases when one
1537                  *  of the add_alias would fail (e.g. first add_alias for 2 connections
1538                  *   with the same destination but different src. ip*/
1539                 TCPCONN_UNLOCK;
1540                 LM_DBG("hashes: %d:%d:%d, %d\n",
1541                                                                                                 c->con_aliases[0].hash,
1542                                                                                                 c->con_aliases[1].hash,
1543                                                                                                 c->con_aliases[2].hash,
1544                                                                                                 c->id_hash);
1545                 return c;
1546         }else{
1547                 LM_CRIT("null connection pointer\n");
1548                 return 0;
1549         }
1550 }
1551
1552
1553 static inline void _tcpconn_detach(struct tcp_connection *c)
1554 {
1555         int r;
1556         tcpconn_listrm(tcpconn_id_hash[c->id_hash], c, id_next, id_prev);
1557         /* remove all the aliases */
1558         for (r=0; r<c->aliases; r++)
1559                 tcpconn_listrm(tcpconn_aliases_hash[c->con_aliases[r].hash], 
1560                                                 &c->con_aliases[r], next, prev);
1561         c->aliases = 0;
1562 }
1563
1564
1565
1566 static inline void _tcpconn_free(struct tcp_connection* c)
1567 {
1568 #ifdef TCP_ASYNC
1569         if (unlikely(_wbufq_non_empty(c)))
1570                 _wbufq_destroy(&c->wbuf_q);
1571 #endif
1572         lock_destroy(&c->write_lock);
1573 #ifdef USE_TLS
1574         if (unlikely(c->type==PROTO_TLS || c->type==PROTO_WSS)) tls_tcpconn_clean(c);
1575 #endif
1576         shm_free(c);
1577 }
1578
1579
1580
1581 /* unsafe tcpconn_rm version (nolocks) */
1582 void _tcpconn_rm(struct tcp_connection* c)
1583 {
1584         _tcpconn_detach(c);
1585         _tcpconn_free(c);
1586 }
1587
1588
1589
1590 void tcpconn_rm(struct tcp_connection* c)
1591 {
1592         int r;
1593         TCPCONN_LOCK;
1594         tcpconn_listrm(tcpconn_id_hash[c->id_hash], c, id_next, id_prev);
1595         /* remove all the aliases */
1596         for (r=0; r<c->aliases; r++)
1597                 tcpconn_listrm(tcpconn_aliases_hash[c->con_aliases[r].hash], 
1598                                                 &c->con_aliases[r], next, prev);
1599         c->aliases = 0;
1600         TCPCONN_UNLOCK;
1601         lock_destroy(&c->write_lock);
1602 #ifdef USE_TLS
1603         if ((c->type==PROTO_TLS || c->type==PROTO_WSS)&&(c->extra_data)) tls_tcpconn_clean(c);
1604 #endif
1605         shm_free(c);
1606 }
1607
1608
1609 /* finds a connection, if id=0 uses the ip addr, port, local_ip and local port
1610  *  (host byte order) and tries to find the connection that matches all of
1611  *   them. Wild cards can be used for local_ip and local_port (a 0 filled
1612  *   ip address and/or a 0 local port).
1613  * WARNING: unprotected (locks) use tcpconn_get unless you really
1614  * know what you are doing */
1615 struct tcp_connection* _tcpconn_find(int id, struct ip_addr* ip, int port,
1616                                                                                 struct ip_addr* l_ip, int l_port)
1617 {
1618
1619         struct tcp_connection *c;
1620         struct tcp_conn_alias* a;
1621         unsigned hash;
1622         int is_local_ip_any;
1623         
1624 #ifdef EXTRA_DEBUG
1625         LM_DBG("%d  port %d\n",id, port);
1626         if (ip) print_ip("tcpconn_find: ip ", ip, "\n");
1627 #endif
1628         if (likely(id)){
1629                 hash=tcp_id_hash(id);
1630                 for (c=tcpconn_id_hash[hash]; c; c=c->id_next){
1631 #ifdef EXTRA_DEBUG
1632                         LM_DBG("c=%p, c->id=%d, port=%d\n", c, c->id, c->rcv.src_port);
1633                         print_ip("ip=", &c->rcv.src_ip, "\n");
1634 #endif
1635                         if ((id==c->id)&&(c->state!=S_CONN_BAD)) return c;
1636                 }
1637         }else if (likely(ip)){
1638                 hash=tcp_addr_hash(ip, port, l_ip, l_port);
1639                 is_local_ip_any=ip_addr_any(l_ip);
1640                 for (a=tcpconn_aliases_hash[hash]; a; a=a->next){
1641 #ifdef EXTRA_DEBUG
1642                         LM_DBG("a=%p, c=%p, c->id=%d, alias port= %d port=%d\n", a, a->parent,
1643                                         a->parent->id, a->port, a->parent->rcv.src_port);
1644                         print_ip("ip=",&a->parent->rcv.src_ip,"\n");
1645 #endif
1646                         if ( (a->parent->state!=S_CONN_BAD) && (port==a->port) &&
1647                                         ((l_port==0) || (l_port==a->parent->rcv.dst_port)) &&
1648                                         (ip_addr_cmp(ip, &a->parent->rcv.src_ip)) &&
1649                                         (is_local_ip_any ||
1650                                                 ip_addr_cmp(l_ip, &a->parent->rcv.dst_ip))
1651                                 )
1652                                 return a->parent;
1653                 }
1654         }
1655         return 0;
1656 }
1657
1658
1659 /**
1660  * find if a tcp connection exits by id or remote+local address/port
1661  * - return: 1 if found; 0 if not found
1662  */
1663 int tcpconn_exists(int conn_id, ip_addr_t* peer_ip, int peer_port,
1664                                                 ip_addr_t* local_ip, int local_port)
1665 {
1666         tcp_connection_t* c;
1667
1668         TCPCONN_LOCK;
1669         c=_tcpconn_find(conn_id, peer_ip, peer_port, local_ip, local_port);
1670         TCPCONN_UNLOCK;
1671         if (c) {
1672                 return 1;
1673         }
1674         return 0;
1675
1676 }
1677
1678 /* _tcpconn_find with locks and timeout
1679  * local_addr contains the desired local ip:port. If null any local address 
1680  * will be used.  IN*ADDR_ANY or 0 port are wild cards.
1681  * If found, the connection's reference counter will be incremented, you might
1682  * want to decrement it after use.
1683  */
1684 struct tcp_connection* tcpconn_get(int id, struct ip_addr* ip, int port,
1685                                                                         union sockaddr_union* local_addr,
1686                                                                         ticks_t timeout)
1687 {
1688         struct tcp_connection* c;
1689         struct ip_addr local_ip;
1690         int local_port;
1691         
1692         local_port=0;
1693         if (likely(ip)){
1694                 if (unlikely(local_addr)){
1695                         su2ip_addr(&local_ip, local_addr);
1696                         local_port=su_getport(local_addr);
1697                 }else{
1698                         ip_addr_mk_any(ip->af, &local_ip);
1699                         local_port=0;
1700                 }
1701         }
1702         TCPCONN_LOCK;
1703         c=_tcpconn_find(id, ip, port, &local_ip, local_port);
1704         if (likely(c)){ 
1705                         atomic_inc(&c->refcnt);
1706                         /* update the timeout only if the connection is not handled
1707                          * by a tcp reader _and_the timeout is non-zero  (the tcp
1708                          * reader process uses c->timeout for its own internal
1709                          * timeout and c->timeout will be overwritten * anyway on
1710                          * return to tcp_main) */
1711                         if (likely(c->reader_pid==0 && timeout != 0))
1712                                 c->timeout=get_ticks_raw()+timeout;
1713         }
1714         TCPCONN_UNLOCK;
1715         return c;
1716 }
1717
1718
1719
1720 /* add c->dst:port, local_addr as an alias for the "id" connection, 
1721  * flags: TCP_ALIAS_FORCE_ADD  - add an alias even if a previous one exists
1722  *        TCP_ALIAS_REPLACE    - if a prev. alias exists, replace it with the
1723  *                                new one
1724  * returns 0 on success, <0 on failure ( -1  - null c, -2 too many aliases,
1725  *  -3 alias already present and pointing to another connection)
1726  * WARNING: must be called with TCPCONN_LOCK held */
1727 inline static int _tcpconn_add_alias_unsafe(struct tcp_connection* c, int port,
1728                                                                                 struct ip_addr* l_ip, int l_port,
1729                                                                                 int flags)
1730 {
1731         unsigned hash;
1732         struct tcp_conn_alias* a;
1733         struct tcp_conn_alias* nxt;
1734         struct tcp_connection* p;
1735         int is_local_ip_any;
1736         int i;
1737         int r;
1738         
1739         a=0;
1740         is_local_ip_any=ip_addr_any(l_ip);
1741         if (likely(c)){
1742                 hash=tcp_addr_hash(&c->rcv.src_ip, port, l_ip, l_port);
1743                 /* search the aliases for an already existing one */
1744                 for (a=tcpconn_aliases_hash[hash], nxt=0; a; a=nxt){
1745                         nxt=a->next;
1746                         if ( (a->parent->state!=S_CONN_BAD) && (port==a->port) &&
1747                                         ( (l_port==0) || (l_port==a->parent->rcv.dst_port)) &&
1748                                         (ip_addr_cmp(&c->rcv.src_ip, &a->parent->rcv.src_ip)) &&
1749                                         ( is_local_ip_any || 
1750                                           ip_addr_cmp(&a->parent->rcv.dst_ip, l_ip))
1751                                         ){
1752                                 /* found */
1753                                 if (unlikely(a->parent!=c)){
1754                                         if (flags & TCP_ALIAS_FORCE_ADD)
1755                                                 /* still have to walk the whole list to check if
1756                                                  * the alias was not already added */
1757                                                 continue;
1758                                         else if (flags & TCP_ALIAS_REPLACE){
1759                                                 /* remove the alias =>
1760                                                  * remove the current alias and all the following
1761                                                  *  ones from the corresponding connection, shift the 
1762                                                  *  connection aliases array and re-add the other 
1763                                                  *  aliases (!= current one) */
1764                                                 p=a->parent;
1765                                                 for (i=0; (i<p->aliases) && (&(p->con_aliases[i])!=a);
1766                                                                 i++);
1767                                                 if (unlikely(i==p->aliases)){
1768                                                         LM_CRIT("alias %p not found in con %p (id %d)\n",
1769                                                                         a, p, p->id);
1770                                                         goto error_not_found;
1771                                                 }
1772                                                 for (r=i; r<p->aliases; r++){
1773                                                         tcpconn_listrm(
1774                                                                 tcpconn_aliases_hash[p->con_aliases[r].hash],
1775                                                                 &p->con_aliases[r], next, prev);
1776                                                 }
1777                                                 if (likely((i+1)<p->aliases)){
1778                                                         memmove(&p->con_aliases[i], &p->con_aliases[i+1],
1779                                                                                         (p->aliases-i-1)*
1780                                                                                                 sizeof(p->con_aliases[0]));
1781                                                 }
1782                                                 p->aliases--;
1783                                                 /* re-add the remaining aliases */
1784                                                 for (r=i; r<p->aliases; r++){
1785                                                         tcpconn_listadd(
1786                                                                 tcpconn_aliases_hash[p->con_aliases[r].hash], 
1787                                                                 &p->con_aliases[r], next, prev);
1788                                                 }
1789                                         }else
1790                                                 goto error_sec;
1791                                 }else goto ok;
1792                         }
1793                 }
1794                 if (unlikely(c->aliases>=TCP_CON_MAX_ALIASES)) goto error_aliases;
1795                 c->con_aliases[c->aliases].parent=c;
1796                 c->con_aliases[c->aliases].port=port;
1797                 c->con_aliases[c->aliases].hash=hash;
1798                 tcpconn_listadd(tcpconn_aliases_hash[hash], 
1799                                                                 &c->con_aliases[c->aliases], next, prev);
1800                 c->aliases++;
1801         }else goto error_not_found;
1802 ok:
1803 #ifdef EXTRA_DEBUG
1804         if (a) LM_DBG("alias already present\n");
1805         else   LM_DBG("alias port %d for hash %d, id %d\n",
1806                         port, hash, c->id);
1807 #endif
1808         return 0;
1809 error_aliases:
1810         /* too many aliases */
1811         return -2;
1812 error_not_found:
1813         /* null connection */
1814         return -1;
1815 error_sec:
1816         /* alias already present and pointing to a different connection
1817          * (hijack attempt?) */
1818         return -3;
1819 }
1820
1821
1822
1823 /* add port as an alias for the "id" connection, 
1824  * returns 0 on success,-1 on failure */
1825 int tcpconn_add_alias(int id, int port, int proto)
1826 {
1827         struct tcp_connection* c;
1828         int ret;
1829         struct ip_addr zero_ip;
1830         int r;
1831         int alias_flags;
1832         
1833         /* fix the port */
1834         port=port?port:((proto==PROTO_TLS)?SIPS_PORT:SIP_PORT);
1835         TCPCONN_LOCK;
1836         /* check if alias already exists */
1837         c=_tcpconn_find(id, 0, 0, 0, 0);
1838         if (likely(c)){
1839                 ip_addr_mk_any(c->rcv.src_ip.af, &zero_ip);
1840                 alias_flags=cfg_get(tcp, tcp_cfg, alias_flags);
1841                 /* alias src_ip:port, 0, 0 */
1842                 ret=_tcpconn_add_alias_unsafe(c, port,  &zero_ip, 0, 
1843                                                                                 alias_flags);
1844                 if (ret<0 && ret!=-3) goto error;
1845                 /* alias src_ip:port, local_ip, 0 */
1846                 ret=_tcpconn_add_alias_unsafe(c, port,  &c->rcv.dst_ip, 0, 
1847                                                                                 alias_flags);
1848                 if (ret<0 && ret!=-3) goto error;
1849                 /* alias src_ip:port, local_ip, local_port */
1850                 ret=_tcpconn_add_alias_unsafe(c, port, &c->rcv.dst_ip, c->rcv.dst_port,
1851                                                                                 alias_flags);
1852                 if (unlikely(ret<0)) goto error;
1853         }else goto error_not_found;
1854         TCPCONN_UNLOCK;
1855         return 0;
1856 error_not_found:
1857         TCPCONN_UNLOCK;
1858         LM_ERR("no connection found for id %d\n",id);
1859         return -1;
1860 error:
1861         TCPCONN_UNLOCK;
1862         switch(ret){
1863                 case -2:
1864                         LM_ERR("too many aliases (%d) for connection %p (id %d) %s:%d <- %d\n",
1865                                         c->aliases, c, c->id, ip_addr2a(&c->rcv.src_ip),
1866                                         c->rcv.src_port, port);
1867                         for (r=0; r<c->aliases; r++){
1868                                 LM_ERR("alias %d: for %p (%d) %s:%d <-%d hash %x\n",  r, c, c->id, 
1869                                                 ip_addr2a(&c->rcv.src_ip), c->rcv.src_port, 
1870                                                 c->con_aliases[r].port, c->con_aliases[r].hash);
1871                         }
1872                         break;
1873                 case -3:
1874                         LM_ERR("possible port hijack attempt\n");
1875                         LM_ERR("alias for %d port %d already"
1876                                                 " present and points to another connection \n",
1877                                                 c->id, port);
1878                         break;
1879                 default:
1880                         LM_ERR("unknown error %d\n", ret);
1881         }
1882         return -1;
1883 }
1884
1885
1886
1887 #ifdef TCP_FD_CACHE
1888
1889 static void tcp_fd_cache_init(void)
1890 {
1891         int r;
1892         for (r=0; r<TCP_FD_CACHE_SIZE; r++)
1893                 fd_cache[r].fd=-1;
1894 }
1895
1896
1897 inline static struct fd_cache_entry* tcp_fd_cache_get(struct tcp_connection *c)
1898 {
1899         int h;
1900         
1901         h=c->id%TCP_FD_CACHE_SIZE;
1902         if ((fd_cache[h].fd>0) && (fd_cache[h].id==c->id) && (fd_cache[h].con==c))
1903                 return &fd_cache[h];
1904         return 0;
1905 }
1906
1907
1908 inline static void tcp_fd_cache_rm(struct fd_cache_entry* e)
1909 {
1910         e->fd=-1;
1911 }
1912
1913
1914 inline static void tcp_fd_cache_add(struct tcp_connection *c, int fd)
1915 {
1916         int h;
1917         
1918         h=c->id%TCP_FD_CACHE_SIZE;
1919         if (likely(fd_cache[h].fd>0))
1920                 tcp_safe_close(fd_cache[h].fd);
1921         fd_cache[h].fd=fd;
1922         fd_cache[h].id=c->id;
1923         fd_cache[h].con=c;
1924 }
1925
1926 #endif /* TCP_FD_CACHE */
1927
1928
1929
1930 inline static int tcpconn_chld_put(struct tcp_connection* tcpconn);
1931
1932 static int tcpconn_send_put(struct tcp_connection* c, const char* buf,
1933                                                         unsigned len, snd_flags_t send_flags);
1934 static int tcpconn_do_send(int fd, struct tcp_connection* c,
1935                                                         const char* buf, unsigned len,
1936                                                         snd_flags_t send_flags, long* resp, int locked);
1937
1938 static int tcpconn_1st_send(int fd, struct tcp_connection* c,
1939                                                         const char* buf, unsigned len,
1940                                                         snd_flags_t send_flags, long* resp, int locked);
1941
1942 /* finds a tcpconn & sends on it
1943  * uses the dst members to, proto (TCP|TLS) and id and tries to send
1944  *  from the "from" address (if non null and id==0)
1945  * returns: number of bytes written (>=0) on success
1946  *          <0 on error */
1947 int tcp_send(struct dest_info* dst, union sockaddr_union* from,
1948                                         const char* buf, unsigned len)
1949 {
1950         struct tcp_connection *c;
1951         struct ip_addr ip;
1952         int port;
1953         int fd;
1954         long response[2];
1955         int n;
1956         ticks_t con_lifetime;
1957 #ifdef USE_TLS
1958         const char* rest_buf;
1959         const char* t_buf;
1960         unsigned rest_len, t_len;
1961         long resp;
1962         snd_flags_t t_send_flags;
1963 #endif /* USE_TLS */
1964
1965         port=su_getport(&dst->to);
1966         con_lifetime=cfg_get(tcp, tcp_cfg, con_lifetime);
1967         if (likely(port)){
1968                 su2ip_addr(&ip, &dst->to);
1969                 c=tcpconn_get(dst->id, &ip, port, from, con_lifetime);
1970         }else if (likely(dst->id)){
1971                 c=tcpconn_get(dst->id, 0, 0, 0, con_lifetime);
1972         }else{
1973                 LM_CRIT("null id & to\n");
1974                 return -1;
1975         }
1976
1977         if (likely(dst->id)){
1978                 if (unlikely(c==0)) {
1979                         if (likely(port)){
1980                                 /* try again w/o id */
1981                                 c=tcpconn_get(0, &ip, port, from, con_lifetime);
1982                         }else{
1983                                 LM_ERR("id %d not found, dropping\n", dst->id);
1984                                 return -1;
1985                         }
1986                 }
1987         }
1988         /* connection not found or unusable => open a new one and send on it */
1989         if (unlikely((c==0) || tcpconn_close_after_send(c))){
1990                 if (unlikely(c)){
1991                         /* can't use c if it's marked as close-after-send  =>
1992                          * release it and try opening new one */
1993                         tcpconn_chld_put(c); /* release c (dec refcnt & free on 0) */
1994                         c=0;
1995                 }
1996                 /* check if connect() is disabled */
1997                 if (unlikely((dst->send_flags.f & SND_F_FORCE_CON_REUSE) ||
1998                                                 cfg_get(tcp, tcp_cfg, no_connect)))
1999                         return -1;
2000                 LM_DBG("no open tcp connection found, opening new one\n");
2001                 /* create tcp connection */
2002                 if (likely(from==0)){
2003                         /* check to see if we have to use a specific source addr. */
2004                         switch (dst->to.s.sa_family) {
2005                                 case AF_INET:
2006                                                 from = tcp_source_ipv4;
2007                                         break;
2008                                 case AF_INET6:
2009                                                 from = tcp_source_ipv6;
2010                                         break;
2011                                 default:
2012                                         /* error, bad af, ignore ... */
2013                                         break;
2014                         }
2015                 }
2016 #if defined(TCP_CONNECT_WAIT) && defined(TCP_ASYNC)
2017                 if (likely(cfg_get(tcp, tcp_cfg, tcp_connect_wait) &&
2018                                         cfg_get(tcp, tcp_cfg, async) )){
2019                         if (unlikely(*tcp_connections_no >=
2020                                                         cfg_get(tcp, tcp_cfg, max_connections))){
2021                                 LM_ERR("%s: maximum number of connections exceeded (%d/%d)\n",
2022                                                         su2a(&dst->to, sizeof(dst->to)),
2023                                                         *tcp_connections_no,
2024                                                         cfg_get(tcp, tcp_cfg, max_connections));
2025                                 return -1;
2026                         }
2027                         if (unlikely(dst->proto==PROTO_TLS)) {
2028                                 if (unlikely(*tls_connections_no >=
2029                                                         cfg_get(tcp, tcp_cfg, max_tls_connections))){
2030                                         LM_ERR("%s: maximum number of tls connections exceeded (%d/%d)\n",
2031                                                         su2a(&dst->to, sizeof(dst->to)),
2032                                                         *tls_connections_no,
2033                                                         cfg_get(tcp, tcp_cfg, max_tls_connections));
2034                                         return -1;
2035                                 }
2036                         }
2037                         c=tcpconn_new(-1, &dst->to, from, 0, dst->proto,
2038                                                         S_CONN_CONNECT);
2039                         if (unlikely(c==0)){
2040                                 LM_ERR("%s: could not create new connection\n",
2041                                                 su2a(&dst->to, sizeof(dst->to)));
2042                                 return -1;
2043                         }
2044                         c->flags|=F_CONN_PENDING|F_CONN_FD_CLOSED;
2045                         tcpconn_set_send_flags(c, dst->send_flags);
2046                         atomic_set(&c->refcnt, 2); /* ref from here and from main hash
2047                                                                                 * table */
2048                         /* add it to id hash and aliases */
2049                         if (unlikely(tcpconn_add(c)==0)){
2050                                 LM_ERR("%s: could not add connection %p\n",
2051                                                 su2a(&dst->to, sizeof(dst->to)), c);
2052                                 _tcpconn_free(c);
2053                                 n=-1;
2054                                 goto end_no_conn;
2055                         }
2056                         /* do connect and if src ip or port changed, update the
2057                          * aliases */
2058                         if (unlikely((fd=tcpconn_finish_connect(c, from))<0)){
2059                                 /* tcpconn_finish_connect will automatically blacklist
2060                                  * on error => no need to do it here */
2061                                 LM_ERR("%s: tcpconn_finish_connect(%p) failed\n",
2062                                                 su2a(&dst->to, sizeof(dst->to)), c);
2063                                 goto conn_wait_error;
2064                         }
2065                         if(c->flags & F_CONN_NOSEND) {
2066                                 /* connection marked as no-send data
2067                                  * (e.g., drop() from tls event route)*/
2068                                 LM_INFO("%s: connection marked for no-send (%p)\n",
2069                                                 su2a(&dst->to, sizeof(dst->to)), c);
2070                                 goto conn_wait_error;
2071                         }
2072                         /* ? TODO: it might be faster just to queue the write directly
2073                          *  and send to main CONN_NEW_PENDING_WRITE */
2074                         /* delay sending the fd to main after the send */
2075
2076                         /* NOTE: no lock here, because the connection is marked as
2077                          * pending and nobody else will try to write on it. However
2078                          * this might produce out-of-order writes. If this is not
2079                          * desired either lock before the write or use
2080                          * _wbufq_insert(...)
2081                          * NOTE2: _wbufq_insert() is used now (no out-of-order).
2082                          */
2083 #ifdef USE_TLS
2084                         if (unlikely(c->type==PROTO_TLS)) {
2085                                 /* for TLS the TLS processing and the send must happen
2086                                  * atomically w/ respect to other sends on the same connection
2087                                  * (otherwise reordering might occur which would break TLS) =>
2088                                  * lock. However in this case this send will always be the first.
2089                                  * We can have the send() outside the lock only if this is the
2090                                  * first and only send (tls_encode is not called again), or
2091                                  * this is the last send for a tls_encode() loop and all the
2092                                  * previous ones did return CONN_NEW_COMPLETE or CONN_EOF.
2093                                  */
2094                                 response[1] = CONN_NOP;
2095                                 t_buf = buf;
2096                                 t_len = len;
2097                                 lock_get(&c->write_lock);
2098 redo_tls_encode:
2099                                         t_send_flags = dst->send_flags;
2100                                         n = tls_encode(c, &t_buf, &t_len, &rest_buf, &rest_len,
2101                                                                         &t_send_flags);
2102                                         /* There are 4 cases:
2103                                          *  1. entire buffer consumed from the first try
2104                                          *    (rest_len == rest_buf == 0)
2105                                          *  2. rest_buf & first call
2106                                          *  3. rest_buf & not first call
2107                                          *        3a. CONN_NEW_COMPLETE or CONN_EOF
2108                                          *        3b. CONN_NEW_PENDING_WRITE
2109                                          *  4. entire buffer consumed, but not first call
2110                                          *      4a. CONN_NEW_COMPLETE or CONN_EOF
2111                                          *         4b. CONN_NEW_PENDING_WRITE
2112                                          *      We misuse response[1] == CONN_NOP to test for the
2113                                          *      first call.
2114                                          */
2115                                         if (unlikely(n < 0)) {
2116                                                 lock_release(&c->write_lock);
2117                                                 goto conn_wait_error;
2118                                         }
2119                                         if (likely(rest_len == 0)) {
2120                                                 /* 1 or 4*: CONN_NEW_COMPLETE, CONN_EOF,  CONN_NOP
2121                                                  * or CONN_NEW_PENDING_WRITE (*rest_len == 0) */
2122                                                 if (likely(response[1] != CONN_NEW_PENDING_WRITE)) {
2123                                                         /* 1 or 4a => it's safe to do the send outside the
2124                                                          * lock (it will either send directly or
2125                                                          * wbufq_insert())
2126                                                          */
2127                                                         lock_release(&c->write_lock);
2128                                                         if (likely(t_len != 0)) {
2129                                                                 n=tcpconn_1st_send(fd, c, t_buf, t_len,
2130                                                                                                         t_send_flags,
2131                                                                                                         &response[1], 0);
2132                                                         } else { /* t_len == 0 */
2133                                                                 if (response[1] == CONN_NOP) {
2134                                                                         /* nothing to send (e.g  parallel send
2135                                                                          * tls_encode queues some data and then
2136                                                                          * WANT_READ => this tls_encode will queue
2137                                                                          * the cleartext too and will have nothing
2138                                                                          * to send right now) and initial send =>
2139                                                                          * behave as if the send was successful
2140                                                                          * (but never return EOF here) */
2141                                                                         response[1] = CONN_NEW_COMPLETE;
2142                                                                 }
2143                                                         }
2144                                                         /* exit */
2145                                                 } else {
2146                                                         /* CONN_NEW_PENDING_WRITE:  4b: it was a
2147                                                          * repeated tls_encode() (or otherwise we would
2148                                                          * have here CONN_NOP) => add to the queue */
2149                                                         if (unlikely(t_len &&
2150                                                                                         _wbufq_add(c, t_buf, t_len) < 0)) {
2151                                                                 response[1] = CONN_ERROR;
2152                                                                 n = -1;
2153                                                         }
2154                                                         lock_release(&c->write_lock);
2155                                                         /* exit (no send) */
2156                                                 }
2157                                         } else {  /* rest_len != 0 */
2158                                                 /* 2 or 3*: if tls_encode hasn't finished, we have to
2159                                                  * call tcpconn_1st_send() under lock (otherwise if it
2160                                                  * returns CONN_NEW_PENDING_WRITE, there is no way
2161                                                  * to find the right place to add the new queued
2162                                                  * data from the 2nd tls_encode()) */
2163                                                 if (likely((response[1] == CONN_NOP /*2*/ ||
2164                                                                         response[1] == CONN_NEW_COMPLETE /*3a*/ ||
2165                                                                         response[1] == CONN_EOF /*3a*/) && t_len))
2166                                                         n = tcpconn_1st_send(fd, c, t_buf, t_len,
2167                                                                                                         t_send_flags,
2168                                                                                                         &response[1], 1);
2169                                                 else if (unlikely(t_len &&
2170                                                                                         _wbufq_add(c, t_buf, t_len) < 0)) {
2171                                                         /*3b: CONN_NEW_PENDING_WRITE*/
2172                                                         response[1] = CONN_ERROR;
2173                                                         n = -1;
2174                                                 }
2175                                                 if (likely(n >= 0)) {
2176                                                         /* if t_len == 0 => nothing was sent => previous
2177                                                          * response will be kept */
2178                                                         t_buf = rest_buf;
2179                                                         t_len = rest_len;
2180                                                         goto redo_tls_encode;
2181                                                 } else {
2182                                                         lock_release(&c->write_lock);
2183                                                         /* error exit */
2184                                                 }
2185                                         }
2186                         } else
2187 #endif /* USE_TLS */
2188                                 n=tcpconn_1st_send(fd, c, buf, len, dst->send_flags,
2189                                                                         &response[1], 0);
2190                         if (unlikely(n<0)) /* this will catch CONN_ERROR too */
2191                                 goto conn_wait_error;
2192                         if (unlikely(response[1]==CONN_EOF)){
2193                                 /* if close-after-send requested, don't bother
2194                                  * sending the fd back to tcp_main, try closing it
2195                                  * immediately (no other tcp_send should use it,
2196                                  * because it is marked as close-after-send before
2197                                  * being added to the hash) */
2198                                 goto conn_wait_close;
2199                         }
2200                         /* send to tcp_main */
2201                         response[0]=(long)c;
2202                         if (unlikely(send_fd(unix_tcp_sock, response,
2203                                                                         sizeof(response), fd) <= 0)){
2204                                 LM_ERR("%s: %ld for %p failed:" " %s (%d)\n",
2205                                                         su2a(&dst->to, sizeof(dst->to)),
2206                                                         response[1], c, strerror(errno), errno);
2207                                 goto conn_wait_error;
2208                         }
2209                         goto conn_wait_success;
2210                 }
2211 #endif /* TCP_CONNECT_WAIT  && TCP_ASYNC */
2212                 if (unlikely((c=tcpconn_connect(&dst->to, from, dst->proto,
2213                                                                                 &dst->send_flags))==0)){
2214                         LM_ERR("%s: connect failed\n", su2a(&dst->to, sizeof(dst->to)));
2215                         return -1;
2216                 }
2217                 if(c->flags & F_CONN_NOSEND) {
2218                         /* connection marked as no-send data
2219                          * (e.g., drop() from tls event route)*/
2220                         LM_INFO("%s: connection marked for no-send (%p)\n",
2221                                         su2a(&dst->to, sizeof(dst->to)), c);
2222                         /* we can safely delete it, it's not referenced by anybody */
2223                         _tcpconn_free(c);
2224                         n=-1;
2225                         goto end_no_conn;
2226                 }
2227                 tcpconn_set_send_flags(c, dst->send_flags);
2228                 if (likely(c->state==S_CONN_OK))
2229                         TCP_STATS_ESTABLISHED(S_CONN_CONNECT);
2230                 atomic_set(&c->refcnt, 2); /* ref. from here and it will also
2231                                                                         * be added in the tcp_main hash */
2232                 fd=c->s;
2233                 c->flags|=F_CONN_FD_CLOSED; /* not yet opened in main */
2234                 /* ? TODO: it might be faster just to queue the write and
2235                  * send to main a CONN_NEW_PENDING_WRITE */
2236
2237                 /* send the new tcpconn to "tcp main" */
2238                 response[0]=(long)c;
2239                 response[1]=CONN_NEW;
2240                 n=send_fd(unix_tcp_sock, response, sizeof(response), c->s);
2241                 if (unlikely(n<=0)){
2242                         LM_ERR("%s: failed send_fd: %s (%d)\n",
2243                                         su2a(&dst->to, sizeof(dst->to)),
2244                                         strerror(errno), errno);
2245                         /* we can safely delete it, it's not referenced by anybody */
2246                         _tcpconn_free(c);
2247                         n=-1;
2248                         goto end_no_conn;
2249                 }
2250                 /* new connection => send on it directly */
2251 #ifdef USE_TLS
2252                 if (unlikely(c->type==PROTO_TLS)) {
2253                         /* for TLS the TLS processing and the send must happen
2254                          * atomically w/ respect to other sends on the same connection
2255                          * (otherwise reordering might occur which would break TLS) =>
2256                          * lock.
2257                         */
2258                         response[1] = CONN_NOP;
2259                         t_buf = buf;
2260                         t_len = len;
2261                         lock_get(&c->write_lock);
2262                                 do {
2263                                         t_send_flags = dst->send_flags;
2264                                         n = tls_encode(c, &t_buf, &t_len, &rest_buf, &rest_len,
2265                                                                         &t_send_flags);
2266                                         if (likely(n > 0)) {
2267                                                 n = tcpconn_do_send(fd, c, t_buf, t_len, t_send_flags,
2268                                                                                                 &resp, 1);
2269                                                 if (likely(response[1] != CONN_QUEUED_WRITE ||
2270                                                                         resp == CONN_ERROR))
2271                                                         /* don't overwrite a previous CONN_QUEUED_WRITE
2272                                                          * unless error */
2273                                                         response[1] = resp;
2274                                         } else  if (unlikely(n < 0)) {
2275                                                 response[1] = CONN_ERROR;
2276                                                 break;
2277                                         }
2278                                         /* else do nothing for n (t_len) == 0, keep
2279                                          * the last reponse */
2280                                         t_buf = rest_buf;
2281                                         t_len = rest_len;
2282                                 } while(unlikely(rest_len && n > 0));
2283                         lock_release(&c->write_lock);
2284                 } else
2285 #endif /* USE_TLS */
2286                         n = tcpconn_do_send(fd, c, buf, len, dst->send_flags,
2287                                                                         &response[1], 0);
2288                 if (unlikely(response[1] != CONN_NOP)) {
2289                         response[0]=(long)c;
2290                         if (send_all(unix_tcp_sock, response, sizeof(response)) <= 0) {
2291                                 BUG("tcp_main command %ld sending failed (write):"
2292                                                 "%s (%d)\n", response[1], strerror(errno), errno);
2293                                 /* all commands != CONN_NOP returned by tcpconn_do_send()
2294                                  * (CONN_EOF, CONN_ERROR, CONN_QUEUED_WRITE) will auto-dec
2295                                  * refcnt => if sending the command fails we have to
2296                                  * dec. refcnt by hand */
2297                                 tcpconn_chld_put(c); /* deref. it manually */
2298                                 n=-1;
2299                         }
2300                         /* here refcnt for c is already decremented => c contents can
2301                          * no longer be used and refcnt _must_ _not_ be decremented
2302                          * again on exit */
2303                         if (unlikely(n < 0 || response[1] == CONN_EOF)) {
2304                                 /* on error or eof, close fd */
2305                                 tcp_safe_close(fd);
2306                         } else if (response[1] == CONN_QUEUED_WRITE) {
2307 #ifdef TCP_FD_CACHE
2308                                 if (cfg_get(tcp, tcp_cfg, fd_cache)) {
2309                                         tcp_fd_cache_add(c, fd);
2310                                 } else
2311 #endif /* TCP_FD_CACHE */
2312                                         tcp_safe_close(fd);
2313                         } else {
2314                                 BUG("unexpected tcpconn_do_send() return & response:"
2315                                                 " %d, %ld\n", n, response[1]);
2316                         }
2317                         goto end_no_deref;
2318                 }
2319 #ifdef TCP_FD_CACHE
2320                 if (cfg_get(tcp, tcp_cfg, fd_cache)) {
2321                         tcp_fd_cache_add(c, fd);
2322                 }else
2323 #endif /* TCP_FD_CACHE */
2324                         tcp_safe_close(fd);
2325         /* here we can have only commands that _do_ _not_ dec refcnt.
2326          * (CONN_EOF, CON_ERROR, CON_QUEUED_WRITE are all treated above) */
2327                 goto release_c;
2328         } /* if (c==0 or unusable) new connection */
2329         /* existing connection, send on it */
2330         n = tcpconn_send_put(c, buf, len, dst->send_flags);
2331         /* no deref needed (automatically done inside tcpconn_send_put() */
2332         return n;
2333 #ifdef TCP_CONNECT_WAIT
2334 conn_wait_success:
2335 #ifdef TCP_FD_CACHE
2336         if (cfg_get(tcp, tcp_cfg, fd_cache)) {
2337                 tcp_fd_cache_add(c, fd);
2338         } else
2339 #endif /* TCP_FD_CACHE */
2340                 if (unlikely (tcp_safe_close(fd) < 0))
2341                         LM_ERR("closing temporary send fd for %p: %s: "
2342                                         "close(%d) failed (flags 0x%x): %s (%d)\n", c,
2343                                         su2a(&c->rcv.src_su, sizeof(c->rcv.src_su)),
2344                                         fd, c->flags, strerror(errno), errno);
2345         tcpconn_chld_put(c); /* release c (dec refcnt & free on 0) */
2346         return n;
2347 conn_wait_error:
2348         n=-1;
2349 conn_wait_close:
2350         /* connect or send failed or immediate close-after-send was requested on
2351          * newly created connection which was not yet sent to tcp_main (but was
2352          * already hashed) => don't send to main, unhash and destroy directly
2353          * (if refcnt>2 it will be destroyed when the last sender releases the
2354          * connection (tcpconn_chld_put(c))) or when tcp_main receives a
2355          * CONN_ERROR it*/
2356         c->state=S_CONN_BAD;
2357         /* we are here only if we opened a new fd (and not reused a cached or
2358          * a reader one) => if the connect was successful close the fd */
2359         if (fd>=0) {
2360                 if (unlikely(tcp_safe_close(fd) < 0 ))
2361                         LM_ERR("closing temporary send fd for %p: %s: "
2362                                         "close(%d) failed (flags 0x%x): %s (%d)\n", c,
2363                                         su2a(&c->rcv.src_su, sizeof(c->rcv.src_su)),
2364                                         fd, c->flags, strerror(errno), errno);
2365         }
2366         /* here the connection is for sure in the hash (tcp_main will not
2367          * remove it because it's marked as PENDing) and the refcnt is at least 2
2368          */
2369         TCPCONN_LOCK;
2370                 _tcpconn_detach(c);
2371                 c->flags&=~F_CONN_HASHED;
2372                 tcpconn_put(c);
2373         TCPCONN_UNLOCK;
2374         /* dec refcnt -> mark it for destruction */
2375         tcpconn_chld_put(c);
2376         return n;
2377 #endif /* TCP_CONNECT_WAIT */
2378 release_c:
2379         tcpconn_chld_put(c); /* release c (dec refcnt & free on 0) */
2380 end_no_deref:
2381 end_no_conn:
2382         return n;
2383 }
2384
2385
2386
2387 /** sends on an existing tcpconn and auto-dec. con. ref counter.
2388  * As opposed to tcp_send(), this function requires an existing
2389  * tcp connection.
2390  * WARNING: the tcp_connection will be de-referenced.
2391  * @param c - existing tcp connection pointer.
2392  * @param buf - data to be sent.
2393  * @param len - data length,
2394  * @return >=0 on success, -1 on error.
2395  */
2396 static int tcpconn_send_put(struct tcp_connection* c, const char* buf,
2397                                                                 unsigned len, snd_flags_t send_flags)
2398 {
2399         struct tcp_connection *tmp;
2400         int fd;
2401         long response[2];
2402         int n;
2403         int do_close_fd;
2404 #ifdef USE_TLS
2405         const char* rest_buf;
2406         const char* t_buf;
2407         unsigned rest_len, t_len;
2408         long resp;
2409         snd_flags_t t_send_flags;
2410 #endif /* USE_TLS */
2411 #ifdef TCP_FD_CACHE
2412         struct fd_cache_entry* fd_cache_e;
2413         int use_fd_cache;
2414         
2415         use_fd_cache=cfg_get(tcp, tcp_cfg, fd_cache);
2416         fd_cache_e=0;
2417 #endif /* TCP_FD_CACHE */
2418         do_close_fd=1; /* close the fd on exit */
2419         response[1] = CONN_NOP;
2420 #ifdef TCP_ASYNC
2421         /* if data is already queued, we don't need the fd */
2422 #ifdef TCP_CONNECT_WAIT
2423                 if (unlikely(cfg_get(tcp, tcp_cfg, async) &&
2424                                                 (_wbufq_non_empty(c) || (c->flags&F_CONN_PENDING)) ))
2425 #else /* ! TCP_CONNECT_WAIT */
2426                 if (unlikely(cfg_get(tcp, tcp_cfg, async) && (_wbufq_non_empty(c)) ))
2427 #endif /* TCP_CONNECT_WAIT */
2428                 {
2429                         lock_get(&c->write_lock);
2430 #ifdef TCP_CONNECT_WAIT
2431                                 if (likely(_wbufq_non_empty(c) || (c->flags&F_CONN_PENDING)))
2432 #else /* ! TCP_CONNECT_WAIT */
2433                                 if (likely(_wbufq_non_empty(c)))
2434 #endif /* TCP_CONNECT_WAIT */
2435                                 {
2436                                         do_close_fd=0;
2437 #ifdef USE_TLS
2438                                         if (unlikely(c->type==PROTO_TLS || c->type==PROTO_WSS)) {
2439                                                 t_buf = buf;
2440                                                 t_len = len;
2441                                                 do {
2442                                                         t_send_flags = send_flags;
2443                                                         n = tls_encode(c, &t_buf, &t_len,
2444                                                                                         &rest_buf, &rest_len,
2445                                                                                         &t_send_flags);
2446                                                         if (unlikely((n < 0) || (t_len &&
2447                                                                          (_wbufq_add(c, t_buf, t_len) < 0)))) {
2448                                                                 lock_release(&c->write_lock);
2449                                                                 n=-1;
2450                                                                 response[1] = CONN_ERROR;
2451                                                                 c->state=S_CONN_BAD;
2452                                                                 c->timeout=get_ticks_raw(); /* force timeout */
2453                                                                 goto error;
2454                                                         }
2455                                                         t_buf = rest_buf;
2456                                                         t_len = rest_len;
2457                                                 } while(unlikely(rest_len && n > 0));
2458                                         } else
2459 #endif /* USE_TLS */
2460                                                 if (unlikely(len && (_wbufq_add(c, buf, len)<0))){
2461                                                         lock_release(&c->write_lock);
2462                                                         n=-1;
2463                                                         response[1] = CONN_ERROR;
2464                                                         c->state=S_CONN_BAD;
2465                                                         c->timeout=get_ticks_raw(); /* force timeout */
2466                                                         goto error;
2467                                                 }
2468                                         n=len;
2469                                         lock_release(&c->write_lock);
2470                                         goto release_c;
2471                                 }
2472                         lock_release(&c->write_lock);
2473                 }
2474 #endif /* TCP_ASYNC */
2475                 /* check if this is not the same reader process holding
2476                  *  c  and if so send directly on c->fd */
2477                 if (c->reader_pid==my_pid()){
2478                         LM_DBG("send from reader (%d (%d)), reusing fd\n",
2479                                         my_pid(), process_no);
2480                         fd=c->fd;
2481                         do_close_fd=0; /* don't close the fd on exit, it's in use */
2482 #ifdef TCP_FD_CACHE
2483                         use_fd_cache=0; /* don't cache: problems would arise due to the
2484                                                            close() on cache eviction (if the fd is still 
2485                                                            used). If it has to be cached then dup() _must_ 
2486                                                            be used */
2487                 }else if (likely(use_fd_cache && 
2488                                                         ((fd_cache_e=tcp_fd_cache_get(c))!=0))){
2489                         fd=fd_cache_e->fd;
2490                         do_close_fd=0;
2491                         LM_DBG("found fd in cache (%d, %p, %d)\n", fd, c, fd_cache_e->id);
2492 #endif /* TCP_FD_CACHE */
2493                 }else{
2494                         LM_DBG("tcp connection found (%p), acquiring fd\n", c);
2495                         /* get the fd */
2496                         response[0]=(long)c;
2497                         response[1]=CONN_GET_FD;
2498                         n=send_all(unix_tcp_sock, response, sizeof(response));
2499                         if (unlikely(n<=0)){
2500                                 LM_ERR("failed to get fd(write):%s (%d)\n", strerror(errno), errno);
2501                                 n=-1;
2502                                 goto release_c;
2503                         }
2504                         LM_DBG("c=%p, n=%d\n", c, n);
2505                         n=receive_fd(unix_tcp_sock, &tmp, sizeof(tmp), &fd, MSG_WAITALL);
2506                         if (unlikely(n<=0)){
2507                                 LM_ERR("failed to get fd(receive_fd): %s (%d)\n",
2508                                                 strerror(errno), errno);
2509                                 n=-1;
2510                                 do_close_fd=0;
2511                                 goto release_c;
2512                         }
2513                         /* handle fd closed or bad connection/error
2514                                 (it's possible that this happened in the time between
2515                                 we found the intial connection and the time when we get
2516                                 the fd)
2517                          */
2518                         if (unlikely(c!=tmp || fd==-1 || c->state==S_CONN_BAD)){
2519                                 if (unlikely(c!=tmp && tmp!=0))
2520                                         BUG("tcp_send: get_fd: got different connection:"
2521                                                 "  %p (id= %d, refcnt=%d state=%d) != "
2522                                                 "  %p (n=%d)\n",
2523                                                   c,   c->id,   atomic_get(&c->refcnt),   c->state,
2524                                                   tmp, n
2525                                                 );
2526                                 n=-1; /* fail */
2527                                 /* don't cache fd & close it */
2528                                 do_close_fd = (fd==-1)?0:1;
2529 #ifdef TCP_FD_CACHE
2530                                 use_fd_cache = 0;
2531 #endif /* TCP_FD_CACHE */
2532                                 goto end;
2533                         }
2534                         LM_DBG("after receive_fd: c= %p n=%d fd=%d\n",c, n, fd);
2535                 }
2536         
2537 #ifdef USE_TLS
2538                 if (unlikely(c->type==PROTO_TLS || c->type==PROTO_WSS)) {
2539                         /* for TLS the TLS processing and the send must happen
2540                            atomically w/ respect to other sends on the same connection
2541                            (otherwise reordering might occur which would break TLS) =>
2542                            lock.
2543                         */
2544                         response[1] = CONN_NOP;
2545                         t_buf = buf;
2546                         t_len = len;
2547                         lock_get(&c->write_lock);
2548                                 do {
2549                                         t_send_flags = send_flags;
2550                                         n = tls_encode(c, &t_buf, &t_len, &rest_buf, &rest_len,
2551                                                                         &t_send_flags);
2552                                         if (likely(n > 0)) {
2553                                                 n = tcpconn_do_send(fd, c, t_buf, t_len, t_send_flags,
2554                                                                                                 &resp, 1);
2555                                                 if (likely(response[1] != CONN_QUEUED_WRITE ||
2556                                                                         resp == CONN_ERROR))
2557                                                         /* don't overwrite a previous CONN_QUEUED_WRITE
2558                                                            unless error */
2559                                                         response[1] = resp;
2560                                         } else if (unlikely(n < 0)) {
2561                                                 response[1] = CONN_ERROR;
2562                                                 break;
2563                                         }
2564                                         /* else do nothing for n (t_len) == 0, keep
2565                                            the last reponse */
2566                                         t_buf = rest_buf;
2567                                         t_len = rest_len;
2568                                 } while(unlikely(rest_len && n > 0));
2569                         lock_release(&c->write_lock);
2570                 } else
2571 #endif
2572                         n = tcpconn_do_send(fd, c, buf, len, send_flags, &response[1], 0);
2573         if (unlikely(response[1] != CONN_NOP)) {
2574 error:
2575                 response[0]=(long)c;
2576                 if (send_all(unix_tcp_sock, response, sizeof(response)) <= 0) {
2577                         BUG("tcp_main command %ld sending failed (write):%s (%d)\n",
2578                                         response[1], strerror(errno), errno);
2579                         /* all commands != CONN_NOP returned by tcpconn_do_send()
2580                            (CONN_EOF, CONN_ERROR, CONN_QUEUED_WRITE) will auto-dec refcnt
2581                            => if sending the command fails we have to dec. refcnt by hand
2582                          */
2583                         tcpconn_chld_put(c); /* deref. it manually */
2584                         n=-1;
2585                 }
2586                 /* here refcnt for c is already decremented => c contents can no
2587                    longer be used and refcnt _must_ _not_ be decremented again
2588                    on exit */
2589                 if (unlikely(n < 0 || response[1] == CONN_EOF)) {
2590                         /* on error or eof, remove from cache or close fd */
2591 #ifdef TCP_FD_CACHE
2592                         if (unlikely(fd_cache_e)){
2593                                 tcp_fd_cache_rm(fd_cache_e);
2594                                 fd_cache_e = 0;
2595                                 tcp_safe_close(fd);
2596                         }else
2597 #endif /* TCP_FD_CACHE */
2598                                 if (do_close_fd) tcp_safe_close(fd);
2599                 } else if (response[1] == CONN_QUEUED_WRITE) {
2600 #ifdef TCP_FD_CACHE
2601                         if (unlikely((fd_cache_e==0) && use_fd_cache)){
2602                                 tcp_fd_cache_add(c, fd);
2603                         }else
2604 #endif /* TCP_FD_CACHE */
2605                                 if (do_close_fd) tcp_safe_close(fd);
2606                 } else {
2607                         BUG("unexpected tcpconn_do_send() return & response: %d, %ld\n",
2608                                         n, response[1]);
2609                 }
2610                 return n; /* no tcpconn_put */
2611         }
2612 end:
2613 #ifdef TCP_FD_CACHE
2614         if (unlikely((fd_cache_e==0) && use_fd_cache)){
2615                 tcp_fd_cache_add(c, fd);
2616         }else
2617 #endif /* TCP_FD_CACHE */
2618         if (do_close_fd) {
2619                 if (unlikely(tcp_safe_close(fd) < 0))
2620                         LM_ERR("closing temporary send fd for %p: %s: "
2621                                         "close(%d) failed (flags 0x%x): %s (%d)\n", c,
2622                                         su2a(&c->rcv.src_su, sizeof(c->rcv.src_su)),
2623                                         fd, c->flags, strerror(errno), errno);
2624         }
2625         /* here we can have only commands that _do_ _not_ dec refcnt.
2626            (CONN_EOF, CON_ERROR, CON_QUEUED_WRITE are all treated above) */
2627 release_c:
2628         tcpconn_chld_put(c); /* release c (dec refcnt & free on 0) */
2629         return n;
2630 }
2631
2632
2633
2634 /* unsafe send on a known tcp connection.
2635  * Directly send on a known tcp connection with a given fd.
2636  * It is assumed that the connection locks are already held.
2637  * Side effects: if needed it will send state update commands to
2638  *  tcp_main (e.g. CON_EOF, CON_ERROR, CON_QUEUED_WRITE).
2639  * @param fd - fd used for sending.
2640  * @param c - existing tcp connection pointer (state and flags might be
2641  *            changed).
2642  * @param buf - data to be sent.
2643  * @param len - data length.
2644  * @param send_flags
2645  * @return <0 on error, number of bytes sent on success.
2646  */
2647 int tcpconn_send_unsafe(int fd, struct tcp_connection *c,
2648                                                 const char* buf, unsigned len, snd_flags_t send_flags)
2649 {
2650         int n;
2651         long response[2];
2652         
2653         n = tcpconn_do_send(fd, c, buf, len, send_flags, &response[1], 1);
2654         if (unlikely(response[1] != CONN_NOP)) {
2655                 /* all commands != CONN_NOP returned by tcpconn_do_send()
2656                    (CONN_EOF, CONN_ERROR, CONN_QUEUED_WRITE) will auto-dec refcnt
2657                    => increment it (we don't want the connection to be destroyed
2658                    from under us)
2659                  */
2660                 atomic_inc(&c->refcnt);
2661                 response[0]=(long)c;
2662                 if (send_all(unix_tcp_sock, response, sizeof(response)) <= 0) {
2663                         BUG("connection %p command %ld sending failed (write):%s (%d)\n",
2664                                         c, response[1], strerror(errno), errno);
2665                         /* send failed => deref. it back by hand */
2666                         tcpconn_chld_put(c); 
2667                         n=-1;
2668                 }
2669                 /* here refcnt for c is already decremented => c contents can no
2670                    longer be used and refcnt _must_ _not_ be decremented again
2671                    on exit */
2672                 return n;
2673         }
2674         return n;
2675 }
2676
2677
2678
2679 /** lower level send (connection and fd should be known).
2680  * It takes care of possible write-queueing, blacklisting a.s.o.
2681  * It expects a valid tcp connection. It doesn't touch the ref. cnts.
2682  * It will also set the connection flags from send_flags (it's better
2683  * to do it here, because it's guaranteed to be under lock).
2684  * @param fd - fd used for sending.
2685  * @param c - existing tcp connection pointer (state and flags might be
2686  *            changed).
2687  * @param buf - data to be sent.
2688  * @param len - data length.
2689  * @param send_flags
2690  * @param resp - filled with a cmd. for tcp_main:
2691  *                      CONN_NOP - nothing needs to be done (do not send
2692  *                                 anything to tcp_main).
2693  *                      CONN_ERROR - error, connection should be closed.
2694  *                      CONN_EOF - no error, but connection should be closed.
2695  *                      CONN_QUEUED_WRITE - new write queue (connection
2696  *                                 should be watched for write and the wr.
2697  *                                 queue flushed).
2698  * @param locked - if set assume the connection is already locked (call from
2699  *                  tls) and do not lock/unlock the connection.
2700  * @return >=0 on success, < 0 on error && *resp == CON_ERROR.
2701  *
2702  */
2703 static int tcpconn_do_send(int fd, struct tcp_connection* c,
2704                                                         const char* buf, unsigned len,
2705                                                         snd_flags_t send_flags, long* resp,
2706                                                         int locked)
2707 {
2708         int  n;
2709 #ifdef TCP_ASYNC
2710         int enable_write_watch;
2711 #endif /* TCP_ASYNC */
2712
2713         LM_DBG("sending...\n");
2714         *resp = CONN_NOP;
2715         if (likely(!locked)) lock_get(&c->write_lock);
2716         /* update connection send flags with the current ones */
2717         tcpconn_set_send_flags(c, send_flags);
2718 #ifdef TCP_ASYNC
2719         if (likely(cfg_get(tcp, tcp_cfg, async))){
2720                 if (_wbufq_non_empty(c)
2721 #ifdef TCP_CONNECT_WAIT
2722                         || (c->flags&F_CONN_PENDING) 
2723 #endif /* TCP_CONNECT_WAIT */
2724                         ){
2725                         if (unlikely(_wbufq_add(c, buf, len)<0)){
2726                                 if (likely(!locked)) lock_release(&c->write_lock);
2727                                 n=-1;
2728                                 goto error;
2729                         }
2730                         if (likely(!locked)) lock_release(&c->write_lock);
2731                         n=len;
2732                         goto end;
2733                 }
2734                 n=_tcpconn_write_nb(fd, c, buf, len);
2735         }else{
2736 #endif /* TCP_ASYNC */
2737                 /* n=tcp_blocking_write(c, fd, buf, len); */
2738                 n=tsend_stream(fd, buf, len,
2739                                                 TICKS_TO_S(cfg_get(tcp, tcp_cfg, send_timeout)) *
2740                                                 1000);
2741 #ifdef TCP_ASYNC
2742         }
2743 #else /* ! TCP_ASYNC */
2744         if (likely(!locked)) lock_release(&c->write_lock);
2745 #endif /* TCP_ASYNC */
2746         
2747         LM_DBG("after real write: c= %p n=%d fd=%d\n",c, n, fd);
2748         LM_DBG("buf=\n%.*s\n", (int)len, buf);
2749         if (unlikely(n<(int)len)){
2750 #ifdef TCP_ASYNC
2751                 if (cfg_get(tcp, tcp_cfg, async) &&
2752                                 ((n>=0) || errno==EAGAIN || errno==EWOULDBLOCK)){
2753                         enable_write_watch=_wbufq_empty(c);
2754                         if (n<0) n=0;
2755                         else if (unlikely(c->state==S_CONN_CONNECT ||
2756                                                 c->state==S_CONN_ACCEPT)){
2757                                 TCP_STATS_ESTABLISHED(c->state);