TCPSocket.cpp 29 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784
  1. /*
  2. * Copyright (c) 2018-2020, Andreas Kling <kling@serenityos.org>
  3. *
  4. * SPDX-License-Identifier: BSD-2-Clause
  5. */
  6. #include <AK/Singleton.h>
  7. #include <AK/Time.h>
  8. #include <Kernel/Debug.h>
  9. #include <Kernel/Devices/Generic/RandomDevice.h>
  10. #include <Kernel/FileSystem/OpenFileDescription.h>
  11. #include <Kernel/Locking/MutexProtected.h>
  12. #include <Kernel/Net/EthernetFrameHeader.h>
  13. #include <Kernel/Net/IPv4.h>
  14. #include <Kernel/Net/NetworkAdapter.h>
  15. #include <Kernel/Net/NetworkingManagement.h>
  16. #include <Kernel/Net/Routing.h>
  17. #include <Kernel/Net/TCP.h>
  18. #include <Kernel/Net/TCPSocket.h>
  19. #include <Kernel/Security/Random.h>
  20. #include <Kernel/Tasks/Process.h>
  21. #include <Kernel/Time/TimeManagement.h>
  22. namespace Kernel {
  23. void TCPSocket::for_each(Function<void(TCPSocket const&)> callback)
  24. {
  25. sockets_by_tuple().for_each_shared([&](auto const& it) {
  26. callback(*it.value);
  27. });
  28. }
  29. ErrorOr<void> TCPSocket::try_for_each(Function<ErrorOr<void>(TCPSocket const&)> callback)
  30. {
  31. return sockets_by_tuple().with_shared([&](auto const& sockets) -> ErrorOr<void> {
  32. for (auto& it : sockets)
  33. TRY(callback(*it.value));
  34. return {};
  35. });
  36. }
  37. bool TCPSocket::unref() const
  38. {
  39. bool did_hit_zero = sockets_by_tuple().with_exclusive([&](auto& table) {
  40. if (deref_base())
  41. return false;
  42. table.remove(tuple());
  43. const_cast<TCPSocket&>(*this).revoke_weak_ptrs();
  44. return true;
  45. });
  46. if (did_hit_zero) {
  47. const_cast<TCPSocket&>(*this).will_be_destroyed();
  48. delete this;
  49. }
  50. return did_hit_zero;
  51. }
  52. void TCPSocket::set_state(State new_state)
  53. {
  54. dbgln_if(TCP_SOCKET_DEBUG, "TCPSocket({}) state moving from {} to {}", this, to_string(m_state), to_string(new_state));
  55. auto was_disconnected = protocol_is_disconnected();
  56. auto previous_role = m_role;
  57. m_state = new_state;
  58. if (new_state == State::Established && m_direction == Direction::Outgoing) {
  59. set_role(Role::Connected);
  60. clear_so_error();
  61. }
  62. if (new_state == State::TimeWait) {
  63. // Once we hit TimeWait, we are only holding the socket in case there
  64. // are packets on the way which we wouldn't want a new socket to get hit
  65. // with, so there's no point in keeping the receive buffer around.
  66. drop_receive_buffer();
  67. auto deadline = TimeManagement::the().current_time(CLOCK_MONOTONIC_COARSE) + maximum_segment_lifetime;
  68. auto timer_was_added = TimerQueue::the().add_timer_without_id(*m_timer, CLOCK_MONOTONIC_COARSE, deadline, [&]() {
  69. dbgln_if(TCP_SOCKET_DEBUG, "TCPSocket({}) TimeWait timer elpased", this);
  70. if (m_state == State::TimeWait) {
  71. m_state = State::Closed;
  72. do_state_closed();
  73. }
  74. });
  75. if (!timer_was_added) [[unlikely]] {
  76. dbgln_if(TCP_SOCKET_DEBUG, "TCPSocket({}) TimeWait timer deadline is in the past", this);
  77. m_state = State::Closed;
  78. new_state = State::Closed;
  79. }
  80. }
  81. if (new_state == State::Closed)
  82. do_state_closed();
  83. if (previous_role != m_role || was_disconnected != protocol_is_disconnected())
  84. evaluate_block_conditions();
  85. }
  86. void TCPSocket::do_state_closed()
  87. {
  88. if (m_originator)
  89. release_to_originator();
  90. closing_sockets().with_exclusive([&](auto& table) {
  91. table.remove(tuple());
  92. });
  93. }
  94. static Singleton<MutexProtected<HashMap<IPv4SocketTuple, RefPtr<TCPSocket>>>> s_socket_closing;
  95. MutexProtected<HashMap<IPv4SocketTuple, RefPtr<TCPSocket>>>& TCPSocket::closing_sockets()
  96. {
  97. return *s_socket_closing;
  98. }
  99. static Singleton<MutexProtected<HashMap<IPv4SocketTuple, TCPSocket*>>> s_socket_tuples;
  100. MutexProtected<HashMap<IPv4SocketTuple, TCPSocket*>>& TCPSocket::sockets_by_tuple()
  101. {
  102. return *s_socket_tuples;
  103. }
  104. RefPtr<TCPSocket> TCPSocket::from_tuple(IPv4SocketTuple const& tuple)
  105. {
  106. return sockets_by_tuple().with_shared([&](auto const& table) -> RefPtr<TCPSocket> {
  107. auto exact_match = table.get(tuple);
  108. if (exact_match.has_value())
  109. return { *exact_match.value() };
  110. auto address_tuple = IPv4SocketTuple(tuple.local_address(), tuple.local_port(), IPv4Address(), 0);
  111. auto address_match = table.get(address_tuple);
  112. if (address_match.has_value())
  113. return { *address_match.value() };
  114. auto wildcard_tuple = IPv4SocketTuple(IPv4Address(), tuple.local_port(), IPv4Address(), 0);
  115. auto wildcard_match = table.get(wildcard_tuple);
  116. if (wildcard_match.has_value())
  117. return { *wildcard_match.value() };
  118. return {};
  119. });
  120. }
  121. ErrorOr<NonnullRefPtr<TCPSocket>> TCPSocket::try_create_client(IPv4Address const& new_local_address, u16 new_local_port, IPv4Address const& new_peer_address, u16 new_peer_port)
  122. {
  123. auto tuple = IPv4SocketTuple(new_local_address, new_local_port, new_peer_address, new_peer_port);
  124. return sockets_by_tuple().with_exclusive([&](auto& table) -> ErrorOr<NonnullRefPtr<TCPSocket>> {
  125. if (table.contains(tuple))
  126. return EEXIST;
  127. auto receive_buffer = TRY(try_create_receive_buffer());
  128. auto client = TRY(TCPSocket::try_create(protocol(), move(receive_buffer)));
  129. client->set_setup_state(SetupState::InProgress);
  130. client->set_local_address(new_local_address);
  131. client->set_local_port(new_local_port);
  132. client->set_peer_address(new_peer_address);
  133. client->set_peer_port(new_peer_port);
  134. client->set_bound(true);
  135. client->set_direction(Direction::Incoming);
  136. client->set_originator(*this);
  137. m_pending_release_for_accept.set(tuple, client);
  138. client->m_registered_socket_tuple = tuple;
  139. table.set(tuple, client);
  140. return { move(client) };
  141. });
  142. }
  143. void TCPSocket::release_to_originator()
  144. {
  145. VERIFY(!!m_originator);
  146. m_originator.strong_ref()->release_for_accept(*this);
  147. m_originator.clear();
  148. }
  149. void TCPSocket::release_for_accept(NonnullRefPtr<TCPSocket> socket)
  150. {
  151. VERIFY(m_pending_release_for_accept.contains(socket->tuple()));
  152. m_pending_release_for_accept.remove(socket->tuple());
  153. // FIXME: Should we observe this error somehow?
  154. [[maybe_unused]] auto rc = queue_connection_from(move(socket));
  155. }
  156. TCPSocket::TCPSocket(int protocol, NonnullOwnPtr<DoubleBuffer> receive_buffer, NonnullOwnPtr<KBuffer> scratch_buffer, NonnullRefPtr<Timer> timer)
  157. : IPv4Socket(SOCK_STREAM, protocol, move(receive_buffer), move(scratch_buffer))
  158. , m_last_ack_sent_time(TimeManagement::the().monotonic_time())
  159. , m_last_retransmit_time(TimeManagement::the().monotonic_time())
  160. , m_timer(timer)
  161. {
  162. }
  163. TCPSocket::~TCPSocket()
  164. {
  165. dequeue_for_retransmit();
  166. dbgln_if(TCP_SOCKET_DEBUG, "~TCPSocket in state {}", to_string(state()));
  167. }
  168. ErrorOr<NonnullRefPtr<TCPSocket>> TCPSocket::try_create(int protocol, NonnullOwnPtr<DoubleBuffer> receive_buffer)
  169. {
  170. // Note: Scratch buffer is only used for SOCK_STREAM sockets.
  171. auto scratch_buffer = TRY(KBuffer::try_create_with_size("TCPSocket: Scratch buffer"sv, 65536));
  172. auto timer = TRY(adopt_nonnull_ref_or_enomem(new (nothrow) Timer));
  173. return adopt_nonnull_ref_or_enomem(new (nothrow) TCPSocket(protocol, move(receive_buffer), move(scratch_buffer), timer));
  174. }
  175. ErrorOr<size_t> TCPSocket::protocol_size(ReadonlyBytes raw_ipv4_packet)
  176. {
  177. auto& ipv4_packet = *reinterpret_cast<IPv4Packet const*>(raw_ipv4_packet.data());
  178. auto& tcp_packet = *static_cast<TCPPacket const*>(ipv4_packet.payload());
  179. return raw_ipv4_packet.size() - sizeof(IPv4Packet) - tcp_packet.header_size();
  180. }
  181. ErrorOr<size_t> TCPSocket::protocol_receive(ReadonlyBytes raw_ipv4_packet, UserOrKernelBuffer& buffer, size_t buffer_size, [[maybe_unused]] int flags)
  182. {
  183. auto& ipv4_packet = *reinterpret_cast<IPv4Packet const*>(raw_ipv4_packet.data());
  184. auto& tcp_packet = *static_cast<TCPPacket const*>(ipv4_packet.payload());
  185. size_t payload_size = raw_ipv4_packet.size() - sizeof(IPv4Packet) - tcp_packet.header_size();
  186. dbgln_if(TCP_SOCKET_DEBUG, "payload_size {}, will it fit in {}?", payload_size, buffer_size);
  187. VERIFY(buffer_size >= payload_size);
  188. SOCKET_TRY(buffer.write(tcp_packet.payload(), payload_size));
  189. return payload_size;
  190. }
  191. ErrorOr<size_t> TCPSocket::protocol_send(UserOrKernelBuffer const& data, size_t data_length)
  192. {
  193. auto adapter = bound_interface().with([](auto& bound_device) -> RefPtr<NetworkAdapter> { return bound_device; });
  194. RoutingDecision routing_decision = route_to(peer_address(), local_address(), adapter);
  195. if (routing_decision.is_zero())
  196. return set_so_error(EHOSTUNREACH);
  197. size_t mss = routing_decision.adapter->mtu() - sizeof(IPv4Packet) - sizeof(TCPPacket);
  198. if (!m_no_delay) {
  199. // RFC 896 (Nagle’s algorithm): https://www.ietf.org/rfc/rfc0896
  200. // "The solution is to inhibit the sending of new TCP segments when
  201. // new outgoing data arrives from the user if any previously
  202. // transmitted data on the connection remains unacknowledged. This
  203. // inhibition is to be unconditional; no timers, tests for size of
  204. // data received, or other conditions are required."
  205. auto has_unacked_data = m_unacked_packets.with_shared([&](auto const& packets) { return packets.size > 0; });
  206. if (has_unacked_data && data_length < mss)
  207. return set_so_error(EAGAIN);
  208. }
  209. data_length = min(data_length, mss);
  210. TRY(send_tcp_packet(TCPFlags::PSH | TCPFlags::ACK, &data, data_length, &routing_decision));
  211. return data_length;
  212. }
  213. ErrorOr<void> TCPSocket::send_ack(bool allow_duplicate)
  214. {
  215. if (!allow_duplicate && m_last_ack_number_sent == m_ack_number)
  216. return {};
  217. return send_tcp_packet(TCPFlags::ACK);
  218. }
  219. ErrorOr<void> TCPSocket::send_tcp_packet(u16 flags, UserOrKernelBuffer const* payload, size_t payload_size, RoutingDecision* user_routing_decision)
  220. {
  221. auto adapter = bound_interface().with([](auto& bound_device) -> RefPtr<NetworkAdapter> { return bound_device; });
  222. RoutingDecision routing_decision = user_routing_decision ? *user_routing_decision : route_to(peer_address(), local_address(), adapter);
  223. if (routing_decision.is_zero())
  224. return set_so_error(EHOSTUNREACH);
  225. auto ipv4_payload_offset = routing_decision.adapter->ipv4_payload_offset();
  226. bool const has_mss_option = flags & TCPFlags::SYN;
  227. bool const has_window_scale_option = flags & TCPFlags::SYN;
  228. size_t const options_size = (has_mss_option ? sizeof(TCPOptionMSS) : 0) + (has_window_scale_option ? sizeof(TCPOptionWindowScale) : 0);
  229. size_t const tcp_header_size = sizeof(TCPPacket) + align_up_to(options_size, 4);
  230. size_t const buffer_size = ipv4_payload_offset + tcp_header_size + payload_size;
  231. auto packet = routing_decision.adapter->acquire_packet_buffer(buffer_size);
  232. if (!packet)
  233. return set_so_error(ENOMEM);
  234. routing_decision.adapter->fill_in_ipv4_header(*packet, local_address(),
  235. routing_decision.next_hop, peer_address(), IPv4Protocol::TCP,
  236. buffer_size - ipv4_payload_offset, type_of_service(), ttl());
  237. memset(packet->buffer->data() + ipv4_payload_offset, 0, sizeof(TCPPacket));
  238. auto& tcp_packet = *(TCPPacket*)(packet->buffer->data() + ipv4_payload_offset);
  239. VERIFY(local_port());
  240. tcp_packet.set_source_port(local_port());
  241. tcp_packet.set_destination_port(peer_port());
  242. auto window_size = available_space_in_receive_buffer();
  243. if ((flags & TCPFlags::SYN) == 0 && m_window_scaling_supported)
  244. window_size >>= receive_window_scale();
  245. tcp_packet.set_window_size(min(window_size, NumericLimits<u16>::max()));
  246. tcp_packet.set_sequence_number(m_sequence_number);
  247. tcp_packet.set_data_offset(tcp_header_size / sizeof(u32));
  248. tcp_packet.set_flags(flags);
  249. if (payload) {
  250. if (auto result = payload->read(tcp_packet.payload(), payload_size); result.is_error()) {
  251. routing_decision.adapter->release_packet_buffer(*packet);
  252. return set_so_error(result.release_error());
  253. }
  254. }
  255. if (flags & TCPFlags::ACK) {
  256. m_last_ack_number_sent = m_ack_number;
  257. m_last_ack_sent_time = TimeManagement::the().monotonic_time();
  258. tcp_packet.set_ack_number(m_ack_number);
  259. }
  260. if (flags & TCPFlags::SYN) {
  261. ++m_sequence_number;
  262. } else {
  263. m_sequence_number += payload_size;
  264. }
  265. u8* next_option = packet->buffer->data() + ipv4_payload_offset + sizeof(TCPPacket);
  266. if (has_mss_option) {
  267. u16 mss = routing_decision.adapter->mtu() - sizeof(IPv4Packet) - sizeof(TCPPacket);
  268. TCPOptionMSS mss_option { mss };
  269. memcpy(next_option, &mss_option, sizeof(mss_option));
  270. next_option += sizeof(mss_option);
  271. }
  272. if (has_window_scale_option) {
  273. TCPOptionWindowScale window_scale_option { receive_window_scale() };
  274. memcpy(next_option, &window_scale_option, sizeof(window_scale_option));
  275. next_option += sizeof(window_scale_option);
  276. }
  277. if ((options_size % 4) != 0)
  278. *next_option = to_underlying(TCPOptionKind::End);
  279. tcp_packet.set_checksum(compute_tcp_checksum(local_address(), peer_address(), tcp_packet, payload_size));
  280. bool expect_ack { tcp_packet.has_syn() || payload_size > 0 };
  281. if (expect_ack) {
  282. bool append_failed { false };
  283. m_unacked_packets.with_exclusive([&](auto& unacked_packets) {
  284. auto result = unacked_packets.packets.try_append({ m_sequence_number, packet, ipv4_payload_offset, *routing_decision.adapter });
  285. if (result.is_error()) {
  286. dbgln("TCPSocket: Dropped outbound packet because try_append() failed");
  287. append_failed = true;
  288. return;
  289. }
  290. unacked_packets.size += payload_size;
  291. enqueue_for_retransmit();
  292. });
  293. if (append_failed)
  294. return set_so_error(ENOMEM);
  295. }
  296. m_packets_out++;
  297. m_bytes_out += buffer_size;
  298. routing_decision.adapter->send_packet(packet->bytes());
  299. if (!expect_ack)
  300. routing_decision.adapter->release_packet_buffer(*packet);
  301. return {};
  302. }
  303. void TCPSocket::receive_tcp_packet(TCPPacket const& packet, u16 size)
  304. {
  305. if (packet.has_ack()) {
  306. u32 ack_number = packet.ack_number();
  307. dbgln_if(TCP_SOCKET_DEBUG, "TCPSocket: receive_tcp_packet: {}", ack_number);
  308. int removed = 0;
  309. m_unacked_packets.with_exclusive([&](auto& unacked_packets) {
  310. while (!unacked_packets.packets.is_empty()) {
  311. auto& packet = unacked_packets.packets.first();
  312. dbgln_if(TCP_SOCKET_DEBUG, "TCPSocket: iterate: {}", packet.ack_number);
  313. if (packet.ack_number <= ack_number) {
  314. auto old_adapter = packet.adapter.strong_ref();
  315. if (old_adapter)
  316. old_adapter->release_packet_buffer(*packet.buffer);
  317. TCPPacket& tcp_packet = *(TCPPacket*)(packet.buffer->buffer->data() + packet.ipv4_payload_offset);
  318. if (m_send_window_size != tcp_packet.window_size()) {
  319. m_send_window_size = tcp_packet.window_size() << m_send_window_scale;
  320. }
  321. auto payload_size = packet.buffer->buffer->data() + packet.buffer->buffer->size() - (u8*)tcp_packet.payload();
  322. unacked_packets.size -= payload_size;
  323. evaluate_block_conditions();
  324. unacked_packets.packets.take_first();
  325. removed++;
  326. } else {
  327. break;
  328. }
  329. }
  330. if (unacked_packets.packets.is_empty()) {
  331. m_retransmit_attempts = 0;
  332. dequeue_for_retransmit();
  333. }
  334. dbgln_if(TCP_SOCKET_DEBUG, "TCPSocket: receive_tcp_packet acknowledged {} packets", removed);
  335. });
  336. }
  337. m_packets_in++;
  338. m_bytes_in += packet.header_size() + size;
  339. }
  340. bool TCPSocket::should_delay_next_ack() const
  341. {
  342. // FIXME: We don't know the MSS here so make a reasonable guess.
  343. size_t const mss = 1500;
  344. // RFC 1122 says we should send an ACK for every two full-sized segments.
  345. if (m_ack_number >= m_last_ack_number_sent + 2 * mss)
  346. return false;
  347. // RFC 1122 says we should not delay ACKs for more than 500 milliseconds.
  348. if (TimeManagement::the().monotonic_time(TimePrecision::Precise) >= m_last_ack_sent_time + Duration::from_milliseconds(500))
  349. return false;
  350. return true;
  351. }
  352. NetworkOrdered<u16> TCPSocket::compute_tcp_checksum(IPv4Address const& source, IPv4Address const& destination, TCPPacket const& packet, u16 payload_size)
  353. {
  354. union PseudoHeader {
  355. struct [[gnu::packed]] {
  356. IPv4Address source;
  357. IPv4Address destination;
  358. u8 zero;
  359. u8 protocol;
  360. NetworkOrdered<u16> payload_size;
  361. } header;
  362. u16 raw[6];
  363. };
  364. static_assert(sizeof(PseudoHeader) == 12);
  365. Checked<u16> packet_size = packet.header_size();
  366. packet_size += payload_size;
  367. VERIFY(!packet_size.has_overflow());
  368. PseudoHeader pseudo_header { .header = { source, destination, 0, (u8)IPv4Protocol::TCP, packet_size.value() } };
  369. u32 checksum = 0;
  370. auto* raw_pseudo_header = pseudo_header.raw;
  371. for (size_t i = 0; i < sizeof(pseudo_header) / sizeof(u16); ++i) {
  372. checksum += AK::convert_between_host_and_network_endian(raw_pseudo_header[i]);
  373. if (checksum > 0xffff)
  374. checksum = (checksum >> 16) + (checksum & 0xffff);
  375. }
  376. auto* raw_packet = bit_cast<u16*>(&packet);
  377. for (size_t i = 0; i < packet.header_size() / sizeof(u16); ++i) {
  378. checksum += AK::convert_between_host_and_network_endian(raw_packet[i]);
  379. if (checksum > 0xffff)
  380. checksum = (checksum >> 16) + (checksum & 0xffff);
  381. }
  382. VERIFY(packet.data_offset() * 4 == packet.header_size());
  383. auto* raw_payload = bit_cast<u16*>(packet.payload());
  384. for (size_t i = 0; i < payload_size / sizeof(u16); ++i) {
  385. checksum += AK::convert_between_host_and_network_endian(raw_payload[i]);
  386. if (checksum > 0xffff)
  387. checksum = (checksum >> 16) + (checksum & 0xffff);
  388. }
  389. if (payload_size & 1) {
  390. u16 expanded_byte = ((u8 const*)packet.payload())[payload_size - 1] << 8;
  391. checksum += expanded_byte;
  392. if (checksum > 0xffff)
  393. checksum = (checksum >> 16) + (checksum & 0xffff);
  394. }
  395. return ~(checksum & 0xffff);
  396. }
  397. ErrorOr<void> TCPSocket::setsockopt(int level, int option, Userspace<void const*> user_value, socklen_t user_value_size)
  398. {
  399. if (level != IPPROTO_TCP)
  400. return IPv4Socket::setsockopt(level, option, user_value, user_value_size);
  401. MutexLocker locker(mutex());
  402. switch (option) {
  403. case TCP_NODELAY:
  404. if (user_value_size < sizeof(int))
  405. return EINVAL;
  406. int value;
  407. TRY(copy_from_user(&value, static_ptr_cast<int const*>(user_value)));
  408. if (value != 0 && value != 1)
  409. return EINVAL;
  410. m_no_delay = value;
  411. return {};
  412. default:
  413. dbgln("setsockopt({}) at IPPROTO_TCP not implemented.", option);
  414. return ENOPROTOOPT;
  415. }
  416. }
  417. ErrorOr<void> TCPSocket::getsockopt(OpenFileDescription& description, int level, int option, Userspace<void*> value, Userspace<socklen_t*> value_size)
  418. {
  419. if (level != IPPROTO_TCP)
  420. return IPv4Socket::getsockopt(description, level, option, value, value_size);
  421. MutexLocker locker(mutex());
  422. socklen_t size;
  423. TRY(copy_from_user(&size, value_size.unsafe_userspace_ptr()));
  424. switch (option) {
  425. case TCP_NODELAY: {
  426. int nodelay = m_no_delay ? 1 : 0;
  427. if (size < sizeof(nodelay))
  428. return EINVAL;
  429. TRY(copy_to_user(static_ptr_cast<int*>(value), &nodelay));
  430. size = sizeof(nodelay);
  431. return copy_to_user(value_size, &size);
  432. }
  433. default:
  434. dbgln("getsockopt({}) at IPPROTO_TCP not implemented.", option);
  435. return ENOPROTOOPT;
  436. }
  437. }
  438. ErrorOr<void> TCPSocket::protocol_bind()
  439. {
  440. dbgln_if(TCP_SOCKET_DEBUG, "TCPSocket::protocol_bind(), local_port() is {}", local_port());
  441. // Check that we do have the address we're trying to bind to.
  442. TRY(m_adapter.with([this](auto& adapter) -> ErrorOr<void> {
  443. if (has_specific_local_address() && !adapter) {
  444. adapter = NetworkingManagement::the().from_ipv4_address(local_address());
  445. if (!adapter)
  446. return set_so_error(EADDRNOTAVAIL);
  447. }
  448. return {};
  449. }));
  450. if (local_port() == 0) {
  451. // Allocate an unused ephemeral port.
  452. constexpr u16 first_ephemeral_port = 32768;
  453. constexpr u16 last_ephemeral_port = 60999;
  454. constexpr u16 ephemeral_port_range_size = last_ephemeral_port - first_ephemeral_port;
  455. u16 first_scan_port = first_ephemeral_port + get_good_random<u16>() % ephemeral_port_range_size;
  456. return sockets_by_tuple().with_exclusive([&](auto& table) -> ErrorOr<void> {
  457. u16 port = first_scan_port;
  458. while (true) {
  459. IPv4SocketTuple proposed_tuple(local_address(), port, peer_address(), peer_port());
  460. auto it = table.find(proposed_tuple);
  461. if (it == table.end()) {
  462. set_local_port(port);
  463. m_registered_socket_tuple = proposed_tuple;
  464. table.set(proposed_tuple, this);
  465. dbgln_if(TCP_SOCKET_DEBUG, "...allocated port {}, tuple {}", port, proposed_tuple.to_string());
  466. return {};
  467. }
  468. ++port;
  469. if (port > last_ephemeral_port)
  470. port = first_ephemeral_port;
  471. if (port == first_scan_port)
  472. break;
  473. }
  474. return set_so_error(EADDRINUSE);
  475. });
  476. } else {
  477. // Verify that the user-supplied port is not already used by someone else.
  478. bool ok = sockets_by_tuple().with_exclusive([&](auto& table) -> bool {
  479. if (table.contains(tuple()))
  480. return false;
  481. auto socket_tuple = tuple();
  482. m_registered_socket_tuple = socket_tuple;
  483. table.set(socket_tuple, this);
  484. return true;
  485. });
  486. if (!ok)
  487. return set_so_error(EADDRINUSE);
  488. return {};
  489. }
  490. }
  491. ErrorOr<void> TCPSocket::protocol_listen()
  492. {
  493. set_direction(Direction::Passive);
  494. set_state(State::Listen);
  495. set_setup_state(SetupState::Completed);
  496. return {};
  497. }
  498. ErrorOr<void> TCPSocket::protocol_connect(OpenFileDescription& description)
  499. {
  500. MutexLocker locker(mutex());
  501. auto routing_decision = route_to(peer_address(), local_address());
  502. if (routing_decision.is_zero())
  503. return set_so_error(EHOSTUNREACH);
  504. if (!has_specific_local_address())
  505. set_local_address(routing_decision.adapter->ipv4_address());
  506. TRY(ensure_bound());
  507. if (m_registered_socket_tuple.has_value() && m_registered_socket_tuple != tuple()) {
  508. // If the socket was manually bound (using bind(2)) instead of implicitly using connect,
  509. // it will already be registered in the TCPSocket sockets_by_tuple table, under the previous
  510. // socket tuple. We replace the entry in the table to ensure it is also properly removed on
  511. // socket deletion, to prevent a dangling reference.
  512. TRY(sockets_by_tuple().with_exclusive([this](auto& table) -> ErrorOr<void> {
  513. auto removed = table.remove(*m_registered_socket_tuple);
  514. VERIFY(removed);
  515. if (table.contains(tuple()))
  516. return set_so_error(EADDRINUSE);
  517. table.set(tuple(), this);
  518. return {};
  519. }));
  520. m_registered_socket_tuple = tuple();
  521. }
  522. m_sequence_number = get_good_random<u32>();
  523. m_ack_number = 0;
  524. set_setup_state(SetupState::InProgress);
  525. TRY(send_tcp_packet(TCPFlags::SYN));
  526. m_state = State::SynSent;
  527. set_role(Role::Connecting);
  528. m_direction = Direction::Outgoing;
  529. evaluate_block_conditions();
  530. if (description.is_blocking()) {
  531. locker.unlock();
  532. auto unblock_flags = Thread::FileBlocker::BlockFlags::None;
  533. if (Thread::current()->block<Thread::ConnectBlocker>({}, description, unblock_flags).was_interrupted())
  534. return set_so_error(EINTR);
  535. locker.lock();
  536. VERIFY(setup_state() == SetupState::Completed);
  537. if (has_error()) { // TODO: check unblock_flags
  538. set_role(Role::None);
  539. if (error() == TCPSocket::Error::RetransmitTimeout)
  540. return set_so_error(ETIMEDOUT);
  541. else
  542. return set_so_error(ECONNREFUSED);
  543. }
  544. return {};
  545. }
  546. return set_so_error(EINPROGRESS);
  547. }
  548. bool TCPSocket::protocol_is_disconnected() const
  549. {
  550. switch (m_state) {
  551. case State::Closed:
  552. case State::CloseWait:
  553. case State::LastAck:
  554. case State::FinWait1:
  555. case State::FinWait2:
  556. case State::Closing:
  557. case State::TimeWait:
  558. return true;
  559. default:
  560. return false;
  561. }
  562. }
  563. void TCPSocket::shut_down_for_writing()
  564. {
  565. if (state() == State::Established) {
  566. dbgln_if(TCP_SOCKET_DEBUG, " Sending FIN from Established and moving into FinWait1");
  567. (void)send_tcp_packet(TCPFlags::FIN | TCPFlags::ACK);
  568. set_state(State::FinWait1);
  569. } else {
  570. dbgln(" Shutting down TCPSocket for writing but not moving to FinWait1 since state is {}", to_string(state()));
  571. }
  572. }
  573. ErrorOr<void> TCPSocket::close()
  574. {
  575. MutexLocker locker(mutex());
  576. auto result = IPv4Socket::close();
  577. if (state() == State::CloseWait) {
  578. dbgln_if(TCP_SOCKET_DEBUG, " Sending FIN from CloseWait and moving into LastAck");
  579. [[maybe_unused]] auto rc = send_tcp_packet(TCPFlags::FIN | TCPFlags::ACK);
  580. set_state(State::LastAck);
  581. }
  582. if (state() != State::Closed && state() != State::Listen)
  583. closing_sockets().with_exclusive([&](auto& table) {
  584. table.set(tuple(), *this);
  585. });
  586. return result;
  587. }
  588. static Singleton<MutexProtected<TCPSocket::RetransmitList>> s_sockets_for_retransmit;
  589. MutexProtected<TCPSocket::RetransmitList>& TCPSocket::sockets_for_retransmit()
  590. {
  591. return *s_sockets_for_retransmit;
  592. }
  593. void TCPSocket::enqueue_for_retransmit()
  594. {
  595. sockets_for_retransmit().with_exclusive([&](auto& list) {
  596. list.append(*this);
  597. });
  598. }
  599. void TCPSocket::dequeue_for_retransmit()
  600. {
  601. sockets_for_retransmit().with_exclusive([&](auto& list) {
  602. list.remove(*this);
  603. });
  604. }
  605. void TCPSocket::retransmit_packets()
  606. {
  607. auto now = TimeManagement::the().monotonic_time();
  608. // RFC6298 says we should have at least one second between retransmits. According to
  609. // RFC1122 we must do exponential backoff - even for SYN packets.
  610. i64 retransmit_interval = 1;
  611. for (decltype(m_retransmit_attempts) i = 0; i < m_retransmit_attempts; i++)
  612. retransmit_interval *= 2;
  613. if (m_last_retransmit_time > now - Duration::from_seconds(retransmit_interval))
  614. return;
  615. dbgln_if(TCP_SOCKET_DEBUG, "TCPSocket({}) handling retransmit", this);
  616. m_last_retransmit_time = now;
  617. ++m_retransmit_attempts;
  618. if (m_retransmit_attempts > maximum_retransmits) {
  619. set_state(TCPSocket::State::Closed);
  620. set_error(TCPSocket::Error::RetransmitTimeout);
  621. set_setup_state(Socket::SetupState::Completed);
  622. return;
  623. }
  624. auto adapter = bound_interface().with([](auto& bound_device) -> RefPtr<NetworkAdapter> { return bound_device; });
  625. auto routing_decision = route_to(peer_address(), local_address(), adapter);
  626. if (routing_decision.is_zero())
  627. return;
  628. m_unacked_packets.with_exclusive([&](auto& unacked_packets) {
  629. for (auto& packet : unacked_packets.packets) {
  630. packet.tx_counter++;
  631. if constexpr (TCP_SOCKET_DEBUG) {
  632. auto& tcp_packet = *(TCPPacket const*)(packet.buffer->buffer->data() + packet.ipv4_payload_offset);
  633. dbgln("Sending TCP packet from {}:{} to {}:{} with ({}{}{}{}) seq_no={}, ack_no={}, tx_counter={}",
  634. local_address(), local_port(),
  635. peer_address(), peer_port(),
  636. (tcp_packet.has_syn() ? "SYN " : ""),
  637. (tcp_packet.has_ack() ? "ACK " : ""),
  638. (tcp_packet.has_fin() ? "FIN " : ""),
  639. (tcp_packet.has_rst() ? "RST " : ""),
  640. tcp_packet.sequence_number(),
  641. tcp_packet.ack_number(),
  642. packet.tx_counter);
  643. }
  644. size_t ipv4_payload_offset = routing_decision.adapter->ipv4_payload_offset();
  645. if (ipv4_payload_offset != packet.ipv4_payload_offset) {
  646. // FIXME: Add support for this. This can happen if after a route change
  647. // we ended up on another adapter which doesn't have the same layer 2 type
  648. // like the previous adapter.
  649. VERIFY_NOT_REACHED();
  650. }
  651. auto packet_buffer = packet.buffer->bytes();
  652. routing_decision.adapter->fill_in_ipv4_header(*packet.buffer,
  653. local_address(), routing_decision.next_hop, peer_address(),
  654. IPv4Protocol::TCP, packet_buffer.size() - ipv4_payload_offset, type_of_service(), ttl());
  655. routing_decision.adapter->send_packet(packet_buffer);
  656. m_packets_out++;
  657. m_bytes_out += packet_buffer.size();
  658. }
  659. });
  660. }
  661. bool TCPSocket::can_write(OpenFileDescription const& file_description, u64 size) const
  662. {
  663. if (!IPv4Socket::can_write(file_description, size))
  664. return false;
  665. if (m_state == State::SynSent || m_state == State::SynReceived)
  666. return false;
  667. if (!file_description.is_blocking())
  668. return true;
  669. return m_unacked_packets.with_shared([&](auto& unacked_packets) {
  670. return unacked_packets.size + size <= m_send_window_size;
  671. });
  672. }
  673. }