Browse Source

[master] Merge branch 'trac2893'

Marcin Siodelski 10 years ago
parent
commit
9fba39d93b

+ 9 - 2
doc/guide/dhcp4-srv.xml

@@ -1756,8 +1756,15 @@ temporarily override a list of interface names and listen on all interfaces.
             </simpara>
           </listitem>
           <listitem>
-            <simpara>Raw sockets operation is working on Linux
-            only. See <xref linkend="iface-detect"/> for details.</simpara>
+            <simpara>On Linux and BSD system families the DHCP messages are sent
+            and received over the raw sockets (using LPF and BPF) and all packet
+            headers (including data link layer, IP and UDP headers) are created and
+            parsed by Kea, rather than the system kernel. Currently, Kea can
+            only parse the data link layer headers with a format adhering to
+            IEEE 802.3 standard and assumes this data link layer header format
+            for all interfaces. Hence, Kea will fail to work on interfaces
+            which use different data link layer header formats (e.g. Infiniband).
+            </simpara>
           </listitem>
           <listitem>
             <simpara>The DHCPv4 server does not  verify that

+ 3 - 3
doc/guide/libdhcp.xml

@@ -41,9 +41,9 @@
 
       <para>DHCPv4 requires special raw socket processing to send and receive
       packets from hosts that do not have IPv4 address assigned yet. Support
-      for this operation is implemented on Linux only, so it is likely that
-      DHCPv4 component will not work in certain cases on systems other than
-      Linux.</para>
+      for this operation is implemented on Linux, FreeBSD, NetBSD and OpenBSD.
+      It is likely that DHCPv4 component will not work in certain cases on
+      other systems.</para>
     </section>
 
 <!--

+ 1 - 1
src/bin/dhcp4/tests/dhcp4_test_utils.h

@@ -64,7 +64,7 @@ public:
     }
 
     /// Does nothing.
-    virtual SocketInfo openSocket(const Iface&,
+    virtual SocketInfo openSocket(Iface&,
                                   const isc::asiolink::IOAddress& addr,
                                   const uint16_t port, const bool, const bool) {
         return (SocketInfo(addr, port, 0));

+ 6 - 0
src/lib/dhcp/Makefile.am

@@ -50,10 +50,16 @@ libkea_dhcp___la_SOURCES += pkt_filter6.h pkt_filter6.cc
 libkea_dhcp___la_SOURCES += pkt_filter_inet.cc pkt_filter_inet.h
 libkea_dhcp___la_SOURCES += pkt_filter_inet6.cc pkt_filter_inet6.h
 
+# Utilize Linux Packet Filtering on Linux.
 if OS_LINUX
 libkea_dhcp___la_SOURCES += pkt_filter_lpf.cc pkt_filter_lpf.h
 endif
 
+# Utilize Berkeley Packet Filtering on BSD.
+if OS_BSD
+libkea_dhcp___la_SOURCES += pkt_filter_bpf.cc pkt_filter_bpf.h
+endif
+
 libkea_dhcp___la_SOURCES += std_option_defs.h
 libkea_dhcp___la_SOURCES += docsis3_option_defs.h
 

+ 43 - 26
src/lib/dhcp/iface_mgr.cc

@@ -55,11 +55,18 @@ Iface::Iface(const std::string& name, int ifindex)
     :name_(name), ifindex_(ifindex), mac_len_(0), hardware_type_(0),
      flag_loopback_(false), flag_up_(false), flag_running_(false),
      flag_multicast_(false), flag_broadcast_(false), flags_(0),
-     inactive4_(false), inactive6_(false)
+     inactive4_(false), inactive6_(false), read_buffer_(NULL),
+     read_buffer_size_(0)
 {
     memset(mac_, 0, sizeof(mac_));
 }
 
+Iface::~Iface() {
+    if (read_buffer_ != NULL) {
+        free(read_buffer_);
+    }
+}
+
 void
 Iface::closeSockets() {
     // Close IPv4 sockets.
@@ -167,6 +174,21 @@ bool Iface::delSocket(const uint16_t sockfd) {
     return (false); // socket not found
 }
 
+void
+Iface::resizeReadBuffer(const size_t new_size) {
+    // Do nothing if the new size is equal to the current size.
+    if (new_size == read_buffer_size_) {
+        return;
+    }
+
+    read_buffer_size_ = new_size;
+    read_buffer_ = static_cast<uint8_t*>(realloc(read_buffer_,
+                                                 read_buffer_size_));
+    if (read_buffer_ == NULL) {
+        read_buffer_size_ = 0;
+    }
+}
+
 IfaceMgr::IfaceMgr()
     :control_buf_len_(CMSG_SPACE(sizeof(struct in6_pktinfo))),
      control_buf_(new char[control_buf_len_]),
@@ -411,18 +433,6 @@ bool
 IfaceMgr::openSockets4(const uint16_t port, const bool use_bcast,
                        IfaceMgrErrorMsgCallback error_handler) {
     int count = 0;
-
-// This option is used to bind sockets to particular interfaces.
-// This is currently the only way to discover on which interface
-// the broadcast packet has been received. If this option is
-// not supported then only one interface should be confugured
-// to listen for broadcast traffic.
-#ifdef SO_BINDTODEVICE
-    const bool bind_to_device = true;
-#else
-    const bool bind_to_device = false;
-#endif
-
     int bcast_num = 0;
 
     for (IfaceCollection::iterator iface = ifaces_.begin();
@@ -450,17 +460,26 @@ IfaceMgr::openSockets4(const uint16_t port, const bool use_bcast,
             // options on the socket so as it can receive and send broadcast
             // messages.
             if (iface->flag_broadcast_ && use_bcast) {
-                // If our OS supports binding socket to a device we can listen
-                // for broadcast messages on multiple interfaces. Otherwise we
-                // bind to INADDR_ANY address but we can do it only once. Thus,
-                // if one socket has been bound we can't do it any further.
-                if (!bind_to_device && bcast_num > 0) {
+                // The DHCP server must have means to determine which interface
+                // the broadcast packets are coming from. This is achieved by
+                // binding a socket to the device (interface) and specialized
+                // packet filters (e.g. BPF and LPF) implement this mechanism.
+                // If the PktFilterInet (generic one) is used, the socket is
+                // bound to INADDR_ANY which effectively binds the socket to
+                // all addresses on all interfaces. So, only one of those can
+                // be opened. Currently, the direct response support is
+                // provided by the PktFilterLPF and PktFilterBPF, so by checking
+                // the support for direct response we actually determine that
+                // one of those objects is in use. For all other objects we
+                // assume that binding to the device is not supported and we
+                // cease opening sockets and display the appropriate message.
+                if (!isDirectResponseSupported() && bcast_num > 0) {
                     IFACEMGR_ERROR(SocketConfigError, error_handler,
-                                   "SO_BINDTODEVICE socket option is"
-                                   " not supported on this OS;"
-                                   " therefore, DHCP server can only"
-                                   " listen broadcast traffic on a"
-                                   " single interface");
+                                   "Binding socket to an interface is not"
+                                   " supported on this OS; therefore only"
+                                   " one socket listening to broadcast traffic"
+                                   " can be opened. Sockets will not be opened"
+                                   " on remaining interfaces");
                     continue;
 
                 } else {
@@ -479,9 +498,7 @@ IfaceMgr::openSockets4(const uint16_t port, const bool use_bcast,
                     // Binding socket to an interface is not supported so we
                     // can't open any more broadcast sockets. Increase the
                     // number of open broadcast sockets.
-                    if (!bind_to_device) {
-                        ++bcast_num;
-                    }
+                    ++bcast_num;
                 }
 
             } else {

+ 48 - 1
src/lib/dhcp/iface_mgr.h

@@ -120,12 +120,21 @@ struct SocketInfo {
 
 };
 
-
 /// @brief Represents a single network interface
 ///
 /// Iface structure represents network interface with all useful
 /// information, like name, interface index, MAC address and
 /// list of assigned addresses
+///
+/// This class also holds the pointer to the socket read buffer.
+/// Functions reading from the socket may utilize this buffer to store the
+/// data being read from the socket. The advantage of using the
+/// pre-allocated buffer is that the buffer is allocated only once, rather
+/// than on every read. In addition, some OS specific code (e.g. BPF)
+/// may require use of fixed-size buffers. The size of such a buffer is
+/// returned by the OS kernel when the socket is opened. Hence, it is
+/// convenient to allocate the buffer when the socket is being opened and
+/// utilze it throughout the lifetime of the socket.
 class Iface {
 public:
 
@@ -154,6 +163,11 @@ public:
     /// @param ifindex interface index (unique integer identifier)
     Iface(const std::string& name, int ifindex);
 
+    /// @brief Destructor.
+    ///
+    /// Deallocates the socket read buffer.
+    ~Iface();
+
     /// @brief Closes all open sockets on interface.
     void closeSockets();
 
@@ -330,6 +344,29 @@ public:
         return unicasts_;
     }
 
+    /// @brief Returns the pointer to the buffer used for data reading.
+    ///
+    /// The returned pointer is only valid during the lifetime of the
+    /// object which returns it or until the buffer is resized.
+    /// This function is meant to be used with socket API to gather
+    /// data from the socket.
+    ///
+    /// @return Pointer to the first element of the read buffer or
+    /// NULL if the buffer is empty.
+    uint8_t* getReadBuffer() const {
+        return (read_buffer_);
+    }
+
+    /// @brief Returns the current size of the socket read buffer.
+    size_t getReadBufferSize() const {
+        return (read_buffer_size_);
+    }
+
+    /// @brief Reallocates the socket read buffer.
+    ///
+    /// @param new_size New size of the buffer.
+    void resizeReadBuffer(const size_t new_size);
+
 protected:
     /// Socket used to send data.
     SocketCollection sockets_;
@@ -388,6 +425,16 @@ public:
     /// Indicates that IPv6 sockets should (true) or should not (false)
     /// be opened on this interface.
     bool inactive6_;
+
+private:
+
+    /// @brief Pointer to the buffer holding the data read from the socket.
+    ///
+    /// See @c Iface manager description for details.
+    uint8_t* read_buffer_;
+
+    /// @brief Allocated size of the read buffer.
+    size_t read_buffer_size_;
 };
 
 /// @brief This type describes the callback function invoked when error occurs

+ 16 - 4
src/lib/dhcp/iface_mgr_bsd.cc

@@ -18,6 +18,7 @@
 
 #include <dhcp/iface_mgr.h>
 #include <dhcp/iface_mgr_error_handler.h>
+#include <dhcp/pkt_filter_bpf.h>
 #include <dhcp/pkt_filter_inet.h>
 #include <exceptions/exceptions.h>
 
@@ -144,10 +145,21 @@ bool IfaceMgr::os_receive4(struct msghdr& /*m*/, Pkt4Ptr& /*pkt*/) {
 }
 
 void
-IfaceMgr::setMatchingPacketFilter(const bool /* direct_response_desired */) {
-    // @todo Currently we ignore the preference to use direct traffic
-    // because it hasn't been implemented for BSD systems.
-    setPacketFilter(PktFilterPtr(new PktFilterInet()));
+IfaceMgr::setMatchingPacketFilter(const bool direct_response_desired) {
+    // If direct response is desired we have to use BPF. If the direct
+    // response is not desired we use datagram socket supported by the
+    // PktFilterInet class. Note however that on BSD systems binding the
+    // datagram socket to the device is not supported and the server would
+    // have no means to determine on which interface the packet has been
+    // received. Hence, it is discouraged to use PktFilterInet for the
+    // server.
+    if (direct_response_desired) {
+        setPacketFilter(PktFilterPtr(new PktFilterBPF()));
+
+    } else {
+        setPacketFilter(PktFilterPtr(new PktFilterInet()));
+
+    }
 }
 
 bool

+ 13 - 17
src/lib/dhcp/libdhcp++.dox

@@ -159,32 +159,28 @@ address.
 Kea supports the use of raw sockets to create a complete Data-link/IP/UDP/DHCPv4
 stack. By creating each layer of the outgoing packet, the Kea logic has full
 control over the frame contents and it may bypass the use of ARP to inject the
-link layer address into the frame. The raw socket is bound to a specific interface,
-not to the IP address/UDP port. Therefore, the system kernel doesn't have
-means to verify that Kea is listening to the DHCP traffic on the specific address
-and port. This has two major implications:
+link layer address into the frame.
+
+The low level operations on raw sockets are implemented within the "packet
+filtering" classes derived from @c isc::dhcp::PktFilter. The implementation
+of these classes is specific to the operating system. On Linux the
+@c isc::dhcp::PktFilterLPF is used. On BSD systems the
+@c isc::dhcp::PktFilterBPF is used.
+
+The raw sockets are bound to a specific interface, not to the IP address/UDP port.
+Therefore, the system kernel doesn't have means to verify that Kea is listening
+to the DHCP traffic on the specific address and port. This has two major implications:
 - It is possible to run another DHCPv4 sever instance which will bind socket to the
 same address and port.
 - An attempt to send a unicast message to the DHCPv4 server will result in ICMP
 "Port Unreachable" message being sent by the kernel (which is unaware that the
 DHCPv4 service is actually running).
-In order to overcome these issues, the isc::dhcp::PktFilterLPF opens a
+
+In order to overcome these issues, the packet filtering classes open a
 regular IP/UDP socket which coexists with the raw socket. The socket is referred
 to as "fallback socket" in the Kea code. All packets received through this socket
 are discarded.
 
-In general, the use of datagram sockets is preferred over raw sockets.
-For convenience, the switchable Packet Filter objects are used to manage
-sockets for different purposes. These objects implement the socket opening
-operation and sending/receiving messages over this socket. For example:
-the isc::dhcp::PktFilterLPF object opens a raw socket.
-The isc::dhcp::PktFilterLPF::send and isc::dhcp::PktFilterLPF::receive
-methods encode/decode full data-link/IP/UDP/DHCPv4 stack. The
-isc::dhcp::PktFilterInet supports sending and receiving messages over
-the regular IP/UDP socket. The isc::dhcp::PktFilterInet should be used in all
-cases when an application using the libdhcp++ doesn't require sending
-DHCP messages to a device which doesn't have an address yet.
-
 @section libdhcpPktFilter6 Switchable Packet Filters for DHCPv6
 
 The DHCPv6 implementation doesn't suffer from the problems described in \ref

+ 2 - 2
src/lib/dhcp/pkt_filter.h

@@ -1,4 +1,4 @@
-// Copyright (C) 2013 Internet Systems Consortium, Inc. ("ISC")
+// Copyright (C) 2013-2014 Internet Systems Consortium, Inc. ("ISC")
 //
 // Permission to use, copy, modify, and/or distribute this software for any
 // purpose with or without fee is hereby granted, provided that the above
@@ -86,7 +86,7 @@ public:
     /// @param send_bcast configure socket to send broadcast messages.
     ///
     /// @return A structure describing a primary and fallback socket.
-    virtual SocketInfo openSocket(const Iface& iface,
+    virtual SocketInfo openSocket(Iface& iface,
                                   const isc::asiolink::IOAddress& addr,
                                   const uint16_t port,
                                   const bool receive_bcast,

+ 522 - 0
src/lib/dhcp/pkt_filter_bpf.cc

@@ -0,0 +1,522 @@
+// Copyright (C) 2014 Internet Systems Consortium, Inc. ("ISC")
+//
+// Permission to use, copy, modify, and/or distribute this software for any
+// purpose with or without fee is hereby granted, provided that the above
+// copyright notice and this permission notice appear in all copies.
+//
+// THE SOFTWARE IS PROVIDED "AS IS" AND ISC DISCLAIMS ALL WARRANTIES WITH
+// REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY
+// AND FITNESS.  IN NO EVENT SHALL ISC BE LIABLE FOR ANY SPECIAL, DIRECT,
+// INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM
+// LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE
+// OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR
+// PERFORMANCE OF THIS SOFTWARE.
+
+#include <config.h>
+#include <dhcp/dhcp4.h>
+#include <dhcp/iface_mgr.h>
+#include <dhcp/pkt4.h>
+#include <dhcp/pkt_filter_bpf.h>
+#include <dhcp/protocol_util.h>
+#include <exceptions/exceptions.h>
+#include <algorithm>
+#include <net/bpf.h>
+#include <netinet/if_ether.h>
+
+namespace {
+
+using namespace isc::dhcp;
+
+/// @brief Maximum number of attempts to open BPF device.
+const unsigned int MAX_BPF_OPEN_ATTEMPTS = 100;
+
+/// @brief Length of the header containing the address family for the packet
+/// received on local loopback interface.
+const unsigned int BPF_LOCAL_LOOPBACK_HEADER_LEN = 4;
+
+/// The following structure defines a Berkely Packet Filter program to perform
+/// packet filtering. The program operates on Ethernet packets.  To help with
+/// interpretation of the program, for the types of Ethernet packets we are
+/// interested in, the header layout is:
+///
+///   6 bytes  Destination Ethernet Address
+///   6 bytes  Source Ethernet Address
+///   2 bytes  Ethernet packet type
+///
+///  20 bytes  Fixed part of IP header
+///  variable  Variable part of IP header
+///
+///   2 bytes  UDP Source port
+///   2 bytes  UDP destination port
+///   4 bytes  Rest of UDP header
+///
+/// @todo We may want to extend the filter to receive packets sent
+/// to the particular IP address assigned to the interface or
+/// broadcast address.
+struct bpf_insn ethernet_ip_udp_filter [] = {
+    // Make sure this is an IP packet: check the half-word (two bytes)
+    // at offset 12 in the packet (the Ethernet packet type).  If it
+    // is, advance to the next instruction.  If not, advance 8
+    // instructions (which takes execution to the last instruction in
+    // the sequence: "drop it").
+    BPF_STMT(BPF_LD + BPF_H + BPF_ABS, ETHERNET_PACKET_TYPE_OFFSET),
+    BPF_JUMP(BPF_JMP + BPF_JEQ + BPF_K, ETHERTYPE_IP, 0, 8),
+
+    // Make sure it's a UDP packet.  The IP protocol is at offset
+    // 9 in the IP header so, adding the Ethernet packet header size
+    // of 14 bytes gives an absolute byte offset in the packet of 23.
+    BPF_STMT(BPF_LD + BPF_B + BPF_ABS,
+             ETHERNET_HEADER_LEN + IP_PROTO_TYPE_OFFSET),
+    BPF_JUMP(BPF_JMP + BPF_JEQ + BPF_K, IPPROTO_UDP, 0, 6),
+
+    // Make sure this isn't a fragment by checking that the fragment
+    // offset field in the IP header is zero.  This field is the
+    // least-significant 13 bits in the bytes at offsets 6 and 7 in
+    // the IP header, so the half-word at offset 20 (6 + size of
+    // Ethernet header) is loaded and an appropriate mask applied.
+    BPF_STMT(BPF_LD + BPF_H + BPF_ABS, ETHERNET_HEADER_LEN + IP_FLAGS_OFFSET),
+    BPF_JUMP(BPF_JMP + BPF_JSET + BPF_K, 0x1fff, 4, 0),
+
+    // Get the IP header length.  This is achieved by the following
+    // (special) instruction that, given the offset of the start
+    // of the IP header (offset 14) loads the IP header length.
+    BPF_STMT(BPF_LDX + BPF_B + BPF_MSH, ETHERNET_HEADER_LEN),
+
+    // Make sure it's to the right port.  The following instruction
+    // adds the previously extracted IP header length to the given
+    // offset to locate the correct byte.  The given offset of 16
+    // comprises the length of the Ethernet header (14) plus the offset
+    // of the UDP destination port (2) within the UDP header.
+    BPF_STMT(BPF_LD + BPF_H + BPF_IND, ETHERNET_HEADER_LEN + UDP_DEST_PORT),
+    // The following instruction tests against the default DHCP server port,
+    // but the action port is actually set in PktFilterBPF::openSocket().
+    // N.B. The code in that method assumes that this instruction is at
+    // offset 8 in the program.  If this is changed, openSocket() must be
+    // updated.
+    BPF_JUMP(BPF_JMP + BPF_JEQ + BPF_K, DHCP4_SERVER_PORT, 0, 1),
+
+    // If we passed all the tests, ask for the whole packet.
+    BPF_STMT(BPF_RET + BPF_K, (u_int)-1),
+
+    // Otherwise, drop it.
+    BPF_STMT(BPF_RET + BPF_K, 0),
+};
+
+/// The following structure defines a BPF program to perform packet filtering
+/// on local loopback interface. The packets received on this interface do not
+/// contain the regular link-layer header, but rather a 4-byte long pseudo
+/// header containing the address family. The reminder of the packet contains
+/// IP header, UDP header and a DHCP message.
+struct bpf_insn loopback_ip_udp_filter [] = {
+    // Make sure this is an IP packet. The pseudo header comprises a 4-byte
+    // long value identifying the address family, which should be set to
+    // AF_INET. The default value used here (0xFFFFFFFF) must be overriden
+    // with htonl(AF_INET) from within the openSocket function.
+    BPF_STMT(BPF_LD + BPF_W + BPF_ABS, 0),
+    BPF_JUMP(BPF_JMP + BPF_JEQ + BPF_K, 0xFFFFFFFF, 0, 8),
+
+    // Make sure it's a UDP packet.  The IP protocol is at offset
+    // 9 in the IP header so, adding the pseudo header size 4 bytes
+    // gives an absolute byte offset in the packet of 13.
+    BPF_STMT(BPF_LD + BPF_B + BPF_ABS,
+             BPF_LOCAL_LOOPBACK_HEADER_LEN + IP_PROTO_TYPE_OFFSET),
+    BPF_JUMP(BPF_JMP + BPF_JEQ + BPF_K, IPPROTO_UDP, 0, 6),
+
+    // Make sure this isn't a fragment by checking that the fragment
+    // offset field in the IP header is zero.  This field is the
+    // least-significant 13 bits in the bytes at offsets 6 and 7 in
+    // the IP header, so the half-word at offset 10 (6 + size of
+    // pseudo header) is loaded and an appropriate mask applied.
+    BPF_STMT(BPF_LD + BPF_H + BPF_ABS,
+             BPF_LOCAL_LOOPBACK_HEADER_LEN + IP_FLAGS_OFFSET),
+    BPF_JUMP(BPF_JMP + BPF_JSET + BPF_K, 0x1fff, 4, 0),
+
+    // Get the IP header length.  This is achieved by the following
+    // (special) instruction that, given the offset of the start
+    // of the IP header (offset 4) loads the IP header length.
+    BPF_STMT(BPF_LDX + BPF_B + BPF_MSH, BPF_LOCAL_LOOPBACK_HEADER_LEN),
+
+    // Make sure it's to the right port.  The following instruction
+    // adds the previously extracted IP header length to the given
+    // offset to locate the correct byte.  The given offset of 6
+    // comprises the length of the pseudo header (4) plus the offset
+    // of the UDP destination port (2) within the UDP header.
+    BPF_STMT(BPF_LD + BPF_H + BPF_IND,
+             BPF_LOCAL_LOOPBACK_HEADER_LEN + UDP_DEST_PORT),
+    // The following instruction tests against the default DHCP server port,
+    // but the action port is actually set in PktFilterBPF::openSocket().
+    // N.B. The code in that method assumes that this instruction is at
+    // offset 8 in the program.  If this is changed, openSocket() must be
+    // updated.
+    BPF_JUMP(BPF_JMP + BPF_JEQ + BPF_K, DHCP4_SERVER_PORT, 0, 1),
+
+    // If we passed all the tests, ask for the whole packet.
+    BPF_STMT(BPF_RET + BPF_K, (u_int)-1),
+
+    // Otherwise, drop it.
+    BPF_STMT(BPF_RET + BPF_K, 0),
+};
+
+
+}
+
+using namespace isc::util;
+
+namespace isc {
+namespace dhcp {
+
+SocketInfo
+PktFilterBPF::openSocket(Iface& iface,
+                         const isc::asiolink::IOAddress& addr,
+                         const uint16_t port, const bool,
+                         const bool) {
+
+    // Open fallback socket first. If it fails, it will give us an indication
+    // that there is another service (perhaps DHCP server) running.
+    // The function will throw an exception and effectivelly cease opening
+    // the BPF device below.
+    int fallback = openFallbackSocket(addr, port);
+
+    // Fallback has opened, so let's open the BPF device that we will be
+    // using for receiving and sending packets. The BPF device is opened
+    // by opening a file /dev/bpf%d where %d is a number. There may be
+    // devices already open so we will try them one by one and open the
+    // one that is not busy.
+    int sock = -1;
+    for (unsigned int bpf_dev = 0;
+         bpf_dev < MAX_BPF_OPEN_ATTEMPTS && (sock < 0);
+         ++bpf_dev) {
+        std::ostringstream s;
+        s << "/dev/bpf" << bpf_dev;
+        sock = open(s.str().c_str(), O_RDWR, 0);
+        if (sock < 0) {
+            // If device is busy, try another one.
+            if (errno == EBUSY) {
+                continue;
+            }
+            // All other errors are fatal, so close the fallback socket
+            // and throw.
+            close(fallback);
+            isc_throw(SocketConfigError, "Failed to open BPF device " << s);
+        }
+    }
+
+    // The BPF device is now open. Now it needs to be configured.
+
+    // Associate the device with the interface name.
+    struct ifreq iface_data;
+    memset(&iface_data, 0, sizeof(iface_data));
+    std::strncpy(iface_data.ifr_name, iface.getName().c_str(),
+                 std::min(static_cast<int>(IFNAMSIZ),
+                          static_cast<int>(iface.getName().length())));
+    if (ioctl(sock, BIOCSETIF, &iface_data) < 0) {
+        close(fallback);
+        close(sock);
+        isc_throw(SocketConfigError, "Failed to associate BPF device "
+                  " with interface " << iface.getName());
+    }
+
+    // Get the BPF version supported by the kernel. Every application
+    // must check this version against the current version in use.
+    struct bpf_version ver;
+    if (ioctl(sock, BIOCVERSION, &ver) < 0) {
+        close(fallback);
+        close(sock);
+        isc_throw(SocketConfigError, "Failed to obtain the BPF version"
+                  " number from the kernel");
+    }
+    // Major BPF version must match and the minor version that the kernel
+    // runs must be at least the current version in use.
+    if ((ver.bv_major != BPF_MAJOR_VERSION) ||
+        (ver.bv_minor < BPF_MINOR_VERSION)) {
+        close(fallback);
+        close(sock);
+        isc_throw(SocketConfigError, "Invalid BPF version: "
+                  << ver.bv_major << "." << ver.bv_minor
+                  << " Expected at least version:"
+                  << BPF_MAJOR_VERSION << "."
+                  << BPF_MINOR_VERSION);;
+    }
+
+    // Get the size of the read buffer for this device. We will need to
+    // allocate the buffer of this size for packet reads.
+    unsigned int buf_len = 0;
+    if (ioctl(sock, BIOCGBLEN, &buf_len) < 0) {
+        close(fallback);
+        close(sock);
+        isc_throw(SocketConfigError, "Unable to obtain the required"
+                  " buffer legth for reads from BPF device");
+    }
+
+    if (buf_len < sizeof(bpf_hdr)) {
+        isc_throw(SocketConfigError, "read buffer length returned by the"
+                  " kernel for the BPF device associated with the interface"
+                  << iface.getName() << " is lower than the BPF header"
+                  " length: this condition is impossible unless the"
+                  " operating system is really broken!");
+    }
+
+    // Set the filter program so as we only get packets we are interested in.
+    struct bpf_program prog;
+    memset(&prog, 0, sizeof(bpf_program));
+    if (iface.flag_loopback_) {
+        prog.bf_insns = loopback_ip_udp_filter;
+        prog.bf_len = sizeof(loopback_ip_udp_filter) / sizeof(struct bpf_insn);
+        // The address family is AF_INET. It can't be hardcoded in the BPF program
+        // because we need to make the host to network order conversion using htonl
+        // and conversion can't be done within the BPF program structure as it
+        // doesn't work on some systems.
+        prog.bf_insns[1].k = htonl(AF_INET);
+
+    } else {
+        prog.bf_insns = ethernet_ip_udp_filter;
+        prog.bf_len = sizeof(ethernet_ip_udp_filter) / sizeof(struct bpf_insn);
+    }
+
+    // Configure the BPF program to receive packets on the specified port.
+    prog.bf_insns[8].k = port;
+
+    // Actually set the filter program for the device.
+    if (ioctl(sock, BIOCSETF, &prog) < 0) {
+        close(fallback);
+        close(sock);
+        isc_throw(SocketConfigError, "Failed to install BPF filter"
+                  " program");
+    }
+
+    // Configure the BPF device to use the immediate mode. This ensures
+    // that the read function returns immediatelly, instead of waiting
+    // for the kernel to fill up the buffer, which would likely cause
+    // read hangs.
+    int flag = 1;
+    if (ioctl(sock, BIOCIMMEDIATE, &flag) < 0) {
+        close(fallback);
+        close(sock);
+        isc_throw(SocketConfigError, "Failed to set promiscious mode for"
+                  " BPF device");
+    }
+
+    // Everything is ok, allocate the read buffer and return the socket
+    // (BPF device descriptor) to the caller.
+    iface.resizeReadBuffer(buf_len);
+    return (SocketInfo(addr, port, sock, fallback));
+}
+
+Pkt4Ptr
+PktFilterBPF::receive(const Iface& iface, const SocketInfo& socket_info) {
+    // When using BPF, the read buffer must be allocated for the interface.
+    // If it is not allocated, it is a programmatic error.
+    if (iface.getReadBufferSize() == 0) {
+        isc_throw(SocketConfigError, "socket read buffer not allocated"
+                  " for the interface: " << iface.getName());
+    }
+
+    // First let's get some data from the fallback socket. The data will be
+    // discarded but we don't want the socket buffer to bloat. We get the
+    // packets from the socket in loop but most of the time the loop will
+    // end after receiving one packet. The call to recv returns immediately
+    // when there is no data left on the socket because the socket is
+    // non-blocking.
+    // @todo In the normal conditions, both the primary socket and the fallback
+    // socket are in sync as they are set to receive packets on the same
+    // address and port. The reception of packets on the fallback socket
+    // shouldn't cause significant lags in packet reception. If we find in the
+    // future that it does, the sort of threshold could be set for the maximum
+    // bytes received on the fallback socket in a single round. Further
+    // optimizations would include an asynchronous read from the fallback socket
+    // when the DHCP server is idle.
+    int datalen;
+    do {
+        datalen = recv(socket_info.fallbackfd_, iface.getReadBuffer(),
+                       iface.getReadBufferSize(), 0);
+    } while (datalen > 0);
+
+    datalen = read(socket_info.sockfd_, iface.getReadBuffer(),
+                   iface.getReadBufferSize());
+    // If negative value is returned by read(), it indicates that an
+    // error occured. If returned value is 0, no data was read from the
+    // socket. In both cases something has gone wrong, because we expect
+    // that a chunk of data is there. We signal the lack of data by
+    // returing an empty packet.
+    if (datalen <= 0) {
+        return Pkt4Ptr();
+    }
+    datalen = BPF_WORDALIGN(datalen);
+
+    // Holds BPF header.
+    struct bpf_hdr bpfh;
+
+    /// @todo BPF may occasionally append more than one packet in a
+    /// single read. Our current libdhcp++ API is oriented towards receiving
+    /// one packet at the time so we just pick first usable packet here
+    /// and drop other packets. In the future the additional packets should
+    /// be queued and processed. For now, we just iterate over the packets
+    /// in the buffer and pick the first usable one.
+    int offset = 0;
+    while (offset < datalen) {
+        // Check if the BPF header fits in the reminder of the buffer.
+        // If it doesn't something is really wrong.
+        if (datalen - offset < sizeof(bpf_hdr)) {
+            isc_throw(SocketReadError, "packet received over the BPF device on"
+                      " interface " << iface.getName() << " has a truncated "
+                      " BPF header");
+        }
+
+        // Copy the BPF header.
+        memcpy(static_cast<void*>(&bpfh),
+               static_cast<void*>(iface.getReadBuffer()),
+               sizeof(bpfh));
+
+        // Check if the captured data fit into the reminder of the buffer.
+        // Again, something is really wrong here if it doesn't fit.
+        if (offset + bpfh.bh_hdrlen + bpfh.bh_caplen > datalen) {
+            isc_throw(SocketReadError, "packet received from the BPF device"
+                      << " attached to interface " << iface.getName()
+                      << " is truncated");
+        }
+
+        // Check if the whole packet has been captured.
+        if (bpfh.bh_caplen != bpfh.bh_datalen) {
+            // Not whole packet captured, proceed to next received packet.
+            offset = BPF_WORDALIGN(offset + bpfh.bh_hdrlen + bpfh.bh_caplen);
+            continue;
+        }
+
+        // All checks passed, let's use the packet at the offset found.
+        // Typically it will be at offset 0.
+        break;
+    };
+
+    // No parsable packet found, so return.
+    if (offset >= datalen) {
+        return (Pkt4Ptr());
+    }
+
+    // Skip the BPF header and create the buffer holding a frame.
+    InputBuffer buf(iface.getReadBuffer() + offset + bpfh.bh_hdrlen,
+                    datalen - bpfh.bh_hdrlen - offset);
+
+
+    // @todo: This is awkward way to solve the chicken and egg problem
+    // whereby we don't know the offset where DHCP data start in the
+    // received buffer when we create the packet object. In general case,
+    // the IP header has variable length. The information about its length
+    // is stored in one of its fields. Therefore, we have to decode the
+    // packet to get the offset of the DHCP data. The dummy object is
+    // created so as we can pass it to the functions which decode IP stack
+    // and find actual offset of the DHCP data.
+    // Once we find the offset we can create another Pkt4 object from
+    // the reminder of the input buffer and set the IP addresses and
+    // ports from the dummy packet. We should consider doing it
+    // in some more elegant way.
+    Pkt4Ptr dummy_pkt = Pkt4Ptr(new Pkt4(DHCPDISCOVER, 0));
+
+    // On local loopback interface the ethernet header is not present.
+    // Instead, there is a 4-byte long pseudo header containing the
+    // address family in the host byte order.
+    if (iface.flag_loopback_) {
+        if (buf.getLength() < BPF_LOCAL_LOOPBACK_HEADER_LEN) {
+            isc_throw(SocketReadError, "packet received on local loopback"
+                      " interface " << iface.getName() << " doesn't contain"
+                      " the pseudo header with the address family type");
+        }
+        // Advance to the position of the IP header. We don't check the
+        // contents of the pseudo header because the BPF filter should have
+        // filtered out the packets with address family other than AF_INET.
+        buf.setPosition(BPF_LOCAL_LOOPBACK_HEADER_LEN);
+
+        // Since we don't decode the real link-layer header we need to
+        // supply the hardware address ourselves.
+        dummy_pkt->setLocalHWAddr(HWAddrPtr(new HWAddr()));
+        dummy_pkt->setRemoteHWAddr(HWAddrPtr(new HWAddr()));
+
+    } else {
+        // If we are on the interface other than local loopback, assume
+        // the ethernet header. For now we don't support any other data
+        // link layer.
+        decodeEthernetHeader(buf, dummy_pkt);
+    }
+
+    // Decode IP/UDP headers.
+    decodeIpUdpHeader(buf, dummy_pkt);
+
+    // Read the DHCP data.
+    std::vector<uint8_t> dhcp_buf;
+    buf.readVector(dhcp_buf, buf.getLength() - buf.getPosition());
+
+    // Decode DHCP data into the Pkt4 object.
+    Pkt4Ptr pkt = Pkt4Ptr(new Pkt4(&dhcp_buf[0], dhcp_buf.size()));
+
+    // Set the appropriate packet members using data collected from
+    // the decoded headers.
+    pkt->setIndex(iface.getIndex());
+    pkt->setIface(iface.getName());
+    pkt->setLocalAddr(dummy_pkt->getLocalAddr());
+    pkt->setRemoteAddr(dummy_pkt->getRemoteAddr());
+    pkt->setLocalPort(dummy_pkt->getLocalPort());
+    pkt->setRemotePort(dummy_pkt->getRemotePort());
+    pkt->setLocalHWAddr(dummy_pkt->getLocalHWAddr());
+    pkt->setRemoteHWAddr(dummy_pkt->getRemoteHWAddr());
+
+    return (pkt);
+}
+
+int
+PktFilterBPF::send(const Iface& iface, uint16_t sockfd, const Pkt4Ptr& pkt) {
+
+    OutputBuffer buf(14);
+
+    // Some interfaces may have no HW address - e.g. loopback interface.
+    // For these interfaces the HW address length is 0. If this is the case,
+    // then we will rely on the functions which construct the IP/UDP headers
+    // to provide a default HW address. Otherwise, create the HW address
+    // object using the HW address of the interface.
+    if (iface.getMacLen() > 0) {
+        HWAddrPtr hwaddr(new HWAddr(iface.getMac(), iface.getMacLen(),
+                                    iface.getHWType()));
+        pkt->setLocalHWAddr(hwaddr);
+    }
+
+    /// Loopback interface requires special treatment. It doesn't
+    /// use the ethernet header but rather a 4-bytes long pseudo header
+    /// holding an address family type (see bpf.c in OS sources).
+    if (iface.flag_loopback_) {
+        writeAFPseudoHeader(AF_INET, buf);
+
+    } else {
+        // Ethernet frame header.
+        // Note that we don't validate whether HW addresses in 'pkt'
+        // are valid because they are validated by the function called.
+        writeEthernetHeader(pkt, buf);
+    }
+
+    // IP and UDP header
+    writeIpUdpHeader(pkt, buf);
+
+    // DHCPv4 message
+    buf.writeData(pkt->getBuffer().getData(), pkt->getBuffer().getLength());
+
+    int result = write(sockfd, buf.getData(), buf.getLength());
+    if (result < 0) {
+        isc_throw(SocketWriteError, "failed to send DHCPv4 packet: "
+                  << strerror(errno));
+    }
+
+    return (0);
+}
+
+void
+PktFilterBPF::writeAFPseudoHeader(const uint32_t address_family,
+                                  util::OutputBuffer& out_buf) {
+    // Copy address family to the temporary buffer and preserve the
+    // bytes order.
+    uint8_t af_buf[4];
+    memcpy(static_cast<void*>(af_buf),
+           static_cast<const void*>(&address_family),
+           sizeof(af_buf));
+    // Write the data into the buffer.
+    out_buf.writeData(af_buf, sizeof(af_buf));
+}
+
+} // end of isc::dhcp namespace
+} // end of isc namespace

+ 150 - 0
src/lib/dhcp/pkt_filter_bpf.h

@@ -0,0 +1,150 @@
+// Copyright (C) 2014 Internet Systems Consortium, Inc. ("ISC")
+//
+// Permission to use, copy, modify, and/or distribute this software for any
+// purpose with or without fee is hereby granted, provided that the above
+// copyright notice and this permission notice appear in all copies.
+//
+// THE SOFTWARE IS PROVIDED "AS IS" AND ISC DISCLAIMS ALL WARRANTIES WITH
+// REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY
+// AND FITNESS.  IN NO EVENT SHALL ISC BE LIABLE FOR ANY SPECIAL, DIRECT,
+// INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM
+// LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE
+// OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR
+// PERFORMANCE OF THIS SOFTWARE.
+
+#ifndef PKT_FILTER_BPF_H
+#define PKT_FILTER_BPF_H
+
+#include <dhcp/pkt_filter.h>
+
+#include <util/buffer.h>
+
+namespace isc {
+namespace dhcp {
+
+/// @brief Packet handling class using Berkeley Packet Filtering (BPF)
+///
+/// The BPF is supported on the BSD-like operating systems. It allows for access
+/// to low level layers of the inbound and outbound packets. This is specifically
+/// useful when the DHCP server is allocating new address to the client.
+///
+/// The response being sent to the client must include the HW address in the
+/// datalink layer. When the regular datagram socket is used the kernel will
+/// determine the HW address of the destination using ARP. In the case when
+/// the DHCP server is allocating the new address for the client the ARP can't
+/// be used because it requires the destination to have the IP address.
+///
+/// The DHCP server utilizes HW address sent by the client in the DHCP message
+/// and stores it in the datalink layer of the outbound packet. The BPF provides
+/// the means for crafting the whole packet (including datalink and network
+/// layers) and injecting the hardware address of the client.
+///
+/// The DHCP server receiving the messages sent from the directly connected
+/// clients to the broadcast address must be able to determine the interface
+/// on which the message arrives. The Linux kernel provides the SO_BINDTODEVICE
+/// socket option which allows for binding the socket to the particular
+/// interface. This option is not implemented on the BSD-like operating
+/// systems. This implies that there may be only one datagram socket listening
+/// to broadcast messages and this socket would receive the traffic on all
+/// interfaces. This effectively precludes the server from identifying the
+/// interface on which the packet arrived. The BPF resolves this problem.
+/// The BPF device (socket) can be attached to the selected interface using
+/// the ioctl function.
+///
+/// In nutshell, the BPF device is created by opening the file /dev/bpf%d
+/// where %d is a number. The BPF device is configured by issuing ioctl
+/// commands listed here: http://www.freebsd.org/cgi/man.cgi?bpf(4).
+/// The specific configuration used by Kea DHCP server is decribed in
+/// documentation of @c PktFilterBPF::openSocket.
+///
+/// Use of BPF requires Kea to encode and decode the datalink and network
+/// layer headers. Currently Kea supports encoding and decoding ethernet
+/// frames on physical interfaces and pseudo headers received on local
+/// loopback interface.
+class PktFilterBPF : public PktFilter {
+public:
+
+    /// @brief Check if packet can be sent to the host without address directly.
+    ///
+    /// This class supports direct responses to the host without address.
+    ///
+    /// @return true always.
+    virtual bool isDirectResponseSupported() const {
+        return (true);
+    }
+
+    /// @brief Open primary and fallback socket.
+    ///
+    /// This method opens the BPF device and applies the following
+    /// configuration to it:
+    /// - attach the device to the specified interface
+    /// - set filter program to receive DHCP messages encapsulated in UDP
+    /// packets
+    /// - set immediate mode which causes the read function to return
+    /// immediatelly and do not wait for the whole read buffer to be filled
+    /// by the kernel (to avoid hangs)
+    ///
+    /// It also obtains the following configuration from the kernel:
+    /// - major and minor version of the BPF (and checks if it is valid)
+    /// - length of the buffer to be used to receive the data from the socket
+    ///
+    /// @param iface Interface descriptor. Note that the function (re)allocates
+    /// the socket read buffer according to the buffer size returned by the
+    /// kernel.
+    /// @param addr Address on the interface to be used to send packets.
+    /// @param port Port number.
+    /// @param receive_bcast Configure socket to receive broadcast messages
+    /// @param send_bcast Configure socket to send broadcast messages.
+    ///
+    /// @return A structure describing a primary and fallback socket.
+    virtual SocketInfo openSocket(Iface& iface,
+                                  const isc::asiolink::IOAddress& addr,
+                                  const uint16_t port,
+                                  const bool receive_bcast,
+                                  const bool send_bcast);
+
+    /// @brief Receive packet over specified socket.
+    ///
+    /// @param iface interface
+    /// @param socket_info structure holding socket information
+    ///
+    /// @return Received packet
+    virtual Pkt4Ptr receive(const Iface& iface, const SocketInfo& socket_info);
+
+    /// @brief Send packet over specified socket.
+    ///
+    /// @param iface interface to be used to send packet
+    /// @param sockfd socket descriptor
+    /// @param pkt packet to be sent
+    ///
+    /// @return result of sending a packet. It is 0 if successful.
+    virtual int send(const Iface& iface, uint16_t sockfd,
+                     const Pkt4Ptr& pkt);
+
+private:
+
+    /// @brief Writes pseudo header containing an address family into a buffer.
+    ///
+    /// BPF utilizes the pseudo headers to pass the ancillary data between the
+    /// kernel and the application. For example, when the packet is to be sent
+    /// over the local loopback interface the pseudo header must be added before
+    /// the network layer header to indicate the address family. Other link
+    /// layer header (e.g. ethernet) is not used for local loopback interface.
+    ///
+    /// The header written by this method consists of 4 bytes and contains the
+    /// address family value in host byte order. See sys/socket.h for the
+    /// address family values. Typically it will be AF_INET.
+    ///
+    /// This function doesn't throw.
+    ///
+    /// @param address_family Address family (e.g. AF_INET).
+    /// @param [out] out_buf buffer where a header is written.
+    void writeAFPseudoHeader(const uint32_t address_family,
+                             util::OutputBuffer& out_buf);
+
+};
+
+} // namespace isc::dhcp
+} // namespace isc
+
+#endif // PKT_FILTER_BPF_H

+ 1 - 1
src/lib/dhcp/pkt_filter_inet.cc

@@ -31,7 +31,7 @@ PktFilterInet::PktFilterInet()
 }
 
 SocketInfo
-PktFilterInet::openSocket(const Iface& iface,
+PktFilterInet::openSocket(Iface& iface,
                           const isc::asiolink::IOAddress& addr,
                           const uint16_t port,
                           const bool receive_bcast,

+ 2 - 2
src/lib/dhcp/pkt_filter_inet.h

@@ -1,4 +1,4 @@
-// Copyright (C) 2013 Internet Systems Consortium, Inc. ("ISC")
+// Copyright (C) 2013-2014 Internet Systems Consortium, Inc. ("ISC")
 //
 // Permission to use, copy, modify, and/or distribute this software for any
 // purpose with or without fee is hereby granted, provided that the above
@@ -55,7 +55,7 @@ public:
     /// @return A structure describing a primary and fallback socket.
     /// @throw isc::dhcp::SocketConfigError if error occurs when opening,
     /// binding or configuring the socket.
-    virtual SocketInfo openSocket(const Iface& iface,
+    virtual SocketInfo openSocket(Iface& iface,
                                   const isc::asiolink::IOAddress& addr,
                                   const uint16_t port,
                                   const bool receive_bcast,

+ 2 - 2
src/lib/dhcp/pkt_filter_lpf.cc

@@ -1,4 +1,4 @@
-// Copyright (C) 2013 Internet Systems Consortium, Inc. ("ISC")
+// Copyright (C) 2013-2014 Internet Systems Consortium, Inc. ("ISC")
 //
 // Permission to use, copy, modify, and/or distribute this software for any
 // purpose with or without fee is hereby granted, provided that the above
@@ -103,7 +103,7 @@ namespace isc {
 namespace dhcp {
 
 SocketInfo
-PktFilterLPF::openSocket(const Iface& iface,
+PktFilterLPF::openSocket(Iface& iface,
                          const isc::asiolink::IOAddress& addr,
                          const uint16_t port, const bool,
                          const bool) {

+ 2 - 2
src/lib/dhcp/pkt_filter_lpf.h

@@ -1,4 +1,4 @@
-// Copyright (C) 2013 Internet Systems Consortium, Inc. ("ISC")
+// Copyright (C) 2013-2014 Internet Systems Consortium, Inc. ("ISC")
 //
 // Permission to use, copy, modify, and/or distribute this software for any
 // purpose with or without fee is hereby granted, provided that the above
@@ -49,7 +49,7 @@ public:
     /// @param send_bcast Configure socket to send broadcast messages.
     ///
     /// @return A structure describing a primary and fallback socket.
-    virtual SocketInfo openSocket(const Iface& iface,
+    virtual SocketInfo openSocket(Iface& iface,
                                   const isc::asiolink::IOAddress& addr,
                                   const uint16_t port,
                                   const bool receive_bcast,

+ 6 - 0
src/lib/dhcp/tests/Makefile.am

@@ -78,10 +78,16 @@ libdhcp___unittests_SOURCES += pkt_filter6_test_stub.cc pkt_filter_test_stub.h
 libdhcp___unittests_SOURCES += pkt_filter_test_utils.h pkt_filter_test_utils.cc
 libdhcp___unittests_SOURCES += pkt_filter6_test_utils.h pkt_filter6_test_utils.cc
 
+# Utilize Linux Packet Filtering on Linux.
 if OS_LINUX
 libdhcp___unittests_SOURCES += pkt_filter_lpf_unittest.cc
 endif
 
+# Utilize Berkeley Packet Filtering on BSD.
+if OS_BSD
+libdhcp___unittests_SOURCES += pkt_filter_bpf_unittest.cc
+endif
+
 libdhcp___unittests_SOURCES += protocol_util_unittest.cc
 libdhcp___unittests_SOURCES += duid_unittest.cc
 

+ 55 - 21
src/lib/dhcp/tests/iface_mgr_unittest.cc

@@ -17,6 +17,7 @@
 #include <asiolink/io_address.h>
 #include <dhcp/dhcp4.h>
 #include <dhcp/iface_mgr.h>
+#include <dhcp/option.h>
 #include <dhcp/pkt6.h>
 #include <dhcp/pkt_filter.h>
 #include <dhcp/tests/iface_mgr_test_config.h>
@@ -56,6 +57,39 @@ const uint16_t PORT2 = 10548;   // V4 socket
 // tolerance to 0.01s.
 const uint32_t TIMEOUT_TOLERANCE = 10000;
 
+/// This test verifies that the socket read buffer can be used to
+/// receive the data and that the data can be read from it.
+TEST(IfaceTest, readBuffer) {
+    // Create fake interface object.
+    Iface iface("em0", 0);
+    // The size of read buffer should initially be 0 and the returned
+    // pointer should be NULL.
+    ASSERT_EQ(0, iface.getReadBufferSize());
+    EXPECT_EQ(NULL, iface.getReadBuffer());
+
+    // Let's resize the buffer.
+    iface.resizeReadBuffer(256);
+    // Check that the buffer has expected size.
+    ASSERT_EQ(256, iface.getReadBufferSize());
+    // The returned pointer should now be non-NULL.
+    uint8_t* buf_ptr = iface.getReadBuffer();
+    ASSERT_FALSE(buf_ptr == NULL);
+
+    // Use the pointer to set some data.
+    for (int i = 0; i < iface.getReadBufferSize(); ++i) {
+        buf_ptr[i] = i;
+    }
+
+    // Get the pointer again and validate the data.
+    buf_ptr = iface.getReadBuffer();
+    ASSERT_EQ(256, iface.getReadBufferSize());
+    for (int i = 0; i < iface.getReadBufferSize(); ++i) {
+        // Use assert so as it fails on the first failure, no need
+        // to continue further checks.
+        ASSERT_EQ(i, buf_ptr[i]);
+    }
+}
+
 /// Mock object implementing PktFilter class.  It is used by
 /// IfaceMgrTest::setPacketFilter to verify that IfaceMgr::setPacketFilter
 /// sets this object as a handler for opening sockets. This dummy
@@ -96,7 +130,7 @@ public:
     /// @param iface An interface on which the socket is to be opened.
     /// @param addr An address to which the socket is to be bound.
     /// @param port A port to which the socket is to be bound.
-    virtual SocketInfo openSocket(const Iface& iface,
+    virtual SocketInfo openSocket(Iface& iface,
                                   const isc::asiolink::IOAddress& addr,
                                   const uint16_t port,
                                   const bool join_multicast,
@@ -1256,12 +1290,12 @@ TEST_F(IfaceMgrTest, setPacketFilter6) {
 }
 
 
-#if defined OS_LINUX
+#if defined OS_LINUX || OS_BSD
 
-// This Linux specific test checks whether it is possible to use
-// IfaceMgr to figure out which Pakcket Filter object should be
-// used when direct responses to hosts, having no address assigned
-// are desired or not desired.
+// This test is only supported on Linux and BSD systems. It checks
+// if it is possible to use the IfaceMgr to select the packet filter
+// object which can be used to send direct responses to the host
+// which doesn't have an address yet.
 TEST_F(IfaceMgrTest, setMatchingPacketFilter) {
 
     // Create an instance of IfaceMgr.
@@ -1270,28 +1304,27 @@ TEST_F(IfaceMgrTest, setMatchingPacketFilter) {
 
     // Let IfaceMgr figure out which Packet Filter to use when
     // direct response capability is not desired. It should pick
-    // PktFilterInet.
+    // PktFilterInet on Linux.
     EXPECT_NO_THROW(iface_mgr->setMatchingPacketFilter(false));
     // The PktFilterInet is supposed to report lack of direct
     // response capability.
     EXPECT_FALSE(iface_mgr->isDirectResponseSupported());
 
     // There is working implementation of direct responses on Linux
-    // in PktFilterLPF. It uses Linux Packet Filtering as underlying
-    // mechanism. When direct responses are desired the object of
-    // this class should be set.
+    // and BSD (using PktFilterLPF and PktFilterBPF. When direct
+    // responses are desired the object of this class should be set.
     EXPECT_NO_THROW(iface_mgr->setMatchingPacketFilter(true));
     // This object should report that direct responses are supported.
     EXPECT_TRUE(iface_mgr->isDirectResponseSupported());
 }
 
 // This test checks that it is not possible to open two sockets: IP/UDP
-// and raw (LPF) socket and bind to the same address and port. The
+// and raw socket and bind to the same address and port. The
 // raw socket should be opened together with the fallback IP/UDP socket.
 // The fallback socket should fail to open when there is another IP/UDP
 // socket bound to the same address and port. Failing to open the fallback
 // socket should preclude the raw socket from being open.
-TEST_F(IfaceMgrTest, checkPacketFilterLPFSocket) {
+TEST_F(IfaceMgrTest, checkPacketFilterRawSocket) {
     IOAddress loAddr("127.0.0.1");
     int socket1 = -1, socket2 = -1;
     // Create two instances of IfaceMgr.
@@ -1335,15 +1368,16 @@ TEST_F(IfaceMgrTest, checkPacketFilterLPFSocket) {
 
 #else
 
-// This non-Linux specific test checks whether it is possible to use
-// IfaceMgr to figure out which Pakcket Filter object should be
-// used when direct responses to hosts, having no address assigned
-// are desired or not desired. Since direct responses aren't supported
-// on systems other than Linux the function under test should always
-// set object of PktFilterInet type as current Packet Filter. This
-// object does not support direct responses. Once implementation is
-// added on non-Linux systems the OS specific version of the test
-// will be removed.
+// Note: This test will only run on non-Linux and non-BSD systems.
+// This test checks whether it is possible to use IfaceMgr to figure
+// out which Pakcket Filter object should be used when direct responses
+// to hosts, having no address assigned are desired or not desired.
+// Since direct responses aren't supported on systems other than Linux
+// and BSD the function under test should always set object of
+// PktFilterInet type as current Packet Filter. This object does not 
+//support direct responses. Once implementation is added on systems
+// other than BSD and Linux the OS specific version of the test will
+// be removed.
 TEST_F(IfaceMgrTest, setMatchingPacketFilter) {
 
     // Create an instance of IfaceMgr.

+ 205 - 0
src/lib/dhcp/tests/pkt_filter_bpf_unittest.cc

@@ -0,0 +1,205 @@
+// Copyright (C) 2014 Internet Systems Consortium, Inc. ("ISC")
+//
+// Permission to use, copy, modify, and/or distribute this software for any
+// purpose with or without fee is hereby granted, provided that the above
+// copyright notice and this permission notice appear in all copies.
+//
+// THE SOFTWARE IS PROVIDED "AS IS" AND ISC DISCLAIMS ALL WARRANTIES WITH
+// REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY
+// AND FITNESS.  IN NO EVENT SHALL ISC BE LIABLE FOR ANY SPECIAL, DIRECT,
+// INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM
+// LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE
+// OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR
+// PERFORMANCE OF THIS SOFTWARE.
+
+#include <config.h>
+#include <asiolink/io_address.h>
+#include <dhcp/iface_mgr.h>
+#include <dhcp/pkt4.h>
+#include <dhcp/pkt_filter_bpf.h>
+#include <dhcp/protocol_util.h>
+#include <dhcp/tests/pkt_filter_test_utils.h>
+#include <util/buffer.h>
+
+#include <gtest/gtest.h>
+
+#include <net/bpf.h>
+#include <sys/socket.h>
+
+using namespace isc::asiolink;
+using namespace isc::dhcp;
+using namespace isc::util;
+
+namespace {
+
+/// Port number used by tests.
+const uint16_t PORT = 10067;
+/// Size of the buffer holding received packets.
+const size_t RECV_BUF_SIZE = 4096;
+
+// Test fixture class inherits from the class common for all packet
+// filter tests.
+class PktFilterBPFTest : public isc::dhcp::test::PktFilterTest {
+public:
+    PktFilterBPFTest() : PktFilterTest(PORT) {
+    }
+};
+
+// This test verifies that the PktFilterBPF class reports its capability
+// to send packets to the host having no IP address assigned.
+TEST_F(PktFilterBPFTest, isDirectResponseSupported) {
+    // Create object under test.
+    PktFilterBPF pkt_filter;
+    // Must support direct responses.
+    EXPECT_TRUE(pkt_filter.isDirectResponseSupported());
+}
+
+// All tests below require root privileges to execute successfully. If
+// they are run as non-root user they will fail due to insufficient privileges
+// to open raw network sockets. Therefore, they should remain disabled by default
+// and "DISABLED_" tags should not be removed. If one is willing to run these
+// tests please run "make check" as root and enable execution of disabled tests
+// by setting GTEST_ALSO_RUN_DISABLED_TESTS to a value other than 0. In order
+// to run tests from this particular file, set the GTEST_FILTER environmental
+// variable to "PktFilterBPFTest.*" apart from GTEST_ALSO_RUN_DISABLED_TESTS
+// setting.
+
+// This test verifies that the raw AF_PACKET family socket can
+// be opened and bound to the specific interface.
+TEST_F(PktFilterBPFTest, DISABLED_openSocket) {
+    // Create object representing loopback interface.
+    Iface iface(ifname_, ifindex_);
+    iface.flag_loopback_ = true;
+    // Set loopback address.
+    IOAddress addr("127.0.0.1");
+
+    // Try to open socket.
+    PktFilterBPF pkt_filter;
+    ASSERT_NO_THROW(
+        sock_info_ = pkt_filter.openSocket(iface, addr, PORT, false, false);
+    );
+
+    // Check that the primary socket has been opened.
+    ASSERT_GE(sock_info_.sockfd_, 0);
+    // Check that the fallback socket has been opened too.
+    ASSERT_GE(sock_info_.fallbackfd_, 0);
+}
+
+// This test verifies correctness of sending DHCP packet through the BPF
+// device attached to local loopback interface. Note that this is not exactly
+// the same as sending over the hardware interface (e.g. ethernet) because the
+// packet format is different on local loopback interface when using the
+// BPF. The key difference is that the pseudo header containing address
+// family is sent instead of link-layer header. Ideally we would run this
+// test over the real interface but since we don't know what interfaces
+// are present in the particular system we have to stick to local loopback
+// interface as this one is almost always present.
+TEST_F(PktFilterBPFTest, DISABLED_send) {
+    // Packet will be sent over loopback interface.
+    Iface iface(ifname_, ifindex_);
+    iface.flag_loopback_ = true;
+    IOAddress addr("127.0.0.1");
+
+    // Create an instance of the class which we are testing.
+    PktFilterBPF pkt_filter;
+
+    // Open BPF device.
+    sock_info_ = pkt_filter.openSocket(iface, addr, PORT, false, false);
+    // Returned descriptor must not be negative. 0 is valid.
+    ASSERT_GE(sock_info_.sockfd_, 0);
+
+    // Send the packet over the socket.
+    ASSERT_NO_THROW(pkt_filter.send(iface, sock_info_.sockfd_, test_message_));
+
+    // Read the data from socket.
+    fd_set readfds;
+    FD_ZERO(&readfds);
+    FD_SET(sock_info_.sockfd_, &readfds);
+
+    struct timeval timeout;
+    timeout.tv_sec = 5;
+    timeout.tv_usec = 0;
+    int result = select(sock_info_.sockfd_ + 1, &readfds, NULL, NULL, &timeout);
+    // We should receive some data from loopback interface.
+    ASSERT_GT(result, 0);
+
+    /// Get the actual data.
+    uint8_t rcv_buf[RECV_BUF_SIZE];
+    result = read(sock_info_.sockfd_, rcv_buf, RECV_BUF_SIZE);
+    ASSERT_GT(result, 0);
+
+    // Each packet is prepended with the BPF header structure. We have to
+    // parse this structure to locate the position of the address family
+    // pseudo header.
+    struct bpf_hdr bpfh;
+    memcpy(static_cast<void*>(&bpfh), static_cast<void*>(rcv_buf),
+           sizeof(bpf_hdr));
+    // bh_hdrlen contains the total length of the BPF header, including
+    // alignment. We will use this value to skip over the BPF header and
+    // parse the contents of the packet that we are interested in.
+    uint32_t bpfh_len = bpfh.bh_hdrlen;
+    // Address Family pseudo header contains the address family of the
+    // packet (used for local loopback interface instead of the link-layer
+    // header such as ethernet frame header).
+    uint32_t af = 0;
+    memcpy(static_cast<void*>(&af),
+           static_cast<void*>(rcv_buf + bpfh_len), 4);
+    // Check the value in the pseudo header. If this is incorrect, something
+    // is really broken, so let's exit.
+    ASSERT_EQ(AF_INET, af);
+
+    Pkt4Ptr dummy_pkt = Pkt4Ptr(new Pkt4(DHCPDISCOVER, 0));
+    // Create the input buffer from the reminder of the packet. This should
+    // only contain the IP/UDP headers and the DHCP message.
+    InputBuffer buf(rcv_buf + bpfh_len + 4, result - bpfh_len - 4);
+    ASSERT_GE(buf.getLength(), test_message_->len());
+
+    decodeIpUdpHeader(buf, dummy_pkt);
+
+    // Create the DHCPv4 packet from the received data.
+    std::vector<uint8_t> dhcp_buf;
+    buf.readVector(dhcp_buf, buf.getLength() - buf.getPosition());
+    Pkt4Ptr rcvd_pkt(new Pkt4(&dhcp_buf[0], dhcp_buf.size()));
+    ASSERT_TRUE(rcvd_pkt);
+
+    // Parse the packet.
+    ASSERT_NO_THROW(rcvd_pkt->unpack());
+
+    // Check if the received message is correct.
+    testRcvdMessage(rcvd_pkt);
+}
+
+// This test verifies correctness of reception of the DHCP packet over
+// raw socket, whereby all IP stack headers are hand-crafted.
+TEST_F(PktFilterBPFTest, DISABLED_receive) {
+
+    // Packet will be received over loopback interface.
+    Iface iface(ifname_, ifindex_);
+    iface.flag_loopback_ = true;
+    IOAddress addr("127.0.0.1");
+
+    // Create an instance of the class which we are testing.
+    PktFilterBPF pkt_filter;
+    // Open socket. We don't check that the socket has appropriate
+    // options and family set because we have checked that in the
+    // openSocket test already.
+    sock_info_ = pkt_filter.openSocket(iface, addr, PORT, false, false);
+    ASSERT_GE(sock_info_.sockfd_, 0);
+
+    // Send DHCPv4 message to the local loopback address and server's port.
+    sendMessage();
+
+    // Receive the packet using LPF packet filter.
+    Pkt4Ptr rcvd_pkt;
+    ASSERT_NO_THROW(rcvd_pkt = pkt_filter.receive(iface, sock_info_));
+    // Check that the packet has been correctly received.
+    ASSERT_TRUE(rcvd_pkt);
+
+    // Parse the packet.
+    ASSERT_NO_THROW(rcvd_pkt->unpack());
+
+    // Check if the received message is correct.
+    testRcvdMessage(rcvd_pkt);
+}
+
+} // anonymous namespace

+ 1 - 1
src/lib/dhcp/tests/pkt_filter_test_stub.cc

@@ -28,7 +28,7 @@ PktFilterTestStub::isDirectResponseSupported() const {
 }
 
 SocketInfo
-PktFilterTestStub::openSocket(const Iface&,
+PktFilterTestStub::openSocket(Iface&,
            const isc::asiolink::IOAddress& addr,
            const uint16_t port, const bool, const bool) {
     return (SocketInfo(addr, port, 0));

+ 1 - 1
src/lib/dhcp/tests/pkt_filter_test_stub.h

@@ -63,7 +63,7 @@ public:
     ///
     /// @return A SocketInfo structure with the socket descriptor set to 0. The
     /// fallback socket descriptor is set to a negative value.
-    virtual SocketInfo openSocket(const Iface& iface,
+    virtual SocketInfo openSocket(Iface& iface,
                                   const isc::asiolink::IOAddress& addr,
                                   const uint16_t port,
                                   const bool receive_bcast,

+ 2 - 2
src/lib/dhcp/tests/pkt_filter_test_utils.cc

@@ -1,4 +1,4 @@
-// Copyright (C) 2013 Internet Systems Consortium, Inc. ("ISC")
+// Copyright (C) 2013-2014 Internet Systems Consortium, Inc. ("ISC")
 //
 // Permission to use, copy, modify, and/or distribute this software for any
 // purpose with or without fee is hereby granted, provided that the above
@@ -172,7 +172,7 @@ PktFilterStub::isDirectResponseSupported() const {
 }
 
 SocketInfo
-PktFilterStub::openSocket(const Iface&,
+PktFilterStub::openSocket(Iface&,
            const isc::asiolink::IOAddress& addr,
            const uint16_t port, const bool, const bool) {
     return (SocketInfo(addr, port, 0));

+ 1 - 1
src/lib/dhcp/tests/pkt_filter_test_utils.h

@@ -127,7 +127,7 @@ public:
     ///
     /// @return A SocketInfo structure with the socket descriptor set to 0. The
     /// fallback socket descriptor is set to a negative value.
-    virtual SocketInfo openSocket(const Iface& iface,
+    virtual SocketInfo openSocket(Iface& iface,
                                   const isc::asiolink::IOAddress& addr,
                                   const uint16_t port,
                                   const bool receive_bcast,