Browse Source

[2893] Implemented support for BPF on ethernet and local loopback iface.

Marcin Siodelski 10 years ago
parent
commit
495a5731c9

+ 15 - 6
src/lib/dhcp/iface_mgr.cc

@@ -55,11 +55,18 @@ Iface::Iface(const std::string& name, int ifindex)
     :name_(name), ifindex_(ifindex), mac_len_(0), hardware_type_(0),
     :name_(name), ifindex_(ifindex), mac_len_(0), hardware_type_(0),
      flag_loopback_(false), flag_up_(false), flag_running_(false),
      flag_loopback_(false), flag_up_(false), flag_running_(false),
      flag_multicast_(false), flag_broadcast_(false), flags_(0),
      flag_multicast_(false), flag_broadcast_(false), flags_(0),
-     inactive4_(false), inactive6_(false), read_buffer_()
+     inactive4_(false), inactive6_(false), read_buffer_(NULL),
+     read_buffer_size_(0)
 {
 {
     memset(mac_, 0, sizeof(mac_));
     memset(mac_, 0, sizeof(mac_));
 }
 }
 
 
+Iface::~Iface() {
+    if (read_buffer_ != NULL) {
+        free(read_buffer_);
+    }
+}
+
 void
 void
 Iface::closeSockets() {
 Iface::closeSockets() {
     // Close IPv4 sockets.
     // Close IPv4 sockets.
@@ -167,12 +174,14 @@ bool Iface::delSocket(const uint16_t sockfd) {
     return (false); // socket not found
     return (false); // socket not found
 }
 }
 
 
-uint8_t*
-Iface::getReadBufferPtr() {
-    if (read_buffer_.empty()) {
-        return (NULL);
+void
+Iface::resizeReadBuffer(const size_t new_size) {
+    read_buffer_size_ = new_size;
+    read_buffer_ = static_cast<uint8_t*>(realloc(read_buffer_,
+                                                 read_buffer_size_));
+    if (read_buffer_ == NULL) {
+        read_buffer_size_ = 0;
     }
     }
-    return (static_cast<uint8_t*>(&read_buffer_[0]));
 }
 }
 
 
 IfaceMgr::IfaceMgr()
 IfaceMgr::IfaceMgr()

+ 13 - 17
src/lib/dhcp/iface_mgr.h

@@ -22,7 +22,6 @@
 #include <dhcp/pkt6.h>
 #include <dhcp/pkt6.h>
 #include <dhcp/pkt_filter.h>
 #include <dhcp/pkt_filter.h>
 #include <dhcp/pkt_filter6.h>
 #include <dhcp/pkt_filter6.h>
-#include <dhcp/option.h>
 
 
 #include <boost/function.hpp>
 #include <boost/function.hpp>
 #include <boost/noncopyable.hpp>
 #include <boost/noncopyable.hpp>
@@ -154,6 +153,11 @@ public:
     /// @param ifindex interface index (unique integer identifier)
     /// @param ifindex interface index (unique integer identifier)
     Iface(const std::string& name, int ifindex);
     Iface(const std::string& name, int ifindex);
 
 
+    /// @brief Destructor.
+    ///
+    /// Deallocates the socket read buffer.
+    ~Iface();
+
     /// @brief Closes all open sockets on interface.
     /// @brief Closes all open sockets on interface.
     void closeSockets();
     void closeSockets();
 
 
@@ -335,34 +339,23 @@ public:
     /// The returned pointer is only valid during the lifetime of the
     /// The returned pointer is only valid during the lifetime of the
     /// object which returns it or until the buffer is resized.
     /// object which returns it or until the buffer is resized.
     /// This function is meant to be used with socket API to gather
     /// This function is meant to be used with socket API to gather
-    /// data from the socket. In order to process the read data,
-    /// the @c getReadBuffer function should be used.
+    /// data from the socket.
     ///
     ///
     /// @return Pointer to the first element of the read buffer or
     /// @return Pointer to the first element of the read buffer or
     /// NULL if the buffer is empty.
     /// NULL if the buffer is empty.
-    uint8_t* getReadBufferPtr();
-
-    /// @brief Returns reference to the buffer used for data reading.
-    ///
-    /// The returned reference is only valid during the lifetime of the
-    /// object which returns it or until the buffer is resized.
-    ///
-    /// @return Reference to the read buffer.
-    const OptionBuffer& getReadBuffer() const {
+    uint8_t* getReadBufferPtr() const {
         return (read_buffer_);
         return (read_buffer_);
     }
     }
 
 
     /// @brief Returns the current size of the socket read buffer.
     /// @brief Returns the current size of the socket read buffer.
     size_t getReadBufferSize() const {
     size_t getReadBufferSize() const {
-        return (read_buffer_.size());
+        return (read_buffer_size_);
     }
     }
 
 
     /// @brief Resizes the socket read buffer.
     /// @brief Resizes the socket read buffer.
     ///
     ///
     /// @param new_size New size of the buffer.
     /// @param new_size New size of the buffer.
-    void resizeReadBuffer(const size_t new_size) {
-        read_buffer_.resize(new_size);
-    }
+    void resizeReadBuffer(const size_t new_size);
 
 
 protected:
 protected:
     /// Socket used to send data.
     /// Socket used to send data.
@@ -428,7 +421,10 @@ public:
     /// This buffer may be pre-allocated when the socket on the interface
     /// This buffer may be pre-allocated when the socket on the interface
     /// is being opened. The functions which read the data from the socket
     /// is being opened. The functions which read the data from the socket
     /// may use this buffer as a storage for the data being read.
     /// may use this buffer as a storage for the data being read.
-    OptionBuffer read_buffer_;
+    uint8_t* read_buffer_;
+
+    /// @brief Allocated size of the read buffer.
+    size_t read_buffer_size_;
 };
 };
 
 
 /// @brief This type describes the callback function invoked when error occurs
 /// @brief This type describes the callback function invoked when error occurs

+ 2 - 2
src/lib/dhcp/pkt_filter.h

@@ -1,4 +1,4 @@
-// Copyright (C) 2013 Internet Systems Consortium, Inc. ("ISC")
+// Copyright (C) 2013-2014 Internet Systems Consortium, Inc. ("ISC")
 //
 //
 // Permission to use, copy, modify, and/or distribute this software for any
 // Permission to use, copy, modify, and/or distribute this software for any
 // purpose with or without fee is hereby granted, provided that the above
 // purpose with or without fee is hereby granted, provided that the above
@@ -86,7 +86,7 @@ public:
     /// @param send_bcast configure socket to send broadcast messages.
     /// @param send_bcast configure socket to send broadcast messages.
     ///
     ///
     /// @return A structure describing a primary and fallback socket.
     /// @return A structure describing a primary and fallback socket.
-    virtual SocketInfo openSocket(const Iface& iface,
+    virtual SocketInfo openSocket(Iface& iface,
                                   const isc::asiolink::IOAddress& addr,
                                   const isc::asiolink::IOAddress& addr,
                                   const uint16_t port,
                                   const uint16_t port,
                                   const bool receive_bcast,
                                   const bool receive_bcast,

+ 181 - 38
src/lib/dhcp/pkt_filter_bpf.cc

@@ -30,6 +30,10 @@ using namespace isc::dhcp;
 /// @brief Maximum number of attempts to open BPF device.
 /// @brief Maximum number of attempts to open BPF device.
 const unsigned int MAX_BPF_OPEN_ATTEMPTS = 100;
 const unsigned int MAX_BPF_OPEN_ATTEMPTS = 100;
 
 
+/// @brief Length of the header containing the address family for the packet
+/// received on local loopback interface.
+const unsigned int BPF_LOCAL_LOOPBACK_HEADER_LEN = 4;
+
 /// The following structure defines a Berkely Packet Filter program to perform
 /// The following structure defines a Berkely Packet Filter program to perform
 /// packet filtering. The program operates on Ethernet packets.  To help with
 /// packet filtering. The program operates on Ethernet packets.  To help with
 /// interpretation of the program, for the types of Ethernet packets we are
 /// interpretation of the program, for the types of Ethernet packets we are
@@ -49,7 +53,7 @@ const unsigned int MAX_BPF_OPEN_ATTEMPTS = 100;
 /// @todo We may want to extend the filter to receive packets sent
 /// @todo We may want to extend the filter to receive packets sent
 /// to the particular IP address assigned to the interface or
 /// to the particular IP address assigned to the interface or
 /// broadcast address.
 /// broadcast address.
-struct bpf_insn dhcp_sock_filter [] = {
+struct bpf_insn ethernet_ip_udp_filter [] = {
     // Make sure this is an IP packet: check the half-word (two bytes)
     // Make sure this is an IP packet: check the half-word (two bytes)
     // at offset 12 in the packet (the Ethernet packet type).  If it
     // at offset 12 in the packet (the Ethernet packet type).  If it
     // is, advance to the next instruction.  If not, advance 8
     // is, advance to the next instruction.  If not, advance 8
@@ -61,7 +65,8 @@ struct bpf_insn dhcp_sock_filter [] = {
     // Make sure it's a UDP packet.  The IP protocol is at offset
     // Make sure it's a UDP packet.  The IP protocol is at offset
     // 9 in the IP header so, adding the Ethernet packet header size
     // 9 in the IP header so, adding the Ethernet packet header size
     // of 14 bytes gives an absolute byte offset in the packet of 23.
     // of 14 bytes gives an absolute byte offset in the packet of 23.
-    BPF_STMT(BPF_LD + BPF_B + BPF_ABS, ETHERNET_HEADER_LEN + IP_PROTO_TYPE_OFFSET),
+    BPF_STMT(BPF_LD + BPF_B + BPF_ABS,
+             ETHERNET_HEADER_LEN + IP_PROTO_TYPE_OFFSET),
     BPF_JUMP(BPF_JMP + BPF_JEQ + BPF_K, IPPROTO_UDP, 0, 6),
     BPF_JUMP(BPF_JMP + BPF_JEQ + BPF_K, IPPROTO_UDP, 0, 6),
 
 
     // Make sure this isn't a fragment by checking that the fragment
     // Make sure this isn't a fragment by checking that the fragment
@@ -97,6 +102,61 @@ struct bpf_insn dhcp_sock_filter [] = {
     BPF_STMT(BPF_RET + BPF_K, 0),
     BPF_STMT(BPF_RET + BPF_K, 0),
 };
 };
 
 
+/// The following structure defines a BPF program to perform packet filtering
+/// on local loopback interface. The packets received on this interface do not
+/// contain the regular link-layer header, but rather a 4-byte long pseudo
+/// header containing the address family. The reminder of the packet contains
+/// IP header, UDP header and a DHCP message.
+struct bpf_insn loopback_ip_udp_filter [] = {
+    // Make sure this is an IP packet. The pseudo header comprises a 4-byte
+    // long value identifying the address family, which should be set to
+    // AF_INET.
+    BPF_STMT(BPF_LD + BPF_W + BPF_ABS, 0),
+    BPF_JUMP(BPF_JMP + BPF_JEQ + BPF_K, htonl(AF_INET), 0, 8),
+
+    // Make sure it's a UDP packet.  The IP protocol is at offset
+    // 9 in the IP header so, adding the pseudo header size 4 bytes
+    // gives an absolute byte offset in the packet of 13.
+    BPF_STMT(BPF_LD + BPF_B + BPF_ABS,
+             BPF_LOCAL_LOOPBACK_HEADER_LEN + IP_PROTO_TYPE_OFFSET),
+    BPF_JUMP(BPF_JMP + BPF_JEQ + BPF_K, IPPROTO_UDP, 0, 6),
+
+    // Make sure this isn't a fragment by checking that the fragment
+    // offset field in the IP header is zero.  This field is the
+    // least-significant 13 bits in the bytes at offsets 6 and 7 in
+    // the IP header, so the half-word at offset 10 (6 + size of
+    // pseudo header) is loaded and an appropriate mask applied.
+    BPF_STMT(BPF_LD + BPF_H + BPF_ABS,
+             BPF_LOCAL_LOOPBACK_HEADER_LEN + IP_FLAGS_OFFSET),
+    BPF_JUMP(BPF_JMP + BPF_JSET + BPF_K, 0x1fff, 4, 0),
+
+    // Get the IP header length.  This is achieved by the following
+    // (special) instruction that, given the offset of the start
+    // of the IP header (offset 4) loads the IP header length.
+    BPF_STMT(BPF_LDX + BPF_B + BPF_MSH, BPF_LOCAL_LOOPBACK_HEADER_LEN),
+
+    // Make sure it's to the right port.  The following instruction
+    // adds the previously extracted IP header length to the given
+    // offset to locate the correct byte.  The given offset of 6
+    // comprises the length of the pseudo header (4) plus the offset
+    // of the UDP destination port (2) within the UDP header.
+    BPF_STMT(BPF_LD + BPF_H + BPF_IND,
+             BPF_LOCAL_LOOPBACK_HEADER_LEN + UDP_DEST_PORT),
+    // The following instruction tests against the default DHCP server port,
+    // but the action port is actually set in PktFilterBPF::openSocket().
+    // N.B. The code in that method assumes that this instruction is at
+    // offset 8 in the program.  If this is changed, openSocket() must be
+    // updated.
+    BPF_JUMP(BPF_JMP + BPF_JEQ + BPF_K, DHCP4_SERVER_PORT, 0, 1),
+
+    // If we passed all the tests, ask for the whole packet.
+    BPF_STMT(BPF_RET + BPF_K, (u_int)-1),
+
+    // Otherwise, drop it.
+    BPF_STMT(BPF_RET + BPF_K, 0),
+};
+
+
 }
 }
 
 
 using namespace isc::util;
 using namespace isc::util;
@@ -105,7 +165,7 @@ namespace isc {
 namespace dhcp {
 namespace dhcp {
 
 
 SocketInfo
 SocketInfo
-PktFilterBPF::openSocket(const Iface& iface,
+PktFilterBPF::openSocket(Iface& iface,
                          const isc::asiolink::IOAddress& addr,
                          const isc::asiolink::IOAddress& addr,
                          const uint16_t port, const bool,
                          const uint16_t port, const bool,
                          const bool) {
                          const bool) {
@@ -183,13 +243,30 @@ PktFilterBPF::openSocket(const Iface& iface,
                   " buffer legth for reads from BPF device");
                   " buffer legth for reads from BPF device");
     }
     }
 
 
-    // Configure the BPF program to receive packets on the specified port.
-    dhcp_sock_filter[8].k = port;
+    if (buf_len < sizeof(bpf_hdr)) {
+        isc_throw(SocketConfigError, "read buffer length returned by the"
+                  " kernel for the BPF device associated with the interface"
+                  << iface.getName() << " is lower than the BPF header"
+                  " length: this condition is impossible unless the"
+                  " operating system is really broken!")
+    }
 
 
     // Set the filter program so as we only get packets we are interested in.
     // Set the filter program so as we only get packets we are interested in.
     struct bpf_program prog;
     struct bpf_program prog;
-    prog.bf_insns = dhcp_sock_filter;
-    prog.bf_len = sizeof(dhcp_sock_filter) / sizeof(struct bpf_insn);
+    memset(&prog, 0, sizeof(bpf_program));
+    if (iface.flag_loopback_) {
+        prog.bf_insns = loopback_ip_udp_filter;
+        prog.bf_len = sizeof(loopback_ip_udp_filter) / sizeof(struct bpf_insn);
+
+    } else {
+        prog.bf_insns = ethernet_ip_udp_filter;
+        prog.bf_len = sizeof(ethernet_ip_udp_filter) / sizeof(struct bpf_insn);
+    }
+
+    // Configure the BPF program to receive packets on the specified port.
+    prog.bf_insns[8].k = port;
+
+    // Actually set the filter program for the device.
     if (ioctl(sock, BIOCSETF, &prog) < 0) {
     if (ioctl(sock, BIOCSETF, &prog) < 0) {
         close(fallback);
         close(fallback);
         close(sock);
         close(sock);
@@ -197,15 +274,33 @@ PktFilterBPF::openSocket(const Iface& iface,
                   " program");
                   " program");
     }
     }
 
 
-    // Everything is ok, return the socket (BPF device descriptor) to
-    // the caller.
+    // Configure the BPF device to use the immediate mode. This ensures
+    // that the read function returns immediatelly, instead of waiting
+    // for the kernel to fill up the buffer, which would likely cause
+    // read hangs.
+    int flag = 1;
+    if (ioctl(sock, BIOCIMMEDIATE, &flag) < 0) {
+        close(fallback);
+        close(sock);
+        isc_throw(SocketConfigError, "Failed to set promiscious mode for"
+                  " BPF device");
+    }
+
+    // Everything is ok, allocate the read buffer and return the socket
+    // (BPF device descriptor) to the caller.
+    iface.resizeReadBuffer(buf_len);
     return (SocketInfo(addr, port, sock, fallback));
     return (SocketInfo(addr, port, sock, fallback));
 }
 }
 
 
 Pkt4Ptr
 Pkt4Ptr
-PktFilterBPF::receive(const Iface&/* iface */, const SocketInfo& /*socket_info*/) {
-  return (Pkt4Ptr());
-  /*    uint8_t raw_buf[IfaceMgr::RCVBUFSIZE];
+PktFilterBPF::receive(const Iface& iface, const SocketInfo& socket_info) {
+    // When using BPF, the read buffer must be allocated for the interface.
+    // If it is not allocated, it is a programmatic error.
+    if (iface.getReadBufferSize() == 0) {
+        isc_throw(SocketConfigError, "socket read buffer not allocated"
+                  " for the interface: " << iface.getName());
+    }
+
     // First let's get some data from the fallback socket. The data will be
     // First let's get some data from the fallback socket. The data will be
     // discarded but we don't want the socket buffer to bloat. We get the
     // discarded but we don't want the socket buffer to bloat. We get the
     // packets from the socket in loop but most of the time the loop will
     // packets from the socket in loop but most of the time the loop will
@@ -222,22 +317,36 @@ PktFilterBPF::receive(const Iface&/* iface */, const SocketInfo& /*socket_info*/
     // when the DHCP server is idle.
     // when the DHCP server is idle.
     int datalen;
     int datalen;
     do {
     do {
-        datalen = recv(socket_info.fallbackfd_, raw_buf, sizeof(raw_buf), 0);
+        datalen = recv(socket_info.fallbackfd_, iface.getReadBufferPtr(),
+                       iface.getReadBufferSize(), 0);
     } while (datalen > 0);
     } while (datalen > 0);
 
 
     // Now that we finished getting data from the fallback socket, we
     // Now that we finished getting data from the fallback socket, we
     // have to get the data from the raw socket too.
     // have to get the data from the raw socket too.
-    int data_len = read(socket_info.sockfd_, raw_buf, sizeof(raw_buf));
+    int data_len = read(socket_info.sockfd_, iface.getReadBufferPtr(),
+                        iface.getReadBufferSize());
     // If negative value is returned by read(), it indicates that an
     // If negative value is returned by read(), it indicates that an
     // error occured. If returned value is 0, no data was read from the
     // error occured. If returned value is 0, no data was read from the
-    // socket. In both cases something has gone wrong, because we expect
+    // socket.b In both cases something has gone wrong, because we expect
     // that a chunk of data is there. We signal the lack of data by
     // that a chunk of data is there. We signal the lack of data by
     // returing an empty packet.
     // returing an empty packet.
     if (data_len <= 0) {
     if (data_len <= 0) {
         return Pkt4Ptr();
         return Pkt4Ptr();
     }
     }
 
 
-    InputBuffer buf(raw_buf, data_len);
+    struct bpf_hdr bpfh;
+    memcpy(static_cast<void*>(&bpfh),
+           static_cast<void*>(iface.getReadBufferPtr()),
+           sizeof(bpfh));
+    if (bpfh.bh_hdrlen >= data_len) {
+        isc_throw(SocketReadError, "packet received from the BPF device"
+                  << " attached to interface " << iface.getName()
+                  << " is truncated");
+    }
+
+    InputBuffer buf(iface.getReadBufferPtr() + bpfh.bh_hdrlen,
+                    data_len - bpfh.bh_hdrlen);
+
 
 
     // @todo: This is awkward way to solve the chicken and egg problem
     // @todo: This is awkward way to solve the chicken and egg problem
     // whereby we don't know the offset where DHCP data start in the
     // whereby we don't know the offset where DHCP data start in the
@@ -253,8 +362,33 @@ PktFilterBPF::receive(const Iface&/* iface */, const SocketInfo& /*socket_info*/
     // in some more elegant way.
     // in some more elegant way.
     Pkt4Ptr dummy_pkt = Pkt4Ptr(new Pkt4(DHCPDISCOVER, 0));
     Pkt4Ptr dummy_pkt = Pkt4Ptr(new Pkt4(DHCPDISCOVER, 0));
 
 
-    // Decode ethernet, ip and udp headers.
-    decodeEthernetHeader(buf, dummy_pkt);
+    // On local loopback interface the ethernet header is not present.
+    // Instead, there is a 4-byte long pseudo header containing the
+    // address family in the host byte order.
+    if (iface.flag_loopback_) {
+        if (buf.getLength() < BPF_LOCAL_LOOPBACK_HEADER_LEN) {
+            isc_throw(SocketReadError, "packet received on local loopback"
+                      " interface " << iface.getName() << " doesn't contain"
+                      " the pseudo header with the address family type");
+        }
+        // Advance to the position of the IP header. We don't check the
+        // contents of the pseudo header because the BPF filter should have
+        // filtered out the packets with address family other than AF_INET.
+        buf.setPosition(BPF_LOCAL_LOOPBACK_HEADER_LEN);
+
+        // Since we don't decode the real link-layer header we need to
+        // supply the hardware address ourselves.
+        dummy_pkt->setLocalHWAddr(HWAddrPtr(new HWAddr()));
+        dummy_pkt->setRemoteHWAddr(HWAddrPtr(new HWAddr()));
+
+    } else {
+        // If we are on the interface other than local loopback, assume
+        // the ethernet header. For now we don't support any other data
+        // link layer.
+        decodeEthernetHeader(buf, dummy_pkt);
+    }
+
+    // Decode IP/UDP headers.
     decodeIpUdpHeader(buf, dummy_pkt);
     decodeIpUdpHeader(buf, dummy_pkt);
 
 
     // Read the DHCP data.
     // Read the DHCP data.
@@ -275,15 +409,13 @@ PktFilterBPF::receive(const Iface&/* iface */, const SocketInfo& /*socket_info*/
     pkt->setLocalHWAddr(dummy_pkt->getLocalHWAddr());
     pkt->setLocalHWAddr(dummy_pkt->getLocalHWAddr());
     pkt->setRemoteHWAddr(dummy_pkt->getRemoteHWAddr());
     pkt->setRemoteHWAddr(dummy_pkt->getRemoteHWAddr());
 
 
-    return (pkt); */
+    return (pkt);
 }
 }
 
 
 int
 int
-PktFilterBPF::send(const Iface& /*iface*/, uint16_t /*sockfd*/, const Pkt4Ptr& /*pkt*/) {
-
-  return 0;
+PktFilterBPF::send(const Iface& iface, uint16_t sockfd, const Pkt4Ptr& pkt) {
 
 
-  /*    OutputBuffer buf(14);
+    OutputBuffer buf(14);
 
 
     // Some interfaces may have no HW address - e.g. loopback interface.
     // Some interfaces may have no HW address - e.g. loopback interface.
     // For these interfaces the HW address length is 0. If this is the case,
     // For these interfaces the HW address length is 0. If this is the case,
@@ -296,11 +428,18 @@ PktFilterBPF::send(const Iface& /*iface*/, uint16_t /*sockfd*/, const Pkt4Ptr& /
         pkt->setLocalHWAddr(hwaddr);
         pkt->setLocalHWAddr(hwaddr);
     }
     }
 
 
-
-    // Ethernet frame header.
-    // Note that we don't validate whether HW addresses in 'pkt'
-    // are valid because they are checked by the function called.
-    writeEthernetHeader(pkt, buf);
+    /// Local loopback interface requires special treatment. It doesn't
+    /// use the ethernet header but rather a 4-bytes long pseudo header
+    /// holding an address family type (see bpf.c in OS sources).
+    if (iface.flag_loopback_) {
+        writeAFPseudoHeader(AF_INET, buf);
+
+    } else {
+        // Ethernet frame header.
+        // Note that we don't validate whether HW addresses in 'pkt'
+        // are valid because they are validated by the function called.
+        writeEthernetHeader(pkt, buf);
+    }
 
 
     // IP and UDP header
     // IP and UDP header
     writeIpUdpHeader(pkt, buf);
     writeIpUdpHeader(pkt, buf);
@@ -308,24 +447,28 @@ PktFilterBPF::send(const Iface& /*iface*/, uint16_t /*sockfd*/, const Pkt4Ptr& /
     // DHCPv4 message
     // DHCPv4 message
     buf.writeData(pkt->getBuffer().getData(), pkt->getBuffer().getLength());
     buf.writeData(pkt->getBuffer().getData(), pkt->getBuffer().getLength());
 
 
-    sockaddr_ll sa;
-    sa.sll_family = AF_PACKET;
-    sa.sll_ifindex = iface.getIndex();
-    sa.sll_protocol = htons(ETH_P_IP);
-    sa.sll_halen = 6;
-
-    int result = sendto(sockfd, buf.getData(), buf.getLength(), 0,
-                        reinterpret_cast<const struct sockaddr*>(&sa),
-                        sizeof(sockaddr_ll));
+    int result = write(sockfd, buf.getData(), buf.getLength());
     if (result < 0) {
     if (result < 0) {
+        std::cout << strerror(errno) << std::endl;
         isc_throw(SocketWriteError, "failed to send DHCPv4 packet, errno="
         isc_throw(SocketWriteError, "failed to send DHCPv4 packet, errno="
                   << errno << " (check errno.h)");
                   << errno << " (check errno.h)");
     }
     }
 
 
     return (0);
     return (0);
-  */
 }
 }
 
 
+void
+PktFilterBPF::writeAFPseudoHeader(const uint32_t address_family,
+                                  util::OutputBuffer& out_buf) {
+    // Copy address family to the temporary buffer and preserve the
+    // bytes order.
+    uint8_t af_buf[4];
+    memcpy(static_cast<void*>(af_buf),
+           static_cast<const void*>(&address_family),
+           sizeof(af_buf));
+    // Write the data into the buffer.
+    out_buf.writeData(af_buf, sizeof(af_buf));
+}
 
 
 } // end of isc::dhcp namespace
 } // end of isc::dhcp namespace
 } // end of isc namespace
 } // end of isc namespace

+ 22 - 1
src/lib/dhcp/pkt_filter_bpf.h

@@ -49,7 +49,7 @@ public:
     /// @param send_bcast Configure socket to send broadcast messages.
     /// @param send_bcast Configure socket to send broadcast messages.
     ///
     ///
     /// @return A structure describing a primary and fallback socket.
     /// @return A structure describing a primary and fallback socket.
-    virtual SocketInfo openSocket(const Iface& iface,
+    virtual SocketInfo openSocket(Iface& iface,
                                   const isc::asiolink::IOAddress& addr,
                                   const isc::asiolink::IOAddress& addr,
                                   const uint16_t port,
                                   const uint16_t port,
                                   const bool receive_bcast,
                                   const bool receive_bcast,
@@ -73,6 +73,27 @@ public:
     virtual int send(const Iface& iface, uint16_t sockfd,
     virtual int send(const Iface& iface, uint16_t sockfd,
                      const Pkt4Ptr& pkt);
                      const Pkt4Ptr& pkt);
 
 
+private:
+
+    /// @brief Writes pseudo header containing an address family into a buffer.
+    ///
+    /// BPF utilizes the pseudo headers to pass the ancillary data between the
+    /// kernel and the application. For example, when the packet is to be sent
+    /// over the local loopback interface the pseudo header must be added before
+    /// the network layer header to indicate the address family. Other link
+    /// layer header (e.g. ethernet) is not used for local loopback interface.
+    ///
+    /// The header written by this method consists of 4 bytes and contains the
+    /// address family value in host byte order. See sys/socket.h for the
+    /// address family values. Typically it will be AF_INET.
+    ///
+    /// This function doesn't throw.
+    ///
+    /// @param address_family Address family (e.g. AF_INET).
+    /// @param [out] out_buf buffer where a header is written.
+    void writeAFPseudoHeader(const uint32_t address_family,
+                             util::OutputBuffer& out_buf);
+
 };
 };
 
 
 } // namespace isc::dhcp
 } // namespace isc::dhcp

+ 1 - 1
src/lib/dhcp/pkt_filter_inet.cc

@@ -31,7 +31,7 @@ PktFilterInet::PktFilterInet()
 }
 }
 
 
 SocketInfo
 SocketInfo
-PktFilterInet::openSocket(const Iface& iface,
+PktFilterInet::openSocket(Iface& iface,
                           const isc::asiolink::IOAddress& addr,
                           const isc::asiolink::IOAddress& addr,
                           const uint16_t port,
                           const uint16_t port,
                           const bool receive_bcast,
                           const bool receive_bcast,

+ 2 - 2
src/lib/dhcp/pkt_filter_inet.h

@@ -1,4 +1,4 @@
-// Copyright (C) 2013 Internet Systems Consortium, Inc. ("ISC")
+// Copyright (C) 2013-2014 Internet Systems Consortium, Inc. ("ISC")
 //
 //
 // Permission to use, copy, modify, and/or distribute this software for any
 // Permission to use, copy, modify, and/or distribute this software for any
 // purpose with or without fee is hereby granted, provided that the above
 // purpose with or without fee is hereby granted, provided that the above
@@ -55,7 +55,7 @@ public:
     /// @return A structure describing a primary and fallback socket.
     /// @return A structure describing a primary and fallback socket.
     /// @throw isc::dhcp::SocketConfigError if error occurs when opening,
     /// @throw isc::dhcp::SocketConfigError if error occurs when opening,
     /// binding or configuring the socket.
     /// binding or configuring the socket.
-    virtual SocketInfo openSocket(const Iface& iface,
+    virtual SocketInfo openSocket(Iface& iface,
                                   const isc::asiolink::IOAddress& addr,
                                   const isc::asiolink::IOAddress& addr,
                                   const uint16_t port,
                                   const uint16_t port,
                                   const bool receive_bcast,
                                   const bool receive_bcast,

+ 2 - 2
src/lib/dhcp/pkt_filter_lpf.cc

@@ -1,4 +1,4 @@
-// Copyright (C) 2013 Internet Systems Consortium, Inc. ("ISC")
+// Copyright (C) 2013-2014 Internet Systems Consortium, Inc. ("ISC")
 //
 //
 // Permission to use, copy, modify, and/or distribute this software for any
 // Permission to use, copy, modify, and/or distribute this software for any
 // purpose with or without fee is hereby granted, provided that the above
 // purpose with or without fee is hereby granted, provided that the above
@@ -103,7 +103,7 @@ namespace isc {
 namespace dhcp {
 namespace dhcp {
 
 
 SocketInfo
 SocketInfo
-PktFilterLPF::openSocket(const Iface& iface,
+PktFilterLPF::openSocket(Iface& iface,
                          const isc::asiolink::IOAddress& addr,
                          const isc::asiolink::IOAddress& addr,
                          const uint16_t port, const bool,
                          const uint16_t port, const bool,
                          const bool) {
                          const bool) {

+ 2 - 2
src/lib/dhcp/pkt_filter_lpf.h

@@ -1,4 +1,4 @@
-// Copyright (C) 2013 Internet Systems Consortium, Inc. ("ISC")
+// Copyright (C) 2013-2014 Internet Systems Consortium, Inc. ("ISC")
 //
 //
 // Permission to use, copy, modify, and/or distribute this software for any
 // Permission to use, copy, modify, and/or distribute this software for any
 // purpose with or without fee is hereby granted, provided that the above
 // purpose with or without fee is hereby granted, provided that the above
@@ -49,7 +49,7 @@ public:
     /// @param send_bcast Configure socket to send broadcast messages.
     /// @param send_bcast Configure socket to send broadcast messages.
     ///
     ///
     /// @return A structure describing a primary and fallback socket.
     /// @return A structure describing a primary and fallback socket.
-    virtual SocketInfo openSocket(const Iface& iface,
+    virtual SocketInfo openSocket(Iface& iface,
                                   const isc::asiolink::IOAddress& addr,
                                   const isc::asiolink::IOAddress& addr,
                                   const uint16_t port,
                                   const uint16_t port,
                                   const bool receive_bcast,
                                   const bool receive_bcast,

+ 6 - 6
src/lib/dhcp/tests/iface_mgr_unittest.cc

@@ -80,13 +80,13 @@ TEST(IfaceTest, readBuffer) {
         buf_ptr[i] = i;
         buf_ptr[i] = i;
     }
     }
 
 
-    // Validate the data.
-    const OptionBuffer& buf = iface.getReadBuffer();
-    ASSERT_EQ(256, buf.size());
-    for (int i = 0; i < buf.size(); ++i) {
+    // Get the pointer again and validate the data.
+    buf_ptr = iface.getReadBufferPtr();
+    ASSERT_EQ(256, iface.getReadBufferSize());
+    for (int i = 0; i < iface.getReadBufferSize(); ++i) {
         // Use assert so as it fails on the first failure, no need
         // Use assert so as it fails on the first failure, no need
         // to continue further checks.
         // to continue further checks.
-        ASSERT_EQ(i, buf[i]);
+        ASSERT_EQ(i, buf_ptr[i]);
     }
     }
 }
 }
 
 
@@ -130,7 +130,7 @@ public:
     /// @param iface An interface on which the socket is to be opened.
     /// @param iface An interface on which the socket is to be opened.
     /// @param addr An address to which the socket is to be bound.
     /// @param addr An address to which the socket is to be bound.
     /// @param port A port to which the socket is to be bound.
     /// @param port A port to which the socket is to be bound.
-    virtual SocketInfo openSocket(const Iface& iface,
+    virtual SocketInfo openSocket(Iface& iface,
                                   const isc::asiolink::IOAddress& addr,
                                   const isc::asiolink::IOAddress& addr,
                                   const uint16_t port,
                                   const uint16_t port,
                                   const bool join_multicast,
                                   const bool join_multicast,

+ 46 - 34
src/lib/dhcp/tests/pkt_filter_bpf_unittest.cc

@@ -23,6 +23,7 @@
 
 
 #include <gtest/gtest.h>
 #include <gtest/gtest.h>
 
 
+#include <net/bpf.h>
 #include <sys/socket.h>
 #include <sys/socket.h>
 
 
 using namespace isc::asiolink;
 using namespace isc::asiolink;
@@ -34,7 +35,7 @@ namespace {
 /// Port number used by tests.
 /// Port number used by tests.
 const uint16_t PORT = 10067;
 const uint16_t PORT = 10067;
 /// Size of the buffer holding received packets.
 /// Size of the buffer holding received packets.
-const size_t RECV_BUF_SIZE = 2048;
+const size_t RECV_BUF_SIZE = 4096;
 
 
 // Test fixture class inherits from the class common for all packet
 // Test fixture class inherits from the class common for all packet
 // filter tests.
 // filter tests.
@@ -68,6 +69,7 @@ TEST_F(PktFilterBPFTest, isDirectResponseSupported) {
 TEST_F(PktFilterBPFTest, DISABLED_openSocket) {
 TEST_F(PktFilterBPFTest, DISABLED_openSocket) {
     // Create object representing loopback interface.
     // Create object representing loopback interface.
     Iface iface(ifname_, ifindex_);
     Iface iface(ifname_, ifindex_);
+    iface.flag_loopback_ = true;
     // Set loopback address.
     // Set loopback address.
     IOAddress addr("127.0.0.1");
     IOAddress addr("127.0.0.1");
 
 
@@ -81,41 +83,29 @@ TEST_F(PktFilterBPFTest, DISABLED_openSocket) {
     ASSERT_GE(sock_info_.sockfd_, 0);
     ASSERT_GE(sock_info_.sockfd_, 0);
     // Check that the fallback socket has been opened too.
     // Check that the fallback socket has been opened too.
     ASSERT_GE(sock_info_.fallbackfd_, 0);
     ASSERT_GE(sock_info_.fallbackfd_, 0);
-
-    /*    // Verify that the socket belongs to AF_PACKET family.
-    sockaddr_ll sock_address;
-    socklen_t sock_address_len = sizeof(sock_address);
-    ASSERT_EQ(0, getsockname(sock_info_.sockfd_,
-                             reinterpret_cast<sockaddr*>(&sock_address),
-                             &sock_address_len));
-    EXPECT_EQ(AF_PACKET, sock_address.sll_family);
-
-    // Verify that the socket is bound to appropriate interface.
-    EXPECT_EQ(ifindex_, sock_address.sll_ifindex);
-
-    // Verify that the socket has SOCK_RAW type.
-    int sock_type;
-    socklen_t sock_type_len = sizeof(sock_type);
-    ASSERT_EQ(0, getsockopt(sock_info_.sockfd_, SOL_SOCKET, SO_TYPE,
-                            &sock_type, &sock_type_len));
-			    EXPECT_EQ(SOCK_RAW, sock_type); */
 }
 }
 
 
-// This test verifies correctness of sending DHCP packet through the raw
-// socket, whereby all IP stack headers are hand-crafted.
+// This test verifies correctness of sending DHCP packet through the BPF
+// device attached to local loopback interface. Note that this is not exactly
+// the same as sending over the hardware interface (e.g. ethernet) because the
+// packet format is different on local loopback interface when using the
+// BPF. The key difference is that the pseudo header containing address
+// family is sent instead of link-layer header. Ideally we would run this
+// test over the real interface but since we don't know what interfaces
+// are present in the particular system we have to stick to local loopback
+// interface as this one is almost always present.
 TEST_F(PktFilterBPFTest, DISABLED_send) {
 TEST_F(PktFilterBPFTest, DISABLED_send) {
-  /*    // Packet will be sent over loopback interface.
+    // Packet will be sent over loopback interface.
     Iface iface(ifname_, ifindex_);
     Iface iface(ifname_, ifindex_);
+    iface.flag_loopback_ = true;
     IOAddress addr("127.0.0.1");
     IOAddress addr("127.0.0.1");
 
 
     // Create an instance of the class which we are testing.
     // Create an instance of the class which we are testing.
     PktFilterBPF pkt_filter;
     PktFilterBPF pkt_filter;
-    // Open socket. We don't check that the socket has appropriate
-    // options and family set because we have checked that in the
-    // openSocket test already.
 
 
+    // Open BPF device.
     sock_info_ = pkt_filter.openSocket(iface, addr, PORT, false, false);
     sock_info_ = pkt_filter.openSocket(iface, addr, PORT, false, false);
-
+    // Returned descriptor must not be negative. 0 is valid.
     ASSERT_GE(sock_info_.sockfd_, 0);
     ASSERT_GE(sock_info_.sockfd_, 0);
 
 
     // Send the packet over the socket.
     // Send the packet over the socket.
@@ -133,17 +123,37 @@ TEST_F(PktFilterBPFTest, DISABLED_send) {
     // We should receive some data from loopback interface.
     // We should receive some data from loopback interface.
     ASSERT_GT(result, 0);
     ASSERT_GT(result, 0);
 
 
-    // Get the actual data.
+    /// Get the actual data.
     uint8_t rcv_buf[RECV_BUF_SIZE];
     uint8_t rcv_buf[RECV_BUF_SIZE];
-    result = recv(sock_info_.sockfd_, rcv_buf, RECV_BUF_SIZE, 0);
+    result = read(sock_info_.sockfd_, rcv_buf, RECV_BUF_SIZE);
     ASSERT_GT(result, 0);
     ASSERT_GT(result, 0);
 
 
-    Pkt4Ptr dummy_pkt = Pkt4Ptr(new Pkt4(DHCPDISCOVER, 0));
+    // Each packet is prepended with the BPF header structure. We have to
+    // parse this structure to locate the position of the address family
+    // pseudo header.
+    struct bpf_hdr bpfh;
+    memcpy(static_cast<void*>(&bpfh), static_cast<void*>(rcv_buf),
+           sizeof(bpf_hdr));
+    // bh_hdrlen contains the total length of the BPF header, including
+    // alignment. We will use this value to skip over the BPF header and
+    // parse the contents of the packet that we are interested in.
+    uint32_t bpfh_len = bpfh.bh_hdrlen;
+    // Address Family pseudo header contains the address family of the
+    // packet (used for local loopback interface instead of the link-layer
+    // header such as ethernet frame header).
+    uint32_t af = 0;
+    memcpy(static_cast<void*>(&af),
+           static_cast<void*>(rcv_buf + bpfh_len), 4);
+    // Check the value in the pseudo header. If this is incorrect, something
+    // is really broken, so let's exit.
+    ASSERT_EQ(AF_INET, af);
 
 
-    InputBuffer buf(rcv_buf, result);
+    Pkt4Ptr dummy_pkt = Pkt4Ptr(new Pkt4(DHCPDISCOVER, 0));
+    // Create the input buffer from the reminder of the packet. This should
+    // only contain the IP/UDP headers and the DHCP message.
+    InputBuffer buf(rcv_buf + bpfh_len + 4, result - bpfh_len - 4);
+    ASSERT_GE(buf.getLength(), test_message_->len());
 
 
-    // Decode ethernet, ip and udp headers.
-    decodeEthernetHeader(buf, dummy_pkt);
     decodeIpUdpHeader(buf, dummy_pkt);
     decodeIpUdpHeader(buf, dummy_pkt);
 
 
     // Create the DHCPv4 packet from the received data.
     // Create the DHCPv4 packet from the received data.
@@ -156,7 +166,7 @@ TEST_F(PktFilterBPFTest, DISABLED_send) {
     ASSERT_NO_THROW(rcvd_pkt->unpack());
     ASSERT_NO_THROW(rcvd_pkt->unpack());
 
 
     // Check if the received message is correct.
     // Check if the received message is correct.
-    testRcvdMessage(rcvd_pkt); */
+    testRcvdMessage(rcvd_pkt);
 }
 }
 
 
 // This test verifies correctness of reception of the DHCP packet over
 // This test verifies correctness of reception of the DHCP packet over
@@ -165,6 +175,7 @@ TEST_F(PktFilterBPFTest, DISABLED_receive) {
 
 
     // Packet will be received over loopback interface.
     // Packet will be received over loopback interface.
     Iface iface(ifname_, ifindex_);
     Iface iface(ifname_, ifindex_);
+    iface.flag_loopback_ = true;
     IOAddress addr("127.0.0.1");
     IOAddress addr("127.0.0.1");
 
 
     // Create an instance of the class which we are testing.
     // Create an instance of the class which we are testing.
@@ -179,7 +190,8 @@ TEST_F(PktFilterBPFTest, DISABLED_receive) {
     sendMessage();
     sendMessage();
 
 
     // Receive the packet using LPF packet filter.
     // Receive the packet using LPF packet filter.
-    Pkt4Ptr rcvd_pkt = pkt_filter.receive(iface, sock_info_);
+    Pkt4Ptr rcvd_pkt;
+    ASSERT_NO_THROW(rcvd_pkt = pkt_filter.receive(iface, sock_info_));
     // Check that the packet has been correctly received.
     // Check that the packet has been correctly received.
     ASSERT_TRUE(rcvd_pkt);
     ASSERT_TRUE(rcvd_pkt);
 
 

+ 1 - 1
src/lib/dhcp/tests/pkt_filter_test_stub.cc

@@ -28,7 +28,7 @@ PktFilterTestStub::isDirectResponseSupported() const {
 }
 }
 
 
 SocketInfo
 SocketInfo
-PktFilterTestStub::openSocket(const Iface&,
+PktFilterTestStub::openSocket(Iface&,
            const isc::asiolink::IOAddress& addr,
            const isc::asiolink::IOAddress& addr,
            const uint16_t port, const bool, const bool) {
            const uint16_t port, const bool, const bool) {
     return (SocketInfo(addr, port, 0));
     return (SocketInfo(addr, port, 0));

+ 1 - 1
src/lib/dhcp/tests/pkt_filter_test_stub.h

@@ -63,7 +63,7 @@ public:
     ///
     ///
     /// @return A SocketInfo structure with the socket descriptor set to 0. The
     /// @return A SocketInfo structure with the socket descriptor set to 0. The
     /// fallback socket descriptor is set to a negative value.
     /// fallback socket descriptor is set to a negative value.
-    virtual SocketInfo openSocket(const Iface& iface,
+    virtual SocketInfo openSocket(Iface& iface,
                                   const isc::asiolink::IOAddress& addr,
                                   const isc::asiolink::IOAddress& addr,
                                   const uint16_t port,
                                   const uint16_t port,
                                   const bool receive_bcast,
                                   const bool receive_bcast,

+ 2 - 2
src/lib/dhcp/tests/pkt_filter_test_utils.cc

@@ -1,4 +1,4 @@
-// Copyright (C) 2013 Internet Systems Consortium, Inc. ("ISC")
+// Copyright (C) 2013-2014 Internet Systems Consortium, Inc. ("ISC")
 //
 //
 // Permission to use, copy, modify, and/or distribute this software for any
 // Permission to use, copy, modify, and/or distribute this software for any
 // purpose with or without fee is hereby granted, provided that the above
 // purpose with or without fee is hereby granted, provided that the above
@@ -172,7 +172,7 @@ PktFilterStub::isDirectResponseSupported() const {
 }
 }
 
 
 SocketInfo
 SocketInfo
-PktFilterStub::openSocket(const Iface&,
+PktFilterStub::openSocket(Iface&,
            const isc::asiolink::IOAddress& addr,
            const isc::asiolink::IOAddress& addr,
            const uint16_t port, const bool, const bool) {
            const uint16_t port, const bool, const bool) {
     return (SocketInfo(addr, port, 0));
     return (SocketInfo(addr, port, 0));

+ 1 - 1
src/lib/dhcp/tests/pkt_filter_test_utils.h

@@ -127,7 +127,7 @@ public:
     ///
     ///
     /// @return A SocketInfo structure with the socket descriptor set to 0. The
     /// @return A SocketInfo structure with the socket descriptor set to 0. The
     /// fallback socket descriptor is set to a negative value.
     /// fallback socket descriptor is set to a negative value.
-    virtual SocketInfo openSocket(const Iface& iface,
+    virtual SocketInfo openSocket(Iface& iface,
                                   const isc::asiolink::IOAddress& addr,
                                   const isc::asiolink::IOAddress& addr,
                                   const uint16_t port,
                                   const uint16_t port,
                                   const bool receive_bcast,
                                   const bool receive_bcast,