Commit f70ea49a authored by Davis King's avatar Davis King

Added code to allow different processing nodes to terminate at different

times.
parent c3f09874
......@@ -123,6 +123,7 @@ namespace dlib
const static char ALL_NODES_WAITING = 3;
const static char SENT_MESSAGE = 4;
const static char GOT_MESSAGE = 5;
const static char NODE_TERMINATE = 6;
// ------------------------------------------------------------------------------------
......@@ -202,6 +203,31 @@ namespace dlib
// ----------------------------------------------------------------------------------------
// IMPLEMENTATION OF bsp OBJECT MEMBERS
// ----------------------------------------------------------------------------------------
// ----------------------------------------------------------------------------------------
void bsp::
close_all_connections_gracefully(
)
{
if (_node_id == 0)
{
// Wait for all the other nodes to terminate before we do anything since
// we are the controller node.
receive();
}
_cons.reset();
while (_cons.move_next())
{
// tell the other end that we are intentionally dropping the connection
serialize(impl::NODE_TERMINATE,_cons.element().value()->stream);
_cons.element().value()->stream.flush();
_cons.element().value()->con->shutdown();
}
check_for_errors();
}
// ----------------------------------------------------------------------------------------
bsp::
......@@ -225,7 +251,7 @@ namespace dlib
unsigned long node_id_,
impl::map_id_to_con& cons_
) :
read_thread_terminated(false),
read_thread_terminated_improperly(false),
outstanding_messages(0),
num_waiting_nodes(0),
buf_not_empty(class_mutex),
......@@ -264,11 +290,11 @@ namespace dlib
if (msg_buffer.size() == 0)
{
send_to_master_node(WAITING_ON_RECEIVE);
while (msg_buffer.size() == 0 && !read_thread_terminated)
while (msg_buffer.size() == 0 && !read_thread_terminated_improperly)
{
buf_not_empty.wait();
}
if (read_thread_terminated)
if (read_thread_terminated_improperly)
{
throw dlib::socket_error("A connection between processing nodes has been lost.");
}
......@@ -355,10 +381,15 @@ namespace dlib
{
try
{
serialize(ALL_NODES_WAITING, _cons.element().value()->stream);
_cons.element().value()->stream.flush();
if (!_cons.element().value()->stream)
throw dlib::error("Error writing data to TCP connection");
// Skip connections to nodes that have already terminated their
// execution.
if (_cons.element().value()->terminated == false)
{
serialize(ALL_NODES_WAITING, _cons.element().value()->stream);
_cons.element().value()->stream.flush();
if (!_cons.element().value()->stream)
throw dlib::error("Error writing data to TCP connection");
}
}
catch (std::exception& e)
{
......@@ -438,6 +469,18 @@ namespace dlib
auto_mutex lock(class_mutex);
--outstanding_messages;
} break;
case NODE_TERMINATE: {
auto_mutex lock(class_mutex);
if (_node_id == 0)
{
// a terminating node is basically the same as a node that waits forever.
_cons[sender_id]->terminated = true;
++num_waiting_nodes;
notify_everyone_if_all_blocked();
}
return;
} break;
}
}
}
......@@ -453,7 +496,7 @@ namespace dlib
}
auto_mutex lock(class_mutex);
read_thread_terminated = true;
read_thread_terminated_improperly = true;
buf_not_empty.signal();
}
......@@ -476,6 +519,9 @@ namespace dlib
)
{
using namespace impl;
if (_cons[target_node_id]->terminated)
throw socket_error("Attempt to send a message to a node that has terminated.");
serialize(MESSAGE_HEADER, _cons[target_node_id]->stream);
serialize(item, _cons[target_node_id]->stream);
_cons[target_node_id]->stream.flush();
......
......@@ -27,14 +27,16 @@ namespace dlib
) :
con(connect(dest.first,dest.second)),
buf(con),
stream(&buf)
stream(&buf),
terminated(false)
{}
bsp_con(
scoped_ptr<connection>& conptr
) :
buf(conptr),
stream(&buf)
stream(&buf),
terminated(false)
{
// make sure we own the connection
conptr.swap(con);
......@@ -43,6 +45,7 @@ namespace dlib
scoped_ptr<connection> con;
sockstreambuf::kernel_2a buf;
std::iostream stream;
bool terminated;
};
typedef dlib::map<unsigned long, scoped_ptr<bsp_con> >::kernel_1a_c map_id_to_con;
......@@ -135,6 +138,20 @@ namespace dlib
BSP computation.
!*/
void receive (
)
/*!
ensures
- simply waits for all other nodes to become blocked
on calls to receive() or to terminate (i.e. waits for
other nodes to be in a state that can't send messages).
!*/
{
int junk;
if (receive(junk))
throw dlib::socket_error("call to receive got an unexpected message");
}
template <typename T>
bool receive (
T& item
......@@ -194,6 +211,14 @@ namespace dlib
impl::map_id_to_con& cons_
);
void close_all_connections_gracefully();
/*!
ensures
- closes all the connections to other nodes and lets them know that
we are terminating normally rather than as the result of some kind
of error.
!*/
bool receive_data (
shared_ptr<std::string>& item,
unsigned long& sending_node_id
......@@ -237,7 +262,7 @@ namespace dlib
rmutex class_mutex; // used to lock any class members touched from more than one thread.
std::string error_message;
bool read_thread_terminated; // true if any of our connections goes down.
bool read_thread_terminated_improperly; // true if any of our connections goes down.
unsigned long outstanding_messages;
unsigned long num_waiting_nodes;
rsignaler buf_not_empty; // used to signal when msg_buffer isn't empty
......@@ -362,7 +387,7 @@ namespace dlib
send_out_connection_orders(cons, hosts);
bsp obj(node_id, cons);
funct(obj);
obj.check_for_errors();
obj.close_all_connections_gracefully();
}
// ----------------------------------------------------------------------------------------
......@@ -383,7 +408,7 @@ namespace dlib
send_out_connection_orders(cons, hosts);
bsp obj(node_id, cons);
funct(obj,arg1);
obj.check_for_errors();
obj.close_all_connections_gracefully();
}
// ----------------------------------------------------------------------------------------
......@@ -406,7 +431,7 @@ namespace dlib
send_out_connection_orders(cons, hosts);
bsp obj(node_id, cons);
funct(obj,arg1,arg2);
obj.check_for_errors();
obj.close_all_connections_gracefully();
}
// ----------------------------------------------------------------------------------------
......@@ -431,7 +456,7 @@ namespace dlib
send_out_connection_orders(cons, hosts);
bsp obj(node_id, cons);
funct(obj,arg1,arg2,arg3);
obj.check_for_errors();
obj.close_all_connections_gracefully();
}
// ----------------------------------------------------------------------------------------
......@@ -451,7 +476,7 @@ namespace dlib
listen_and_connect_all(node_id, cons, listening_port);
bsp obj(node_id, cons);
funct(obj);
obj.check_for_errors();
obj.close_all_connections_gracefully();
}
// ----------------------------------------------------------------------------------------
......@@ -471,7 +496,7 @@ namespace dlib
listen_and_connect_all(node_id, cons, listening_port);
bsp obj(node_id, cons);
funct(obj,arg1);
obj.check_for_errors();
obj.close_all_connections_gracefully();
}
// ----------------------------------------------------------------------------------------
......@@ -493,7 +518,7 @@ namespace dlib
listen_and_connect_all(node_id, cons, listening_port);
bsp obj(node_id, cons);
funct(obj,arg1,arg2);
obj.check_for_errors();
obj.close_all_connections_gracefully();
}
// ----------------------------------------------------------------------------------------
......@@ -517,7 +542,7 @@ namespace dlib
listen_and_connect_all(node_id, cons, listening_port);
bsp obj(node_id, cons);
funct(obj,arg1,arg2,arg3);
obj.check_for_errors();
obj.close_all_connections_gracefully();
}
// ----------------------------------------------------------------------------------------
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment