#!/bin/env python """Testlet to run the internode LatencyBandwidth test.""" import os import time import test_utils as util name = 'Running internode latency/bandwidth test' def get_node_connections(nodes): """Given list of nodes, return list of all possible pairs.""" def pair_list(l): result = [] if len(l) > 1: head = l[0] tail = l[1:] for x in tail: result.append((head, x)) result += pair_list(tail) return result return pair_list(nodes) def test(logfile): result = True (cluster, domain) = util.get_hostname() # get python to run python_env_var = os.getenv('PYTHON') # compile C code obj_file = 'ctiming_%s' % cluster cmd = ('mpicc ctiming.c -lm -o %s' % obj_file) util.log_print_nl(logfile, 'Compiling ctiming.c:') util.log_print_nl(logfile,cmd) (_, fd) = os.popen4(cmd) output = fd.read() status = fd.close() if status: util.log_print_nl(logfile, 'ERRORS COMPILING') util.log_print(logfile, output) return False util.log_print_nl(logfile) # get list of node numbers home_env_var = os.getenv('HOME') #machines_file = os.path.join(home_env_var, '.machines_%s' % cluster) nodes = util.get_node_numbers(cluster, strip_bad_nodes=True) nodes.sort() # # check which nodes are up, remove others # active_nodes = [] # for node in nodes: # (_, fd) = os.popen4('ssh %s exit' % node) # data = fd.read() # status = fd.close() # if len(data) == 0: # active_nodes.append(node) # get all unique pairs of active node *numbers* node_pairs = get_node_connections(nodes) node_stem = util.get_cluster_info(cluster)['node_stem'] # run the tests results = {} for pair in node_pairs: (n1, n2) = pair node1 = node_stem % n1 node2 = node_stem % n2 cmd = ('mpirun -nolocal -np 2 -host %s,%s %s' % (node1, node2, obj_file)) util.log_print(logfile, cmd + ' ...') status = 100 try: (_, fd) = os.popen4(cmd) output = fd.readlines() status = fd.close() except: pass if status: util.log_print_nl(logfile, 'ERROR') util.log_print(logfile, '\n'.join(output)) results[pair] = (None, 'fail') continue estimated_bandwidth = None estimated_latency = None for line in output: if line.startswith('Estimated bandwith'): (_, estimated_bandwidth) = line.split(': ', 1) estimated_bandwidth = estimated_bandwidth.strip() (estimated_bandwidth, _) = estimated_bandwidth.split(' ', 1) if line.startswith('Estimated latency'): (_, estimated_latency) = line.split(': ', 1) estimated_latency = estimated_latency.strip() (estimated_latency, _) = estimated_latency.split(' ', 1) if estimated_bandwidth is None or estimated_latency is None: util.log_print_nl(logfile, 'ERROR') util.log_print(logfile, '\n'.join(output)) results[pair] = (None, 'bad') continue results[pair] = (float(estimated_bandwidth), int(estimated_latency)) util.log_print_nl(logfile, 'done') # report util.log_print_nl(logfile, '\nResults of internode bandwidth/latency test on %s' % cluster) node_numbers = util.get_node_numbers(cluster, strip_bad_nodes=False) node_numbers.sort() ## # dump data for debug ## fd = open('xyzzy', 'w') ## fd.write('results = {\n') ## for k in results: ## fd.write(' %s: %s,\n' % (str(k), str(results[k]))) ## fd.write(' }\n') ## fd.write('node_numbers = %s\n' % str(node_numbers)) ## fd.close() # bandwidth figures util.log_print_nl(logfile, 'Bandwidth:') util.log_print(logfile, 'node#\t') for x in node_numbers: util.log_print(logfile, '%d\t' % x) util.log_print_nl(logfile) util.log_print(logfile, '\t') for x in node_numbers: util.log_print(logfile, '-'*7) util.log_print_nl(logfile) for y in node_numbers[1:]: util.log_print(logfile, '%d\t|' % y) for x in node_numbers: val = results.get((x, y), None) if val is None: util.log_print(logfile, '\t') else: (b, l) = val if b is None: util.log_print(logfile, '%s\t' % str(l)) else: util.log_print(logfile, '%.1f\t' % b) util.log_print_nl(logfile) util.log_print(logfile, '\t') for x in node_numbers: util.log_print(logfile, '-'*7) util.log_print_nl(logfile) util.log_print(logfile, 'node#\t') for x in node_numbers: util.log_print(logfile, '%d\t' % x) util.log_print_nl(logfile) util.log_print_nl(logfile) util.log_print_nl(logfile) # latency figures util.log_print_nl(logfile, 'Latency:') util.log_print(logfile, 'node#\t') for x in node_numbers: util.log_print(logfile, '%d\t' % x) util.log_print_nl(logfile) util.log_print(logfile, '\t') for x in node_numbers: util.log_print(logfile, '-'*7) util.log_print_nl(logfile) for y in node_numbers[1:]: util.log_print(logfile, '%d\t|' % y) for x in node_numbers: val = results.get((x, y), None) if val is None: util.log_print(logfile, '\t') else: (b, l) = val if b is None: util.log_print(logfile, '%s\t' % str(l)) else: util.log_print(logfile, '%d\t' % l) util.log_print_nl(logfile) util.log_print(logfile, '\t') for x in node_numbers: util.log_print(logfile, '-'*7) util.log_print_nl(logfile) util.log_print(logfile, 'node#\t') for x in node_numbers: util.log_print(logfile, '%d\t' % x) util.log_print_nl(logfile) os.remove(obj_file) return result if __name__ == '__main__': import sys logfile = 'test.log' if len(sys.argv) > 1: logfile = sys.argv[1] try: os.remove(logfile) except: pass if not test(logfile): sys.exit(10) sys.exit(0)