[7025] | 1 | #!/bin/env python |
---|
| 2 | |
---|
| 3 | '''Testlet to run the internode LatencyBandwidth test.''' |
---|
| 4 | |
---|
| 5 | import os |
---|
| 6 | import time |
---|
| 7 | import test_utils as util |
---|
| 8 | |
---|
| 9 | name = 'Running internode latency/bandwidth test' |
---|
| 10 | |
---|
| 11 | |
---|
| 12 | def get_node_connections(nodes): |
---|
| 13 | '''Given list of nodes, return list of all possible pairs.''' |
---|
| 14 | |
---|
| 15 | def pair_list(l): |
---|
| 16 | result = [] |
---|
| 17 | if len(l) > 1: |
---|
| 18 | head = l[0] |
---|
| 19 | tail = l[1:] |
---|
| 20 | for x in tail: |
---|
| 21 | result.append((head, x)) |
---|
| 22 | result += pair_list(tail) |
---|
| 23 | return result |
---|
| 24 | |
---|
| 25 | return pair_list(nodes) |
---|
| 26 | |
---|
| 27 | def test(logfile): |
---|
| 28 | result = True |
---|
| 29 | |
---|
| 30 | (cluster, domain) = util.get_hostname() |
---|
| 31 | |
---|
| 32 | # get python to run |
---|
| 33 | python_env_var = os.getenv('PYTHON') |
---|
| 34 | |
---|
| 35 | # compile C code |
---|
| 36 | obj_file = 'ctiming_%s' % cluster |
---|
| 37 | cmd = ('mpicc ctiming.c -lm -o %s' % obj_file) |
---|
| 38 | util.log_print_nl(logfile, 'Compiling ctiming.c:') |
---|
| 39 | util.log_print_nl(logfile,cmd) |
---|
| 40 | (_, fd) = os.popen4(cmd) |
---|
| 41 | output = fd.read() |
---|
| 42 | status = fd.close() |
---|
| 43 | if status: |
---|
| 44 | util.log_print_nl(logfile, 'ERRORS COMPILING') |
---|
| 45 | util.log_print(logfile, output) |
---|
| 46 | return False |
---|
| 47 | |
---|
| 48 | util.log_print_nl(logfile) |
---|
| 49 | |
---|
| 50 | # get list of node numbers |
---|
| 51 | home_env_var = os.getenv('HOME') |
---|
| 52 | machines_file = os.path.join(home_env_var, '.machines_%s' % cluster) |
---|
| 53 | nodes = util.get_node_numbers(cluster, strip_bad_nodes=True) |
---|
| 54 | nodes.sort() |
---|
| 55 | |
---|
| 56 | # # check which nodes are up, remove others |
---|
| 57 | # active_nodes = [] |
---|
| 58 | # for node in nodes: |
---|
| 59 | # (_, fd) = os.popen4('ssh %s exit' % node) |
---|
| 60 | # data = fd.read() |
---|
| 61 | # status = fd.close() |
---|
| 62 | # if len(data) == 0: |
---|
| 63 | # active_nodes.append(node) |
---|
| 64 | |
---|
| 65 | # get all unique pairs of active node *numbers* |
---|
| 66 | node_pairs = get_node_connections(nodes) |
---|
| 67 | node_stem = util.get_cluster_info(cluster)['node_stem'] |
---|
| 68 | |
---|
| 69 | # run the tests |
---|
| 70 | results = {} |
---|
| 71 | for pair in node_pairs: |
---|
| 72 | (n1, n2) = pair |
---|
| 73 | node1 = node_stem % n1 |
---|
| 74 | node2 = node_stem % n2 |
---|
| 75 | cmd = ('mpirun -nolocal -np 2 -host %s,%s %s' % (node1, node2, obj_file)) |
---|
| 76 | util.log_print(logfile, cmd + ' ...') |
---|
| 77 | status = 100 |
---|
| 78 | try: |
---|
| 79 | (_, fd) = os.popen4(cmd) |
---|
| 80 | output = fd.readlines() |
---|
| 81 | status = fd.close() |
---|
| 82 | except: |
---|
| 83 | pass |
---|
| 84 | if status: |
---|
| 85 | util.log_print_nl(logfile, 'ERROR') |
---|
| 86 | util.log_print(logfile, '\n'.join(output)) |
---|
| 87 | results[pair] = (None, 'fail') |
---|
| 88 | continue |
---|
| 89 | |
---|
| 90 | estimated_bandwidth = None |
---|
| 91 | estimated_latency = None |
---|
| 92 | for line in output: |
---|
| 93 | if line.startswith('Estimated bandwith'): |
---|
| 94 | (_, estimated_bandwidth) = line.split(': ', 1) |
---|
| 95 | estimated_bandwidth = estimated_bandwidth.strip() |
---|
| 96 | (estimated_bandwidth, _) = estimated_bandwidth.split(' ', 1) |
---|
| 97 | if line.startswith('Estimated latency'): |
---|
| 98 | (_, estimated_latency) = line.split(': ', 1) |
---|
| 99 | estimated_latency = estimated_latency.strip() |
---|
| 100 | (estimated_latency, _) = estimated_latency.split(' ', 1) |
---|
| 101 | |
---|
| 102 | if estimated_bandwidth is None or estimated_latency is None: |
---|
| 103 | util.log_print_nl(logfile, 'ERROR') |
---|
| 104 | util.log_print(logfile, '\n'.join(output)) |
---|
| 105 | results[pair] = (None, 'bad') |
---|
| 106 | continue |
---|
| 107 | |
---|
| 108 | results[pair] = (float(estimated_bandwidth), int(estimated_latency)) |
---|
| 109 | util.log_print_nl(logfile, 'done') |
---|
| 110 | |
---|
| 111 | # report |
---|
| 112 | util.log_print_nl(logfile, '\nResults of internode bandwidth/latency test on %s' % cluster) |
---|
| 113 | node_numbers = util.get_node_numbers(cluster, strip_bad_nodes=False) |
---|
| 114 | node_numbers.sort() |
---|
| 115 | |
---|
| 116 | ## # dump data for debug |
---|
| 117 | ## fd = open('xyzzy', 'w') |
---|
| 118 | ## fd.write('results = {\n') |
---|
| 119 | ## for k in results: |
---|
| 120 | ## fd.write(' %s: %s,\n' % (str(k), str(results[k]))) |
---|
| 121 | ## fd.write(' }\n') |
---|
| 122 | ## fd.write('node_numbers = %s\n' % str(node_numbers)) |
---|
| 123 | ## fd.close() |
---|
| 124 | |
---|
| 125 | # bandwidth figures |
---|
| 126 | util.log_print_nl(logfile, 'Bandwidth:') |
---|
| 127 | util.log_print(logfile, 'node#\t') |
---|
| 128 | for x in node_numbers: |
---|
| 129 | util.log_print(logfile, '%d\t' % x) |
---|
| 130 | util.log_print_nl(logfile) |
---|
| 131 | |
---|
| 132 | util.log_print(logfile, '\t') |
---|
| 133 | for x in node_numbers: |
---|
| 134 | util.log_print(logfile, '-'*7) |
---|
| 135 | util.log_print_nl(logfile) |
---|
| 136 | |
---|
| 137 | for y in node_numbers[1:]: |
---|
| 138 | util.log_print(logfile, '%d\t|' % y) |
---|
| 139 | for x in node_numbers: |
---|
| 140 | val = results.get((x, y), None) |
---|
| 141 | if val is None: |
---|
| 142 | util.log_print(logfile, '\t') |
---|
| 143 | else: |
---|
| 144 | (b, l) = val |
---|
| 145 | if b is None: |
---|
| 146 | util.log_print(logfile, '%s\t' % str(l)) |
---|
| 147 | else: |
---|
| 148 | util.log_print(logfile, '%.1f\t' % b) |
---|
| 149 | util.log_print_nl(logfile) |
---|
| 150 | |
---|
| 151 | util.log_print(logfile, '\t') |
---|
| 152 | for x in node_numbers: |
---|
| 153 | util.log_print(logfile, '-'*7) |
---|
| 154 | util.log_print_nl(logfile) |
---|
| 155 | |
---|
| 156 | util.log_print(logfile, 'node#\t') |
---|
| 157 | for x in node_numbers: |
---|
| 158 | util.log_print(logfile, '%d\t' % x) |
---|
| 159 | util.log_print_nl(logfile) |
---|
| 160 | util.log_print_nl(logfile) |
---|
| 161 | util.log_print_nl(logfile) |
---|
| 162 | |
---|
| 163 | # latency figures |
---|
| 164 | util.log_print_nl(logfile, 'Latency:') |
---|
| 165 | util.log_print(logfile, 'node#\t') |
---|
| 166 | for x in node_numbers: |
---|
| 167 | util.log_print(logfile, '%d\t' % x) |
---|
| 168 | util.log_print_nl(logfile) |
---|
| 169 | |
---|
| 170 | util.log_print(logfile, '\t') |
---|
| 171 | for x in node_numbers: |
---|
| 172 | util.log_print(logfile, '-'*7) |
---|
| 173 | util.log_print_nl(logfile) |
---|
| 174 | |
---|
| 175 | for y in node_numbers[1:]: |
---|
| 176 | util.log_print(logfile, '%d\t|' % y) |
---|
| 177 | for x in node_numbers: |
---|
| 178 | val = results.get((x, y), None) |
---|
| 179 | if val is None: |
---|
| 180 | util.log_print(logfile, '\t') |
---|
| 181 | else: |
---|
| 182 | (b, l) = val |
---|
| 183 | if b is None: |
---|
| 184 | util.log_print(logfile, '%s\t' % str(l)) |
---|
| 185 | else: |
---|
| 186 | util.log_print(logfile, '%d\t' % l) |
---|
| 187 | util.log_print_nl(logfile) |
---|
| 188 | |
---|
| 189 | util.log_print(logfile, '\t') |
---|
| 190 | for x in node_numbers: |
---|
| 191 | util.log_print(logfile, '-'*7) |
---|
| 192 | util.log_print_nl(logfile) |
---|
| 193 | |
---|
| 194 | util.log_print(logfile, 'node#\t') |
---|
| 195 | for x in node_numbers: |
---|
| 196 | util.log_print(logfile, '%d\t' % x) |
---|
| 197 | util.log_print_nl(logfile) |
---|
| 198 | |
---|
| 199 | os.remove(obj_file) |
---|
| 200 | |
---|
| 201 | return result |
---|
| 202 | |
---|
| 203 | |
---|
| 204 | if __name__ == '__main__': |
---|
| 205 | import sys |
---|
| 206 | |
---|
| 207 | logfile = 'test.log' |
---|
| 208 | if len(sys.argv) > 1: |
---|
| 209 | logfile = sys.argv[1] |
---|
| 210 | |
---|
| 211 | try: |
---|
| 212 | os.remove(logfile) |
---|
| 213 | except: |
---|
| 214 | pass |
---|
| 215 | |
---|
| 216 | if not test(logfile): |
---|
| 217 | sys.exit(10) |
---|
| 218 | |
---|
| 219 | sys.exit(0) |
---|