1 | #!/bin/env python |
---|
2 | |
---|
3 | '''Testlet to run the internode LatencyBandwidth test.''' |
---|
4 | |
---|
5 | import os |
---|
6 | import time |
---|
7 | import test_utils as util |
---|
8 | |
---|
9 | name = 'Running internode latency/bandwidth test' |
---|
10 | |
---|
11 | |
---|
12 | def get_node_connections(nodes): |
---|
13 | '''Given list of nodes, return list of all possible pairs.''' |
---|
14 | |
---|
15 | def pair_list(l): |
---|
16 | result = [] |
---|
17 | if len(l) > 1: |
---|
18 | head = l[0] |
---|
19 | tail = l[1:] |
---|
20 | for x in tail: |
---|
21 | result.append((head, x)) |
---|
22 | result += pair_list(tail) |
---|
23 | return result |
---|
24 | |
---|
25 | return pair_list(nodes) |
---|
26 | |
---|
27 | def test(logfile): |
---|
28 | result = True |
---|
29 | |
---|
30 | (cluster, domain) = util.get_hostname() |
---|
31 | |
---|
32 | # get python to run |
---|
33 | python_env_var = os.getenv('PYTHON') |
---|
34 | |
---|
35 | # compile C code |
---|
36 | obj_file = 'ctiming_%s' % cluster |
---|
37 | cmd = ('mpicc ctiming.c -lm -o %s' % obj_file) |
---|
38 | util.log_print_nl(logfile, 'Compiling ctiming.c:') |
---|
39 | util.log_print_nl(logfile,cmd) |
---|
40 | (_, fd) = os.popen4(cmd) |
---|
41 | output = fd.read() |
---|
42 | status = fd.close() |
---|
43 | if status: |
---|
44 | util.log_print_nl(logfile, 'ERRORS COMPILING') |
---|
45 | util.log_print(logfile, output) |
---|
46 | return False |
---|
47 | |
---|
48 | util.log_print_nl(logfile) |
---|
49 | |
---|
50 | # get list of node numbers |
---|
51 | home_env_var = os.getenv('HOME') |
---|
52 | machines_file = os.path.join(home_env_var, '.machines_%s' % cluster) |
---|
53 | nodes = util.get_node_numbers(cluster, strip_bad_nodes=True) |
---|
54 | nodes.sort() |
---|
55 | |
---|
56 | # # check which nodes are up, remove others |
---|
57 | # active_nodes = [] |
---|
58 | # for node in nodes: |
---|
59 | # (_, fd) = os.popen4('ssh %s exit' % node) |
---|
60 | # data = fd.read() |
---|
61 | # status = fd.close() |
---|
62 | # if len(data) == 0: |
---|
63 | # active_nodes.append(node) |
---|
64 | |
---|
65 | # get all unique pairs of active node *numbers* |
---|
66 | node_pairs = get_node_connections(nodes) |
---|
67 | node_stem = util.get_cluster_info(cluster)['node_stem'] |
---|
68 | |
---|
69 | # run the tests |
---|
70 | results = {} |
---|
71 | for pair in node_pairs: |
---|
72 | (n1, n2) = pair |
---|
73 | node1 = node_stem % n1 |
---|
74 | node2 = node_stem % n2 |
---|
75 | cmd = ('mpirun -nolocal -np 2 -host %s,%s %s' % (node1, node2, obj_file)) |
---|
76 | util.log_print(logfile, cmd + ' ...') |
---|
77 | status = 100 |
---|
78 | try: |
---|
79 | (_, fd) = os.popen4(cmd) |
---|
80 | output = fd.readlines() |
---|
81 | status = fd.close() |
---|
82 | except: |
---|
83 | pass |
---|
84 | if status: |
---|
85 | util.log_print_nl(logfile, 'ERROR') |
---|
86 | util.log_print(logfile, '\n'.join(output)) |
---|
87 | results[pair] = (None, 'fail') |
---|
88 | continue |
---|
89 | |
---|
90 | estimated_bandwidth = None |
---|
91 | estimated_latency = None |
---|
92 | for line in output: |
---|
93 | if line.startswith('Estimated bandwith'): |
---|
94 | (_, estimated_bandwidth) = line.split(': ', 1) |
---|
95 | estimated_bandwidth = estimated_bandwidth.strip() |
---|
96 | (estimated_bandwidth, _) = estimated_bandwidth.split(' ', 1) |
---|
97 | if line.startswith('Estimated latency'): |
---|
98 | (_, estimated_latency) = line.split(': ', 1) |
---|
99 | estimated_latency = estimated_latency.strip() |
---|
100 | (estimated_latency, _) = estimated_latency.split(' ', 1) |
---|
101 | |
---|
102 | if estimated_bandwidth is None or estimated_latency is None: |
---|
103 | util.log_print_nl(logfile, 'ERROR') |
---|
104 | util.log_print(logfile, '\n'.join(output)) |
---|
105 | results[pair] = (None, 'bad') |
---|
106 | continue |
---|
107 | |
---|
108 | results[pair] = (float(estimated_bandwidth), int(estimated_latency)) |
---|
109 | util.log_print_nl(logfile, 'done') |
---|
110 | |
---|
111 | # report |
---|
112 | util.log_print_nl(logfile, '\nResults of internode bandwidth/latency test on %s' % cluster) |
---|
113 | node_numbers = util.get_node_numbers(cluster, strip_bad_nodes=False) |
---|
114 | node_numbers.sort() |
---|
115 | |
---|
116 | ## # dump data for debug |
---|
117 | ## fd = open('xyzzy', 'w') |
---|
118 | ## fd.write('results = {\n') |
---|
119 | ## for k in results: |
---|
120 | ## fd.write(' %s: %s,\n' % (str(k), str(results[k]))) |
---|
121 | ## fd.write(' }\n') |
---|
122 | ## fd.write('node_numbers = %s\n' % str(node_numbers)) |
---|
123 | ## fd.close() |
---|
124 | |
---|
125 | # bandwidth figures |
---|
126 | util.log_print_nl(logfile, 'Bandwidth:') |
---|
127 | util.log_print(logfile, 'node#\t') |
---|
128 | for x in node_numbers: |
---|
129 | util.log_print(logfile, '%d\t' % x) |
---|
130 | util.log_print_nl(logfile) |
---|
131 | |
---|
132 | util.log_print(logfile, '\t') |
---|
133 | for x in node_numbers: |
---|
134 | util.log_print(logfile, '-'*7) |
---|
135 | util.log_print_nl(logfile) |
---|
136 | |
---|
137 | for y in node_numbers[1:]: |
---|
138 | util.log_print(logfile, '%d\t|' % y) |
---|
139 | for x in node_numbers: |
---|
140 | val = results.get((x, y), None) |
---|
141 | if val is None: |
---|
142 | util.log_print(logfile, '\t') |
---|
143 | else: |
---|
144 | (b, l) = val |
---|
145 | if b is None: |
---|
146 | util.log_print(logfile, '%s\t' % str(l)) |
---|
147 | else: |
---|
148 | util.log_print(logfile, '%.1f\t' % b) |
---|
149 | util.log_print_nl(logfile) |
---|
150 | |
---|
151 | util.log_print(logfile, '\t') |
---|
152 | for x in node_numbers: |
---|
153 | util.log_print(logfile, '-'*7) |
---|
154 | util.log_print_nl(logfile) |
---|
155 | |
---|
156 | util.log_print(logfile, 'node#\t') |
---|
157 | for x in node_numbers: |
---|
158 | util.log_print(logfile, '%d\t' % x) |
---|
159 | util.log_print_nl(logfile) |
---|
160 | util.log_print_nl(logfile) |
---|
161 | util.log_print_nl(logfile) |
---|
162 | |
---|
163 | # latency figures |
---|
164 | util.log_print_nl(logfile, 'Latency:') |
---|
165 | util.log_print(logfile, 'node#\t') |
---|
166 | for x in node_numbers: |
---|
167 | util.log_print(logfile, '%d\t' % x) |
---|
168 | util.log_print_nl(logfile) |
---|
169 | |
---|
170 | util.log_print(logfile, '\t') |
---|
171 | for x in node_numbers: |
---|
172 | util.log_print(logfile, '-'*7) |
---|
173 | util.log_print_nl(logfile) |
---|
174 | |
---|
175 | for y in node_numbers[1:]: |
---|
176 | util.log_print(logfile, '%d\t|' % y) |
---|
177 | for x in node_numbers: |
---|
178 | val = results.get((x, y), None) |
---|
179 | if val is None: |
---|
180 | util.log_print(logfile, '\t') |
---|
181 | else: |
---|
182 | (b, l) = val |
---|
183 | if b is None: |
---|
184 | util.log_print(logfile, '%s\t' % str(l)) |
---|
185 | else: |
---|
186 | util.log_print(logfile, '%d\t' % l) |
---|
187 | util.log_print_nl(logfile) |
---|
188 | |
---|
189 | util.log_print(logfile, '\t') |
---|
190 | for x in node_numbers: |
---|
191 | util.log_print(logfile, '-'*7) |
---|
192 | util.log_print_nl(logfile) |
---|
193 | |
---|
194 | util.log_print(logfile, 'node#\t') |
---|
195 | for x in node_numbers: |
---|
196 | util.log_print(logfile, '%d\t' % x) |
---|
197 | util.log_print_nl(logfile) |
---|
198 | |
---|
199 | os.remove(obj_file) |
---|
200 | |
---|
201 | return result |
---|
202 | |
---|
203 | |
---|
204 | if __name__ == '__main__': |
---|
205 | import sys |
---|
206 | |
---|
207 | logfile = 'test.log' |
---|
208 | if len(sys.argv) > 1: |
---|
209 | logfile = sys.argv[1] |
---|
210 | |
---|
211 | try: |
---|
212 | os.remove(logfile) |
---|
213 | except: |
---|
214 | pass |
---|
215 | |
---|
216 | if not test(logfile): |
---|
217 | sys.exit(10) |
---|
218 | |
---|
219 | sys.exit(0) |
---|