1 | #!/usr/bin/env python |
---|
2 | |
---|
3 | def get_kernel_function_info(a, W1=0, W2=1, W3=1): |
---|
4 | """Show kernel information |
---|
5 | |
---|
6 | Including |
---|
7 | 1. max #threads per block, |
---|
8 | 2. active warps per MP, |
---|
9 | 3. thread block per MP, |
---|
10 | 4. usage of shared memory, |
---|
11 | 5. const memory , |
---|
12 | 6. local memory |
---|
13 | 7. registers |
---|
14 | 8. hardware occupancy |
---|
15 | 9. limitation of the hardware occupancy |
---|
16 | """ |
---|
17 | |
---|
18 | import pycuda.tools as tl |
---|
19 | import pycuda.driver as dri |
---|
20 | dev = dri.Device(0) |
---|
21 | td = tl.DeviceData() |
---|
22 | if not W1: |
---|
23 | W1 = a.max_threads_per_block |
---|
24 | to = tl.OccupancyRecord(td, W1*W2*W3, a.shared_size_bytes, a.num_regs) |
---|
25 | |
---|
26 | print "***************************************" |
---|
27 | print " Function Info " |
---|
28 | print " -> max threads per block: %d / %d / %d" % \ |
---|
29 | (a.max_threads_per_block, |
---|
30 | dev.max_threads_per_block, |
---|
31 | dev.max_threads_per_multiprocessor) |
---|
32 | print " -> shared mem : %d / %d" % (a.shared_size_bytes, |
---|
33 | td.shared_memory) |
---|
34 | print " -> const mem : %d" % a.const_size_bytes |
---|
35 | print " -> local mem : %d" % a.local_size_bytes |
---|
36 | print " -> register : %d / %d" % (a.num_regs, td.registers) |
---|
37 | print " -> thread block per MP %d / %d" % \ |
---|
38 | (to.tb_per_mp, td.thread_blocks_per_mp) |
---|
39 | print " -> warps per MP %d / %d" % (to.warps_per_mp, td.warps_per_mp) |
---|
40 | print " -> occupancy %f" % to.occupancy |
---|
41 | print " -> limitation %s" % to.limited_by |
---|
42 | print " Block size : %dx%dx%d" % (W1, W2, W3) |
---|
43 | print "***************************************" |
---|
44 | |
---|
45 | |
---|
46 | |
---|
47 | |
---|
48 | def get_sourceModule(k_dir, k_name, rearranged_domain=False): |
---|
49 | """Compile kernel code and return the PyCUDA function object""" |
---|
50 | |
---|
51 | from pycuda.compiler import SourceModule |
---|
52 | from anuga_cuda import archM, codeM |
---|
53 | |
---|
54 | if rearranged_domain: |
---|
55 | defince_macro = "#define REARRANGED_DOMAIN\n" |
---|
56 | else: |
---|
57 | defince_macro = "" |
---|
58 | return SourceModule( |
---|
59 | defince_macro + open( k_dir + k_name, "r").read(), |
---|
60 | arch = 'compute_20', |
---|
61 | code = 'sm_20', |
---|
62 | options =['-use_fast_math', '--compiler-options', '-O3'], |
---|
63 | include_dirs=[ k_dir ] |
---|
64 | ) |
---|
65 | |
---|
66 | |
---|
67 | |
---|
68 | def get_page_locked_array(a): |
---|
69 | """Replace the pageable array to page-locked array""" |
---|
70 | |
---|
71 | import pycuda.driver as drv |
---|
72 | |
---|
73 | temp_page_lock_p = drv.pagelocked_zeros_like(a, |
---|
74 | mem_flags=drv.host_alloc_flags.DEVICEMAP) |
---|
75 | if len(a.shape) == 1: |
---|
76 | temp_page_lock_p[:] = a |
---|
77 | else: |
---|
78 | temp_page_lock_p[:, :] = a |
---|
79 | assert numpy.allclose(a, temp_page_lock_p) |
---|
80 | return temp_page_lock_p |
---|
81 | |
---|
82 | |
---|
83 | def get_device_array(a): |
---|
84 | """Allocate device memory""" |
---|
85 | |
---|
86 | import pycuda.driver as drv |
---|
87 | |
---|
88 | return drv.mem_alloc(a.nbytes) |
---|
89 | |
---|
90 | |
---|
91 | |
---|
92 | def asy_cpy(a, a_gpu, auto_init_context= True): |
---|
93 | """Data transfer from host to device. |
---|
94 | |
---|
95 | Asynchronous will be enabled when auto_init_context is True, otherwise |
---|
96 | use normal transfer. |
---|
97 | """ |
---|
98 | |
---|
99 | import pycuda.driver as drv |
---|
100 | |
---|
101 | if auto_init_context: |
---|
102 | strm = drv.Stream() |
---|
103 | drv.memcpy_htod_async(a_gpu, a, strm) |
---|
104 | |
---|
105 | # Test correctness |
---|
106 | #ctx.synchronize() |
---|
107 | #b= numpy.zeros_like(a, a.dtype) |
---|
108 | #drv.memcpy_dtoh(b, a_gpu) |
---|
109 | #print numpy.allclose(a, b) |
---|
110 | return strm |
---|
111 | else: |
---|
112 | drv.memcpy_htod(a_gpu, a) |
---|
113 | |
---|
114 | |
---|
115 | |
---|
116 | def cpy_back(a, a_gpu, auto_init_context=True): |
---|
117 | """Data transfer from device to host. |
---|
118 | |
---|
119 | Asynchronous will be enabled when auto_init_context is True, otherwise |
---|
120 | use normal transfer. |
---|
121 | """ |
---|
122 | |
---|
123 | import pycuda.driver as drv |
---|
124 | |
---|
125 | if auto_init_context: |
---|
126 | strm = drv.Stream() |
---|
127 | drv.memcpy_dtoh_async(a, a_gpu, strm) |
---|
128 | return strm |
---|
129 | else: |
---|
130 | drv.memcpy_dtoh(a, a_gpu) |
---|
131 | |
---|
132 | |
---|
133 | |
---|
134 | def cpy_back_and_cmp(a, b, value_type, gpu = True, rg = False): |
---|
135 | """Download mesh information and check result.""" |
---|
136 | |
---|
137 | import numpy |
---|
138 | |
---|
139 | if gpu: |
---|
140 | if value_type is "centroid_values": |
---|
141 | cpy_back(a.centroid_values, a.centroid_values_gpu) |
---|
142 | return numpy.allclose(a.centroid_values, b.centroid_values) |
---|
143 | elif value_type is "vertex_values": |
---|
144 | cpy_back(a.vertex_values, a.vertex_values_gpu) |
---|
145 | if rg: |
---|
146 | return check_rearranged_array( |
---|
147 | b.vertex_values, a.vertex_values, 3) |
---|
148 | return numpy.allclose(a.vertex_values, b.vertex_values) |
---|
149 | elif value_type is "boundary_values": |
---|
150 | cpy_back(a.boundary_values, a.boundary_values_gpu) |
---|
151 | return numpy.allclose(a.boundary_values, b.boundary_values) |
---|
152 | elif value_type is "edge_values": |
---|
153 | cpy_back(a.edge_values, a.edge_values_gpu) |
---|
154 | if rg: |
---|
155 | return check_rearranged_array( |
---|
156 | b.edge_values, a.edge_values, 3) |
---|
157 | return numpy.allclose(a.edge_values, b.edge_values) |
---|
158 | elif value_type is "x_gradient_values": |
---|
159 | cpy_back(a.x_gradient, a.x_gradient_gpu) |
---|
160 | return numpy.allclose(a.x_gradient, b.x_gradient) |
---|
161 | elif value_type is "y_gradient_values": |
---|
162 | cpy_back(a.y_gradient, a.y_gradient_gpu) |
---|
163 | return numpy.allclose(a.y_gradient, b.y_gradient) |
---|
164 | elif value_type is "explicit_update": |
---|
165 | cpy_back(a.explicit_update, a.explicit_update_gpu) |
---|
166 | return numpy.allclose(a.explicit_update, b.explicit_update) |
---|
167 | elif value_type is "semi_implicit_update": |
---|
168 | cpy_back(a.semi_implicit_update, a.semi_implicit_update_gpu) |
---|
169 | return numpy.allclose(a.semi_implicit_update, |
---|
170 | b.semi_implicit_update) |
---|
171 | elif value_type is "areas": |
---|
172 | cpy_back(a.areas, a.areas_gpu) |
---|
173 | return numpy.allclose(a.areas, b.areas) |
---|
174 | elif value_type is "surrogate_neighbours": |
---|
175 | cpy_back(a.surrogate_neighbours, a.surrogate_neighbours_gpu) |
---|
176 | if rg: |
---|
177 | return check_rearranged_array( |
---|
178 | b.surrogate_neighbours, a.surrogate_neighbours, 3) |
---|
179 | return numpy.allclose(a.surrogate_neighbours, b.surrogate_neighbours) |
---|
180 | elif value_type is "number_of_boundaries": |
---|
181 | cpy_back(a.number_of_boundaries, a.number_of_boundaries_gpu) |
---|
182 | return numpy.allclose(a.number_of_boundaries, |
---|
183 | b.number_of_boundaries) |
---|
184 | elif value_type is "centroid_coordinates": |
---|
185 | cpy_back(a.centroid_coordinates, a.centroid_coordinates_gpu) |
---|
186 | if rg: |
---|
187 | return check_rearranged_array( |
---|
188 | b.centroid_coordinates, a.centroid_coordinates, 2) |
---|
189 | return numpy.allclose(a.centroid_coordinates, |
---|
190 | b.centroid_coordinates) |
---|
191 | elif value_type is "vertex_coordinates": |
---|
192 | cpy_back(a.vertex_coordinates, a.vertex_coordinates_gpu) |
---|
193 | if rg: |
---|
194 | return check_rearranged_array( |
---|
195 | b.vertex_coordinates, a.vertex_coordinates, 32) |
---|
196 | return numpy.allclose(a.vertex_coordinates, |
---|
197 | b.vertex_coordinates) |
---|
198 | elif value_type is "edge_coordinates": |
---|
199 | cpy_back(a.edge_coordinates, a.edge_coordinates_gpu) |
---|
200 | if rg: |
---|
201 | return check_rearranged_array( |
---|
202 | b.edge_coordinates, a.edge_coordinates, 32) |
---|
203 | return numpy.allclose(a.edge_coordinates, |
---|
204 | b.edge_coordinates) |
---|
205 | else: |
---|
206 | raise Exception('Unknown value_type %s' % value_type) |
---|
207 | else: |
---|
208 | if value_type is "centroid_values": |
---|
209 | return numpy.allclose(a.centroid_values, b.centroid_values) |
---|
210 | elif value_type is "vertex_values": |
---|
211 | return numpy.allclose(a.vertex_values, b.vertex_values) |
---|
212 | elif value_type is "boundary_values": |
---|
213 | return numpy.allclose(a.boundary_values, b.boundary_values) |
---|
214 | elif value_type is "edge_values": |
---|
215 | return numpy.allclose(a.edge_values, b.edge_values) |
---|
216 | elif value_type is "x_gradient_values": |
---|
217 | return numpy.allclose(a.x_gradient, b.x_gradient) |
---|
218 | elif value_type is "y_gradient_values": |
---|
219 | return numpy.allclose(a.y_gradient, b.y_gradient) |
---|
220 | elif value_type is "explicit_update": |
---|
221 | return numpy.allclose(a.explicit_update, b.explicit_update) |
---|
222 | elif value_type is "semi_implicit_update": |
---|
223 | return numpy.allclose( |
---|
224 | a.semi_implicit_update, b.semi_implicit_update) |
---|
225 | elif value_type is "vertex_coordinates": |
---|
226 | return numpy.allclose( |
---|
227 | a.vertex_coordinates, b.vertex_coordinates) |
---|
228 | elif value_type is "areas": |
---|
229 | return numpy.allclose(a.areas, b.areas) |
---|
230 | elif value_type is "surrogate_neighbours": |
---|
231 | return numpy.allclose( |
---|
232 | a.surrogate_neighbours, b.surrogate_neighbours) |
---|
233 | elif value_type is "number_of_boundaries": |
---|
234 | return numpy.allclose( |
---|
235 | a.number_of_boundaries, b.number_of_boundaries) |
---|
236 | elif value_type is "centroid_coordinates": |
---|
237 | return numpy.allclose( |
---|
238 | a.centroid_coordinates, b.centroid_coordinates) |
---|
239 | elif value_type is "vertex_coordinates": |
---|
240 | return numpy.allclose( |
---|
241 | a.vertex_coordinates, b.vertex_coordinates) |
---|
242 | else: |
---|
243 | raise Exception('Unknown value_type %s' % value_type) |
---|
244 | |
---|
245 | |
---|
246 | |
---|
247 | def number_domain_method(domain): |
---|
248 | """Convert mesh information stored in text string to ingeger. |
---|
249 | |
---|
250 | This is used in hmpp_pyhton_glue. |
---|
251 | |
---|
252 | Return value: |
---|
253 | (compute_fluxes_method, flow_algorithm, timestepping_method) |
---|
254 | """ |
---|
255 | |
---|
256 | if domain.timestepping_method == 'euler': |
---|
257 | timestepping_method = 1 |
---|
258 | elif domain.timestepping_method == 'rk2': |
---|
259 | timestepping_method = 2 |
---|
260 | elif domain.timestepping_method == 'rk3': |
---|
261 | timestepping_method = 3 |
---|
262 | else: |
---|
263 | timestepping_method = 4 |
---|
264 | print " The timestepping_method is '%s' %d" % (domain.timestepping_method, timestepping_method) |
---|
265 | |
---|
266 | |
---|
267 | if domain.flow_algorithm == 'tsunami': |
---|
268 | flow_algorithm = 1 |
---|
269 | elif domain.flow_algorithm == 'yusuke': |
---|
270 | flow_algorithm = 2 |
---|
271 | else: |
---|
272 | flow_algorithm = 3 |
---|
273 | print " The flow_algorithm us '%s' %d" % (domain.flow_algorithm, flow_algorithm) |
---|
274 | |
---|
275 | |
---|
276 | if domain.compute_fluxes_method == 'original': |
---|
277 | compute_fluxes_method = 0 |
---|
278 | elif domain.compute_fluxes_method == 'wb_1': |
---|
279 | compute_fluxes_method = 1 |
---|
280 | elif domain.compute_fluxes_method == 'wb_2': |
---|
281 | compute_fluxes_method = 2 |
---|
282 | elif domain.compute_fluxes_method == 'wb_3': |
---|
283 | compute_fluxes_method = 3 |
---|
284 | elif domain.compute_fluxes_method == 'tsunami': |
---|
285 | compute_fluxes_method = 4 |
---|
286 | else: |
---|
287 | compute_fluxes_method = 5 |
---|
288 | print " The compute_fluxes_method is '%s' %d" % (domain.compute_fluxes_method, compute_fluxes_method) |
---|
289 | |
---|
290 | |
---|
291 | return (compute_fluxes_method, flow_algorithm, timestepping_method) |
---|
292 | |
---|
293 | |
---|
294 | |
---|
295 | |
---|