Context Navigation

← Previous Revision
Latest Revision
Next Revision →
Blame
Revision Log

gpu_python_glue.c @ 8932

Last change on this file since 8932 was 8176, checked in by steve, 14 years ago
Moved and then moved back anuga_1d. Will have to get Paul Bryan to first update the pipe flow directory
File size: 21.1 KB

Line
1
2	// Python - C extension module for gpu_domain.py
3	//
4	// To compile (Python2.3):
5	// gcc -c %.c -I/usr/include/python2.3 -o %.o -Wall -O
6	// gcc -shared %.o -o %.so cudafun.o -L$(CUDA_INSTALL_PATH)/lib64 -lcudart -lm
7	//
8	// or use python compile.py
9	//
10	// See the module shallow_water_domain.py for more documentation on
11	// how to use this module
12	//
13	// Includes Cuda GPU usage:
14	// memory allocation in first iteration, copying, and deletion in final
15	// iteration;
16	// requires linkage of cudafun.o, CUDA library
17	//
18
19
20	#include "Python.h"
21	#include "numpy/arrayobject.h"
22	#include "math.h"
23	#include <stdio.h>
24	#include <stdlib.h>
25	#include "numpy_shim.h"
26	#include "util_ext.h"
27
28	#include "cudafun.h"
29
30	//=========================================================================
31	// Python Glue
32	//=========================================================================
33
34	PyObject compute_fluxes_ext_central_new_gpu(PyObject self, PyObject *args) {
35	/*Compute all fluxes and the timestep suitable for all volumes
36	in domain. GPU version.
37
38	Compute total flux for each conserved quantity using "flux_function_central"
39
40	Fluxes across each edge are scaled by edgelengths and summed up
41	Resulting flux is then scaled by area and stored in
42	explicit_update for each of the three conserved quantities
43	stage, xmomentum and ymomentum
44
45	The maximal allowable speed computed by the flux_function for each volume
46	is converted to a timestep that must not be exceeded. The minimum of
47	those is computed as the next overall timestep.
48
49	First set up data structures on GPU, then copy data, call computational kernel,
50	copy back.
51
52	Python call:
53	timestep = compute_fluxes(timestep, domain, stage, xmom, ymom, bed)
54
55
56	Post conditions:
57	domain.explicit_update is reset to computed flux values
58	returns timestep which is the largest step satisfying all volumes.
59
60
61	*/
62	PyObject
63	*domain,
64	*stage,
65	*xmom,
66	*ymom,
67	*bed;
68
69	PyArrayObject
70	*neighbours,
71	*neighbour_edges,
72	*normals,
73	*edgelengths,
74	*radii,
75	*areas,
76	*tri_full_flag,
77	*stage_edge_values,
78	*xmom_edge_values,
79	*ymom_edge_values,
80	*bed_edge_values,
81	*stage_boundary_values,
82	*xmom_boundary_values,
83	*ymom_boundary_values,
84	*stage_explicit_update,
85	*xmom_explicit_update,
86	*ymom_explicit_update,
87	*already_computed_flux, //Tracks whether the flux across an edge has already been computed
88	*max_speed_array; //Keeps track of max speeds for each triangle
89
90	static int iteration = 0;
91	// TODO: get final_iteration
92	int final_iter = 1;
93	double timestep, epsilon, H0, g;
94	int i,optimise_dry_cells = 0;
95
96	// Convert Python arguments to C
97	if (!PyArg_ParseTuple(args, "dOOOOO", &timestep, &domain, &stage, &xmom, &ymom, &bed )) {
98	report_python_error(AT, "could not parse input arguments");
99	return NULL;
100	}
101
102	epsilon = get_python_double(domain,"epsilon");
103	H0 = get_python_double(domain,"H0");
104	g = get_python_double(domain,"g");
105	//optimise_dry_cells = get_python_integer(domain,"optimse_dry_cells");
106
107	neighbours = get_consecutive_array(domain, "neighbours");
108	neighbour_edges = get_consecutive_array(domain, "neighbour_edges");
109	normals = get_consecutive_array(domain, "normals");
110	edgelengths = get_consecutive_array(domain, "edgelengths");
111	radii = get_consecutive_array(domain, "radii");
112	areas = get_consecutive_array(domain, "areas");
113	tri_full_flag = get_consecutive_array(domain, "tri_full_flag");
114	already_computed_flux = get_consecutive_array(domain, "already_computed_flux");
115	max_speed_array = get_consecutive_array(domain, "max_speed");
116
117	stage_edge_values = get_consecutive_array(stage, "edge_values");
118	xmom_edge_values = get_consecutive_array(xmom, "edge_values");
119	ymom_edge_values = get_consecutive_array(ymom, "edge_values");
120	bed_edge_values = get_consecutive_array(bed, "edge_values");
121
122	stage_boundary_values = get_consecutive_array(stage, "boundary_values");
123	xmom_boundary_values = get_consecutive_array(xmom, "boundary_values");
124	ymom_boundary_values = get_consecutive_array(ymom, "boundary_values");
125
126	stage_explicit_update = get_consecutive_array(stage, "explicit_update");
127	xmom_explicit_update = get_consecutive_array(xmom, "explicit_update");
128	ymom_explicit_update = get_consecutive_array(ymom, "explicit_update");
129
130	int number_of_elements = stage_edge_values -> dimensions[0];
131
132	printf("glue: dt: %f, noe: %i, eps: %f, H0: %f, g: %f\n",
133	timestep, number_of_elements, epsilon, H0, g); fflush(stdout);
134
135	// get array dimensions
136	size_t number_of_neighbours = neighbours -> dimensions[0];
137	size_t number_of_neighbour_edges = neighbour_edges -> dimensions[0];
138	size_t number_of_normals = normals -> dimensions[0];
139	size_t number_of_edgelengths = edgelengths -> dimensions[0];
140	size_t number_of_radii = radii -> dimensions[0];
141	size_t number_of_areas = areas -> dimensions[0];
142	size_t number_of_tri_full_flag = tri_full_flag -> dimensions[0];
143	//size_t number_of_already_computed = already_computed_flux -> dimensions[0];
144	size_t number_of_max_speed_array = max_speed_array -> dimensions[0];
145
146	size_t number_of_stage_edge_values = stage_edge_values -> dimensions[0];
147	size_t number_of_xmom_edge_values = xmom_edge_values -> dimensions[0];
148	size_t number_of_ymom_edge_values = ymom_edge_values -> dimensions[0];
149	size_t number_of_bed_edge_values = bed_edge_values -> dimensions[0];
150
151	size_t number_of_stage_boundary_values = stage_boundary_values -> dimensions[0];
152	size_t number_of_xmom_boundary_values = xmom_boundary_values -> dimensions[0];
153	size_t number_of_ymom_boundary_values = ymom_boundary_values -> dimensions[0];
154
155	size_t number_of_stage_explicit_update = stage_explicit_update -> dimensions[0];
156	size_t number_of_xmom_explicit_update = xmom_explicit_update -> dimensions[0];
157	size_t number_of_ymom_explicit_update = ymom_explicit_update -> dimensions[0];
158
159
160	// extract C arrays from python wrapper
161	long* c_neighbours = (long*) neighbours -> data;
162	long* c_neighbour_edges = (long*) neighbour_edges -> data;
163	double* c_normals = (double*) normals -> data;
164	double* c_edgelengths = (double*) edgelengths -> data;
165	double* c_radii = (double*) radii -> data;
166	double* c_areas = (double*) areas -> data;
167	long* c_tri_full_flag = (long*) tri_full_flag -> data;
168	//long* c_already_computed_flux = (long*) already_computed_flux -> data;
169	double* c_max_speed_array = (double*) max_speed_array -> data;
170
171	double* c_stage_edge_values = (double*) stage_edge_values -> data;
172	double* c_xmom_edge_values = (double*) xmom_edge_values -> data;
173	double* c_ymom_edge_values = (double*) ymom_edge_values -> data;
174	double* c_bed_edge_values = (double*) bed_edge_values -> data;
175
176	double* c_stage_boundary_values = (double*) stage_boundary_values -> data;
177	double* c_xmom_boundary_values = (double*) xmom_boundary_values -> data;
178	double* c_ymom_boundary_values = (double*) ymom_boundary_values -> data;
179
180	double* c_stage_explicit_update = (double*) stage_explicit_update -> data;
181	double* c_xmom_explicit_update = (double*) xmom_explicit_update -> data;
182	double* c_ymom_explicit_update = (double*) ymom_explicit_update -> data;
183
184	setKernelDims( 32, 96 );
185	printKernelDims();
186
187	static long* gpu_neighbours = NULL;
188	static long* gpu_neighbour_edges = NULL;
189	static double* gpu_normals = NULL;
190	static double* gpu_edgelengths = NULL;
191	static double* gpu_radii = NULL;
192	static double* gpu_areas = NULL;
193	static long* gpu_tri_full_flag = NULL;
194	//static long* gpu_already_computed_flux = NULL;
195	static double* gpu_max_speed_array = NULL;
196
197	static double* gpu_stage_edge_values = NULL;
198	static double* gpu_xmom_edge_values = NULL;
199	static double* gpu_ymom_edge_values = NULL;
200	static double* gpu_bed_edge_values = NULL;
201
202	static double* gpu_stage_boundary_values = NULL;
203	static double* gpu_xmom_boundary_values = NULL;
204	static double* gpu_ymom_boundary_values = NULL;
205
206	static double* gpu_stage_explicit_update = NULL;
207	static double* gpu_xmom_explicit_update = NULL;
208	static double* gpu_ymom_explicit_update = NULL;
209
210
211	if( 0 == iteration ) {
212	selectDevice(0,1,"");
213	// allocate GPU arrays
214	gpu_neighbours = (long) allocDeviceMemory( number_of_neighbours sizeof(long) );
215	gpu_neighbour_edges = (long) allocDeviceMemory( number_of_neighbour_edges sizeof(long) );
216	gpu_normals = (double) allocDeviceMemory( number_of_normals sizeof(double) );
217	gpu_edgelengths = (double) allocDeviceMemory( number_of_edgelengths sizeof(double) );
218	gpu_radii = (double) allocDeviceMemory( number_of_radii sizeof(double) );
219	gpu_areas = (double) allocDeviceMemory( number_of_areas sizeof(double) );
220	gpu_tri_full_flag = (long) allocDeviceMemory( number_of_tri_full_flag sizeof(long) );
221	//gpu_already_computed_flux = (long) allocDeviceMemory( number_of_already_computed sizeof(long) );
222	gpu_max_speed_array = (double) allocDeviceMemory( number_of_max_speed_array sizeof(double) );
223
224	gpu_stage_edge_values = (double) allocDeviceMemory( number_of_stage_edge_values sizeof(double) );
225	gpu_xmom_edge_values = (double) allocDeviceMemory( number_of_xmom_edge_values sizeof(double) );
226	gpu_ymom_edge_values = (double) allocDeviceMemory( number_of_ymom_edge_values sizeof(double) );
227	gpu_bed_edge_values = (double) allocDeviceMemory( number_of_bed_edge_values sizeof(double) );
228
229	gpu_stage_boundary_values = (double) allocDeviceMemory( number_of_stage_boundary_values sizeof(double) );
230	gpu_xmom_boundary_values = (double) allocDeviceMemory( number_of_xmom_boundary_values sizeof(double) );
231	gpu_ymom_boundary_values = (double) allocDeviceMemory( number_of_ymom_boundary_values sizeof(double) );
232
233	gpu_stage_explicit_update = (double) allocDeviceMemory( number_of_stage_explicit_update sizeof(double) );
234	gpu_xmom_explicit_update = (double) allocDeviceMemory( number_of_xmom_explicit_update sizeof(double) );
235	gpu_ymom_explicit_update = (double) allocDeviceMemory( number_of_ymom_explicit_update sizeof(double) );
236
237	// Constant quantities copied only in first iteration
238	copyHostToDevice( gpu_neighbours , c_neighbours , number_of_neighbours * sizeof(long) );
239	copyHostToDevice( gpu_neighbour_edges , c_neighbour_edges , number_of_neighbour_edges * sizeof(long) );
240	copyHostToDevice( gpu_normals , c_normals , number_of_normals * sizeof(double) );
241	copyHostToDevice( gpu_edgelengths , c_edgelengths , number_of_edgelengths * sizeof(double) );
242	copyHostToDevice( gpu_radii , c_radii , number_of_radii * sizeof(double) );
243	copyHostToDevice( gpu_areas , c_areas , number_of_areas * sizeof(double) );
244	}
245
246	copyHostToDevice( gpu_tri_full_flag , c_tri_full_flag , number_of_tri_full_flag * sizeof(long) );
247	//copyHostToDevice( gpu_already_computed_flux, c_already_computed_flux, number_of_already_computed * sizeof(long) );
248	//copyHostToDevice( gpu_max_speed_array , c_max_speed_array , number_of_max_speed_array * sizeof(double) );
249
250	copyHostToDevice( gpu_stage_edge_values , c_stage_edge_values , number_of_stage_edge_values * sizeof(double) );
251	copyHostToDevice( gpu_xmom_edge_values , c_xmom_edge_values , number_of_xmom_edge_values * sizeof(double) );
252	copyHostToDevice( gpu_ymom_edge_values , c_ymom_edge_values , number_of_ymom_edge_values * sizeof(double) );
253	copyHostToDevice( gpu_bed_edge_values , c_bed_edge_values , number_of_bed_edge_values * sizeof(double) );
254
255	copyHostToDevice( gpu_stage_boundary_values ,c_stage_boundary_values ,number_of_stage_boundary_values * sizeof(double) );
256	copyHostToDevice( gpu_xmom_boundary_values ,c_xmom_boundary_values ,number_of_xmom_boundary_values * sizeof(double) );
257	copyHostToDevice( gpu_ymom_boundary_values ,c_ymom_boundary_values ,number_of_ymom_boundary_values * sizeof(double) );
258
259	/copyHostToDevice( gpu_stage_explicit_update ,c_stage_explicit_update ,number_of_stage_explicit_update sizeof(double) );
260	copyHostToDevice( gpu_xmom_explicit_update ,c_xmom_explicit_update ,number_of_xmom_explicit_update * sizeof(double) );
261	copyHostToDevice( gpu_ymom_explicit_update ,c_ymom_explicit_update ,number_of_ymom_explicit_update * sizeof(double) );*/
262
263	// initialize explicit updates to zero (possibly superfluous)
264	_set_to_default( gpu_stage_explicit_update, gpu_xmom_explicit_update, gpu_ymom_explicit_update, number_of_stage_explicit_update, 0.0 );
265
266
267	// Call underlying flux computation routine and update
268	// the explicit update arrays
269	timestep = _compute_fluxes_central(number_of_elements,
270	timestep,
271	epsilon,
272	H0,
273	g,
274	gpu_neighbours ,
275	gpu_neighbour_edges ,
276	gpu_normals ,
277	gpu_edgelengths ,
278	gpu_radii ,
279	gpu_areas ,
280	gpu_tri_full_flag ,
281	gpu_stage_edge_values ,
282	gpu_xmom_edge_values ,
283	gpu_ymom_edge_values ,
284	gpu_bed_edge_values ,
285	gpu_stage_boundary_values ,
286	gpu_xmom_boundary_values ,
287	gpu_ymom_boundary_values ,
288	gpu_stage_explicit_update ,
289	gpu_xmom_explicit_update ,
290	gpu_ymom_explicit_update ,
291	//gpu_already_computed_flux ,
292	gpu_max_speed_array ,
293	optimise_dry_cells);
294
295	/* (long*) neighbours -> data,
296	(long*) neighbour_edges -> data,
297	(double*) normals -> data,
298	(double*) edgelengths -> data,
299	(double*) radii -> data,
300	(double*) areas -> data,
301	(long*) tri_full_flag -> data,
302	(double*) stage_edge_values -> data,
303	(double*) xmom_edge_values -> data,
304	(double*) ymom_edge_values -> data,
305	(double*) bed_edge_values -> data,
306	(double*) stage_boundary_values -> data,
307	(double*) xmom_boundary_values -> data,
308	(double*) ymom_boundary_values -> data,
309	(double*) stage_explicit_update -> data,
310	(double*) xmom_explicit_update -> data,
311	(double*) ymom_explicit_update -> data,
312	(long*) already_computed_flux -> data,
313	(double*) max_speed_array -> data,
314	optimise_dry_cells);
315	*/
316
317	// copy GPU to Host memory
318	/copyDeviceToHost( c_neighbours , gpu_neighbours , number_of_neighbours sizeof(long) );
319	copyDeviceToHost( c_neighbour_edges , gpu_neighbour_edges , number_of_neighbour_edges * sizeof(long) );
320	copyDeviceToHost( c_normals , gpu_normals , number_of_normals * sizeof(double) );
321	copyDeviceToHost( c_edgelengths , gpu_edgelengths , number_of_edgelengths * sizeof(double) );
322	copyDeviceToHost( c_radii , gpu_radii , number_of_radii * sizeof(double) );
323	copyDeviceToHost( c_areas , gpu_areas , number_of_areas * sizeof(double) );
324	copyDeviceToHost( c_tri_full_flag , gpu_tri_full_flag , number_of_tri_full_flag * sizeof(long) );
325	//copyDeviceToHost( c_already_computed_flux, gpu_already_computed_flux, number_of_already_computed * sizeof(long) );
326	copyDeviceToHost( c_max_speed_array , gpu_max_speed_array , number_of_max_speed_array * sizeof(double) );
327
328	copyDeviceToHost( c_stage_edge_values , gpu_stage_edge_values , number_of_stage_edge_values * sizeof(double) );
329	copyDeviceToHost( c_xmom_edge_values , gpu_xmom_edge_values , number_of_xmom_edge_values * sizeof(double) );
330	copyDeviceToHost( c_ymom_edge_values , gpu_ymom_edge_values , number_of_ymom_edge_values * sizeof(double) );
331	copyDeviceToHost( c_bed_edge_values , gpu_bed_edge_values , number_of_bed_edge_values * sizeof(double) );
332
333	copyDeviceToHost( c_stage_boundary_values ,gpu_stage_boundary_values , number_of_stage_boundary_values * sizeof(double) );
334	copyDeviceToHost( c_xmom_boundary_values ,gpu_xmom_boundary_values , number_of_xmom_boundary_values * sizeof(double) );
335	copyDeviceToHost( c_ymom_boundary_values ,gpu_ymom_boundary_values , number_of_ymom_boundary_values * sizeof(double) );
336	*/
337	copyDeviceToHost( c_stage_explicit_update ,gpu_stage_explicit_update , number_of_stage_explicit_update* sizeof(double) );
338	copyDeviceToHost( c_xmom_explicit_update ,gpu_xmom_explicit_update , number_of_xmom_explicit_update * sizeof(double) );
339	copyDeviceToHost( c_ymom_explicit_update ,gpu_ymom_explicit_update , number_of_ymom_explicit_update * sizeof(double) );
340
341	if( iteration == final_iter ) {
342	// Free gpu memory
343	freeDeviceMemory( gpu_neighbours );
344	freeDeviceMemory( gpu_neighbour_edges );
345	freeDeviceMemory( gpu_normals );
346	freeDeviceMemory( gpu_edgelengths);
347	freeDeviceMemory( gpu_radii );
348	freeDeviceMemory( gpu_areas );
349	freeDeviceMemory( gpu_tri_full_flag );
350	//freeDeviceMemory( gpu_already_computed_flux);
351	freeDeviceMemory( gpu_max_speed_array );
352
353	freeDeviceMemory( gpu_stage_edge_values );
354	freeDeviceMemory( gpu_xmom_edge_values );
355	freeDeviceMemory( gpu_ymom_edge_values );
356	freeDeviceMemory( gpu_bed_edge_values );
357
358	freeDeviceMemory( gpu_stage_boundary_values );
359	freeDeviceMemory( gpu_xmom_boundary_values );
360	freeDeviceMemory( gpu_ymom_boundary_values );
361
362	freeDeviceMemory( gpu_stage_explicit_update);
363	freeDeviceMemory( gpu_xmom_explicit_update );
364	freeDeviceMemory( gpu_ymom_explicit_update );
365	}
366
367	iteration++;
368
369	Py_DECREF(neighbours);
370	Py_DECREF(neighbour_edges);
371	Py_DECREF(normals);
372	Py_DECREF(edgelengths);
373	Py_DECREF(radii);
374	Py_DECREF(areas);
375	Py_DECREF(tri_full_flag);
376	Py_DECREF(already_computed_flux);
377	Py_DECREF(max_speed_array);
378	Py_DECREF(stage_edge_values);
379	Py_DECREF(xmom_edge_values);
380	Py_DECREF(ymom_edge_values);
381	Py_DECREF(bed_edge_values);
382	Py_DECREF(stage_boundary_values);
383	Py_DECREF(xmom_boundary_values);
384	Py_DECREF(ymom_boundary_values);
385	Py_DECREF(stage_explicit_update);
386	Py_DECREF(xmom_explicit_update);
387	Py_DECREF(ymom_explicit_update);
388
389
390	// Return updated flux timestep
391	return Py_BuildValue("d", timestep);
392	}
393
394
395	//-------------------------------
396	// Method table for python module
397	//-------------------------------
398	static struct PyMethodDef MethodTable[] = {
399	/* The cast of the function is necessary since PyCFunction values
400	* only take two PyObject* parameters, and rotate() takes
401	* three.
402	*/
403
404	{"compute_fluxes_ext_central_new_gpu", compute_fluxes_ext_central_new_gpu, METH_VARARGS, "Print out"},
405	{NULL, NULL}
406	};
407
408	// Module initialisation
409	void initgpu_python_glue(void){
410	Py_InitModule("gpu_python_glue", MethodTable);
411
412	import_array(); // Necessary for handling of NumPY structures
413	}
414

Note: See TracBrowser for help on using the repository browser.

Context Navigation

source: trunk/anuga_work/anuga_gpu/gpu_python_glue.c @ 8932

Download in other formats: