Context Navigation

source: branches/numpy/anuga/utilities/csv_tools.py @ 7101

Last change on this file since 7101 was 7101, checked in by rwilson, 15 years ago
Changed name of files to better *_tools.py form.
File size: 3.9 KB

Line
1	#!/usr/bin/env python
2
3	'''
4	Some CSV file utility routines.
5	'''
6
7
8	import csv
9
10
11	def merge_csv_key_values(file_title_list, output_file,
12	key_col='hours', data_col='stage'):
13	'''Select key and value columns from 'N' CSV files, write one CSV file.
14
15	file_title_list: a list of (filename, new_value_column_title) values, one
16	for each file
17	output_file: the output CSV file path
18	key_col: column header string of key column
19	data_col: column header string of value column
20
21	The output file will look like:
22	<key_col>, <new_value_column_title1>, <new_value_column_title2>, ...
23	<key_value>, <data1>, <data2>, ...
24	<key_value>, <data1>, <data2>, ...
25	<key_value>, <data1>, <data2>, ...
26	<key_value>, <data1>, <data2>, ...
27
28	There is an assumption that the <key_value> values are the same across
29	all files for the same row. This is tested in the code below.
30	'''
31
32	def read_csv_file(filename, key_col, data_col):
33	'''Read data from a CSV file, get 'key_col' and 'data_col' columns.
34
35	Returns ((key[0], data[0]), ...).
36	'''
37
38	# start reading the CSV file
39	data = []
40	fd = open(filename, 'rb')
41	csv_reader = csv.reader(fd)
42
43	# open file, get header row, calculate required column indices
44	h = csv_reader.next()
45	header = [x.strip() for x in h]
46	if key_col not in header:
47	msg = ("Column '%s' not in file %s"
48	% (key_col, filename))
49	raise Exception, msg
50	if data_col not in header:
51	msg = ("Column '%s' not in file %s"
52	% (data_col, filename))
53	raise Exception, msg
54
55	key_index = header.index(key_col)
56	data_index = header.index(data_col)
57
58	# read data, extract columns, save
59	result = []
60	for line in csv_reader:
61	key_data = line[key_index].strip()
62	data_data = line[data_index].strip()
63	result.append((key_data, data_data))
64
65	fd.close()
66
67	return result
68
69	# get number of input files
70	num_files = len(file_title_list)
71
72	# read data from all files
73	file_data = []
74	for (filename, title) in file_title_list:
75	data = read_csv_file(filename, key_col, data_col)
76	file_data.append((filename, title, data))
77
78	# now, file_data -> [(filename, title, [(k,v), (k,v), ...]]
79	# sanity check, check num rows same in all files
80	num_rows = None
81	for (fn, t, d) in file_data:
82	if num_rows is None:
83	num_rows = len(d)
84	else:
85	if num_rows != len(d):
86	msg = ('File %s has different number of rows from %s, '
87	'expected %d columns, got %d'
88	% (fn, file_data[0][0], num_rows, len(d)))
89	raise Exception, msg
90
91	# sanity check, check key values same in same rows
92	first_key_values = [v[0] for v in file_data[0][2]]
93	for (fn, t, d) in file_data:
94	key_values = [v[0] for v in d]
95	if key_values != first_key_values:
96	msg = ('Key values differ between files %s and %s!?'
97	% (fn, file_data[0][0]))
98	raise Exception, msg
99
100	# open output file
101	out_fd = open(output_file, 'w')
102	out_csv = csv.writer(out_fd)
103
104	# write column rows to output file
105	# at this point, file_data=[(<filename>, <col_title>, [(k,v), ...]), ...]
106	header = [key_col]
107	for (fn, col, d) in file_data:
108	header.append(col)
109	out_csv.writerow(header)
110
111	# write data rows to output file
112	file_kv_list = [x[2] for x in file_data]
113	for i in xrange(num_rows):
114	data_row = [file_kv_list[0][i][0]]
115	for file_data in file_kv_list:
116	data_row.append(file_data[i][1])
117	out_csv.writerow(data_row)
118
119	out_fd.close()

Note: See TracBrowser for help on using the repository browser.

Download in other formats: