Context Navigation

csv_tools.py @ 7244

Last change on this file since 7244 was 7125, checked in by rwilson, 16 years ago
Add another sanity check: is list of input files empty?
File size: 4.0 KB

Rev	Line
[7100]	1	#!/usr/bin/env python
	2
	3	'''
	4	Some CSV file utility routines.
	5	'''
	6
	7
	8	import csv
	9
	10
	11	def merge_csv_key_values(file_title_list, output_file,
	12	key_col='hours', data_col='stage'):
	13	'''Select key and value columns from 'N' CSV files, write one CSV file.
	14
[7109]	15	file_title_list: a list of (filename, new_data_column_title) values, one
	16	for each input file
[7100]	17	output_file: the output CSV file path
	18	key_col: column header string of key column
[7109]	19	data_col: column header string of data column
[7100]	20
	21	The output file will look like:
[7109]	22	<key_col>, <new_data_column_title1>, <new_data_column_title2>, ...
	23	<key_value>, <data1>, <data2>, ...
	24	<key_value>, <data1>, <data2>, ...
	25	<key_value>, <data1>, <data2>, ...
	26	<key_value>, <data1>, <data2>, ...
[7100]	27
	28	There is an assumption that the <key_value> values are the same across
	29	all files for the same row. This is tested in the code below.
	30	'''
	31
	32	def read_csv_file(filename, key_col, data_col):
	33	'''Read data from a CSV file, get 'key_col' and 'data_col' columns.
	34
	35	Returns ((key[0], data[0]), ...).
	36	'''
	37
	38	# start reading the CSV file
	39	data = []
	40	fd = open(filename, 'rb')
	41	csv_reader = csv.reader(fd)
	42
	43	# open file, get header row, calculate required column indices
	44	h = csv_reader.next()
	45	header = [x.strip() for x in h]
	46	if key_col not in header:
	47	msg = ("Column '%s' not in file %s"
	48	% (key_col, filename))
	49	raise Exception, msg
	50	if data_col not in header:
	51	msg = ("Column '%s' not in file %s"
	52	% (data_col, filename))
	53	raise Exception, msg
	54
	55	key_index = header.index(key_col)
	56	data_index = header.index(data_col)
	57
	58	# read data, extract columns, save
	59	result = []
	60	for line in csv_reader:
	61	key_data = line[key_index].strip()
	62	data_data = line[data_index].strip()
	63	result.append((key_data, data_data))
	64
	65	fd.close()
	66
	67	return result
	68
[7125]	69	# get number of input files, check we have 1 or more
[7100]	70	num_files = len(file_title_list)
[7125]	71	if num_files == 0:
	72	msg = "List 'file_title_list' is empty!?"
	73	raise Exception, msg
[7100]	74
	75	# read data from all files
	76	file_data = []
	77	for (filename, title) in file_title_list:
	78	data = read_csv_file(filename, key_col, data_col)
	79	file_data.append((filename, title, data))
	80
[7109]	81	# now, file_data -> [(filename, title, [(k,v), (k,v), ...], ...]
[7100]	82	# sanity check, check num rows same in all files
	83	num_rows = None
	84	for (fn, t, d) in file_data:
	85	if num_rows is None:
	86	num_rows = len(d)
	87	else:
	88	if num_rows != len(d):
	89	msg = ('File %s has different number of rows from %s, '
	90	'expected %d columns, got %d'
	91	% (fn, file_data[0][0], num_rows, len(d)))
	92	raise Exception, msg
	93
	94	# sanity check, check key values same in same rows
	95	first_key_values = [v[0] for v in file_data[0][2]]
	96	for (fn, t, d) in file_data:
	97	key_values = [v[0] for v in d]
	98	if key_values != first_key_values:
	99	msg = ('Key values differ between files %s and %s!?'
	100	% (fn, file_data[0][0]))
	101	raise Exception, msg
	102
	103	# open output file
	104	out_fd = open(output_file, 'w')
	105	out_csv = csv.writer(out_fd)
	106
	107	# write column rows to output file
	108	header = [key_col]
	109	for (fn, col, d) in file_data:
	110	header.append(col)
	111	out_csv.writerow(header)
	112
	113	# write data rows to output file
	114	file_kv_list = [x[2] for x in file_data]
	115	for i in xrange(num_rows):
	116	data_row = [file_kv_list[0][i][0]]
	117	for file_data in file_kv_list:
	118	data_row.append(file_data[i][1])
	119	out_csv.writerow(data_row)
	120
	121	out_fd.close()

Note: See TracBrowser for help on using the repository browser.

Context Navigation

source: branches/numpy/anuga/utilities/csv_tools.py @ 7244

Download in other formats: