1 | #!/usr/bin/env python |
---|
2 | |
---|
3 | ''' |
---|
4 | Some CSV file utility routines. |
---|
5 | ''' |
---|
6 | |
---|
7 | |
---|
8 | import csv |
---|
9 | |
---|
10 | |
---|
11 | def merge_csv_key_values(file_title_list, output_file, |
---|
12 | key_col='hours', data_col='stage'): |
---|
13 | '''Select key and value columns from 'N' CSV files, write one CSV file. |
---|
14 | |
---|
15 | file_title_list: a list of (filename, new_data_column_title) values, one |
---|
16 | for each input file |
---|
17 | output_file: the output CSV file path |
---|
18 | key_col: column header string of key column |
---|
19 | data_col: column header string of data column |
---|
20 | |
---|
21 | The output file will look like: |
---|
22 | <key_col>, <new_data_column_title1>, <new_data_column_title2>, ... |
---|
23 | <key_value>, <data1>, <data2>, ... |
---|
24 | <key_value>, <data1>, <data2>, ... |
---|
25 | <key_value>, <data1>, <data2>, ... |
---|
26 | <key_value>, <data1>, <data2>, ... |
---|
27 | |
---|
28 | There is an assumption that the <key_value> values are the same across |
---|
29 | all files for the same row. This is tested in the code below. |
---|
30 | ''' |
---|
31 | |
---|
32 | def read_csv_file(filename, key_col, data_col): |
---|
33 | '''Read data from a CSV file, get 'key_col' and 'data_col' columns. |
---|
34 | |
---|
35 | Returns ((key[0], data[0]), ...). |
---|
36 | ''' |
---|
37 | |
---|
38 | # start reading the CSV file |
---|
39 | data = [] |
---|
40 | fd = open(filename, 'rb') |
---|
41 | csv_reader = csv.reader(fd) |
---|
42 | |
---|
43 | # open file, get header row, calculate required column indices |
---|
44 | h = csv_reader.next() |
---|
45 | header = [x.strip() for x in h] |
---|
46 | if key_col not in header: |
---|
47 | msg = ("Column '%s' not in file %s" |
---|
48 | % (key_col, filename)) |
---|
49 | raise Exception, msg |
---|
50 | if data_col not in header: |
---|
51 | msg = ("Column '%s' not in file %s" |
---|
52 | % (data_col, filename)) |
---|
53 | raise Exception, msg |
---|
54 | |
---|
55 | key_index = header.index(key_col) |
---|
56 | data_index = header.index(data_col) |
---|
57 | |
---|
58 | # read data, extract columns, save |
---|
59 | result = [] |
---|
60 | for line in csv_reader: |
---|
61 | key_data = line[key_index].strip() |
---|
62 | data_data = line[data_index].strip() |
---|
63 | result.append((key_data, data_data)) |
---|
64 | |
---|
65 | fd.close() |
---|
66 | |
---|
67 | return result |
---|
68 | |
---|
69 | # get number of input files, check we have 1 or more |
---|
70 | num_files = len(file_title_list) |
---|
71 | if num_files == 0: |
---|
72 | msg = "List 'file_title_list' is empty!?" |
---|
73 | raise Exception, msg |
---|
74 | |
---|
75 | # read data from all files |
---|
76 | file_data = [] |
---|
77 | for (filename, title) in file_title_list: |
---|
78 | data = read_csv_file(filename, key_col, data_col) |
---|
79 | file_data.append((filename, title, data)) |
---|
80 | |
---|
81 | # now, file_data -> [(filename, title, [(k,v), (k,v), ...], ...] |
---|
82 | # sanity check, check num rows same in all files |
---|
83 | num_rows = None |
---|
84 | for (fn, t, d) in file_data: |
---|
85 | if num_rows is None: |
---|
86 | num_rows = len(d) |
---|
87 | else: |
---|
88 | if num_rows != len(d): |
---|
89 | msg = ('File %s has different number of rows from %s, ' |
---|
90 | 'expected %d columns, got %d' |
---|
91 | % (fn, file_data[0][0], num_rows, len(d))) |
---|
92 | raise Exception, msg |
---|
93 | |
---|
94 | # sanity check, check key values same in same rows |
---|
95 | first_key_values = [v[0] for v in file_data[0][2]] |
---|
96 | for (fn, t, d) in file_data: |
---|
97 | key_values = [v[0] for v in d] |
---|
98 | if key_values != first_key_values: |
---|
99 | msg = ('Key values differ between files %s and %s!?' |
---|
100 | % (fn, file_data[0][0])) |
---|
101 | raise Exception, msg |
---|
102 | |
---|
103 | # open output file |
---|
104 | out_fd = open(output_file, 'w') |
---|
105 | out_csv = csv.writer(out_fd) |
---|
106 | |
---|
107 | # write column rows to output file |
---|
108 | header = [key_col] |
---|
109 | for (fn, col, d) in file_data: |
---|
110 | header.append(col) |
---|
111 | out_csv.writerow(header) |
---|
112 | |
---|
113 | # write data rows to output file |
---|
114 | file_kv_list = [x[2] for x in file_data] |
---|
115 | for i in xrange(num_rows): |
---|
116 | data_row = [file_kv_list[0][i][0]] |
---|
117 | for file_data in file_kv_list: |
---|
118 | data_row.append(file_data[i][1]) |
---|
119 | out_csv.writerow(data_row) |
---|
120 | |
---|
121 | out_fd.close() |
---|