[7100] | 1 | #!/usr/bin/env python |
---|
| 2 | |
---|
| 3 | ''' |
---|
| 4 | Some CSV file utility routines. |
---|
| 5 | ''' |
---|
| 6 | |
---|
| 7 | |
---|
| 8 | import csv |
---|
| 9 | |
---|
| 10 | |
---|
| 11 | def merge_csv_key_values(file_title_list, output_file, |
---|
| 12 | key_col='hours', data_col='stage'): |
---|
| 13 | '''Select key and value columns from 'N' CSV files, write one CSV file. |
---|
| 14 | |
---|
[7109] | 15 | file_title_list: a list of (filename, new_data_column_title) values, one |
---|
| 16 | for each input file |
---|
[7100] | 17 | output_file: the output CSV file path |
---|
| 18 | key_col: column header string of key column |
---|
[7109] | 19 | data_col: column header string of data column |
---|
[7100] | 20 | |
---|
| 21 | The output file will look like: |
---|
[7109] | 22 | <key_col>, <new_data_column_title1>, <new_data_column_title2>, ... |
---|
| 23 | <key_value>, <data1>, <data2>, ... |
---|
| 24 | <key_value>, <data1>, <data2>, ... |
---|
| 25 | <key_value>, <data1>, <data2>, ... |
---|
| 26 | <key_value>, <data1>, <data2>, ... |
---|
[7100] | 27 | |
---|
| 28 | There is an assumption that the <key_value> values are the same across |
---|
| 29 | all files for the same row. This is tested in the code below. |
---|
| 30 | ''' |
---|
| 31 | |
---|
| 32 | def read_csv_file(filename, key_col, data_col): |
---|
| 33 | '''Read data from a CSV file, get 'key_col' and 'data_col' columns. |
---|
| 34 | |
---|
| 35 | Returns ((key[0], data[0]), ...). |
---|
| 36 | ''' |
---|
| 37 | |
---|
| 38 | # start reading the CSV file |
---|
| 39 | data = [] |
---|
| 40 | fd = open(filename, 'rb') |
---|
| 41 | csv_reader = csv.reader(fd) |
---|
| 42 | |
---|
| 43 | # open file, get header row, calculate required column indices |
---|
| 44 | h = csv_reader.next() |
---|
| 45 | header = [x.strip() for x in h] |
---|
| 46 | if key_col not in header: |
---|
| 47 | msg = ("Column '%s' not in file %s" |
---|
| 48 | % (key_col, filename)) |
---|
| 49 | raise Exception, msg |
---|
| 50 | if data_col not in header: |
---|
| 51 | msg = ("Column '%s' not in file %s" |
---|
| 52 | % (data_col, filename)) |
---|
| 53 | raise Exception, msg |
---|
| 54 | |
---|
| 55 | key_index = header.index(key_col) |
---|
| 56 | data_index = header.index(data_col) |
---|
| 57 | |
---|
| 58 | # read data, extract columns, save |
---|
| 59 | result = [] |
---|
| 60 | for line in csv_reader: |
---|
| 61 | key_data = line[key_index].strip() |
---|
| 62 | data_data = line[data_index].strip() |
---|
| 63 | result.append((key_data, data_data)) |
---|
| 64 | |
---|
| 65 | fd.close() |
---|
| 66 | |
---|
| 67 | return result |
---|
| 68 | |
---|
[7125] | 69 | # get number of input files, check we have 1 or more |
---|
[7100] | 70 | num_files = len(file_title_list) |
---|
[7125] | 71 | if num_files == 0: |
---|
| 72 | msg = "List 'file_title_list' is empty!?" |
---|
| 73 | raise Exception, msg |
---|
[7100] | 74 | |
---|
| 75 | # read data from all files |
---|
| 76 | file_data = [] |
---|
| 77 | for (filename, title) in file_title_list: |
---|
| 78 | data = read_csv_file(filename, key_col, data_col) |
---|
| 79 | file_data.append((filename, title, data)) |
---|
| 80 | |
---|
[7109] | 81 | # now, file_data -> [(filename, title, [(k,v), (k,v), ...], ...] |
---|
[7100] | 82 | # sanity check, check num rows same in all files |
---|
| 83 | num_rows = None |
---|
| 84 | for (fn, t, d) in file_data: |
---|
| 85 | if num_rows is None: |
---|
| 86 | num_rows = len(d) |
---|
| 87 | else: |
---|
| 88 | if num_rows != len(d): |
---|
| 89 | msg = ('File %s has different number of rows from %s, ' |
---|
| 90 | 'expected %d columns, got %d' |
---|
| 91 | % (fn, file_data[0][0], num_rows, len(d))) |
---|
| 92 | raise Exception, msg |
---|
| 93 | |
---|
| 94 | # sanity check, check key values same in same rows |
---|
| 95 | first_key_values = [v[0] for v in file_data[0][2]] |
---|
| 96 | for (fn, t, d) in file_data: |
---|
| 97 | key_values = [v[0] for v in d] |
---|
| 98 | if key_values != first_key_values: |
---|
| 99 | msg = ('Key values differ between files %s and %s!?' |
---|
| 100 | % (fn, file_data[0][0])) |
---|
| 101 | raise Exception, msg |
---|
| 102 | |
---|
| 103 | # open output file |
---|
| 104 | out_fd = open(output_file, 'w') |
---|
| 105 | out_csv = csv.writer(out_fd) |
---|
| 106 | |
---|
| 107 | # write column rows to output file |
---|
| 108 | header = [key_col] |
---|
| 109 | for (fn, col, d) in file_data: |
---|
| 110 | header.append(col) |
---|
| 111 | out_csv.writerow(header) |
---|
| 112 | |
---|
| 113 | # write data rows to output file |
---|
| 114 | file_kv_list = [x[2] for x in file_data] |
---|
| 115 | for i in xrange(num_rows): |
---|
| 116 | data_row = [file_kv_list[0][i][0]] |
---|
| 117 | for file_data in file_kv_list: |
---|
| 118 | data_row.append(file_data[i][1]) |
---|
| 119 | out_csv.writerow(data_row) |
---|
| 120 | |
---|
| 121 | out_fd.close() |
---|