#!/usr/bin/env python # -*- coding: utf-8 -*- # # python 2.7 # # after AIP site export/import, internal coll/comm/item/bitstream IDs change # this script will change the old IDs of statistics events to the new ones # matching is done based on handle or bitstream checksum # # input: handle-old.csv (from old DB, table 'handle') # input: handle-new.csv (from new DB, table 'handle') # input: bitstream-old.csv (from old DB, table 'bitstream') # input: bitstream-new.csv (from new DB, table 'bitstream') # input: solr-in.csv (solr statistics exported to csv using [dspace]/bin/dspace solr-export-statistics) # output: solr-out.csv (solr statistics to be imported using [dspace]/bin/dspace solr-import-statistics) import sys import csv BITSTREAM = '0' ITEM = '2' COLLECTION = '3' COMMUNITY = '4' old = {} old[ITEM] = {} old[COLLECTION] = {} old[COMMUNITY] = {} new = {} new[ITEM] = {} new[COLLECTION] = {} new[COMMUNITY] = {} old_bitstreams = {} new_bitstreams = {} def load_old_ids(old_handles_csv): """load mapping old ID -> handle (for collections, communities, items)""" for row in old_handles_csv: res_type = row['resource_type_id'] res_id = row['resource_id'] old[res_type][res_id] = row['handle'] def load_new_ids(new_handles_csv): """load mapping handle -> new ID (for collections, communities, items)""" for row in new_handles_csv: res_type = row['resource_type_id'] res_handle = row['handle'] new[res_type][res_handle] = row['resource_id'] def load_old_bitstreams(old_bitstreams_csv): """load mapping old old bitstream_id -> checksum""" for row in old_bitstreams_csv: bitstream_id = row['bitstream_id'] old_bitstreams[bitstream_id] = row['checksum'] def load_new_bitstreams(new_bitstreams_csv): """load mapping new checksum -> new bitstream_id""" for row in new_bitstreams_csv: checksum = row['checksum'] new_bitstreams[checksum] = row['bitstream_id'] def old2new_bitstream(old_id): checksum = old_bitstreams[old_id] new_id = new_bitstreams[checksum] return new_id def old2new(old_id, resource_type_id): """lookup new ID from old ID""" # print "type: %s; id: %s" % (resource_type_id, old_id) if old_id == '': return '' if resource_type_id == BITSTREAM: return old2new_bitstream(old_id) parts = old_id.split(',') if len(parts) != 1: new_ids = [] for part in parts: new_ids.append(old2new(part, resource_type_id)) return ','.join(new_ids) else: new_id = old_id # fallback try: handle = old[resource_type_id][old_id] except KeyError: print "KeyError during old lookup: old_id not found; type: %s; old ID: %s" % (resource_type_id, old_id) raise try: new_id = new[resource_type_id][handle] except KeyError: print "KeyError during new lookup: handle not found in new; type: %s; old ID: %s; handle: %s" % (resource_type_id, old_id, handle) raise return new_id if __name__ == "__main__": if len(sys.argv) != 7: print "Usage: %s handle-old.csv handle-new.csv bitstream-old.csv bitstream-new.csv solr-in.csv solr-out.csv" % sys.argv[0] sys.exit(1) old_handles_file = open(sys.argv[1], 'r') new_handles_file = open(sys.argv[2], 'r') old_bitstreams_file = open(sys.argv[3], 'r') new_bitstreams_file = open(sys.argv[4], 'r') solr_in_file = open(sys.argv[5], 'r') solr_out_file = open(sys.argv[6], 'w') csv.register_dialect('db', delimiter=',', quotechar='"', quoting=csv.QUOTE_MINIMAL) csv.register_dialect('solr', delimiter=',', quotechar='"', quoting=csv.QUOTE_MINIMAL) old_handles_csv = csv.DictReader(old_handles_file, dialect='db') new_handles_csv = csv.DictReader(new_handles_file, dialect='db') old_bitstreams_csv = csv.DictReader(old_bitstreams_file, dialect='db') new_bitstreams_csv = csv.DictReader(new_bitstreams_file, dialect='db') solr_in = csv.DictReader(solr_in_file, dialect='solr') load_old_ids(old_handles_csv) load_new_ids(new_handles_csv) load_old_bitstreams(old_bitstreams_csv) load_new_bitstreams(new_bitstreams_csv) headers = ['uid', 'rpp', 'userAgent', 'submitter', 'query', 'isBot', 'actor', 'type', 'owningComm', 'city', 'id', 'previousWorkflowStep', 'time', 'scopeType', 'page', 'longitude', 'scopeId', 'epersonid', 'workflowItemId', '_version_', 'sortOrder', 'countryCode', 'dns', 'owningColl', 'statistics_type', 'ip', 'referrer', 'sortBy', 'continent', 'owningItem', 'latitude', 'bundleName', 'workflowStep'] solr_out = csv.DictWriter(solr_out_file, fieldnames=headers, dialect='solr') solr_out.writeheader() for i, row in enumerate(solr_in): if row['type'] != '': # print "row ", i # print "-type: %s com: %s col: %s item: %s id: %s" % (row['type'], row['owningComm'], row['owningColl'], row['owningItem'], row['id']) try: row['owningComm'] = old2new(row['owningComm'], COMMUNITY) row['owningColl'] = old2new(row['owningColl'], COLLECTION) row['owningItem'] = old2new(row['owningItem'], ITEM) if row['type'] == '0': row['id'] = old2new_bitstream(row['id']) else: row['id'] = old2new(row['id'], row['type']) except KeyError: continue # old id not in new repository; skip writing this event (CAVEAT: will skip writing item event when old owning coll/comm doesn't exist) # print "+type: %s com: %s col: %s item: %s id: %s" % (row['type'], row['owningComm'], row['owningColl'], row['owningItem'], row['id']) solr_out.writerow(row) else: # statistics_type:search solr_out.writerow(row)