diff --git a/refine.py b/refine.py index 57f9e4c..63a61c3 100755 --- a/refine.py +++ b/refine.py @@ -1,13 +1,6 @@ #!/usr/bin/env python """ Script to provide a command line interface to a Refine server. - -Examples, - -refine --list # show list of Refine projects, ID: name -refine --export 1234... > project.tsv -refine --export --output=project.xls 1234... -refine --apply trim.json 1234... """ # Copyright (c) 2011 Paul Makepeace, Real Programmers. All rights reserved. @@ -33,23 +26,461 @@ import time from google.refine import refine +import urllib2_file +import urllib2 +import urlparse -PARSER = optparse.OptionParser( - usage='usage: %prog [--help | OPTIONS] [project ID/URL]') -PARSER.add_option('-H', '--host', dest='host', - help='OpenRefine hostname') -PARSER.add_option('-P', '--port', dest='port', - help='OpenRefine port') -PARSER.add_option('-o', '--output', dest='output', - help='Output filename') -# Options that are more like commands -PARSER.add_option('-l', '--list', dest='list', action='store_true', - help='List projects') -PARSER.add_option('-E', '--export', dest='export', action='store_true', - help='Export project') -PARSER.add_option('-f', '--apply', dest='apply', - help='Apply a JSON commands file to a project') +import json +class myParser(optparse.OptionParser): + + def format_epilog(self, formatter): + return self.epilog + +PARSER = \ + myParser(description='Script to provide a command line interface to an OpenRefine server.', + usage='usage: %prog [--help | OPTIONS]', + epilog=""" +Examples: + ./refine.py --list # show list of Refine projects, ID: name + ./refine.py --list -H 127.0.0.1 -P 80 # specify hostname and port + ./refine.py --create example.csv # create new project from file example.csv + ./refine.py --create example.tsv --format=tsv --encoding=UTF-8 + ./refine.py --create example.xml --format=xml --recordPath=collection --recordPath=record + ./refine.py --create example.json --format=json --recordPath=_ --recordPath=_ + ./refine.py --create example.xlsx --format=xlsx --sheets=0 + ./refine.py --create example.ods --format=ods --sheets=0 + ./refine.py --export 1234... > project.tsv # export project 1234... in tsv format + ./refine.py --export --output=project.xls 1234... # export project in xls format + ./refine.py --apply trim.json 1234... # apply rules in trim.json to project 1234... + +""") + +group1 = optparse.OptionGroup(PARSER, 'Connection options') +group1.add_option('-H', '--host', dest='host', metavar='127.0.0.1', + help='OpenRefine hostname (default: localhost)') +group1.add_option('-P', '--port', dest='port', metavar='3333', + help='OpenRefine port (default: 3333)') +PARSER.add_option_group(group1) + +group2 = optparse.OptionGroup(PARSER, 'Commands') +group2.add_option('-l', '--list', dest='list', action='store_true', + help='List projects: refine.py -l') +group2.add_option('-c', '--create', dest='create', action='store_true', + help='Create project from file: refine.py -c [FILE]') +group2.add_option('-E', '--export', dest='export', action='store_true', + help='Export project: refine.py -E [PROJECTID]') +group2.add_option('-f', '--apply', dest='apply', metavar='file.json', + help='Apply JSON rules: refine.py -f [FILE] [PROJECTID]') +PARSER.add_option_group(group2) + +group3 = optparse.OptionGroup(PARSER, 'Export options (optional)') +group3.add_option('-o', '--output', dest='output', metavar='file.csv', + help='Specify output filename and filetype. The filename ending (e.g. .csv) defines the output format (csv,tsv,xls,html)') +PARSER.add_option_group(group3) + +group4 = optparse.OptionGroup(PARSER, 'Create format (mandatory for xml, json, fixed-width, xlsx, ods)') +group4.add_option('--format', dest='inputformat', + help='Specify input format (csv, tsv, xml, json, line-based, fixed-width, xlsx, ods)') +PARSER.add_option_group(group4) + +group5 = optparse.OptionGroup(PARSER, 'Create options (mandatory for xml, json, fixed-width, xslx, ods; only together with --format)') +group5.add_option('--recordPath', dest='recordPath', action='append', + help='(xml, json), please provide path in multiple arguments without slashes, e.g. /collection/record/ should be entered like this: --recordPath=collection --recordPath=record') +group5.add_option('--columnWidths', dest='columnWidths', + help='(fixed-width) please provide widths separated by comma (e.g. 7,5)') +group5.add_option('--sheets', dest='sheets', + help='(xlsx, ods), please provide sheets separated by comma (e.g. 0,1), default: 0 (first sheet)') +PARSER.add_option_group(group5) + +group6 = optparse.OptionGroup(PARSER, 'More create options (optional, only together with --format)') +group6.add_option('--limit', dest='limit', + help='(all formats), default: -1') +group6.add_option('--includeFileSources', dest='includeFileSources', + help='(all formats), default: false') +group6.add_option('--trimStrings', dest='trimStrings', + help='(xml, json), default: false') +group6.add_option('--storeEmptyStrings', dest='storeEmptyStrings', + help='(xml, json), default: true') +group6.add_option('--guessCellValueTypes', dest='guessCellValueTypes', + help='(xml, csv, tsv, fixed-width, json), default: false') +group6.add_option('--encoding', dest='encoding', + help='(csv, tsv, line-based, fixed-width), please provide short encoding name (e.g. UTF-8)') +group6.add_option('--ignoreLines', dest='ignoreLines', + help='(csv, tsv, line-based, fixed-width, xlsx, ods), default: -1') +group6.add_option('--headerLines', dest='headerLines', + help='(csv, tsv, fixed-width, xlsx, ods), default: 1') +group6.add_option('--skipDataLines', dest='skipDataLines', + help='(csv, tsv, line-based, fixed-width, xlsx, ods), default: 0') +group6.add_option('--storeBlankRows', dest='storeBlankRows', + help='(csv, tsv, line-based, fixed-width, xlsx, ods), default: true') +group6.add_option('--processQuotes', dest='processQuotes', + help='(csv, tsv), default: true') +group6.add_option('--storeBlankCellsAsNulls', + dest='storeBlankCellsAsNulls', + help='(csv, tsv, line-based, fixed-width, xlsx, ods), default: true') +group6.add_option('--linesPerRow', dest='linesPerRow', + help='(line-based), default: 1') +PARSER.add_option_group(group6) + +def create_project(options, file_fullpath): + servernewproject = 'http://' + refine.REFINE_HOST + if refine.REFINE_PORT != '80': + servernewproject += ':' + refine.REFINE_PORT + + input_format = '' + input_options = '' + + # xml + if options.inputformat == 'xml': + input_format = 'text/xml' + recordPath = 'record' + if options.recordPath: + recordPath = options.recordPath + limit = '-1' + if options.limit: + limit = options.limit + trimStrings = 'false' + if options.trimStrings: + trimStrings = options.trimStrings + guessCellValueTypes = 'false' + if options.guessCellValueTypes: + guessCellValueTypes = options.guessCellValueTypes + storeEmptyStrings = 'true' + if options.storeEmptyStrings: + storeEmptyStrings = options.storeEmptyStrings + includeFileSources = 'false' + if options.includeFileSources: + includeFileSources = options.includeFileSources + input_options = '{"recordPath":["' + '","'.join(recordPath) + '"]' \ + + ',"limit":' + limit \ + + ',"trimStrings":' + trimStrings \ + + ',"guessCellValueTypes":' + guessCellValueTypes \ + + ',"storeEmptyStrings":' + storeEmptyStrings \ + + ',"includeFileSources":' + includeFileSources \ + + '}' + + # csv + if options.inputformat == 'csv': + input_format = 'text/line-based/*sv' + encoding = '' + if options.encoding: + encoding = options.encoding + ignoreLines = '-1' + if options.ignoreLines: + ignoreLines = options.ignoreLines + headerLines = '1' + if options.headerLines: + headerLines = options.headerLines + skipDataLines = '0' + if options.skipDataLines: + skipDataLines = options.skipDataLines + limit = '-1' + if options.limit: + limit = options.limit + storeBlankRows = 'true' + if options.storeBlankRows: + storeBlankRows = options.storeBlankRows + guessCellValueTypes = 'false' + if options.guessCellValueTypes: + guessCellValueTypes = options.guessCellValueTypes + processQuotes = 'true' + if options.processQuotes: + processQuotes = options.processQuotes + storeBlankCellsAsNulls = 'true' + if options.storeBlankCellsAsNulls: + storeBlankCellsAsNulls = options.storeBlankCellsAsNulls + includeFileSources = 'false' + if options.includeFileSources: + includeFileSources = options.includeFileSources + input_options = '{"encoding":"' + encoding + '"' \ + + ',"separator":","' \ + + ',"ignoreLines":' + ignoreLines \ + + ',"headerLines":' + headerLines \ + + ',"skipDataLines":' + skipDataLines \ + + ',"limit":' + limit \ + + ',"storeBlankRows":' + storeBlankRows \ + + ',"guessCellValueTypes":' + guessCellValueTypes \ + + ',"processQuotes":' + processQuotes \ + + ',"storeBlankCellsAsNulls":' + storeBlankCellsAsNulls \ + + ',"includeFileSources":' + includeFileSources \ + + '}' + + # tsv + if options.inputformat == 'tsv': + input_format = 'text/line-based/*sv' + encoding = '' + if options.encoding: + encoding = options.encoding + ignoreLines = '-1' + if options.ignoreLines: + ignoreLines = options.ignoreLines + headerLines = '1' + if options.headerLines: + headerLines = options.headerLines + skipDataLines = '0' + if options.skipDataLines: + skipDataLines = options.skipDataLines + limit = '-1' + if options.limit: + limit = options.limit + storeBlankRows = 'true' + if options.storeBlankRows: + storeBlankRows = options.storeBlankRows + guessCellValueTypes = 'false' + if options.guessCellValueTypes: + guessCellValueTypes = options.guessCellValueTypes + processQuotes = 'true' + if options.processQuotes: + processQuotes = options.processQuotes + storeBlankCellsAsNulls = 'true' + if options.storeBlankCellsAsNulls: + storeBlankCellsAsNulls = options.storeBlankCellsAsNulls + includeFileSources = 'false' + if options.includeFileSources: + includeFileSources = options.includeFileSources + input_options = '{"encoding":"' + encoding + '"' \ + + ',"separator":"\\t"' \ + + ',"ignoreLines":' + ignoreLines \ + + ',"headerLines":' + headerLines \ + + ',"skipDataLines":' + skipDataLines \ + + ',"limit":' + limit \ + + ',"storeBlankRows":' + storeBlankRows \ + + ',"guessCellValueTypes":' + guessCellValueTypes \ + + ',"processQuotes":' + processQuotes \ + + ',"storeBlankCellsAsNulls":' + storeBlankCellsAsNulls \ + + ',"includeFileSources":' + includeFileSources \ + + '}' + + # line-based + if options.inputformat == 'line-based': + input_format = 'text/line-based' + encoding = '' + if options.encoding: + encoding = options.encoding + linesPerRow = '1' + if options.linesPerRow: + linesPerRow = options.linesPerRow + ignoreLines = '-1' + if options.ignoreLines: + ignoreLines = options.ignoreLines + limit = '-1' + if options.limit: + limit = options.limit + skipDataLines = '-1' + if options.skipDataLines: + skipDataLines = options.skipDataLines + storeBlankRows = 'true' + if options.storeBlankRows: + storeBlankRows = options.storeBlankRows + storeBlankCellsAsNulls = 'true' + if options.storeBlankCellsAsNulls: + storeBlankCellsAsNulls = options.storeBlankCellsAsNulls + includeFileSources = 'false' + if options.includeFileSources: + includeFileSources = options.includeFileSources + input_options = '{"encoding":"' + encoding + '"' \ + + ',"linesPerRow":' + linesPerRow \ + + ',"ignoreLines":' + ignoreLines \ + + ',"limit":' + limit \ + + ',"skipDataLines":' + skipDataLines \ + + ',"storeBlankRows":' + storeBlankRows \ + + ',"storeBlankCellsAsNulls":' + storeBlankCellsAsNulls \ + + ',"includeFileSources":' + includeFileSources \ + + '}' + + # fixed-width + if options.inputformat == 'fixed-width': + input_format = 'text/line-based/fixed-width' + encoding = '' + if options.encoding: + encoding = options.encoding + columnWidths = '' + if options.columnWidths: + columnWidths = options.columnWidths + ignoreLines = '-1' + if options.ignoreLines: + ignoreLines = options.ignoreLines + headerLines = '0' + if options.headerLines: + headerLines = options.headerLines + skipDataLines = '0' + if options.skipDataLines: + skipDataLines = options.skipDataLines + limit = '-1' + if options.limit: + limit = options.limit + guessCellValueTypes = 'false' + if options.guessCellValueTypes: + guessCellValueTypes = options.guessCellValueTypes + storeBlankRows = 'true' + if options.storeBlankRows: + storeBlankRows = options.storeBlankRows + storeBlankCellsAsNulls = 'true' + if options.storeBlankCellsAsNulls: + storeBlankCellsAsNulls = options.storeBlankCellsAsNulls + includeFileSources = 'false' + if options.includeFileSources: + includeFileSources = options.includeFileSources + input_options = '{"encoding":"' + encoding + '"' \ + + ',"columnWidths":[' + columnWidths + ']' \ + + ',"ignoreLines":' + ignoreLines \ + + ',"headerLines":' + headerLines \ + + ',"skipDataLines":' + skipDataLines \ + + ',"limit":' + limit \ + + ',"guessCellValueTypes":' + guessCellValueTypes \ + + ',"storeBlankRows":' + storeBlankRows \ + + ',"storeBlankCellsAsNulls":' + storeBlankCellsAsNulls \ + + ',"includeFileSources":' + includeFileSources \ + + '}' + + # json + if options.inputformat == 'json': + input_format = 'text/json' + recordPath = ['_', '_'] + if options.recordPath: + recordPath = options.recordPath + limit = '-1' + if options.limit: + limit = options.limit + trimStrings = 'false' + if options.trimStrings: + trimStrings = options.trimStrings + guessCellValueTypes = 'false' + if options.guessCellValueTypes: + guessCellValueTypes = options.guessCellValueTypes + storeEmptyStrings = 'true' + if options.storeEmptyStrings: + storeEmptyStrings = options.storeEmptyStrings + includeFileSources = 'false' + if options.includeFileSources: + includeFileSources = options.includeFileSources + input_options = '{"recordPath":["' + '","'.join(recordPath) + '"]' \ + + ',"limit":' + limit \ + + ',"trimStrings":' + trimStrings \ + + ',"guessCellValueTypes":' + guessCellValueTypes \ + + ',"storeEmptyStrings":' + storeEmptyStrings \ + + ',"includeFileSources":' + includeFileSources \ + + '}' + + # xlsx + if options.inputformat == 'xlsx': + input_format = 'binary/text/xml/xls/xlsx' + sheets = '0' + if options.sheets: + sheets = options.sheets + ignoreLines = '-1' + if options.ignoreLines: + ignoreLines = options.ignoreLines + headerLines = '1' + if options.headerLines: + headerLines = options.headerLines + skipDataLines = '0' + if options.skipDataLines: + skipDataLines = options.skipDataLines + limit = '-1' + if options.limit: + limit = options.limit + storeBlankRows = 'true' + if options.storeBlankRows: + storeBlankRows = options.storeBlankRows + storeBlankCellsAsNulls = 'true' + if options.storeBlankCellsAsNulls: + storeBlankCellsAsNulls = options.storeBlankCellsAsNulls + includeFileSources = 'false' + if options.includeFileSources: + includeFileSources = options.includeFileSources + input_options = '{"sheets":[' + sheets + ']' \ + + ',"ignoreLines":' + ignoreLines \ + + ',"headerLines":' + headerLines \ + + ',"skipDataLines":' + skipDataLines \ + + ',"limit":' + limit \ + + ',"storeBlankRows":' + storeBlankRows \ + + ',"storeBlankCellsAsNulls":' + storeBlankCellsAsNulls \ + + ',"includeFileSources":' + includeFileSources \ + + '}' + + # ods + if options.inputformat == 'ods': + input_format = 'text/xml/ods' + sheets = '0' + if options.sheets: + sheets = options.sheets + ignoreLines = '-1' + if options.ignoreLines: + ignoreLines = options.ignoreLines + headerLines = '1' + if options.headerLines: + headerLines = options.headerLines + skipDataLines = '0' + if options.skipDataLines: + skipDataLines = options.skipDataLines + limit = '-1' + if options.limit: + limit = options.limit + storeBlankRows = 'true' + if options.storeBlankRows: + storeBlankRows = options.storeBlankRows + storeBlankCellsAsNulls = 'true' + if options.storeBlankCellsAsNulls: + storeBlankCellsAsNulls = options.storeBlankCellsAsNulls + includeFileSources = 'false' + if options.includeFileSources: + includeFileSources = options.includeFileSources + input_options = '{"sheets":[' + sheets + ']' \ + + ',"ignoreLines":' + ignoreLines \ + + ',"headerLines":' + headerLines \ + + ',"skipDataLines":' + skipDataLines \ + + ',"limit":' + limit \ + + ',"storeBlankRows":' + storeBlankRows \ + + ',"storeBlankCellsAsNulls":' + storeBlankCellsAsNulls \ + + ',"includeFileSources":' + includeFileSources \ + + '}' + + data = {} + file_name = os.path.split(file_fullpath)[-1] + data['project-file'] = {'fd': open(file_fullpath), + 'filename': file_name} + data['project-name'] = file_name + + response = urllib2.urlopen(servernewproject + + '/command/core/create-project-from-upload?format=' + + input_format + '&options=' + + input_options, data) + response_body = response.read() + url_params = \ + urlparse.parse_qs(urlparse.urlparse(response.geturl()).query) + + if 'project' in url_params: + project_id = url_params['project'][0] + print 'New project: ' + project_id + else: + raise Exception('Project not created') + + # wait until project is created + def wait_until_idle(self, polling_delay=0.5): + while True: + response = urllib2.urlopen(servernewproject + '/command/core/get-processes?project=' + project_id) + response_body = response.read() + url_params = \ + urlparse.parse_qs(urlparse.urlparse(response.geturl()).query) + if 'processes' in url_params and len(url_params['processes']) > 0: + time.sleep(polling_delay) + else: + print 'done' + return + + # check number of rows + response = urllib2.urlopen(servernewproject + + '/command/core/get-rows?project=' + + project_id + + '&start=0&limit=0') + response_body = response.read() + response_json = json.loads(response_body) + if 'total' in response_body and response_json['total'] > 0: + print 'Number of rows:', response_json['total'] + else: + raise Exception('Project contains 0 rows. Please check --help for mandatory arguments for xml, json, xls and ods') def list_projects(): """Query the Refine server and list projects by ID: name.""" @@ -89,17 +520,24 @@ def main(): if not options.list and len(args) != 1: PARSER.print_usage() + if options.list: list_projects() - if args: + + if options.create: + file_fullpath = args[0] + create_project(options, file_fullpath) + + if options.apply: project = refine.RefineProject(args[0]) - if options.apply: - response = project.apply_operations(options.apply) - if response != 'ok': - print >>sys.stderr, 'Failed to apply %s: %s' % (options.apply, - response) - if options.export: - export_project(project, options) + response = project.apply_operations(options.apply) + if response != 'ok': + print >> sys.stderr, 'Failed to apply %s: %s' \ + % (options.apply, response) + + if options.export: + project = refine.RefineProject(args[0]) + export_project(project, options) return project