refactored and extended CLI
This commit is contained in:
parent
31f06b35c4
commit
947c7510a6
|
@ -1,6 +1,6 @@
|
|||
#!/usr/bin/env python
|
||||
"""
|
||||
Client library to communicate with a Refine server.
|
||||
Client library to communicate with a OpenRefine server.
|
||||
"""
|
||||
|
||||
# Copyright (c) 2011 Paul Makepeace, Real Programmers. All rights reserved.
|
||||
|
@ -147,118 +147,27 @@ class Refine:
|
|||
"""Open a Refine project."""
|
||||
return RefineProject(self.server, project_id)
|
||||
|
||||
# These aren't used yet but are included for reference
|
||||
new_project_defaults = {
|
||||
'text/line-based/*sv': {
|
||||
'encoding': '',
|
||||
'separator': ',',
|
||||
'ignore_lines': -1,
|
||||
'header_lines': 1,
|
||||
'skip_data_lines': 0,
|
||||
'limit': -1,
|
||||
'store_blank_rows': True,
|
||||
'guess_cell_value_types': True,
|
||||
'process_quotes': True,
|
||||
'store_blank_cells_as_nulls': True,
|
||||
'include_file_sources': False},
|
||||
'text/line-based': {
|
||||
'encoding': '',
|
||||
'lines_per_row': 1,
|
||||
'ignore_lines': -1,
|
||||
'limit': -1,
|
||||
'skip_data_lines': -1,
|
||||
'store_blank_rows': True,
|
||||
'store_blank_cells_as_nulls': True,
|
||||
'include_file_sources': False},
|
||||
'text/line-based/fixed-width': {
|
||||
'encoding': '',
|
||||
'column_widths': [20],
|
||||
'ignore_lines': -1,
|
||||
'header_lines': 0,
|
||||
'skip_data_lines': 0,
|
||||
'limit': -1,
|
||||
'guess_cell_value_types': False,
|
||||
'store_blank_rows': True,
|
||||
'store_blank_cells_as_nulls': True,
|
||||
'include_file_sources': False},
|
||||
'text/line-based/pc-axis': {
|
||||
'encoding': '',
|
||||
'limit': -1,
|
||||
'skip_data_lines': -1,
|
||||
'include_file_sources': False},
|
||||
'text/rdf+n3': {'encoding': ''},
|
||||
'text/xml/ods': {
|
||||
'sheets': [],
|
||||
'ignore_lines': -1,
|
||||
'header_lines': 1,
|
||||
'skip_data_lines': 0,
|
||||
'limit': -1,
|
||||
'store_blank_rows': True,
|
||||
'store_blank_cells_as_nulls': True,
|
||||
'include_file_sources': False},
|
||||
'binary/xls': {
|
||||
'xml_based': False,
|
||||
'sheets': [],
|
||||
'ignore_lines': -1,
|
||||
'header_lines': 1,
|
||||
'skip_data_lines': 0,
|
||||
'limit': -1,
|
||||
'store_blank_rows': True,
|
||||
'store_blank_cells_as_nulls': True,
|
||||
'include_file_sources': False}
|
||||
}
|
||||
|
||||
def new_project(self, project_file=None, project_url=None, project_name=None, project_format='text/line-based/*sv',
|
||||
encoding='',
|
||||
separator=',',
|
||||
ignore_lines=-1,
|
||||
header_lines=1,
|
||||
skip_data_lines=0,
|
||||
def new_project(self,
|
||||
project_file=None,
|
||||
project_name=None,
|
||||
project_format='',
|
||||
guessCellValueTypes=False,
|
||||
headerLines=1,
|
||||
ignoreLines=-1,
|
||||
includeFileSources=False,
|
||||
limit=-1,
|
||||
store_blank_rows=True,
|
||||
guess_cell_value_types=True,
|
||||
process_quotes=True,
|
||||
store_blank_cells_as_nulls=True,
|
||||
include_file_sources=False,
|
||||
linesPerRow=1,
|
||||
processQuotes=True,
|
||||
skipDataLines=0,
|
||||
storeBlankCellsAsNulls=True,
|
||||
storeBlankRows=True,
|
||||
storeEmptyStrings=True,
|
||||
trimStrings=False,
|
||||
**opts):
|
||||
|
||||
if (project_file and project_url) or (not project_file and not project_url):
|
||||
raise ValueError('One (only) of project_file and project_url must be set')
|
||||
|
||||
def s(opt):
|
||||
if isinstance(opt, bool):
|
||||
return 'true' if opt else 'false'
|
||||
if opt is None:
|
||||
return ''
|
||||
return str(opt)
|
||||
|
||||
# the new APIs requires a json in the 'option' POST or GET argument
|
||||
# POST is broken at the moment, so we send it in the URL
|
||||
new_style_options = dict(opts, **{
|
||||
'encoding': s(encoding),
|
||||
})
|
||||
params = {
|
||||
'options': json.dumps(new_style_options),
|
||||
}
|
||||
|
||||
# old style options
|
||||
options = {
|
||||
'format': project_format,
|
||||
'separator': s(separator),
|
||||
'ignore-lines': s(ignore_lines),
|
||||
'header-lines': s(header_lines),
|
||||
'skip-data-lines': s(skip_data_lines),
|
||||
'limit': s(limit),
|
||||
'guess-value-type': s(guess_cell_value_types),
|
||||
'process-quotes': s(process_quotes),
|
||||
'store-blank-rows': s(store_blank_rows),
|
||||
'store-blank-cells-as-nulls': s(store_blank_cells_as_nulls),
|
||||
'include-file-sources': s(include_file_sources),
|
||||
}
|
||||
|
||||
if project_url is not None:
|
||||
options['url'] = project_url
|
||||
elif project_file is not None:
|
||||
# options
|
||||
options = { 'format': project_format }
|
||||
if project_file is not None:
|
||||
options['project-file'] = {
|
||||
'fd': open(project_file),
|
||||
'filename': project_file,
|
||||
|
@ -268,6 +177,12 @@ class Refine:
|
|||
project_name = (project_file or 'New project').rsplit('.', 1)[0]
|
||||
project_name = os.path.basename(project_name)
|
||||
options['project-name'] = project_name
|
||||
|
||||
# params (the API requires a json in the 'option' POST argument)
|
||||
new_style_options = dict(opts)
|
||||
params = { 'options': json.dumps(new_style_options) }
|
||||
|
||||
# submit
|
||||
response = self.server.urlopen(
|
||||
'create-project-from-upload', options, params
|
||||
)
|
||||
|
@ -276,11 +191,17 @@ class Refine:
|
|||
urlparse.urlparse(response.geturl()).query)
|
||||
if 'project' in url_params:
|
||||
project_id = url_params['project'][0]
|
||||
# check number of rows
|
||||
rows = RefineProject(RefineServer(),project_id).do_json('get-rows')['total']
|
||||
if rows > 0:
|
||||
print('{0}: {1}'.format('id', project_id))
|
||||
print('{0}: {1}'.format('rows', rows))
|
||||
return RefineProject(self.server, project_id)
|
||||
else:
|
||||
raise Exception('Project contains 0 rows. Please check --help for mandatory arguments for xml, json, xlsx and ods')
|
||||
else:
|
||||
raise Exception('Project not created')
|
||||
|
||||
|
||||
def RowsResponseFactory(column_index):
|
||||
"""Factory for the parsing the output from get_rows().
|
||||
|
||||
|
|
202
refine.py
202
refine.py
|
@ -1,13 +1,6 @@
|
|||
#!/usr/bin/env python
|
||||
"""
|
||||
Script to provide a command line interface to a Refine server.
|
||||
|
||||
Examples,
|
||||
|
||||
refine --list # show list of Refine projects, ID: name
|
||||
refine --export 1234... > project.tsv
|
||||
refine --export --output=project.xls 1234...
|
||||
refine --apply trim.json 1234...
|
||||
Script to provide a command line interface to a OpenRefine server.
|
||||
"""
|
||||
|
||||
# Copyright (c) 2011 Paul Makepeace, Real Programmers. All rights reserved.
|
||||
|
@ -34,25 +27,111 @@ import time
|
|||
from google.refine import refine
|
||||
|
||||
|
||||
PARSER = optparse.OptionParser(
|
||||
usage='usage: %prog [--help | OPTIONS] [project ID/URL]')
|
||||
PARSER.add_option('-H', '--host', dest='host',
|
||||
help='OpenRefine hostname')
|
||||
PARSER.add_option('-P', '--port', dest='port',
|
||||
help='OpenRefine port')
|
||||
PARSER.add_option('-o', '--output', dest='output',
|
||||
help='Output filename')
|
||||
# Options that are more like commands
|
||||
PARSER.add_option('-l', '--list', dest='list', action='store_true',
|
||||
class myParser(optparse.OptionParser):
|
||||
|
||||
def format_epilog(self, formatter):
|
||||
return self.epilog
|
||||
|
||||
PARSER = \
|
||||
myParser(description='Script to provide a command line interface to an OpenRefine server.',
|
||||
usage='usage: %prog [--help | OPTIONS]',
|
||||
epilog="""
|
||||
Examples:
|
||||
--list # show list of projects (id: name)
|
||||
--list -H 127.0.0.1 -P 80 # specify hostname and port
|
||||
--info 2161595260364 # show metadata of project
|
||||
--info "christmas gifts"
|
||||
--create example.csv # create new project from file example.csv
|
||||
--create example.tsv --encoding=UTF-8
|
||||
--create example.xml --recordPath=collection --recordPath=record
|
||||
--create example.json --recordPath=_ --recordPath=_
|
||||
--create example.xlsx --sheets=0
|
||||
--create example.ods --sheets=0
|
||||
--apply trim.json 2161595260364 # apply rules in trim.json to project 1234...
|
||||
--apply trim.json "christmas gifts"
|
||||
--export 2161595260364 > project.tsv # export project 2161595260364 in tsv format
|
||||
--export "christmas gifts" > project.tsv
|
||||
--export --output=project.xlsx 2161595260364 # export project in xlsx format
|
||||
--export --output=project.xlsx "christmas gifts"
|
||||
--delete 2161595260364 # delete project
|
||||
--delete "christmas gifts"
|
||||
""")
|
||||
|
||||
group1 = optparse.OptionGroup(PARSER, 'Connection options')
|
||||
group1.add_option('-H', '--host', dest='host', metavar='127.0.0.1',
|
||||
help='OpenRefine hostname (default: 127.0.0.1)')
|
||||
group1.add_option('-P', '--port', dest='port', metavar='3333',
|
||||
help='OpenRefine port (default: 3333)')
|
||||
PARSER.add_option_group(group1)
|
||||
|
||||
group2 = optparse.OptionGroup(PARSER, 'Commands')
|
||||
group2.add_option('-c', '--create', dest='create', metavar='[FILE]',
|
||||
help='Create project from file. The filename ending (e.g. .csv) defines the input format (csv,tsv,xml,json,txt,xls,xlsx,ods)')
|
||||
group2.add_option('-l', '--list', dest='list', action='store_true',
|
||||
help='List projects')
|
||||
PARSER.add_option('-E', '--export', dest='export', action='store_true',
|
||||
help='Export project')
|
||||
PARSER.add_option('-f', '--apply', dest='apply',
|
||||
help='Apply a JSON commands file to a project')
|
||||
PARSER.add_option_group(group2)
|
||||
|
||||
group3 = optparse.OptionGroup(PARSER, 'Commands with argument [PROJECTID/PROJECTNAME]')
|
||||
group3.add_option('-d', '--delete', dest='delete', action='store_true',
|
||||
help='Delete project')
|
||||
group3.add_option('-f', '--apply', dest='apply', metavar='[FILE]',
|
||||
help='Apply JSON rules to OpenRefine project')
|
||||
group3.add_option('-E', '--export', dest='export', action='store_true',
|
||||
help='Export project in tsv format to stdout.')
|
||||
group3.add_option('-o', '--output', dest='output', metavar='[FILE]',
|
||||
help='Export project to file. The filename ending (e.g. .tsv) defines the output format (csv,tsv,xls,xlsx,html)')
|
||||
group3.add_option('--info', dest='info', action='store_true',
|
||||
help='show project metadata')
|
||||
PARSER.add_option_group(group3)
|
||||
|
||||
group4 = optparse.OptionGroup(PARSER, 'Create options')
|
||||
group4.add_option('--columnWidths', dest='columnWidths',
|
||||
help='(txt/fixed-width) please provide widths separated by comma (e.g. 7,5)')
|
||||
group4.add_option('--encoding', dest='encoding',
|
||||
help='(csv,tsv,txt), please provide short encoding name (e.g. UTF-8)')
|
||||
group4.add_option('--guessCellValueTypes', dest='guessCellValueTypes',
|
||||
help='(xml,csv,tsv,txt,json), default: false')
|
||||
group4.add_option('--headerLines', dest='headerLines',
|
||||
help='(csv,tsv,txt/fixed-width,xls,xlsx,ods), default: 1, default txt/fixed-width: 0')
|
||||
group4.add_option('--ignoreLines', dest='ignoreLines',
|
||||
help='(csv,tsv,txt,xls,xlsx,ods), default: -1')
|
||||
group4.add_option('--includeFileSources', dest='includeFileSources',
|
||||
help='(all formats), default: false')
|
||||
group4.add_option('--limit', dest='limit',
|
||||
help='(all formats), default: -1')
|
||||
group4.add_option('--linesPerRow', dest='linesPerRow',
|
||||
help='(txt/line-based), default: 1')
|
||||
group4.add_option('--processQuotes', dest='processQuotes',
|
||||
help='(csv,tsv), default: true')
|
||||
group4.add_option('--projectName', dest='project_name',
|
||||
help='(all formats), default: filename')
|
||||
group4.add_option('--recordPath', dest='recordPath', action='append',
|
||||
help='(xml,json), please provide path in multiple arguments without slashes, e.g. /collection/record/ should be entered like this: --recordPath=collection --recordPath=record, default xml: record, default json: _ _')
|
||||
group4.add_option('--separator', dest='separator',
|
||||
help='(csv,tsv), default csv: , default tsv: \\t')
|
||||
group4.add_option('--sheets', dest='sheets',
|
||||
help='(xls,xlsx,ods), please provide sheets separated by comma (e.g. 0,1), default: 0 (first sheet)')
|
||||
group4.add_option('--skipDataLines', dest='skipDataLines',
|
||||
help='(csv,tsv,txt,xls,xlsx,ods), default: 0, default line-based: -1')
|
||||
group4.add_option('--storeBlankRows', dest='storeBlankRows',
|
||||
help='(csv,tsv,txt,xls,xlsx,ods), default: true')
|
||||
group4.add_option('--storeBlankCellsAsNulls',
|
||||
dest='storeBlankCellsAsNulls',
|
||||
help='(csv,tsv,txt,xls,xlsx,ods), default: true')
|
||||
group4.add_option('--storeEmptyStrings', dest='storeEmptyStrings',
|
||||
help='(xml,json), default: true')
|
||||
group4.add_option('--trimStrings', dest='trimStrings',
|
||||
help='(xml,json), default: false')
|
||||
PARSER.add_option_group(group4)
|
||||
|
||||
group5 = optparse.OptionGroup(PARSER, 'Legacy options')
|
||||
group5.add_option('--format', dest='input_format',
|
||||
help='Specify input format (csv,tsv,xml,json,line-based,fixed-width,xls,xlsx,ods)')
|
||||
PARSER.add_option_group(group5)
|
||||
|
||||
|
||||
def list_projects():
|
||||
"""Query the Refine server and list projects by ID: name."""
|
||||
"""Query the OpenRefine server and list projects by ID: name."""
|
||||
projects = refine.Refine(refine.RefineServer()).list_projects().items()
|
||||
|
||||
def date_to_epoch(json_dt):
|
||||
|
@ -62,12 +141,20 @@ def list_projects():
|
|||
for project_id, project_info in projects:
|
||||
print('{0:>14}: {1}'.format(project_id, project_info['name']))
|
||||
|
||||
def info(project):
|
||||
projects = refine.Refine(refine.RefineServer()).list_projects().items()
|
||||
for project_id, project_info in projects:
|
||||
if project == project_id:
|
||||
print('{0}: {1}'.format('id', project_id))
|
||||
print('{0}: {1}'.format('name', project_info['name']))
|
||||
print('{0}: {1}'.format('created', project_info['created']))
|
||||
print('{0}: {1}'.format('modified', project_info['modified']))
|
||||
|
||||
def export_project(project, options):
|
||||
"""Dump a project to stdout or options.output file."""
|
||||
export_format = 'tsv'
|
||||
if options.output:
|
||||
ext = os.path.splitext(options.output)[1][1:] # 'xls'
|
||||
ext = os.path.splitext(options.output)[1][1:]
|
||||
if ext:
|
||||
export_format = ext.lower()
|
||||
output = open(options.output, 'wb')
|
||||
|
@ -79,7 +166,7 @@ def export_project(project, options):
|
|||
|
||||
#noinspection PyPep8Naming
|
||||
def main():
|
||||
"""Main."""
|
||||
"""Command line interface."""
|
||||
|
||||
# get environment variables in docker network
|
||||
docker_host = os.environ.get('OPENREFINE_SERVER_PORT_3333_TCP_ADDR')
|
||||
|
@ -92,27 +179,74 @@ def main():
|
|||
refine.REFINE_PORT = docker_port
|
||||
|
||||
options, args = PARSER.parse_args()
|
||||
commands_dict = { group2_arg.dest : getattr(options, group2_arg.dest) for group2_arg in group2.option_list }
|
||||
commands_dict.update({ group3_arg.dest : getattr(options, group3_arg.dest) for group3_arg in group3.option_list })
|
||||
commands_dict = { k: v for k, v in commands_dict.items() if v != None }
|
||||
if not commands_dict:
|
||||
PARSER.print_usage()
|
||||
return
|
||||
if args and not str.isdigit(args[0]):
|
||||
projects = refine.Refine(refine.RefineServer()).list_projects().items()
|
||||
idlist = []
|
||||
for project_id, project_info in projects:
|
||||
if args[0] == project_info['name']:
|
||||
idlist.append(str(project_id))
|
||||
if len(idlist) > 1:
|
||||
raise Exception('Found at least two projects. Please specify project by id.')
|
||||
else:
|
||||
args[0] = idlist[0]
|
||||
|
||||
if options.host:
|
||||
refine.REFINE_HOST = options.host
|
||||
if options.port:
|
||||
refine.REFINE_PORT = options.port
|
||||
|
||||
if not options.list and len(args) != 1:
|
||||
PARSER.print_usage()
|
||||
if options.list:
|
||||
list_projects()
|
||||
if args:
|
||||
project = refine.RefineProject(args[0])
|
||||
if options.create:
|
||||
# general defaults are defined in google/refine/refine.py new_project
|
||||
# additional defaults for each file type
|
||||
defaults = {}
|
||||
defaults['xml'] = { 'project_format' : 'text/xml', 'recordPath' : 'record' }
|
||||
defaults['csv'] = { 'project_format' : 'text/line-based/*sv', 'separator' : ',' }
|
||||
defaults['tsv'] = { 'project_format' : 'text/line-based/*sv', 'separator' : '\t' }
|
||||
defaults['line-based'] = { 'project_format' : 'text/line-based', 'skipDataLines' : -1 }
|
||||
defaults['fixed-width'] = { 'project_format' : 'text/line-based/fixed-width', 'headerLines' : 0 }
|
||||
defaults['json'] = { 'project_format' : 'text/json', 'recordPath' : ('_', '_') }
|
||||
defaults['xls'] = { 'project_format' : 'binary/text/xml/xls/xlsx', 'sheets' : 0 }
|
||||
defaults['xlsx'] = { 'project_format' : 'binary/text/xml/xls/xlsx', 'sheets' : 0 }
|
||||
defaults['ods'] = { 'project_format' : 'text/xml/ods', 'sheets' : 0 }
|
||||
# guess format from file extension (or legacy option --format)
|
||||
input_format = os.path.splitext(options.create)[1][1:].lower()
|
||||
if input_format == 'txt' and options.columnWidths:
|
||||
input_format = 'fixed_width'
|
||||
if input_format == 'txt' and not options.columnWidths:
|
||||
input_format = 'line_based'
|
||||
if options.input_format:
|
||||
input_format = options.input_format
|
||||
# defaults for selected format
|
||||
input_dict = defaults[input_format]
|
||||
# user input
|
||||
input_user = { group4_arg.dest : getattr(options, group4_arg.dest) for group4_arg in group4.option_list }
|
||||
input_user = { k: v for k, v in input_user.items() if v != None }
|
||||
# merge defaults with user input
|
||||
input_dict.update(input_user)
|
||||
input_dict['project_file'] = options.create
|
||||
print(input_dict)
|
||||
refine.Refine(refine.RefineServer()).new_project(**input_dict)
|
||||
if options.delete:
|
||||
refine.RefineProject(refine.RefineServer(),args[0]).delete()
|
||||
if options.apply:
|
||||
project = refine.RefineProject(args[0])
|
||||
response = project.apply_operations(options.apply)
|
||||
if response != 'ok':
|
||||
print >>sys.stderr, 'Failed to apply %s: %s' % (options.apply,
|
||||
response)
|
||||
if options.export:
|
||||
print >> sys.stderr, 'Failed to apply %s: %s' \
|
||||
% (options.apply, response)
|
||||
if options.export or options.output:
|
||||
project = refine.RefineProject(args[0])
|
||||
export_project(project, options)
|
||||
|
||||
return project
|
||||
if options.info:
|
||||
info(args[0])
|
||||
|
||||
if __name__ == '__main__':
|
||||
# return project so that it's available interactively, python -i refine.py
|
||||
|
|
Loading…
Reference in New Issue