2019-08-04 02:21:16 +02:00
#! /usr/bin/env python
"""
2019-08-16 10:55:54 +02:00
Script to provide a command line interface to a Refine server .
2019-08-04 02:21:16 +02:00
"""
# Copyright (c) 2011 Paul Makepeace, Real Programmers. All rights reserved.
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
# You should have received a copy of the GNU General Public License
# along with this program. If not, see <http://www.gnu.org/licenses/>
import optparse
from google . refine import refine
2019-08-05 10:20:09 +02:00
from google . refine import cli
2019-08-04 02:21:16 +02:00
class myParser ( optparse . OptionParser ) :
def format_epilog ( self , formatter ) :
return self . epilog
2019-08-16 10:55:54 +02:00
2019-08-04 02:21:16 +02:00
PARSER = \
2019-08-16 10:55:54 +02:00
myParser ( description = ( ' Script to provide a command line interface to an '
' OpenRefine server. ' ) ,
2019-08-04 02:21:16 +02:00
usage = ' usage: % prog [--help | OPTIONS] ' ,
epilog = """
2019-08-16 10:55:54 +02:00
Example data :
- - download " https://git.io/fj5hF " - - output = duplicates . csv
- - download " https://git.io/fj5ju " - - output = duplicates - deletion . json
Basic commands :
- - list # list all projects
2019-08-04 02:21:16 +02:00
- - list - H 127.0 .0 .1 - P 80 # specify hostname and port
2019-08-16 10:55:54 +02:00
- - create duplicates . csv # create new project from file
- - info " duplicates " # show project metadata
- - apply duplicates - deletion . json " duplicates " # apply rules in file to project
- - export " duplicates " # export project to terminal in tsv format
- - export - - output = deduped . xls " duplicates " # export project to file in xls format
- - delete " duplicates " # delete project
Some more examples :
- - info 1234567890123 # specify project by id
2019-08-04 02:21:16 +02:00
- - create example . tsv - - encoding = UTF - 8
- - create example . xml - - recordPath = collection - - recordPath = record
- - create example . json - - recordPath = _ - - recordPath = _
- - create example . xlsx - - sheets = 0
- - create example . ods - - sheets = 0
2019-08-16 10:55:54 +02:00
Example for Templating Export :
Cf . https : / / github . com / opencultureconsulting / openrefine - client #advanced-templating
2019-08-04 02:21:16 +02:00
""" )
group1 = optparse . OptionGroup ( PARSER , ' Connection options ' )
2019-08-16 10:55:54 +02:00
group1 . add_option ( ' -H ' , ' --host ' , dest = ' host ' ,
metavar = ' 127.0.0.1 ' ,
2019-08-04 02:21:16 +02:00
help = ' OpenRefine hostname (default: 127.0.0.1) ' )
2019-08-16 10:55:54 +02:00
group1 . add_option ( ' -P ' , ' --port ' , dest = ' port ' ,
metavar = ' 3333 ' ,
2019-08-04 02:21:16 +02:00
help = ' OpenRefine port (default: 3333) ' )
PARSER . add_option_group ( group1 )
group2 = optparse . OptionGroup ( PARSER , ' Commands ' )
2019-08-16 10:55:54 +02:00
group2 . add_option ( ' -c ' , ' --create ' , dest = ' create ' ,
metavar = ' [FILE] ' ,
2019-08-04 02:21:16 +02:00
help = ' Create project from file. The filename ending (e.g. .csv) defines the input format (csv,tsv,xml,json,txt,xls,xlsx,ods) ' )
2019-08-16 10:55:54 +02:00
group2 . add_option ( ' -l ' , ' --list ' , dest = ' list ' ,
action = ' store_true ' ,
2019-08-04 02:21:16 +02:00
help = ' List projects ' )
2019-08-16 10:55:54 +02:00
group2 . add_option ( ' --download ' , dest = ' download ' ,
metavar = ' [URL] ' ,
help = ' Download file from URL (e.g. example data). Combine with --output to specify a filename. ' )
2019-08-04 02:21:16 +02:00
PARSER . add_option_group ( group2 )
2019-08-16 10:55:54 +02:00
group3 = optparse . OptionGroup (
PARSER , ' Commands with argument [PROJECTID/PROJECTNAME] ' )
group3 . add_option ( ' -d ' , ' --delete ' , dest = ' delete ' ,
action = ' store_true ' ,
2019-08-04 02:21:16 +02:00
help = ' Delete project ' )
2019-08-16 10:55:54 +02:00
group3 . add_option ( ' -f ' , ' --apply ' , dest = ' apply ' ,
metavar = ' [FILE] ' ,
2019-08-04 02:21:16 +02:00
help = ' Apply JSON rules to OpenRefine project ' )
2019-08-16 10:55:54 +02:00
group3 . add_option ( ' -E ' , ' --export ' , dest = ' export ' ,
action = ' store_true ' ,
2019-08-04 02:21:16 +02:00
help = ' Export project in tsv format to stdout. ' )
2019-08-16 10:55:54 +02:00
group3 . add_option ( ' -o ' , ' --output ' , dest = ' output ' ,
metavar = ' [FILE] ' ,
2019-08-04 02:21:16 +02:00
help = ' Export project to file. The filename ending (e.g. .tsv) defines the output format (csv,tsv,xls,xlsx,html) ' )
2019-08-16 10:55:54 +02:00
group3 . add_option ( ' --template ' , dest = ' template ' ,
metavar = ' [STRING] ' ,
help = ' Export project with templating. Provide (big) text string that you enter in the *row template* textfield in the export/templating menu in the browser app) ' )
group3 . add_option ( ' --info ' , dest = ' info ' ,
action = ' store_true ' ,
2019-08-04 02:21:16 +02:00
help = ' show project metadata ' )
PARSER . add_option_group ( group3 )
2019-08-16 10:55:54 +02:00
group4 = optparse . OptionGroup ( PARSER , ' General options ' )
group4 . add_option ( ' --format ' , dest = ' file_format ' ,
help = ' Override file detection (import: csv,tsv,xml,json,line-based,fixed-width,xls,xlsx,ods; export: csv,tsv,html,xls,xlsx,ods) ' )
PARSER . add_option_group ( group4 )
group5 = optparse . OptionGroup ( PARSER , ' Create options ' )
group5 . add_option ( ' --columnWidths ' , dest = ' columnWidths ' ,
action = ' append ' ,
type = ' int ' ,
help = ' (txt/fixed-width), please provide widths in multiple arguments, e.g. --columnWidths=7 --columnWidths=5 ' )
group5 . add_option ( ' --encoding ' , dest = ' encoding ' ,
2019-08-04 02:21:16 +02:00
help = ' (csv,tsv,txt), please provide short encoding name (e.g. UTF-8) ' )
2019-08-16 10:55:54 +02:00
group5 . add_option ( ' --guessCellValueTypes ' , dest = ' guessCellValueTypes ' ,
metavar = ' true/false ' , choices = ( ' true ' , ' false ' ) ,
2019-08-04 02:21:16 +02:00
help = ' (xml,csv,tsv,txt,json, default: false) ' )
2019-08-16 10:55:54 +02:00
group5 . add_option ( ' --headerLines ' , dest = ' headerLines ' ,
type = " int " ,
2019-08-04 02:21:16 +02:00
help = ' (csv,tsv,txt/fixed-width,xls,xlsx,ods), default: 1, default txt/fixed-width: 0 ' )
2019-08-16 10:55:54 +02:00
group5 . add_option ( ' --ignoreLines ' , dest = ' ignoreLines ' ,
type = " int " ,
2019-08-04 02:21:16 +02:00
help = ' (csv,tsv,txt,xls,xlsx,ods), default: -1 ' )
2019-08-16 10:55:54 +02:00
group5 . add_option ( ' --includeFileSources ' , dest = ' includeFileSources ' ,
metavar = ' true/false ' , choices = ( ' true ' , ' false ' ) ,
2019-08-04 02:21:16 +02:00
help = ' (all formats), default: false ' )
2019-08-16 10:55:54 +02:00
group5 . add_option ( ' --limit ' , dest = ' limit ' ,
type = " int " ,
2019-08-04 02:21:16 +02:00
help = ' (all formats), default: -1 ' )
2019-08-16 10:55:54 +02:00
group5 . add_option ( ' --linesPerRow ' , dest = ' linesPerRow ' ,
type = " int " ,
2019-08-04 02:21:16 +02:00
help = ' (txt/line-based), default: 1 ' )
2019-08-16 10:55:54 +02:00
group5 . add_option ( ' --processQuotes ' , dest = ' processQuotes ' ,
metavar = ' true/false ' , choices = ( ' true ' , ' false ' ) ,
2019-08-04 02:21:16 +02:00
help = ' (csv,tsv), default: true ' )
2019-08-16 10:55:54 +02:00
group5 . add_option ( ' --projectName ' , dest = ' project_name ' ,
2019-08-04 02:21:16 +02:00
help = ' (all formats), default: filename ' )
2019-08-16 10:55:54 +02:00
group5 . add_option ( ' --projectTags ' , dest = ' projectTags ' ,
action = ' append ' ,
help = ' (all formats), please provide tags in multiple arguments, e.g. --projectTags=beta --projectTags=client1 ' )
group5 . add_option ( ' --recordPath ' , dest = ' recordPath ' ,
action = ' append ' ,
2019-08-20 04:30:50 +02:00
help = ' (xml,json), please provide path in multiple arguments, e.g. /collection/record/ should be entered: --recordPath=collection --recordPath=record, default xml: root element, default json: _ _ ' )
2019-08-16 10:55:54 +02:00
group5 . add_option ( ' --separator ' , dest = ' separator ' ,
2019-08-04 02:21:16 +02:00
help = ' (csv,tsv), default csv: , default tsv: \\ t ' )
2019-08-16 10:55:54 +02:00
group5 . add_option ( ' --sheets ' , dest = ' sheets ' ,
action = ' append ' ,
type = " int " ,
2019-08-04 02:21:16 +02:00
help = ' (xls,xlsx,ods), please provide sheets in multiple arguments, e.g. --sheets=0 --sheets=1, default: 0 (first sheet) ' )
2019-08-16 10:55:54 +02:00
group5 . add_option ( ' --skipDataLines ' , dest = ' skipDataLines ' ,
type = " int " ,
2019-08-04 02:21:16 +02:00
help = ' (csv,tsv,txt,xls,xlsx,ods), default: 0, default line-based: -1 ' )
2019-08-16 10:55:54 +02:00
group5 . add_option ( ' --storeBlankCellsAsNulls ' , dest = ' storeBlankCellsAsNulls ' ,
metavar = ' true/false ' , choices = ( ' true ' , ' false ' ) ,
2019-08-04 02:21:16 +02:00
help = ' (csv,tsv,txt,xls,xlsx,ods), default: true ' )
2019-08-16 10:55:54 +02:00
group5 . add_option ( ' --storeBlankRows ' , dest = ' storeBlankRows ' ,
metavar = ' true/false ' , choices = ( ' true ' , ' false ' ) ,
2019-08-04 02:21:16 +02:00
help = ' (csv,tsv,txt,xls,xlsx,ods), default: true ' )
2019-08-16 10:55:54 +02:00
group5 . add_option ( ' --storeEmptyStrings ' , dest = ' storeEmptyStrings ' ,
metavar = ' true/false ' , choices = ( ' true ' , ' false ' ) ,
2019-08-04 02:21:16 +02:00
help = ' (xml,json), default: true ' )
2019-08-16 10:55:54 +02:00
group5 . add_option ( ' --trimStrings ' , dest = ' trimStrings ' ,
metavar = ' true/false ' , choices = ( ' true ' , ' false ' ) ,
2019-08-04 02:21:16 +02:00
help = ' (xml,json), default: false ' )
PARSER . add_option_group ( group5 )
2019-08-16 10:55:54 +02:00
group6 = optparse . OptionGroup ( PARSER , ' Templating options ' )
group6 . add_option ( ' --mode ' , dest = ' mode ' ,
metavar = ' row-based/record-based ' ,
choices = ( ' row-based ' , ' record-based ' ) ,
help = ' engine mode (default: row-based) ' )
2019-08-04 02:21:16 +02:00
group6 . add_option ( ' --prefix ' , dest = ' prefix ' ,
2019-08-16 10:55:54 +02:00
help = ' text string that you enter in the *prefix* textfield in the browser app ' )
2019-08-04 02:21:16 +02:00
group6 . add_option ( ' --rowSeparator ' , dest = ' rowSeparator ' ,
2019-08-16 10:55:54 +02:00
help = ' text string that you enter in the *row separator* textfield in the browser app ' )
2019-08-04 02:21:16 +02:00
group6 . add_option ( ' --suffix ' , dest = ' suffix ' ,
2019-08-16 10:55:54 +02:00
help = ' text string that you enter in the *suffix* textfield in the browser app ' )
group6 . add_option ( ' --filterQuery ' , dest = ' filterQuery ' ,
metavar = ' REGEX ' ,
help = ' Simple RegEx text filter on filterColumn, e.g. ^12015$ ' ) ,
group6 . add_option ( ' --filterColumn ' , dest = ' filterColumn ' ,
metavar = ' COLUMNNAME ' ,
help = ' column name for filterQuery (default: name of first column) ' )
2019-08-04 02:21:16 +02:00
group6 . add_option ( ' --facets ' , dest = ' facets ' ,
2019-08-16 10:55:54 +02:00
help = ' facets config in json format (may be extracted with browser dev tools in browser app) ' )
group6 . add_option ( ' --splitToFiles ' , dest = ' splitToFiles ' ,
metavar = ' true/false ' , choices = ( ' true ' , ' false ' ) ,
help = ' will split each row/record into a single file; it specifies a presumably unique character series for splitting; --prefix and --suffix will be applied to all files; filename-prefix can be specified with --output (default: % Y % m %d ) ' )
group6 . add_option ( ' --suffixById ' , dest = ' suffixById ' ,
metavar = ' true/false ' , choices = ( ' true ' , ' false ' ) ,
help = ' enhancement option for --splitToFiles; will generate filename-suffix from values in key column ' )
2019-08-04 02:21:16 +02:00
PARSER . add_option_group ( group6 )
2019-08-16 10:55:54 +02:00
2019-08-04 02:21:16 +02:00
def main ( ) :
""" Command line interface. """
options , args = PARSER . parse_args ( )
2019-08-16 10:55:54 +02:00
# set environment
2019-08-04 02:21:16 +02:00
if options . host :
refine . REFINE_HOST = options . host
if options . port :
refine . REFINE_PORT = options . port
2019-08-16 10:55:54 +02:00
# get project_id
2019-08-04 02:21:16 +02:00
if args and not str . isdigit ( args [ 0 ] ) :
projects = refine . Refine ( refine . RefineServer ( ) ) . list_projects ( ) . items ( )
idlist = [ ]
for project_id , project_info in projects :
if args [ 0 ] == project_info [ ' name ' ] :
idlist . append ( str ( project_id ) )
if len ( idlist ) > 1 :
2019-08-16 10:55:54 +02:00
print ( ' Error: Found %s projects with name %s . \n '
' Please specify project by id. ' % ( len ( idlist ) , args [ 0 ] ) )
for i in idlist :
print ( ' ' )
cli . info ( i )
return
2019-08-04 02:21:16 +02:00
else :
2019-08-16 10:55:54 +02:00
try :
project_id = idlist [ 0 ]
except IndexError :
print ( ' Error: No project found with name %s . \n '
' Try command --list ' % args [ 0 ] )
return
elif args :
project_id = args [ 0 ]
2019-08-04 02:21:16 +02:00
2019-08-16 10:55:54 +02:00
# commands without args
2019-08-04 02:21:16 +02:00
if options . list :
2019-08-16 10:55:54 +02:00
cli . ls ( )
elif options . download :
cli . download ( options . download , output_file = options . output )
elif options . create :
group5_dict = { group5_arg . dest : getattr ( options , group5_arg . dest )
for group5_arg in group5 . option_list }
kwargs = { k : v for k , v in group5_dict . items ( )
if v is not None and v not in [ ' true ' , ' false ' ] }
kwargs . update ( { k : True for k , v in group5_dict . items ( )
if v == ' true ' } )
kwargs . update ( { k : False for k , v in group5_dict . items ( )
if v == ' false ' } )
if options . file_format :
kwargs . update ( { ' project_format ' : options . file_format } )
cli . create ( options . create , * * kwargs )
# commands with args
elif args and options . info :
cli . info ( project_id )
elif args and options . delete :
cli . delete ( project_id )
elif args and options . apply :
cli . apply ( project_id , options . apply )
elif args and options . template :
group6_dict = { group6_arg . dest : getattr ( options , group6_arg . dest )
for group6_arg in group6 . option_list }
kwargs = { k : v for k , v in group6_dict . items ( )
if v is not None and v not in [ ' true ' , ' false ' ] }
kwargs . update ( { k : True for k , v in group6_dict . items ( )
if v == ' true ' } )
kwargs . update ( { k : False for k , v in group6_dict . items ( )
if v == ' false ' } )
cli . templating ( project_id , options . template ,
output_file = options . output , * * kwargs )
elif args and ( options . export or options . output ) :
cli . export ( project_id , output_file = options . output ,
export_format = options . file_format )
else :
PARSER . print_usage ( )
2019-08-04 02:21:16 +02:00
if __name__ == " __main__ " :
# execute only if run as a script
main ( )