2011-05-27 02:15:38 +02:00
#!/usr/bin/env python
"""
Script to provide a command line interface to a Refine server .
"""
# Copyright (c) 2011 Paul Makepeace, Real Programmers. All rights reserved.
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
# You should have received a copy of the GNU General Public License
# along with this program. If not, see <http://www.gnu.org/licenses/>
2011-05-27 02:20:26 +02:00
2011-05-27 02:15:38 +02:00
import optparse
import os
import sys
import time
from google . refine import refine
2017-02-01 16:51:13 +01:00
import urllib2_file
import urllib2
import urlparse
import json
class myParser ( optparse . OptionParser ) :
def format_epilog ( self , formatter ) :
return self . epilog
PARSER = \
myParser ( description = ' Script to provide a command line interface to an OpenRefine server. ' ,
usage = ' usage: % prog [--help | OPTIONS] ' ,
epilog = """
Examples :
. / refine . py - - list # show list of Refine projects, ID: name
. / refine . py - - list - H 127.0 .0 .1 - P 80 # specify hostname and port
. / refine . py - - create example . csv # create new project from file example.csv
. / refine . py - - create example . tsv - - format = tsv - - encoding = UTF - 8
. / refine . py - - create example . xml - - format = xml - - recordPath = collection - - recordPath = record
. / refine . py - - create example . json - - format = json - - recordPath = _ - - recordPath = _
. / refine . py - - create example . xlsx - - format = xlsx - - sheets = 0
. / refine . py - - create example . ods - - format = ods - - sheets = 0
. / refine . py - - export 1234. . . > project . tsv # export project 1234... in tsv format
. / refine . py - - export - - output = project . xls 1234. . . # export project in xls format
. / refine . py - - apply trim . json 1234. . . # apply rules in trim.json to project 1234...
""" )
group1 = optparse . OptionGroup ( PARSER , ' Connection options ' )
group1 . add_option ( ' -H ' , ' --host ' , dest = ' host ' , metavar = ' 127.0.0.1 ' ,
help = ' OpenRefine hostname (default: localhost) ' )
group1 . add_option ( ' -P ' , ' --port ' , dest = ' port ' , metavar = ' 3333 ' ,
help = ' OpenRefine port (default: 3333) ' )
PARSER . add_option_group ( group1 )
group2 = optparse . OptionGroup ( PARSER , ' Commands ' )
group2 . add_option ( ' -l ' , ' --list ' , dest = ' list ' , action = ' store_true ' ,
help = ' List projects: refine.py -l ' )
group2 . add_option ( ' -c ' , ' --create ' , dest = ' create ' , action = ' store_true ' ,
help = ' Create project from file: refine.py -c [FILE] ' )
group2 . add_option ( ' -E ' , ' --export ' , dest = ' export ' , action = ' store_true ' ,
help = ' Export project: refine.py -E [PROJECTID] ' )
group2 . add_option ( ' -f ' , ' --apply ' , dest = ' apply ' , metavar = ' file.json ' ,
help = ' Apply JSON rules: refine.py -f [FILE] [PROJECTID] ' )
PARSER . add_option_group ( group2 )
group3 = optparse . OptionGroup ( PARSER , ' Export options (optional) ' )
group3 . add_option ( ' -o ' , ' --output ' , dest = ' output ' , metavar = ' file.csv ' ,
help = ' Specify output filename and filetype. The filename ending (e.g. .csv) defines the output format (csv,tsv,xls,html) ' )
PARSER . add_option_group ( group3 )
group4 = optparse . OptionGroup ( PARSER , ' Create format (mandatory for xml, json, fixed-width, xlsx, ods) ' )
group4 . add_option ( ' --format ' , dest = ' inputformat ' ,
help = ' Specify input format (csv, tsv, xml, json, line-based, fixed-width, xlsx, ods) ' )
PARSER . add_option_group ( group4 )
group5 = optparse . OptionGroup ( PARSER , ' Create options (mandatory for xml, json, fixed-width, xslx, ods; only together with --format) ' )
group5 . add_option ( ' --recordPath ' , dest = ' recordPath ' , action = ' append ' ,
help = ' (xml, json), please provide path in multiple arguments without slashes, e.g. /collection/record/ should be entered like this: --recordPath=collection --recordPath=record ' )
group5 . add_option ( ' --columnWidths ' , dest = ' columnWidths ' ,
help = ' (fixed-width) please provide widths separated by comma (e.g. 7,5) ' )
group5 . add_option ( ' --sheets ' , dest = ' sheets ' ,
help = ' (xlsx, ods), please provide sheets separated by comma (e.g. 0,1), default: 0 (first sheet) ' )
PARSER . add_option_group ( group5 )
group6 = optparse . OptionGroup ( PARSER , ' More create options (optional, only together with --format) ' )
group6 . add_option ( ' --limit ' , dest = ' limit ' ,
help = ' (all formats), default: -1 ' )
group6 . add_option ( ' --includeFileSources ' , dest = ' includeFileSources ' ,
help = ' (all formats), default: false ' )
group6 . add_option ( ' --trimStrings ' , dest = ' trimStrings ' ,
help = ' (xml, json), default: false ' )
group6 . add_option ( ' --storeEmptyStrings ' , dest = ' storeEmptyStrings ' ,
help = ' (xml, json), default: true ' )
group6 . add_option ( ' --guessCellValueTypes ' , dest = ' guessCellValueTypes ' ,
help = ' (xml, csv, tsv, fixed-width, json), default: false ' )
group6 . add_option ( ' --encoding ' , dest = ' encoding ' ,
help = ' (csv, tsv, line-based, fixed-width), please provide short encoding name (e.g. UTF-8) ' )
group6 . add_option ( ' --ignoreLines ' , dest = ' ignoreLines ' ,
help = ' (csv, tsv, line-based, fixed-width, xlsx, ods), default: -1 ' )
group6 . add_option ( ' --headerLines ' , dest = ' headerLines ' ,
help = ' (csv, tsv, fixed-width, xlsx, ods), default: 1 ' )
group6 . add_option ( ' --skipDataLines ' , dest = ' skipDataLines ' ,
help = ' (csv, tsv, line-based, fixed-width, xlsx, ods), default: 0 ' )
group6 . add_option ( ' --storeBlankRows ' , dest = ' storeBlankRows ' ,
help = ' (csv, tsv, line-based, fixed-width, xlsx, ods), default: true ' )
group6 . add_option ( ' --processQuotes ' , dest = ' processQuotes ' ,
help = ' (csv, tsv), default: true ' )
group6 . add_option ( ' --storeBlankCellsAsNulls ' ,
dest = ' storeBlankCellsAsNulls ' ,
help = ' (csv, tsv, line-based, fixed-width, xlsx, ods), default: true ' )
group6 . add_option ( ' --linesPerRow ' , dest = ' linesPerRow ' ,
help = ' (line-based), default: 1 ' )
PARSER . add_option_group ( group6 )
def create_project ( options , file_fullpath ) :
servernewproject = ' http:// ' + refine . REFINE_HOST
if refine . REFINE_PORT != ' 80 ' :
servernewproject + = ' : ' + refine . REFINE_PORT
input_format = ' '
input_options = ' '
# xml
if options . inputformat == ' xml ' :
input_format = ' text/xml '
recordPath = ' record '
if options . recordPath :
recordPath = options . recordPath
limit = ' -1 '
if options . limit :
limit = options . limit
trimStrings = ' false '
if options . trimStrings :
trimStrings = options . trimStrings
guessCellValueTypes = ' false '
if options . guessCellValueTypes :
guessCellValueTypes = options . guessCellValueTypes
storeEmptyStrings = ' true '
if options . storeEmptyStrings :
storeEmptyStrings = options . storeEmptyStrings
includeFileSources = ' false '
if options . includeFileSources :
includeFileSources = options . includeFileSources
input_options = ' { " recordPath " :[ " ' + ' " , " ' . join ( recordPath ) + ' " ] ' \
+ ' , " limit " : ' + limit \
+ ' , " trimStrings " : ' + trimStrings \
+ ' , " guessCellValueTypes " : ' + guessCellValueTypes \
+ ' , " storeEmptyStrings " : ' + storeEmptyStrings \
+ ' , " includeFileSources " : ' + includeFileSources \
+ ' } '
2011-05-27 02:15:38 +02:00
2017-02-01 16:51:13 +01:00
# csv
if options . inputformat == ' csv ' :
input_format = ' text/line-based/*sv '
encoding = ' '
if options . encoding :
encoding = options . encoding
ignoreLines = ' -1 '
if options . ignoreLines :
ignoreLines = options . ignoreLines
headerLines = ' 1 '
if options . headerLines :
headerLines = options . headerLines
skipDataLines = ' 0 '
if options . skipDataLines :
skipDataLines = options . skipDataLines
limit = ' -1 '
if options . limit :
limit = options . limit
storeBlankRows = ' true '
if options . storeBlankRows :
storeBlankRows = options . storeBlankRows
guessCellValueTypes = ' false '
if options . guessCellValueTypes :
guessCellValueTypes = options . guessCellValueTypes
processQuotes = ' true '
if options . processQuotes :
processQuotes = options . processQuotes
storeBlankCellsAsNulls = ' true '
if options . storeBlankCellsAsNulls :
storeBlankCellsAsNulls = options . storeBlankCellsAsNulls
includeFileSources = ' false '
if options . includeFileSources :
includeFileSources = options . includeFileSources
input_options = ' { " encoding " : " ' + encoding + ' " ' \
+ ' , " separator " : " , " ' \
+ ' , " ignoreLines " : ' + ignoreLines \
+ ' , " headerLines " : ' + headerLines \
+ ' , " skipDataLines " : ' + skipDataLines \
+ ' , " limit " : ' + limit \
+ ' , " storeBlankRows " : ' + storeBlankRows \
+ ' , " guessCellValueTypes " : ' + guessCellValueTypes \
+ ' , " processQuotes " : ' + processQuotes \
+ ' , " storeBlankCellsAsNulls " : ' + storeBlankCellsAsNulls \
+ ' , " includeFileSources " : ' + includeFileSources \
+ ' } '
2011-05-27 02:15:38 +02:00
2017-02-01 16:51:13 +01:00
# tsv
if options . inputformat == ' tsv ' :
input_format = ' text/line-based/*sv '
encoding = ' '
if options . encoding :
encoding = options . encoding
ignoreLines = ' -1 '
if options . ignoreLines :
ignoreLines = options . ignoreLines
headerLines = ' 1 '
if options . headerLines :
headerLines = options . headerLines
skipDataLines = ' 0 '
if options . skipDataLines :
skipDataLines = options . skipDataLines
limit = ' -1 '
if options . limit :
limit = options . limit
storeBlankRows = ' true '
if options . storeBlankRows :
storeBlankRows = options . storeBlankRows
guessCellValueTypes = ' false '
if options . guessCellValueTypes :
guessCellValueTypes = options . guessCellValueTypes
processQuotes = ' true '
if options . processQuotes :
processQuotes = options . processQuotes
storeBlankCellsAsNulls = ' true '
if options . storeBlankCellsAsNulls :
storeBlankCellsAsNulls = options . storeBlankCellsAsNulls
includeFileSources = ' false '
if options . includeFileSources :
includeFileSources = options . includeFileSources
input_options = ' { " encoding " : " ' + encoding + ' " ' \
+ ' , " separator " : " \\ t " ' \
+ ' , " ignoreLines " : ' + ignoreLines \
+ ' , " headerLines " : ' + headerLines \
+ ' , " skipDataLines " : ' + skipDataLines \
+ ' , " limit " : ' + limit \
+ ' , " storeBlankRows " : ' + storeBlankRows \
+ ' , " guessCellValueTypes " : ' + guessCellValueTypes \
+ ' , " processQuotes " : ' + processQuotes \
+ ' , " storeBlankCellsAsNulls " : ' + storeBlankCellsAsNulls \
+ ' , " includeFileSources " : ' + includeFileSources \
+ ' } '
# line-based
if options . inputformat == ' line-based ' :
input_format = ' text/line-based '
encoding = ' '
if options . encoding :
encoding = options . encoding
linesPerRow = ' 1 '
if options . linesPerRow :
linesPerRow = options . linesPerRow
ignoreLines = ' -1 '
if options . ignoreLines :
ignoreLines = options . ignoreLines
limit = ' -1 '
if options . limit :
limit = options . limit
skipDataLines = ' -1 '
if options . skipDataLines :
skipDataLines = options . skipDataLines
storeBlankRows = ' true '
if options . storeBlankRows :
storeBlankRows = options . storeBlankRows
storeBlankCellsAsNulls = ' true '
if options . storeBlankCellsAsNulls :
storeBlankCellsAsNulls = options . storeBlankCellsAsNulls
includeFileSources = ' false '
if options . includeFileSources :
includeFileSources = options . includeFileSources
input_options = ' { " encoding " : " ' + encoding + ' " ' \
+ ' , " linesPerRow " : ' + linesPerRow \
+ ' , " ignoreLines " : ' + ignoreLines \
+ ' , " limit " : ' + limit \
+ ' , " skipDataLines " : ' + skipDataLines \
+ ' , " storeBlankRows " : ' + storeBlankRows \
+ ' , " storeBlankCellsAsNulls " : ' + storeBlankCellsAsNulls \
+ ' , " includeFileSources " : ' + includeFileSources \
+ ' } '
# fixed-width
if options . inputformat == ' fixed-width ' :
input_format = ' text/line-based/fixed-width '
encoding = ' '
if options . encoding :
encoding = options . encoding
columnWidths = ' '
if options . columnWidths :
columnWidths = options . columnWidths
ignoreLines = ' -1 '
if options . ignoreLines :
ignoreLines = options . ignoreLines
headerLines = ' 0 '
if options . headerLines :
headerLines = options . headerLines
skipDataLines = ' 0 '
if options . skipDataLines :
skipDataLines = options . skipDataLines
limit = ' -1 '
if options . limit :
limit = options . limit
guessCellValueTypes = ' false '
if options . guessCellValueTypes :
guessCellValueTypes = options . guessCellValueTypes
storeBlankRows = ' true '
if options . storeBlankRows :
storeBlankRows = options . storeBlankRows
storeBlankCellsAsNulls = ' true '
if options . storeBlankCellsAsNulls :
storeBlankCellsAsNulls = options . storeBlankCellsAsNulls
includeFileSources = ' false '
if options . includeFileSources :
includeFileSources = options . includeFileSources
input_options = ' { " encoding " : " ' + encoding + ' " ' \
+ ' , " columnWidths " :[ ' + columnWidths + ' ] ' \
+ ' , " ignoreLines " : ' + ignoreLines \
+ ' , " headerLines " : ' + headerLines \
+ ' , " skipDataLines " : ' + skipDataLines \
+ ' , " limit " : ' + limit \
+ ' , " guessCellValueTypes " : ' + guessCellValueTypes \
+ ' , " storeBlankRows " : ' + storeBlankRows \
+ ' , " storeBlankCellsAsNulls " : ' + storeBlankCellsAsNulls \
+ ' , " includeFileSources " : ' + includeFileSources \
+ ' } '
# json
if options . inputformat == ' json ' :
input_format = ' text/json '
recordPath = [ ' _ ' , ' _ ' ]
if options . recordPath :
recordPath = options . recordPath
limit = ' -1 '
if options . limit :
limit = options . limit
trimStrings = ' false '
if options . trimStrings :
trimStrings = options . trimStrings
guessCellValueTypes = ' false '
if options . guessCellValueTypes :
guessCellValueTypes = options . guessCellValueTypes
storeEmptyStrings = ' true '
if options . storeEmptyStrings :
storeEmptyStrings = options . storeEmptyStrings
includeFileSources = ' false '
if options . includeFileSources :
includeFileSources = options . includeFileSources
input_options = ' { " recordPath " :[ " ' + ' " , " ' . join ( recordPath ) + ' " ] ' \
+ ' , " limit " : ' + limit \
+ ' , " trimStrings " : ' + trimStrings \
+ ' , " guessCellValueTypes " : ' + guessCellValueTypes \
+ ' , " storeEmptyStrings " : ' + storeEmptyStrings \
+ ' , " includeFileSources " : ' + includeFileSources \
+ ' } '
# xlsx
if options . inputformat == ' xlsx ' :
input_format = ' binary/text/xml/xls/xlsx '
sheets = ' 0 '
if options . sheets :
sheets = options . sheets
ignoreLines = ' -1 '
if options . ignoreLines :
ignoreLines = options . ignoreLines
headerLines = ' 1 '
if options . headerLines :
headerLines = options . headerLines
skipDataLines = ' 0 '
if options . skipDataLines :
skipDataLines = options . skipDataLines
limit = ' -1 '
if options . limit :
limit = options . limit
storeBlankRows = ' true '
if options . storeBlankRows :
storeBlankRows = options . storeBlankRows
storeBlankCellsAsNulls = ' true '
if options . storeBlankCellsAsNulls :
storeBlankCellsAsNulls = options . storeBlankCellsAsNulls
includeFileSources = ' false '
if options . includeFileSources :
includeFileSources = options . includeFileSources
input_options = ' { " sheets " :[ ' + sheets + ' ] ' \
+ ' , " ignoreLines " : ' + ignoreLines \
+ ' , " headerLines " : ' + headerLines \
+ ' , " skipDataLines " : ' + skipDataLines \
+ ' , " limit " : ' + limit \
+ ' , " storeBlankRows " : ' + storeBlankRows \
+ ' , " storeBlankCellsAsNulls " : ' + storeBlankCellsAsNulls \
+ ' , " includeFileSources " : ' + includeFileSources \
+ ' } '
# ods
if options . inputformat == ' ods ' :
input_format = ' text/xml/ods '
sheets = ' 0 '
if options . sheets :
sheets = options . sheets
ignoreLines = ' -1 '
if options . ignoreLines :
ignoreLines = options . ignoreLines
headerLines = ' 1 '
if options . headerLines :
headerLines = options . headerLines
skipDataLines = ' 0 '
if options . skipDataLines :
skipDataLines = options . skipDataLines
limit = ' -1 '
if options . limit :
limit = options . limit
storeBlankRows = ' true '
if options . storeBlankRows :
storeBlankRows = options . storeBlankRows
storeBlankCellsAsNulls = ' true '
if options . storeBlankCellsAsNulls :
storeBlankCellsAsNulls = options . storeBlankCellsAsNulls
includeFileSources = ' false '
if options . includeFileSources :
includeFileSources = options . includeFileSources
input_options = ' { " sheets " :[ ' + sheets + ' ] ' \
+ ' , " ignoreLines " : ' + ignoreLines \
+ ' , " headerLines " : ' + headerLines \
+ ' , " skipDataLines " : ' + skipDataLines \
+ ' , " limit " : ' + limit \
+ ' , " storeBlankRows " : ' + storeBlankRows \
+ ' , " storeBlankCellsAsNulls " : ' + storeBlankCellsAsNulls \
+ ' , " includeFileSources " : ' + includeFileSources \
+ ' } '
data = { }
file_name = os . path . split ( file_fullpath ) [ - 1 ]
data [ ' project-file ' ] = { ' fd ' : open ( file_fullpath ) ,
' filename ' : file_name }
data [ ' project-name ' ] = file_name
response = urllib2 . urlopen ( servernewproject
+ ' /command/core/create-project-from-upload?format= '
+ input_format + ' &options= '
+ input_options , data )
response_body = response . read ( )
url_params = \
urlparse . parse_qs ( urlparse . urlparse ( response . geturl ( ) ) . query )
if ' project ' in url_params :
project_id = url_params [ ' project ' ] [ 0 ]
print ' New project: ' + project_id
else :
raise Exception ( ' Project not created ' )
# wait until project is created
def wait_until_idle ( self , polling_delay = 0.5 ) :
while True :
response = urllib2 . urlopen ( servernewproject + ' /command/core/get-processes?project= ' + project_id )
response_body = response . read ( )
url_params = \
urlparse . parse_qs ( urlparse . urlparse ( response . geturl ( ) ) . query )
if ' processes ' in url_params and len ( url_params [ ' processes ' ] ) > 0 :
time . sleep ( polling_delay )
else :
print ' done '
return
# check number of rows
response = urllib2 . urlopen ( servernewproject
+ ' /command/core/get-rows?project= '
+ project_id
+ ' &start=0&limit=0 ' )
response_body = response . read ( )
response_json = json . loads ( response_body )
if ' total ' in response_body and response_json [ ' total ' ] > 0 :
print ' Number of rows: ' , response_json [ ' total ' ]
else :
raise Exception ( ' Project contains 0 rows. Please check --help for mandatory arguments for xml, json, xls and ods ' )
2013-10-09 20:04:24 +02:00
2011-05-27 02:15:38 +02:00
def list_projects ( ) :
""" Query the Refine server and list projects by ID: name. """
projects = refine . Refine ( refine . RefineServer ( ) ) . list_projects ( ) . items ( )
2013-10-09 20:04:24 +02:00
2011-05-27 02:15:38 +02:00
def date_to_epoch ( json_dt ) :
2013-10-09 20:04:24 +02:00
""" Convert a JSON date time into seconds-since-epoch. """
2011-05-27 02:15:38 +02:00
return time . mktime ( time . strptime ( json_dt , ' % Y- % m- %d T % H: % M: % SZ ' ) )
projects . sort ( key = lambda v : date_to_epoch ( v [ 1 ] [ ' modified ' ] ) , reverse = True )
for project_id , project_info in projects :
print ( ' {0:>14} : {1} ' . format ( project_id , project_info [ ' name ' ] ) )
2013-10-09 20:04:24 +02:00
2011-05-27 02:15:38 +02:00
def export_project ( project , options ) :
""" Dump a project to stdout or options.output file. """
export_format = ' tsv '
if options . output :
ext = os . path . splitext ( options . output ) [ 1 ] [ 1 : ] # 'xls'
if ext :
export_format = ext . lower ( )
output = open ( options . output , ' wb ' )
else :
output = sys . stdout
output . writelines ( project . export ( export_format = export_format ) )
output . close ( )
2011-05-27 02:20:26 +02:00
2013-10-09 20:04:24 +02:00
#noinspection PyPep8Naming
2011-05-27 02:15:38 +02:00
def main ( ) :
2013-10-09 20:04:24 +02:00
""" Main. """
2017-02-01 23:59:13 +01:00
# get environment variables in docker network
2017-02-02 11:05:14 +01:00
docker_host = os . environ . get ( ' OPENREFINE_SERVER_PORT_3333_TCP_ADDR ' )
2017-02-01 23:59:13 +01:00
if docker_host :
os . environ [ " OPENREFINE_HOST " ] = docker_host
refine . REFINE_HOST = docker_host
2017-02-02 11:05:14 +01:00
docker_port = os . environ . get ( ' OPENREFINE_SERVER_PORT_3333_TCP_PORT ' )
2017-02-01 23:59:13 +01:00
if docker_port :
os . environ [ " OPENREFINE_HOST " ] = docker_port
refine . REFINE_PORT = docker_port
2011-05-27 02:15:38 +02:00
options , args = PARSER . parse_args ( )
if options . host :
refine . REFINE_HOST = options . host
if options . port :
refine . REFINE_PORT = options . port
2011-05-27 02:20:26 +02:00
2011-05-27 02:15:38 +02:00
if not options . list and len ( args ) != 1 :
2011-06-08 19:07:39 +02:00
PARSER . print_usage ( )
2017-02-01 16:51:13 +01:00
2011-05-27 02:15:38 +02:00
if options . list :
list_projects ( )
2017-02-01 16:51:13 +01:00
if options . create :
file_fullpath = args [ 0 ]
create_project ( options , file_fullpath )
if options . apply :
project = refine . RefineProject ( args [ 0 ] )
response = project . apply_operations ( options . apply )
if response != ' ok ' :
print >> sys . stderr , ' Failed to apply %s : %s ' \
% ( options . apply , response )
if options . export :
2011-05-27 02:15:38 +02:00
project = refine . RefineProject ( args [ 0 ] )
2017-02-01 16:51:13 +01:00
export_project ( project , options )
2011-05-27 02:15:38 +02:00
2011-06-08 19:07:39 +02:00
return project
2011-05-27 02:20:26 +02:00
2011-05-27 02:15:38 +02:00
if __name__ == ' __main__ ' :
2011-06-08 19:07:39 +02:00
# return project so that it's available interactively, python -i refine.py
2013-10-09 20:04:24 +02:00
refine_project = main ( )