2017-02-27 00:47:34 +01:00
#!/bin/bash
2017-08-02 14:16:45 +02:00
# openrefine-batch.sh, Felix Lohmeier, v1.4, 2017-08-02
2017-02-27 01:03:33 +01:00
# https://github.com/felixlohmeier/openrefine-batch
2017-02-27 00:47:34 +01:00
2017-03-14 23:17:33 +01:00
# declare download URLs for OpenRefine and OpenRefine client
2017-08-02 14:16:45 +02:00
openrefine_URL = "https://github.com/felixlohmeier/OpenRefine/releases/download/2017-08-02/openrefine-linux-2017-08-02.tar.gz"
client_URL = "https://github.com/felixlohmeier/openrefine-client/releases/download/v0.3.1/openrefine-client_0-3-1_linux-64bit"
2017-03-14 23:17:33 +01:00
# check system requirements
JAVA = " $( which java 2> /dev/null) "
if [ -z " $JAVA " ] ; then
echo 1>& 2 "This action requires you to have 'Java JRE' installed. You can download it for free at https://java.com"
exit 1
2017-02-27 00:47:34 +01:00
fi
2017-03-15 13:08:08 +01:00
# check if wget supports option --show-progress (since wget 1.16)
wget --help | grep -q '\--show-progress' && wget_opt = "--show-progress" || wget_opt = ""
2017-03-14 23:17:33 +01:00
# autoinstall OpenRefine
if [ ! -d "openrefine" ] ; then
echo "Download OpenRefine..."
mkdir -p openrefine
2017-03-15 13:08:08 +01:00
wget -q $wget_opt $openrefine_URL
2017-03-14 23:17:33 +01:00
echo "Install OpenRefine in subdirectory openrefine..."
tar -xzf " $( basename $openrefine_URL ) " -C openrefine --strip 1 --totals
rm -f " $( basename $openrefine_URL ) "
sed -i '$ a JAVA_OPTIONS=-Drefine.headless=true' openrefine/refine.ini
2017-08-02 14:16:45 +02:00
sed -i 's/#REFINE_AUTOSAVE_PERIOD=60/REFINE_AUTOSAVE_PERIOD=1440/' openrefine/refine.ini
2017-06-22 13:38:42 +02:00
sed -i 's/-Xms$REFINE_MIN_MEMORY/-Xms$REFINE_MEMORY/' openrefine/refine
2017-03-14 23:17:33 +01:00
echo ""
2017-02-27 00:47:34 +01:00
fi
2017-03-14 23:17:33 +01:00
# autoinstall OpenRefine client
if [ ! -d "openrefine-client" ] ; then
echo "Download OpenRefine client..."
mkdir -p openrefine-client
2017-08-02 14:16:45 +02:00
wget -q -P openrefine-client $wget_opt $client_URL
chmod +x openrefine-client/openrefine-client_0-3-1_linux-64bit
2017-03-14 23:17:33 +01:00
echo ""
2017-02-27 03:48:43 +01:00
fi
2017-03-14 23:17:33 +01:00
# help screen
function usage ( ) {
cat <<EOF
Usage: ./openrefine-batch.sh [ -a INPUTDIR] [ -b TRANSFORMDIR] [ -c OUTPUTDIR] ...
= = basic arguments = =
-a INPUTDIR path to directory with source files ( leave empty to transform only ; multiple files may be imported into a single project by providing a zip or tar.gz archive, cf. https://github.com/OpenRefine/OpenRefine/wiki/Importers )
-b TRANSFORMDIR path to directory with OpenRefine transformation rules ( json files, cf. http://kb.refinepro.com/2012/06/google-refine-json-and-my-notepad-or.html ; leave empty to transform only)
-c OUTPUTDIR path to directory for exported files ( and OpenRefine workspace)
= = options = =
-d CROSSDIR path to directory with additional OpenRefine projects ( will be copied to workspace before transformation step to support the cross function , cf. https://github.com/OpenRefine/OpenRefine/wiki/GREL-Other-Functions )
-f INPUTFORMAT ( csv, tsv, xml, json, line-based, fixed-width, xlsx, ods)
-i INPUTOPTIONS several options provided by openrefine-client, see below...
-m RAM maximum RAM for OpenRefine java heap space ( default: 2048M)
-p PORT PORT on which OpenRefine should listen ( default: 3333)
-E do NOT export files
-R do NOT restart OpenRefine after each transformation ( e.g. config file)
-X do NOT restart OpenRefine after each project ( e.g. input file)
-h displays this help screen
= = inputoptions ( mandatory for xml, json, fixed-width, xslx, ods) = =
-i recordPath = RECORDPATH ( xml, json) : please provide path in multiple arguments without slashes, e.g. /collection/record/ should be entered like this: --recordPath= collection --recordPath= record
-i columnWidths = COLUMNWIDTHS ( fixed-width) : please provide widths separated by comma ( e.g. 7,5)
-i sheets = SHEETS ( xlsx, ods) : please provide sheets separated by comma ( e.g. 0,1) , default: 0 ( first sheet)
= = more inputoptions ( optional, only together with inputformat) = =
-i projectName = PROJECTNAME ( all formats)
-i limit = LIMIT ( all formats) , default: -1
-i includeFileSources = INCLUDEFILESOURCES ( all formats) , default: false
-i trimStrings = TRIMSTRINGS ( xml, json) , default: false
-i storeEmptyStrings = STOREEMPTYSTRINGS ( xml, json) , default: true
-i guessCellValueTypes = GUESSCELLVALUETYPES ( xml, csv, tsv, fixed-width, json) , default: false
-i encoding = ENCODING ( csv, tsv, line-based, fixed-width) , please provide short encoding name ( e.g. UTF-8)
-i ignoreLines = IGNORELINES ( csv, tsv, line-based, fixed-width, xlsx, ods) , default: -1
-i headerLines = HEADERLINES ( csv, tsv, fixed-width, xlsx, ods) , default: 1
-i skipDataLines = SKIPDATALINES ( csv, tsv, line-based, fixed-width, xlsx, ods) , default: 0
-i storeBlankRows = STOREBLANKROWS ( csv, tsv, line-based, fixed-width, xlsx, ods) , default: true
-i processQuotes = PROCESSQUOTES ( csv, tsv) , default: true
-i storeBlankCellsAsNulls = STOREBLANKCELLSASNULLS ( csv, tsv, line-based, fixed-width, xlsx, ods) , default: true
-i linesPerRow = LINESPERROW ( line-based) , default: 1
= = example = =
./openrefine-batch.sh \
-a examples/powerhouse-museum/input/ \
-b examples/powerhouse-museum/config/ \
-c examples/powerhouse-museum/output/ \
-f tsv \
-i processQuotes = false \
2017-06-20 14:47:30 +02:00
-i guessCellValueTypes = true \
-RX
2017-03-14 23:17:33 +01:00
clone or download GitHub repository to get example data:
https://github.com/felixlohmeier/openrefine-batch/archive/master.zip
EOF
exit 1
}
# defaults
ram = "2048M"
port = "3333"
restartfile = "true"
restarttransform = "true"
export = "true"
inputdir = /dev/null
configdir = /dev/null
crossdir = /dev/null
# check input
NUMARGS = $#
if [ " $NUMARGS " -eq 0 ] ; then
usage
2017-02-27 17:38:25 +01:00
fi
2017-03-14 23:17:33 +01:00
# get user input
options = "a:b:c:d:f:i:m:p:ERXh"
while getopts $options opt; do
case $opt in
a ) inputdir = $( readlink -f ${ OPTARG } ) ; if [ -n " ${ inputdir // } " ] ; then inputfiles = ( $( find -L " ${ inputdir } " /* -type f -printf "%f\n" 2>/dev/null) ) ; fi ; ;
b ) configdir = $( readlink -f ${ OPTARG } ) ; if [ -n " ${ configdir // } " ] ; then jsonfiles = ( $( find -L " ${ configdir } " /* -type f -printf "%f\n" 2>/dev/null) ) ; fi ; ;
c ) outputdir = $( readlink -m ${ OPTARG } ) ; mkdir -p " ${ outputdir } " ; ;
d ) crossdir = $( readlink -f ${ OPTARG } ) ; if [ -n " ${ crossdir // } " ] ; then crossprojects = ( $( find -L " ${ crossdir } " /* -maxdepth 0 -type d -printf "%f\n" 2>/dev/null) ) ; fi ; ;
f ) format = " ${ OPTARG } " ; inputformat = " --format= ${ OPTARG } " ; ;
i ) inputoptions += ( " -- ${ OPTARG } " ) ; ;
m ) ram = ${ OPTARG } ; ;
p ) port = ${ OPTARG } ; ;
E ) export = "false" ; ;
R ) restarttransform = "false" ; ;
X ) restartfile = "false" ; ;
h ) usage ; ;
\? ) echo 1>& 2 " Unknown option: - $OPTARG " ; usage; exit 1; ;
: ) echo 1>& 2 " Missing option argument for - $OPTARG " ; usage; exit 1; ;
* ) echo 1>& 2 " Unimplemented option: - $OPTARG " ; usage; exit 1; ;
esac
done
shift $(( $OPTIND - 1 ))
# check for mandatory options
if [ -z " $outputdir " ] ; then
echo 1>& 2 "please provide path to directory for exported files (and OpenRefine workspace)"
echo 1>& 2 "example: ./openrefine-batch.sh -c output/"
exit 1
2017-02-27 18:12:06 +01:00
fi
2017-03-14 23:17:33 +01:00
if [ " $format " = "xml" ] || [ " $format " = "json" ] && [ -z " $inputoptions " ] ; then
echo 1>& 2 " error: you specified the inputformat $format but did not provide mandatory input options "
echo 1>& 2 "please provide recordpath in multiple arguments without slashes"
echo 1>& 2 " example: ./openrefine-batch.sh ... -f $format -i recordPath=collection -i recordPath=record "
exit 1
2017-03-01 17:48:13 +01:00
fi
2017-03-14 23:17:33 +01:00
if [ " $format " = "fixed-width" ] && [ -z " $inputoptions " ] ; then
echo 1>& 2 " error: you specified the inputformat $format but did not provide mandatory input options "
echo 1>& 2 "please provide column widths separated by comma (e.g. 7,5)"
echo 1>& 2 " example: ./openrefine-batch.sh ... -f $format -i columnWidths=7,5 "
exit 1
2017-03-01 17:48:13 +01:00
fi
2017-03-14 23:17:33 +01:00
if [ " $format " = "xlsx" ] || [ " $format " = "ods" ] && [ -z " $inputoptions " ] ; then
echo 1>& 2 " error: you specified the inputformat $format but did not provide mandatory input options "
echo 1>& 2 "please provide sheets separated by comma (e.g. 0,1), default: 0 (first sheet)"
echo 1>& 2 " example: ./openrefine-batch.sh ... -f $format -i sheets=0 "
exit 1
2017-02-27 00:47:34 +01:00
fi
2017-03-14 23:17:33 +01:00
# print variables
2017-03-01 17:48:13 +01:00
echo " Input directory: $inputdir "
2017-03-14 23:17:33 +01:00
echo " Input files: ${ inputfiles [*] } "
2017-03-01 17:48:13 +01:00
echo " Input format: $inputformat "
2017-03-14 23:17:33 +01:00
echo " Input options: ${ inputoptions [*] } "
2017-03-01 17:48:13 +01:00
echo " Config directory: $configdir "
2017-03-14 23:17:33 +01:00
echo " Transformation rules: ${ jsonfiles [*] } "
2017-03-01 17:48:13 +01:00
echo " Cross directory: $crossdir "
2017-03-14 23:17:33 +01:00
echo " Cross projects: ${ crossprojects [*] } "
2017-03-01 17:48:13 +01:00
echo " OpenRefine heap space: $ram "
2017-03-14 23:17:33 +01:00
echo " OpenRefine port: $port "
2017-03-01 17:48:13 +01:00
echo " OpenRefine workspace: $outputdir "
echo " Export TSV to workspace: $export "
echo " restart after file: $restartfile "
echo " restart after transform: $restarttransform "
2017-02-27 00:47:34 +01:00
echo ""
2017-03-14 23:17:33 +01:00
# declare additional variables
checkpoints = ${# checkpointdate [@] }
checkpointdate[ $(( $checkpoints + 1 )) ] = $( date +%s)
checkpointname[ $(( $checkpoints + 1 )) ] = "Start process"
memoryload = ( )
2017-02-27 00:47:34 +01:00
2017-03-01 17:48:13 +01:00
# launch server
2017-03-14 23:17:33 +01:00
checkpoints = ${# checkpointdate [@] }
checkpointdate[ $(( $checkpoints + 1 )) ] = $( date +%s)
checkpointname[ $(( $checkpoints + 1 )) ] = "Launch OpenRefine"
echo " === $checkpoints . ${ checkpointname [ $(( $checkpoints + 1 )) ] } === "
echo ""
echo " starting time: $( date --date= @${ checkpointdate [ $(( $checkpoints + 1 )) ] } ) "
echo ""
openrefine/refine -p ${ port } -d " ${ outputdir } " -m ${ ram } &
pid = $!
2017-03-01 17:48:13 +01:00
# wait until server is available
2017-03-14 23:17:33 +01:00
until wget -q -O - http://localhost:${ port } | cat | grep -q -o "OpenRefine" ; do sleep 1; done
2017-02-27 00:47:34 +01:00
echo ""
2017-03-01 17:48:13 +01:00
# import all files
2017-02-27 03:48:43 +01:00
if [ -n " $inputfiles " ] ; then
2017-03-14 23:17:33 +01:00
checkpoints = ${# checkpointdate [@] }
checkpointdate[ $(( $checkpoints + 1 )) ] = $( date +%s)
checkpointname[ $(( $checkpoints + 1 )) ] = "Import all files"
echo " === $checkpoints . ${ checkpointname [ $(( $checkpoints + 1 )) ] } === "
echo ""
echo " starting time: $( date --date= @${ checkpointdate [ $(( $checkpoints + 1 )) ] } ) "
echo ""
2017-02-27 03:48:43 +01:00
for inputfile in " ${ inputfiles [@] } " ; do
echo " import ${ inputfile } ... "
2017-03-01 17:48:13 +01:00
# run client with input command
2017-08-02 14:16:45 +02:00
openrefine-client/openrefine-client_0-3-1_linux-64bit -P ${ port } -c ${ inputdir } /${ inputfile } $inputformat " ${ inputoptions [@] } "
2017-03-14 23:17:33 +01:00
# show allocated system resources
ps -o start,etime,%mem,%cpu,rss -p ${ pid } --sort= start
memoryload += ( $( ps --no-headers -o rss -p ${ pid } ) )
2017-02-27 03:48:43 +01:00
echo ""
2017-03-01 17:48:13 +01:00
# restart server to clear memory
2017-03-14 23:17:33 +01:00
if [ " $restartfile " = "true" ] ; then
2017-03-01 17:48:13 +01:00
echo "save project and restart OpenRefine server..."
2017-03-14 23:17:33 +01:00
kill ${ pid }
wait
echo ""
openrefine/refine -p ${ port } -d " ${ outputdir } " -m ${ ram } &
pid = $!
until wget -q -O - http://localhost:${ port } | cat | grep -q -o "OpenRefine" ; do sleep 1; done
2017-03-01 17:48:13 +01:00
echo ""
fi
2017-02-27 03:48:43 +01:00
done
fi
2017-02-27 00:47:34 +01:00
2017-03-01 22:59:30 +01:00
# transform and export files
2017-03-14 23:17:33 +01:00
if [ -n " $jsonfiles " ] || [ " $export " = "true" ] ; then
checkpoints = ${# checkpointdate [@] }
checkpointdate[ $(( $checkpoints + 1 )) ] = $( date +%s)
checkpointname[ $(( $checkpoints + 1 )) ] = "Prepare transform & export"
echo " === $checkpoints . ${ checkpointname [ $(( $checkpoints + 1 )) ] } === "
echo ""
echo " starting time: $( date --date= @${ checkpointdate [ $(( $checkpoints + 1 )) ] } ) "
echo ""
# get project ids
echo "get project ids..."
2017-08-02 14:16:45 +02:00
openrefine-client/openrefine-client_0-3-1_linux-64bit -P ${ port } -l > " ${ outputdir } /projects.tmp "
2017-03-14 23:17:33 +01:00
projectids = ( $( cat " ${ outputdir } /projects.tmp " | cut -c 2-14) )
projectnames = ( $( cat " ${ outputdir } /projects.tmp " | cut -c 17-) )
cat " ${ outputdir } /projects.tmp " && rm " ${ outputdir : ? } /projects.tmp "
echo ""
# provide additional OpenRefine projects for cross function
if [ -n " $crossprojects " ] ; then
echo "provide additional projects for cross function..."
# copy given projects to workspace
rsync -a --exclude= '*.project/history' " ${ crossdir } " /*.project " ${ outputdir } "
# restart server to advertise copied projects
echo "restart OpenRefine server to advertise copied projects..."
kill ${ pid }
wait
echo ""
openrefine/refine -p ${ port } -d " ${ outputdir } " -m ${ ram } &
pid = $!
until wget -q -O - http://localhost:${ port } | cat | grep -q -o "OpenRefine" ; do sleep 1; done
echo ""
fi
# loop for all projects
for ( ( i = 0; i<${# projectids [@] } ; ++i) ) ; do
# apply transformation rules
if [ -n " $jsonfiles " ] ; then
checkpoints = ${# checkpointdate [@] }
checkpointdate[ $(( $checkpoints + 1 )) ] = $( date +%s)
checkpointname[ $(( $checkpoints + 1 )) ] = " Transform ${ projectnames [i] } "
echo " === $checkpoints . ${ checkpointname [ $(( $checkpoints + 1 )) ] } === "
echo ""
echo " starting time: $( date --date= @${ checkpointdate [ $(( $checkpoints + 1 )) ] } ) "
echo ""
for jsonfile in " ${ jsonfiles [@] } " ; do
echo " transform ${ jsonfile } ... "
# run client with apply command
2017-08-02 14:16:45 +02:00
openrefine-client/openrefine-client_0-3-1_linux-64bit -P ${ port } -f ${ configdir } /${ jsonfile } ${ projectids [i] }
2017-03-14 23:17:33 +01:00
# allocated system resources
ps -o start,etime,%mem,%cpu,rss -p ${ pid } --sort= start
memoryload += ( $( ps --no-headers -o rss -p ${ pid } ) )
echo ""
# restart server to clear memory
if [ " $restarttransform " = "true" ] ; then
echo "save project and restart OpenRefine server..."
kill ${ pid }
wait
echo ""
openrefine/refine -p ${ port } -d " ${ outputdir } " -m ${ ram } &
pid = $!
until wget -q -O - http://localhost:${ port } | cat | grep -q -o "OpenRefine" ; do sleep 1; done
fi
echo ""
done
fi
# export project to workspace
if [ " $export " = "true" ] ; then
checkpoints = ${# checkpointdate [@] }
checkpointdate[ $(( $checkpoints + 1 )) ] = $( date +%s)
checkpointname[ $(( $checkpoints + 1 )) ] = " Export ${ projectnames [i] } "
echo " === $checkpoints . ${ checkpointname [ $(( $checkpoints + 1 )) ] } === "
echo ""
echo " starting time: $( date --date= @${ checkpointdate [ $(( $checkpoints + 1 )) ] } ) "
echo ""
# get filename without extension
filename = ${ projectnames [i]%.* }
echo " export to file ${ filename } .tsv... "
# run client with export command
2017-08-02 14:16:45 +02:00
openrefine-client/openrefine-client_0-3-1_linux-64bit -P ${ port } -E --output= " ${ outputdir } / ${ filename } .tsv " ${ projectids [i] }
2017-03-14 23:17:33 +01:00
# show allocated system resources
ps -o start,etime,%mem,%cpu,rss -p ${ pid } --sort= start
memoryload += ( $( ps --no-headers -o rss -p ${ pid } ) )
echo ""
fi
# restart server to clear memory
if [ " $restartfile " = "true" ] ; then
echo "restart OpenRefine server..."
kill ${ pid }
wait
echo ""
openrefine/refine -p ${ port } -d " ${ outputdir } " -m ${ ram } &
pid = $!
until wget -q -O - http://localhost:${ port } | cat | grep -q -o "OpenRefine" ; do sleep 1; done
fi
echo ""
done
# list output files
if [ " $export " = "true" ] ; then
echo "output (number of lines / size in bytes):"
wc -c -l " ${ outputdir } " /*.tsv
echo ""
fi
2017-03-01 17:48:13 +01:00
fi
2017-02-27 00:47:34 +01:00
# cleanup
echo "cleanup..."
2017-03-14 23:17:33 +01:00
kill ${ pid }
wait
rm -r -f " ${ outputdir : ? } " /workspace*.json
2017-03-01 23:40:16 +01:00
# delete duplicates from copied projects
if [ -n " $crossprojects " ] ; then
2017-03-14 23:17:33 +01:00
for i in " ${ crossprojects [@] } " ; do rm -r -f " ${ outputdir } / ${ i } " ; done
2017-03-01 23:40:16 +01:00
fi
2017-02-27 00:47:34 +01:00
echo ""
2017-03-14 23:17:33 +01:00
# calculate and print checkpoints
echo "=== Statistics ==="
echo ""
checkpoints = ${# checkpointdate [@] }
checkpointdate[ $(( $checkpoints + 1 )) ] = $( date +%s)
checkpointname[ $(( $checkpoints + 1 )) ] = "End process"
echo "starting time and run time of each step:"
checkpoints = ${# checkpointdate [@] }
checkpointdate[ $(( $checkpoints + 1 )) ] = $( date +%s)
for i in $( seq 1 $checkpoints ) ; do
diffsec = " $(( ${ checkpointdate [ $(( $i + 1 )) ] } - ${ checkpointdate [ $i ] } )) "
printf " %35s $( date --date= @${ checkpointdate [ $i ] } ) ( $( date -d@${ diffsec } -u +%H:%M:%S) )\n " " ${ checkpointname [ $i ] } "
done
echo ""
diffsec = " $(( ${ checkpointdate [ $checkpoints ] } - ${ checkpointdate [1] } )) "
echo " total run time: $( date -d@${ diffsec } -u +%H:%M:%S) (hh:mm:ss) "
2017-06-20 14:47:30 +02:00
2017-03-14 23:17:33 +01:00
# calculate and print memory load
max = ${ memoryload [0] }
for n in " ${ memoryload [@] } " ; do
( ( n > max) ) && max = $n
done
echo " highest memory load: $(( $max / 1024 )) MB "