377 lines
16 KiB
Bash
Executable File
377 lines
16 KiB
Bash
Executable File
#!/bin/bash
|
|
# openrefine-batch.sh, Felix Lohmeier, v1.5, 2017-10-26
|
|
# https://github.com/felixlohmeier/openrefine-batch
|
|
|
|
# declare download URLs for OpenRefine and OpenRefine client
|
|
openrefine_URL="https://github.com/felixlohmeier/OpenRefine/releases/download/2017-10-26%2Bpr1290/openrefine-linux-2017-10-26-pr1290.tar.gz"
|
|
client_URL="https://github.com/felixlohmeier/openrefine-client/releases/download/v0.3.1/openrefine-client_0-3-1_linux-64bit"
|
|
|
|
# check system requirements
|
|
JAVA="$(which java 2> /dev/null)"
|
|
if [ -z "$JAVA" ] ; then
|
|
echo 1>&2 "This action requires you to have 'Java JRE' installed. You can download it for free at https://java.com"
|
|
exit 1
|
|
fi
|
|
# check if wget supports option --show-progress (since wget 1.16)
|
|
wget --help | grep -q '\--show-progress' && wget_opt="--show-progress" || wget_opt=""
|
|
|
|
# autoinstall OpenRefine
|
|
if [ ! -d "openrefine" ]; then
|
|
echo "Download OpenRefine..."
|
|
mkdir -p openrefine
|
|
wget -q $wget_opt $openrefine_URL
|
|
echo "Install OpenRefine in subdirectory openrefine..."
|
|
tar -xzf "$(basename $openrefine_URL)" -C openrefine --strip 1 --totals
|
|
rm -f "$(basename $openrefine_URL)"
|
|
sed -i '$ a JAVA_OPTIONS=-Drefine.headless=true' openrefine/refine.ini
|
|
sed -i 's/#REFINE_AUTOSAVE_PERIOD=60/REFINE_AUTOSAVE_PERIOD=1440/' openrefine/refine.ini
|
|
sed -i 's/-Xms$REFINE_MIN_MEMORY/-Xms$REFINE_MEMORY/' openrefine/refine
|
|
echo ""
|
|
fi
|
|
|
|
# autoinstall OpenRefine client
|
|
if [ ! -d "openrefine-client" ]; then
|
|
echo "Download OpenRefine client..."
|
|
mkdir -p openrefine-client
|
|
wget -q -P openrefine-client $wget_opt $client_URL
|
|
chmod +x openrefine-client/openrefine-client_0-3-1_linux-64bit
|
|
echo ""
|
|
fi
|
|
|
|
# help screen
|
|
function usage () {
|
|
cat <<EOF
|
|
Usage: ./openrefine-batch.sh [-a INPUTDIR] [-b TRANSFORMDIR] [-c OUTPUTDIR] ...
|
|
|
|
== basic arguments ==
|
|
-a INPUTDIR path to directory with source files (leave empty to transform only ; multiple files may be imported into a single project by providing a zip or tar.gz archive, cf. https://github.com/OpenRefine/OpenRefine/wiki/Importers )
|
|
-b TRANSFORMDIR path to directory with OpenRefine transformation rules (json files, cf. http://kb.refinepro.com/2012/06/google-refine-json-and-my-notepad-or.html ; leave empty to transform only)
|
|
-c OUTPUTDIR path to directory for exported files (and OpenRefine workspace)
|
|
|
|
== options ==
|
|
-d CROSSDIR path to directory with additional OpenRefine projects (will be copied to workspace before transformation step to support the cross function, cf. https://github.com/OpenRefine/OpenRefine/wiki/GREL-Other-Functions )
|
|
-f INPUTFORMAT (csv, tsv, xml, json, line-based, fixed-width, xlsx, ods)
|
|
-i INPUTOPTIONS several options provided by openrefine-client, see below...
|
|
-m RAM maximum RAM for OpenRefine java heap space (default: 2048M)
|
|
-p PORT PORT on which OpenRefine should listen (default: 3333)
|
|
-E do NOT export files
|
|
-R do NOT restart OpenRefine after each transformation (e.g. config file)
|
|
-X do NOT restart OpenRefine after each project (e.g. input file)
|
|
-h displays this help screen
|
|
|
|
== inputoptions (mandatory for xml, json, fixed-width, xslx, ods) ==
|
|
-i recordPath=RECORDPATH (xml, json): please provide path in multiple arguments without slashes, e.g. /collection/record/ should be entered like this: --recordPath=collection --recordPath=record
|
|
-i columnWidths=COLUMNWIDTHS (fixed-width): please provide widths separated by comma (e.g. 7,5)
|
|
-i sheets=SHEETS (xlsx, ods): please provide sheets separated by comma (e.g. 0,1), default: 0 (first sheet)
|
|
|
|
== more inputoptions (optional, only together with inputformat) ==
|
|
-i projectName=PROJECTNAME (all formats)
|
|
-i limit=LIMIT (all formats), default: -1
|
|
-i includeFileSources=INCLUDEFILESOURCES (all formats), default: false
|
|
-i trimStrings=TRIMSTRINGS (xml, json), default: false
|
|
-i storeEmptyStrings=STOREEMPTYSTRINGS (xml, json), default: true
|
|
-i guessCellValueTypes=GUESSCELLVALUETYPES (xml, csv, tsv, fixed-width, json), default: false
|
|
-i encoding=ENCODING (csv, tsv, line-based, fixed-width), please provide short encoding name (e.g. UTF-8)
|
|
-i ignoreLines=IGNORELINES (csv, tsv, line-based, fixed-width, xlsx, ods), default: -1
|
|
-i headerLines=HEADERLINES (csv, tsv, fixed-width, xlsx, ods), default: 1
|
|
-i skipDataLines=SKIPDATALINES (csv, tsv, line-based, fixed-width, xlsx, ods), default: 0
|
|
-i storeBlankRows=STOREBLANKROWS (csv, tsv, line-based, fixed-width, xlsx, ods), default: true
|
|
-i processQuotes=PROCESSQUOTES (csv, tsv), default: true
|
|
-i storeBlankCellsAsNulls=STOREBLANKCELLSASNULLS (csv, tsv, line-based, fixed-width, xlsx, ods), default: true
|
|
-i linesPerRow=LINESPERROW (line-based), default: 1
|
|
|
|
== example ==
|
|
|
|
./openrefine-batch.sh \
|
|
-a examples/powerhouse-museum/input/ \
|
|
-b examples/powerhouse-museum/config/ \
|
|
-c examples/powerhouse-museum/output/ \
|
|
-f tsv \
|
|
-i processQuotes=false \
|
|
-i guessCellValueTypes=true \
|
|
-RX
|
|
|
|
clone or download GitHub repository to get example data:
|
|
https://github.com/felixlohmeier/openrefine-batch/archive/master.zip
|
|
|
|
EOF
|
|
exit 1
|
|
}
|
|
|
|
# defaults
|
|
ram="2048M"
|
|
port="3333"
|
|
restartfile="true"
|
|
restarttransform="true"
|
|
export="true"
|
|
inputdir=/dev/null
|
|
configdir=/dev/null
|
|
crossdir=/dev/null
|
|
|
|
# check input
|
|
NUMARGS=$#
|
|
if [ "$NUMARGS" -eq 0 ]; then
|
|
usage
|
|
fi
|
|
|
|
# get user input
|
|
options="a:b:c:d:f:i:m:p:ERXh"
|
|
while getopts $options opt; do
|
|
case $opt in
|
|
a ) inputdir=$(readlink -f ${OPTARG}); if [ -n "${inputdir// }" ] ; then inputfiles=($(find -L "${inputdir}"/* -type f -printf "%f\n" 2>/dev/null)); fi ;;
|
|
b ) configdir=$(readlink -f ${OPTARG}); if [ -n "${configdir// }" ] ; then jsonfiles=($(find -L "${configdir}"/* -type f -printf "%f\n" 2>/dev/null)); fi ;;
|
|
c ) outputdir=$(readlink -m ${OPTARG}); mkdir -p "${outputdir}" ;;
|
|
d ) crossdir=$(readlink -f ${OPTARG}); if [ -n "${crossdir// }" ] ; then crossprojects=($(find -L "${crossdir}"/* -maxdepth 0 -type d -printf "%f\n" 2>/dev/null)); fi ;;
|
|
f ) format="${OPTARG}" ; inputformat="--format=${OPTARG}" ;;
|
|
i ) inputoptions+=("--${OPTARG}") ;;
|
|
m ) ram=${OPTARG} ;;
|
|
p ) port=${OPTARG} ;;
|
|
E ) export="false" ;;
|
|
R ) restarttransform="false" ;;
|
|
X ) restartfile="false" ;;
|
|
h ) usage ;;
|
|
\? ) echo 1>&2 "Unknown option: -$OPTARG"; usage; exit 1;;
|
|
: ) echo 1>&2 "Missing option argument for -$OPTARG"; usage; exit 1;;
|
|
* ) echo 1>&2 "Unimplemented option: -$OPTARG"; usage; exit 1;;
|
|
esac
|
|
done
|
|
shift $(($OPTIND - 1))
|
|
|
|
# check for mandatory options
|
|
if [ -z "$outputdir" ]; then
|
|
echo 1>&2 "please provide path to directory for exported files (and OpenRefine workspace)"
|
|
echo 1>&2 "example: ./openrefine-batch.sh -c output/"
|
|
exit 1
|
|
fi
|
|
if [ "$format" = "xml" ] || [ "$format" = "json" ] && [ -z "$inputoptions" ]; then
|
|
echo 1>&2 "error: you specified the inputformat $format but did not provide mandatory input options"
|
|
echo 1>&2 "please provide recordpath in multiple arguments without slashes"
|
|
echo 1>&2 "example: ./openrefine-batch.sh ... -f $format -i recordPath=collection -i recordPath=record"
|
|
exit 1
|
|
fi
|
|
if [ "$format" = "fixed-width" ] && [ -z "$inputoptions" ]; then
|
|
echo 1>&2 "error: you specified the inputformat $format but did not provide mandatory input options"
|
|
echo 1>&2 "please provide column widths separated by comma (e.g. 7,5)"
|
|
echo 1>&2 "example: ./openrefine-batch.sh ... -f $format -i columnWidths=7,5"
|
|
exit 1
|
|
fi
|
|
if [ "$format" = "xlsx" ] || [ "$format" = "ods" ] && [ -z "$inputoptions" ]; then
|
|
echo 1>&2 "error: you specified the inputformat $format but did not provide mandatory input options"
|
|
echo 1>&2 "please provide sheets separated by comma (e.g. 0,1), default: 0 (first sheet)"
|
|
echo 1>&2 "example: ./openrefine-batch.sh ... -f $format -i sheets=0"
|
|
exit 1
|
|
fi
|
|
|
|
# print variables
|
|
echo "Input directory: $inputdir"
|
|
echo "Input files: ${inputfiles[*]}"
|
|
echo "Input format: $inputformat"
|
|
echo "Input options: ${inputoptions[*]}"
|
|
echo "Config directory: $configdir"
|
|
echo "Transformation rules: ${jsonfiles[*]}"
|
|
echo "Cross directory: $crossdir"
|
|
echo "Cross projects: ${crossprojects[*]}"
|
|
echo "OpenRefine heap space: $ram"
|
|
echo "OpenRefine port: $port"
|
|
echo "OpenRefine workspace: $outputdir"
|
|
echo "Export TSV to workspace: $export"
|
|
echo "restart after file: $restartfile"
|
|
echo "restart after transform: $restarttransform"
|
|
echo ""
|
|
|
|
# declare additional variables
|
|
checkpoints=${#checkpointdate[@]}
|
|
checkpointdate[$(($checkpoints + 1))]=$(date +%s)
|
|
checkpointname[$(($checkpoints + 1))]="Start process"
|
|
memoryload=()
|
|
|
|
# launch server
|
|
checkpoints=${#checkpointdate[@]}
|
|
checkpointdate[$(($checkpoints + 1))]=$(date +%s)
|
|
checkpointname[$(($checkpoints + 1))]="Launch OpenRefine"
|
|
echo "=== $checkpoints. ${checkpointname[$(($checkpoints + 1))]} ==="
|
|
echo ""
|
|
echo "starting time: $(date --date=@${checkpointdate[$(($checkpoints + 1))]})"
|
|
echo ""
|
|
openrefine/refine -p ${port} -d "${outputdir}" -m ${ram} &
|
|
pid=$!
|
|
# wait until server is available
|
|
until wget -q -O - http://localhost:${port} | cat | grep -q -o "OpenRefine" ; do sleep 1; done
|
|
echo ""
|
|
|
|
# import all files
|
|
if [ -n "$inputfiles" ]; then
|
|
checkpoints=${#checkpointdate[@]}
|
|
checkpointdate[$(($checkpoints + 1))]=$(date +%s)
|
|
checkpointname[$(($checkpoints + 1))]="Import all files"
|
|
echo "=== $checkpoints. ${checkpointname[$(($checkpoints + 1))]} ==="
|
|
echo ""
|
|
echo "starting time: $(date --date=@${checkpointdate[$(($checkpoints + 1))]})"
|
|
echo ""
|
|
for inputfile in "${inputfiles[@]}" ; do
|
|
echo "import ${inputfile}..."
|
|
# run client with input command
|
|
openrefine-client/openrefine-client_0-3-1_linux-64bit -P ${port} -c ${inputdir}/${inputfile} $inputformat "${inputoptions[@]}"
|
|
# show allocated system resources
|
|
ps -o start,etime,%mem,%cpu,rss -p ${pid} --sort=start
|
|
memoryload+=($(ps --no-headers -o rss -p ${pid}))
|
|
echo ""
|
|
# restart server to clear memory
|
|
if [ "$restartfile" = "true" ]; then
|
|
echo "save project and restart OpenRefine server..."
|
|
kill ${pid}
|
|
wait
|
|
echo ""
|
|
openrefine/refine -p ${port} -d "${outputdir}" -m ${ram} &
|
|
pid=$!
|
|
until wget -q -O - http://localhost:${port} | cat | grep -q -o "OpenRefine" ; do sleep 1; done
|
|
echo ""
|
|
fi
|
|
done
|
|
fi
|
|
|
|
# transform and export files
|
|
if [ -n "$jsonfiles" ] || [ "$export" = "true" ]; then
|
|
checkpoints=${#checkpointdate[@]}
|
|
checkpointdate[$(($checkpoints + 1))]=$(date +%s)
|
|
checkpointname[$(($checkpoints + 1))]="Prepare transform & export"
|
|
echo "=== $checkpoints. ${checkpointname[$(($checkpoints + 1))]} ==="
|
|
echo ""
|
|
echo "starting time: $(date --date=@${checkpointdate[$(($checkpoints + 1))]})"
|
|
echo ""
|
|
|
|
# get project ids
|
|
echo "get project ids..."
|
|
openrefine-client/openrefine-client_0-3-1_linux-64bit -P ${port} -l > "${outputdir}/projects.tmp"
|
|
projectids=($(cat "${outputdir}/projects.tmp" | cut -c 2-14))
|
|
projectnames=($(cat "${outputdir}/projects.tmp" | cut -c 17-))
|
|
cat "${outputdir}/projects.tmp" && rm "${outputdir:?}/projects.tmp"
|
|
echo ""
|
|
|
|
# provide additional OpenRefine projects for cross function
|
|
if [ -n "$crossprojects" ]; then
|
|
echo "provide additional projects for cross function..."
|
|
# copy given projects to workspace
|
|
rsync -a --exclude='*.project/history' "${crossdir}"/*.project "${outputdir}"
|
|
# restart server to advertise copied projects
|
|
echo "restart OpenRefine server to advertise copied projects..."
|
|
kill ${pid}
|
|
wait
|
|
echo ""
|
|
openrefine/refine -p ${port} -d "${outputdir}" -m ${ram} &
|
|
pid=$!
|
|
until wget -q -O - http://localhost:${port} | cat | grep -q -o "OpenRefine" ; do sleep 1; done
|
|
echo ""
|
|
fi
|
|
|
|
# loop for all projects
|
|
for ((i=0;i<${#projectids[@]};++i)); do
|
|
|
|
# apply transformation rules
|
|
if [ -n "$jsonfiles" ]; then
|
|
checkpoints=${#checkpointdate[@]}
|
|
checkpointdate[$(($checkpoints + 1))]=$(date +%s)
|
|
checkpointname[$(($checkpoints + 1))]="Transform ${projectnames[i]}"
|
|
echo "=== $checkpoints. ${checkpointname[$(($checkpoints + 1))]} ==="
|
|
echo ""
|
|
echo "starting time: $(date --date=@${checkpointdate[$(($checkpoints + 1))]})"
|
|
echo ""
|
|
for jsonfile in "${jsonfiles[@]}" ; do
|
|
echo "transform ${jsonfile}..."
|
|
# run client with apply command
|
|
openrefine-client/openrefine-client_0-3-1_linux-64bit -P ${port} -f ${configdir}/${jsonfile} ${projectids[i]}
|
|
# allocated system resources
|
|
ps -o start,etime,%mem,%cpu,rss -p ${pid} --sort=start
|
|
memoryload+=($(ps --no-headers -o rss -p ${pid}))
|
|
echo ""
|
|
# restart server to clear memory
|
|
if [ "$restarttransform" = "true" ]; then
|
|
echo "save project and restart OpenRefine server..."
|
|
kill ${pid}
|
|
wait
|
|
echo ""
|
|
openrefine/refine -p ${port} -d "${outputdir}" -m ${ram} &
|
|
pid=$!
|
|
until wget -q -O - http://localhost:${port} | cat | grep -q -o "OpenRefine" ; do sleep 1; done
|
|
fi
|
|
echo ""
|
|
done
|
|
fi
|
|
|
|
# export project to workspace
|
|
if [ "$export" = "true" ]; then
|
|
checkpoints=${#checkpointdate[@]}
|
|
checkpointdate[$(($checkpoints + 1))]=$(date +%s)
|
|
checkpointname[$(($checkpoints + 1))]="Export ${projectnames[i]}"
|
|
echo "=== $checkpoints. ${checkpointname[$(($checkpoints + 1))]} ==="
|
|
echo ""
|
|
echo "starting time: $(date --date=@${checkpointdate[$(($checkpoints + 1))]})"
|
|
echo ""
|
|
# get filename without extension
|
|
filename=${projectnames[i]%.*}
|
|
echo "export to file ${filename}.tsv..."
|
|
# run client with export command
|
|
openrefine-client/openrefine-client_0-3-1_linux-64bit -P ${port} -E --output="${outputdir}/${filename}.tsv" ${projectids[i]}
|
|
# show allocated system resources
|
|
ps -o start,etime,%mem,%cpu,rss -p ${pid} --sort=start
|
|
memoryload+=($(ps --no-headers -o rss -p ${pid}))
|
|
echo ""
|
|
fi
|
|
|
|
# restart server to clear memory
|
|
if [ "$restartfile" = "true" ]; then
|
|
echo "restart OpenRefine server..."
|
|
kill ${pid}
|
|
wait
|
|
echo ""
|
|
openrefine/refine -p ${port} -d "${outputdir}" -m ${ram} &
|
|
pid=$!
|
|
until wget -q -O - http://localhost:${port} | cat | grep -q -o "OpenRefine" ; do sleep 1; done
|
|
fi
|
|
echo ""
|
|
|
|
done
|
|
|
|
# list output files
|
|
if [ "$export" = "true" ]; then
|
|
echo "output (number of lines / size in bytes):"
|
|
wc -c -l "${outputdir}"/*.tsv
|
|
echo ""
|
|
fi
|
|
fi
|
|
|
|
# cleanup
|
|
echo "cleanup..."
|
|
kill ${pid}
|
|
wait
|
|
rm -r -f "${outputdir:?}"/workspace*.json
|
|
# delete duplicates from copied projects
|
|
if [ -n "$crossprojects" ]; then
|
|
for i in "${crossprojects[@]}" ; do rm -r -f "${outputdir}/${i}" ; done
|
|
fi
|
|
echo ""
|
|
|
|
# calculate and print checkpoints
|
|
echo "=== Statistics ==="
|
|
echo ""
|
|
checkpoints=${#checkpointdate[@]}
|
|
checkpointdate[$(($checkpoints + 1))]=$(date +%s)
|
|
checkpointname[$(($checkpoints + 1))]="End process"
|
|
echo "starting time and run time of each step:"
|
|
checkpoints=${#checkpointdate[@]}
|
|
checkpointdate[$(($checkpoints + 1))]=$(date +%s)
|
|
for i in $(seq 1 $checkpoints); do
|
|
diffsec="$((${checkpointdate[$(($i + 1))]} - ${checkpointdate[$i]}))"
|
|
printf "%35s $(date --date=@${checkpointdate[$i]}) ($(date -d@${diffsec} -u +%H:%M:%S))\n" "${checkpointname[$i]}"
|
|
done
|
|
echo ""
|
|
diffsec="$((${checkpointdate[$checkpoints]} - ${checkpointdate[1]}))"
|
|
echo "total run time: $(date -d@${diffsec} -u +%H:%M:%S) (hh:mm:ss)"
|
|
|
|
# calculate and print memory load
|
|
max=${memoryload[0]}
|
|
for n in "${memoryload[@]}" ; do
|
|
((n > max)) && max=$n
|
|
done
|
|
echo "highest memory load: $(($max / 1024)) MB"
|