2022-03-25 11:16:02 +01:00
|
|
|
name: orcli
|
|
|
|
help: OpenRefine command-line interface written in Bash
|
|
|
|
version: 0.1.0
|
2022-04-04 23:00:37 +02:00
|
|
|
footer: https://github.com/opencultureconsulting/orcli
|
2022-03-25 11:16:02 +01:00
|
|
|
|
|
|
|
dependencies:
|
2022-12-06 12:09:29 +00:00
|
|
|
curl: https://curl.se
|
|
|
|
jq: https://github.com/stedolan/jq
|
2022-03-25 11:16:02 +01:00
|
|
|
|
|
|
|
environment_variables:
|
2022-03-25 22:02:28 +00:00
|
|
|
- name: OPENREFINE_URL
|
|
|
|
help: URL to OpenRefine server
|
|
|
|
default: "http://localhost:3333"
|
2022-03-25 11:16:02 +01:00
|
|
|
|
|
|
|
examples:
|
2022-04-20 10:27:53 +00:00
|
|
|
- orcli import csv "https://git.io/fj5hF" --projectName "duplicates"
|
2022-04-14 10:06:54 +00:00
|
|
|
- orcli list
|
2022-04-20 10:27:53 +00:00
|
|
|
- orcli info "duplicates"
|
2022-11-03 21:07:08 +00:00
|
|
|
- orcli transform "duplicates" "https://git.io/fj5ju"
|
2023-10-27 21:12:16 +00:00
|
|
|
- orcli search "duplicates" "^Ben"
|
2022-04-20 10:27:53 +00:00
|
|
|
- orcli export tsv "duplicates"
|
|
|
|
- orcli export tsv "duplicates" --output "duplicates.tsv"
|
2023-01-14 23:43:25 +00:00
|
|
|
- orcli delete "duplicates"
|
2022-10-16 21:13:59 +00:00
|
|
|
- orcli run --interactive
|
2022-04-20 10:27:53 +00:00
|
|
|
- |-
|
2022-10-16 20:36:43 +00:00
|
|
|
orcli run << EOF
|
2022-10-06 11:35:34 +00:00
|
|
|
orcli import csv "https://git.io/fj5hF" --projectName "duplicates"
|
2022-11-13 21:43:04 +00:00
|
|
|
orcli transform "duplicates" "https://git.io/fj5ju"
|
2022-10-06 11:35:34 +00:00
|
|
|
orcli export tsv "duplicates"
|
2022-10-06 11:28:47 +00:00
|
|
|
EOF
|
2022-03-25 11:16:02 +01:00
|
|
|
|
|
|
|
commands:
|
2022-10-04 21:19:18 +00:00
|
|
|
- name: completions
|
|
|
|
help: |-
|
|
|
|
Generate bash completions
|
2023-10-22 22:09:14 +00:00
|
|
|
Usage: source <(orcli completions)
|
2022-10-06 11:28:47 +00:00
|
|
|
|
2022-12-06 14:38:20 +00:00
|
|
|
- name: delete
|
|
|
|
help: delete OpenRefine project
|
|
|
|
args:
|
2022-12-13 21:20:36 +00:00
|
|
|
- &project
|
|
|
|
name: project
|
2022-12-06 14:38:20 +00:00
|
|
|
help: project name or id
|
|
|
|
required: true
|
|
|
|
flags:
|
2022-12-13 11:05:18 +00:00
|
|
|
- long: --force
|
|
|
|
short: -f
|
|
|
|
help: delete all projects with the same name
|
2022-12-13 21:20:36 +00:00
|
|
|
- &quiet
|
|
|
|
long: --quiet
|
2022-12-06 14:38:20 +00:00
|
|
|
short: -q
|
|
|
|
help: suppress log output, print errors only
|
|
|
|
examples:
|
|
|
|
- orcli delete "duplicates"
|
2023-01-14 23:43:25 +00:00
|
|
|
- orcli delete "duplicates" --force
|
2022-12-06 14:38:20 +00:00
|
|
|
- orcli delete 1234567890123
|
2023-10-22 22:09:14 +00:00
|
|
|
- for p in \$(orcli list); do orcli delete \${p:0:13}; done
|
2022-12-06 14:38:20 +00:00
|
|
|
|
2022-04-12 10:54:16 +00:00
|
|
|
- name: import
|
2022-10-25 10:41:13 +00:00
|
|
|
help: commands to create OpenRefine projects from files or URLs
|
2022-04-13 11:36:23 +00:00
|
|
|
|
|
|
|
commands:
|
2022-04-13 21:55:47 +00:00
|
|
|
- name: csv
|
2022-11-16 22:22:54 +00:00
|
|
|
help: import character-separated values (CSV)
|
2022-04-13 21:55:47 +00:00
|
|
|
args:
|
2022-12-13 21:20:36 +00:00
|
|
|
- &file
|
|
|
|
name: file
|
2022-04-13 21:55:47 +00:00
|
|
|
help: Path to one or more files or URLs. When FILE is -, read standard input.
|
|
|
|
default: "-"
|
|
|
|
repeatable: true
|
|
|
|
flags:
|
2022-12-13 21:20:36 +00:00
|
|
|
- &separator
|
|
|
|
long: --separator
|
2022-04-13 21:55:47 +00:00
|
|
|
help: character(s) that separates columns
|
|
|
|
arg: separator
|
|
|
|
default: ","
|
2022-12-13 22:40:10 +00:00
|
|
|
- &blankCellsAsStrings
|
|
|
|
long: --blankCellsAsStrings
|
|
|
|
help: store blank cells as empty strings instead of nulls
|
2023-01-09 10:38:53 +00:00
|
|
|
- &columnNames
|
|
|
|
long: --columnNames
|
2023-01-13 22:30:12 +00:00
|
|
|
help: |-
|
|
|
|
set column names (comma separated)
|
|
|
|
hint: add --ignoreLines 1 to overwrite existing header row
|
2023-01-09 10:38:53 +00:00
|
|
|
arg: columnNames
|
2023-01-13 22:30:12 +00:00
|
|
|
conflicts: [--headerLines]
|
2022-12-13 21:20:36 +00:00
|
|
|
- &encoding_import
|
|
|
|
long: --encoding
|
2022-04-13 21:55:47 +00:00
|
|
|
help: set character encoding
|
|
|
|
arg: encoding
|
2022-12-13 22:40:10 +00:00
|
|
|
- &guessCellValueTypes
|
|
|
|
long: --guessCellValueTypes
|
|
|
|
help: attempt to parse cell text into numbers
|
|
|
|
- &headerLines
|
|
|
|
long: --headerLines
|
|
|
|
help: parse x line(s) as column headers
|
|
|
|
arg: headerLines
|
|
|
|
default: "1"
|
2023-01-13 22:30:12 +00:00
|
|
|
conflicts: [--columnNames]
|
2022-12-13 22:40:10 +00:00
|
|
|
- &ignoreLines
|
|
|
|
long: --ignoreLines
|
|
|
|
help: ignore first x line(s) at beginning of file
|
|
|
|
arg: ignoreLines
|
|
|
|
default: "-1"
|
|
|
|
- &ignoreQuoteCharacter
|
|
|
|
long: --ignoreQuoteCharacter
|
|
|
|
help: do not use any quote character to enclose cells containing column separators
|
|
|
|
- &includeFileSources
|
|
|
|
long: --includeFileSources
|
|
|
|
help: add column with file source
|
|
|
|
- &includeArchiveFileName
|
|
|
|
long: --includeArchiveFileName
|
|
|
|
help: add column with archive file name
|
|
|
|
- &limit
|
|
|
|
long: --limit
|
|
|
|
help: load at most x row(s) of data
|
|
|
|
arg: limit
|
|
|
|
default: "-1"
|
|
|
|
- "eCharacter
|
|
|
|
long: --quoteCharacter
|
|
|
|
help: quote character to enclose cells containing column separators
|
|
|
|
arg: quoteCharacter
|
|
|
|
default: '\\\"'
|
|
|
|
- &skipBlankRows
|
|
|
|
long: --skipBlankRows
|
|
|
|
help: do not store blank rows
|
|
|
|
- &skipDataLines
|
|
|
|
long: --skipDataLines
|
|
|
|
help: discard initial x row(s) of data
|
|
|
|
arg: skipDataLines
|
|
|
|
default: "0"
|
2022-12-13 21:20:36 +00:00
|
|
|
- &trimStrings
|
|
|
|
long: --trimStrings
|
2022-04-13 21:55:47 +00:00
|
|
|
help: trim leading & trailing whitespace from strings
|
2022-12-13 21:20:36 +00:00
|
|
|
- &projectName
|
|
|
|
long: --projectName
|
2022-04-13 21:55:47 +00:00
|
|
|
arg: projectName
|
|
|
|
help: set a name for the OpenRefine project
|
2023-01-09 10:38:53 +00:00
|
|
|
- &projectTags
|
|
|
|
long: --projectTags
|
|
|
|
help: set project tags (comma separated)
|
|
|
|
arg: projectTags
|
2022-12-13 21:20:36 +00:00
|
|
|
- *quiet
|
2022-04-13 21:55:47 +00:00
|
|
|
examples:
|
2022-04-20 10:27:53 +00:00
|
|
|
- orcli import csv "file"
|
|
|
|
- orcli import csv "file1" "file2"
|
2022-12-13 21:20:36 +00:00
|
|
|
- head -n 100 "file" | orcli import csv
|
2022-04-20 10:27:53 +00:00
|
|
|
- orcli import csv "https://git.io/fj5hF"
|
|
|
|
- |-
|
|
|
|
orcli import csv "file" \\\\
|
|
|
|
--separator ";" \\\\
|
2023-01-14 23:43:25 +00:00
|
|
|
--columnNames "foo,bar,baz" \\\\
|
|
|
|
--ignoreLines 1 \\\\
|
2022-04-20 10:27:53 +00:00
|
|
|
--encoding "ISO-8859-1" \\\\
|
2023-01-14 23:43:25 +00:00
|
|
|
--limit 100 \\\\
|
2022-04-20 10:27:53 +00:00
|
|
|
--trimStrings \\\\
|
2023-10-22 22:09:14 +00:00
|
|
|
--projectName "duplicates" \\\\
|
2023-01-14 23:43:25 +00:00
|
|
|
--projectTags "test,urgent"
|
2022-04-12 10:54:16 +00:00
|
|
|
|
2022-11-16 22:22:54 +00:00
|
|
|
- name: tsv
|
|
|
|
help: import tab-separated values (TSV)
|
|
|
|
args:
|
2022-12-13 21:20:36 +00:00
|
|
|
- *file
|
2022-11-16 22:22:54 +00:00
|
|
|
flags:
|
2023-01-09 10:38:53 +00:00
|
|
|
- *blankCellsAsStrings
|
|
|
|
- *columnNames
|
2022-12-13 21:20:36 +00:00
|
|
|
- *encoding_import
|
2023-01-09 10:38:53 +00:00
|
|
|
- *guessCellValueTypes
|
|
|
|
- *headerLines
|
|
|
|
- *ignoreLines
|
|
|
|
- *ignoreQuoteCharacter
|
|
|
|
- *includeFileSources
|
|
|
|
- *includeArchiveFileName
|
|
|
|
- *limit
|
|
|
|
- *quoteCharacter
|
|
|
|
- *skipBlankRows
|
|
|
|
- *skipDataLines
|
2022-12-13 21:20:36 +00:00
|
|
|
- *trimStrings
|
|
|
|
- *projectName
|
2023-01-09 10:38:53 +00:00
|
|
|
- *projectTags
|
2022-12-13 21:20:36 +00:00
|
|
|
- *quiet
|
2022-11-16 22:22:54 +00:00
|
|
|
examples:
|
|
|
|
- orcli import tsv "file"
|
|
|
|
- orcli import tsv "file1" "file2"
|
2022-12-13 21:20:36 +00:00
|
|
|
- head -n 100 "file" | orcli import tsv
|
2023-01-27 18:04:27 +01:00
|
|
|
- orcli import tsv "https://example.com/file.tsv"
|
2022-11-16 22:22:54 +00:00
|
|
|
- |-
|
|
|
|
orcli import tsv "file" \\\\
|
|
|
|
--separator ";" \\\\
|
2023-01-14 23:43:25 +00:00
|
|
|
--columnNames "foo,bar,baz" \\\\
|
|
|
|
--ignoreLines 1 \\\\
|
2022-11-16 22:22:54 +00:00
|
|
|
--encoding "ISO-8859-1" \\\\
|
2023-01-14 23:43:25 +00:00
|
|
|
--limit 100 \\\\
|
2022-11-16 22:22:54 +00:00
|
|
|
--trimStrings \\\\
|
2023-10-22 22:09:14 +00:00
|
|
|
--projectName "duplicates" \\\\
|
2023-01-14 23:43:25 +00:00
|
|
|
--projectTags "test,urgent"
|
2022-11-16 22:22:54 +00:00
|
|
|
|
2022-03-25 22:02:28 +00:00
|
|
|
- name: list
|
|
|
|
help: list projects on OpenRefine server
|
2022-04-14 10:06:54 +00:00
|
|
|
|
|
|
|
- name: info
|
2022-10-25 10:41:13 +00:00
|
|
|
help: show OpenRefine project's metadata
|
2022-04-14 10:06:54 +00:00
|
|
|
args:
|
2022-12-13 21:20:36 +00:00
|
|
|
- *project
|
2022-04-14 10:06:54 +00:00
|
|
|
examples:
|
2022-11-03 21:07:08 +00:00
|
|
|
- orcli info "duplicates"
|
|
|
|
- orcli info 1234567890123
|
2023-01-14 23:43:25 +00:00
|
|
|
- orcli info "duplicates" | jq -r .columns[]
|
2022-11-03 21:07:08 +00:00
|
|
|
|
2023-10-27 21:12:16 +00:00
|
|
|
- name: search
|
|
|
|
help: apply regex to each column and print matches in flattened tsv format
|
|
|
|
args:
|
|
|
|
- *project
|
|
|
|
- ®ex
|
|
|
|
name: regex
|
|
|
|
help: search
|
|
|
|
examples:
|
|
|
|
- orcli search "duplicates" "^Ben"
|
|
|
|
- orcli search 1234567890123 "^Ben"
|
|
|
|
- orcli search "duplicates" "^Ben" | column -t -s \$'\t'
|
|
|
|
|
2022-11-30 22:49:54 +00:00
|
|
|
- name: test
|
|
|
|
help: run functional tests on tmp OpenRefine workspace
|
|
|
|
|
2022-11-03 21:07:08 +00:00
|
|
|
- name: transform
|
|
|
|
help: apply undo/redo JSON file(s) to an OpenRefine project
|
|
|
|
args:
|
2022-12-13 21:20:36 +00:00
|
|
|
- *project
|
|
|
|
- *file
|
2022-12-06 14:38:20 +00:00
|
|
|
flags:
|
2022-12-13 21:20:36 +00:00
|
|
|
- *quiet
|
2022-11-03 21:07:08 +00:00
|
|
|
examples:
|
|
|
|
- orcli transform "duplicates" "history.json"
|
|
|
|
- cat "history.json" | orcli transform "duplicates"
|
|
|
|
- orcli transform "duplicates" "https://git.io/fj5ju"
|
|
|
|
- orcli transform 1234567890123 "history.json"
|
2022-04-14 10:06:54 +00:00
|
|
|
|
|
|
|
- name: export
|
2022-10-25 10:41:13 +00:00
|
|
|
help: commands to export data from OpenRefine projects to files
|
2022-04-14 10:06:54 +00:00
|
|
|
|
|
|
|
commands:
|
|
|
|
- name: tsv
|
|
|
|
help: export tab-separated values (TSV)
|
|
|
|
args:
|
2022-12-13 21:20:36 +00:00
|
|
|
- *project
|
2022-04-14 10:06:54 +00:00
|
|
|
flags:
|
2023-01-27 18:04:27 +01:00
|
|
|
- &facets
|
|
|
|
long: --facets
|
|
|
|
help: filter result set by providing an OpenRefine facets config in json
|
|
|
|
arg: facets
|
|
|
|
default: "[]"
|
2022-12-13 21:20:36 +00:00
|
|
|
- &output
|
|
|
|
long: --output
|
2022-04-14 10:06:54 +00:00
|
|
|
help: Write to file instead of stdout
|
|
|
|
arg: file
|
2022-12-13 21:20:36 +00:00
|
|
|
- &encoding_export
|
|
|
|
long: --encoding
|
2022-04-14 10:06:54 +00:00
|
|
|
help: set character encoding
|
|
|
|
arg: encoding
|
|
|
|
default: "UTF-8"
|
2022-12-13 21:20:36 +00:00
|
|
|
- *quiet
|
2022-04-14 10:06:54 +00:00
|
|
|
examples:
|
2022-04-20 10:27:53 +00:00
|
|
|
- orcli export tsv "duplicates"
|
|
|
|
- orcli export tsv "duplicates" --output "duplicates.tsv"
|
2023-01-14 23:43:25 +00:00
|
|
|
- orcli export tsv "duplicates" --encoding "ISO-8859-1"
|
2023-01-27 18:04:27 +01:00
|
|
|
- |-
|
|
|
|
orcli export tsv "duplicates" --facets '[ { "type": "text", "name": "foo", "columnName": "name", "mode": "regex", "caseSensitive": false, "query": "Ben" } ]'
|
|
|
|
|
|
|
|
- name: template
|
|
|
|
help: export to any text format by providing your own GREL template
|
|
|
|
args:
|
|
|
|
- *project
|
|
|
|
- name: file
|
|
|
|
help: Path to row/record template file or URL. When FILE is -, read standard input.
|
|
|
|
default: "-"
|
|
|
|
flags:
|
|
|
|
- long: --separator
|
|
|
|
help: insert character(s) between rows/records
|
|
|
|
arg: separator
|
|
|
|
- long: --prefix
|
|
|
|
help: insert character(s) at the beginning of the file
|
|
|
|
arg: prefix
|
|
|
|
- long: --suffix
|
|
|
|
help: insert character(s) at the end of the file
|
|
|
|
arg: suffix
|
|
|
|
- long: --mode
|
|
|
|
help: specify if template shall be applied to each row or record
|
|
|
|
arg: mode
|
|
|
|
allowed: [rows, records]
|
|
|
|
default: "rows"
|
|
|
|
- *facets
|
|
|
|
- *output
|
|
|
|
- *encoding_export
|
|
|
|
- *quiet
|
|
|
|
examples:
|
|
|
|
- orcli export template "duplicates" "template.txt"
|
|
|
|
- cat "template.txt" | orcli export template "duplicates"
|
|
|
|
- orcli export template "duplicates" "https://example.com/template.txt"
|
|
|
|
- orcli export template "duplicates" "template.txt" --output "duplicates.tsv"
|
|
|
|
- |-
|
|
|
|
orcli export template "duplicates" \\\\
|
|
|
|
<<< '{ "name" : {{jsonize(cells["name"].value)}}, "purchase" : {{jsonize(cells["purchase"].value)}} }' \\\\
|
|
|
|
--prefix '{ "events" : [' \\\\
|
|
|
|
--separator , \\\\
|
|
|
|
--mode records \\\\
|
|
|
|
--suffix ]}$'\\\n' \\\\
|
|
|
|
--facets '[ { "type": "text", "name": "foo", "columnName": "name", "mode": "regex", "caseSensitive": false, "query": "Ben" } ]' \\\\
|
|
|
|
| jq
|
2022-10-16 20:36:43 +00:00
|
|
|
|
|
|
|
- name: run
|
|
|
|
help: run tmp OpenRefine workspace and execute shell script(s)
|
|
|
|
args:
|
2022-12-13 21:20:36 +00:00
|
|
|
- *file
|
2022-10-16 20:36:43 +00:00
|
|
|
flags:
|
|
|
|
- long: --memory
|
|
|
|
help: maximum RAM for OpenRefine java heap space
|
|
|
|
arg: ram
|
|
|
|
default: "2048M"
|
|
|
|
- long: --port
|
|
|
|
help: PORT on which OpenRefine should listen
|
|
|
|
arg: port
|
|
|
|
default: "3333"
|
2022-10-16 21:13:59 +00:00
|
|
|
- long: --interactive
|
|
|
|
help: do not exit on error and keep bash shell open
|
2022-12-13 21:20:36 +00:00
|
|
|
- *quiet
|
2022-10-16 20:36:43 +00:00
|
|
|
examples:
|
2022-10-16 21:13:59 +00:00
|
|
|
- orcli run --interactive
|
2022-10-16 20:36:43 +00:00
|
|
|
- |-
|
|
|
|
orcli run << EOF
|
|
|
|
orcli import csv "https://git.io/fj5hF" --projectName "duplicates"
|
2022-11-13 21:43:04 +00:00
|
|
|
orcli transform "duplicates" "https://git.io/fj5ju"
|
2022-10-16 20:36:43 +00:00
|
|
|
orcli export tsv "duplicates"
|
|
|
|
EOF
|
|
|
|
- |-
|
|
|
|
orcli run --memory "2000M" --port "3334" << EOF
|
|
|
|
orcli import csv "https://git.io/fj5hF" --projectName "duplicates" &
|
|
|
|
orcli import csv "https://git.io/fj5hF" --projectName "copy" &
|
|
|
|
wait
|
|
|
|
echo "finished import"
|
|
|
|
orcli export csv "duplicates" --output duplicates.csv &
|
|
|
|
orcli export tsv "duplicates" --output duplicates.tsv &
|
|
|
|
wait
|
|
|
|
wc duplicates*
|
|
|
|
EOF
|
|
|
|
- |-
|
2022-10-16 21:13:59 +00:00
|
|
|
orcli run --interactive "file1.sh" "file2.sh" - << EOF
|
2022-10-16 20:36:43 +00:00
|
|
|
echo "finished in \$SECONDS seconds"
|
|
|
|
EOF
|