name: orcli
help: OpenRefine command-line interface written in Bash
version: 0.2.1
footer: https://github.com/opencultureconsulting/orcli

dependencies:
  curl: https://curl.se
  jq: https://github.com/stedolan/jq

environment_variables:
  - name: OPENREFINE_URL
    help: URL to OpenRefine server
    default: "http://localhost:3333"

examples:
  - orcli import csv "https://git.io/fj5hF" --projectName "duplicates"
  - orcli list
  - orcli info "duplicates"
  - orcli transform "duplicates" "https://git.io/fj5ju"
  - orcli search "duplicates" "^Ben"
  - orcli export tsv "duplicates"
  - orcli export tsv "duplicates" --output "duplicates.tsv"
  - orcli delete "duplicates"
  - orcli run --interactive
  - |-
    orcli run << EOF
      orcli import csv "https://git.io/fj5hF" --projectName "duplicates"
      orcli transform "duplicates" "https://git.io/fj5ju"
      orcli export tsv "duplicates"
    EOF

commands:
  - name: completions
    help: |-
      Generate bash completions
      Usage: source <(orcli completions)

  - name: delete
    help: delete OpenRefine project
    args:
      - &project
        name: project
        help: project name or id
        required: true
    flags:
      - long: --force
        short: -f
        help: delete all projects with the same name
      - &quiet
        long: --quiet
        short: -q
        help: suppress log output, print errors only
    examples:
      - orcli delete "duplicates"
      - orcli delete "duplicates" --force
      - orcli delete 1234567890123
      - for p in \$(orcli list); do orcli delete \${p:0:13}; done

  - name: import
    help: commands to create OpenRefine projects from files or URLs

    commands:
      - name: csv
        help: import character-separated values (CSV)
        args:
          - &file
            name: file
            help: Path to one or more files or URLs. When FILE is -, read standard input.
            default: "-"
            repeatable: true
        flags:
          - &separator
            long: --separator
            help: character(s) that separates columns
            arg: separator
            default: ","
          - &blankCellsAsStrings
            long: --blankCellsAsStrings
            help: store blank cells as empty strings instead of nulls
          - &columnNames
            long: --columnNames
            help: |-
              set column names (comma separated)
              hint: add --ignoreLines 1 to overwrite existing header row
            arg: columnNames
            conflicts: [--headerLines]
          - &encoding_import
            long: --encoding
            help: set character encoding
            arg: encoding
          - &guessCellValueTypes
            long: --guessCellValueTypes
            help: attempt to parse cell text into numbers
          - &headerLines
            long: --headerLines
            help: parse x line(s) as column headers
            arg: headerLines
            default: "1"
            conflicts: [--columnNames]
          - &ignoreLines
            long: --ignoreLines
            help: ignore first x line(s) at beginning of file
            arg: ignoreLines
            default: "-1"
          - &ignoreQuoteCharacter
            long: --ignoreQuoteCharacter
            help: do not use any quote character to enclose cells containing column separators
          - &includeFileSources
            long: --includeFileSources
            help: add column with file source
          - &includeArchiveFileName
            long: --includeArchiveFileName
            help: add column with archive file name
          - &limit
            long: --limit
            help: load at most x row(s) of data
            arg: limit
            default: "-1"
          - &quoteCharacter
            long: --quoteCharacter
            help: quote character to enclose cells containing column separators
            arg: quoteCharacter
            default: '\\\"'
          - &skipBlankRows
            long: --skipBlankRows
            help: do not store blank rows
          - &skipDataLines
            long: --skipDataLines
            help: discard initial x row(s) of data
            arg: skipDataLines
            default: "0"
          - &trimStrings
            long: --trimStrings
            help: trim leading & trailing whitespace from strings
          - &projectName
            long: --projectName
            arg: projectName
            help: set a name for the OpenRefine project
          - &projectTags
            long: --projectTags
            help: set project tags (comma separated)
            arg: projectTags
          - *quiet
        examples:
          - orcli import csv "file"
          - orcli import csv "file1" "file2"
          - head -n 100 "file" | orcli import csv
          - orcli import csv "https://git.io/fj5hF"
          - |-
            orcli import csv "file" \\\\
              --separator ";" \\\\
              --columnNames "foo,bar,baz" \\\\
              --ignoreLines 1 \\\\
              --encoding "ISO-8859-1" \\\\
              --limit 100 \\\\
              --trimStrings \\\\
              --projectName "duplicates" \\\\
              --projectTags "test,urgent"

      - name: tsv
        help: import tab-separated values (TSV)
        args:
          - *file
        flags:
          - *blankCellsAsStrings
          - *columnNames
          - *encoding_import
          - *guessCellValueTypes
          - *headerLines
          - *ignoreLines
          - *ignoreQuoteCharacter
          - *includeFileSources
          - *includeArchiveFileName
          - *limit
          - *quoteCharacter
          - *skipBlankRows
          - *skipDataLines
          - *trimStrings
          - *projectName
          - *projectTags
          - *quiet
        examples:
          - orcli import tsv "file"
          - orcli import tsv "file1" "file2"
          - head -n 100 "file" | orcli import tsv
          - orcli import tsv "https://example.com/file.tsv"
          - |-
            orcli import tsv "file" \\\\
              --separator ";" \\\\
              --columnNames "foo,bar,baz" \\\\
              --ignoreLines 1 \\\\
              --encoding "ISO-8859-1" \\\\
              --limit 100 \\\\
              --trimStrings \\\\
              --projectName "duplicates" \\\\
              --projectTags "test,urgent"

      - name: json
        help: import JSON
        args:
          - *file
        flags:
          - &recordPath
            long: --recordPath
            help: specify record path elements in JSON array
            arg: json
            default: "[ \\\"_\\\" , \\\"_\\\" ]"
          - &rename
            long: --rename
            help: rename columns after import to remove record path fragments
          - *guessCellValueTypes
          - *includeFileSources
          - *includeArchiveFileName
          - *limit
          - &storeEmptyStrings
            long: --storeEmptyStrings
            help: preserve empty strings
          - *trimStrings
          - *projectName
          - *projectTags
          - *quiet
        examples:
          - orcli import json "file"
          - orcli import json "file1" "file2"
          - orcli import json "https://example.com/file.json"
          - |-
            orcli import json "file" \\\\
              --recordPath '[ "_", "rows", "_" ]' \\\\
              --rename \\\\
              --storeEmptyStrings \\\\
              --trimStrings \\\\
              --projectName "duplicates" \\\\
              --projectTags "test,urgent"

      - name: jsonl
        help: import JSON Lines / newline-delimited JSON
        args:
          - *file
        flags:
          - *rename
          - *guessCellValueTypes
          - *includeFileSources
          - *includeArchiveFileName
          - *limit
          - *storeEmptyStrings
          - *trimStrings
          - *projectName
          - *projectTags
          - *quiet
        examples:
          - orcli import jsonl "file"
          - orcli import jsonl "file1" "file2"
          - orcli import jsonl "https://example.com/file.json"
          - orcli import jsonl --rename <(orcli export jsonl "duplicates")
          - |-
            orcli import jsonl "file" \\\\
              --rename \\\\
              --storeEmptyStrings \\\\
              --trimStrings \\\\
              --projectName "duplicates" \\\\
              --projectTags "test,urgent"

  - name: list
    help: list projects on OpenRefine server

  - name: info
    help: show OpenRefine project's metadata
    args:
      - *project
    examples:
      - orcli info "duplicates"
      - orcli info 1234567890123
      - orcli info "duplicates" | jq -r .columns[]

  - name: search
    help: |-
      apply regex to each column and print matches in flattened tsv format
      output: index	column	value
      note that any exporter supports search by using --facets (see examples)

    args:
      - *project
      - &regex
        name: regex
        help: search term (regular expression, case-sensitive)
    flags:
      - long: --index
        help: print column values instead of row.index in the first column of the output
        arg: column
    examples:
      - orcli search "duplicates" "^Ben"
      - orcli search 1234567890123 "^Ben"
      - orcli search "duplicates" "^F" --index "email"
      - orcli search "duplicates" | column -t -s \$'\t'
      - |-
        orcli export tsv "duplicates" --facets '[{ "type": "list", "expression": "grel:filter(row.columnNames,cn,cells[cn].value.find(/^Ben/).length()>0).length()>0", "columnName": "", "selection": [{"v": {"v": true}}] }]'
      - |-
        orcli export tsv "duplicates" --facets '[{ "type": "list", "expression": "grel:filter([\\\\"gender\\\\",\\\\"purchase\\\\"],cn,cells[cn].value.find(/^F/).length()>0).length()>0", "columnName": "", "selection": [{"v": {"v": true}}] }]'

  - name: test
    help: run functional tests on tmp OpenRefine workspace

  - name: transform
    help: apply undo/redo JSON file(s) to an OpenRefine project
    args:
      - *project
      - *file
    flags:
      - *quiet
    examples:
      - orcli transform "duplicates" "history.json"
      - cat "history.json" | orcli transform "duplicates"
      - orcli transform "duplicates" "https://git.io/fj5ju"
      - orcli transform 1234567890123 "history.json"

  - name: export
    help: commands to export data from OpenRefine projects to files

    commands:
      - name: jsonl
        help: export JSON Lines / newline-delimited JSON
        args:
          - *project
        flags:
          - &mode
            long: --mode
            help: specify if project contains multi-row records
            arg: mode
            allowed: [rows, records]
            default: "rows"
          - long: --separator
            help: character(s) that separates multiple values in one cell (row mode only)
            arg: separator
          - &facets
            long: --facets
            help: filter result set by providing an OpenRefine facets config in json
            arg: facets
            default: "[]"
          - &output
            long: --output
            help: Write to file instead of stdout
            arg: file
          - &encoding_export
            long: --encoding
            help: set character encoding
            arg: encoding
            default: "UTF-8"
          - *quiet
        examples:
          - orcli export jsonl "duplicates"
          - orcli export jsonl "duplicates" --output "duplicates.jsonl"
          - orcli export jsonl "duplicates" --separator ' '
          - orcli export jsonl "duplicates" --mode records
          - |-
            orcli export jsonl "duplicates" --facets '[ { "type": "text", "columnName": "name", "mode": "regex", "caseSensitive": false, "invert": false, "query": "^Ben" } ]'
          - |-
            orcli export jsonl "duplicates" --facets '[{ "type": "list", "expression": "grel:filter([\\\\"gender\\\\",\\\\"purchase\\\\"],cn,cells[cn].value.find(/^F/).length()>0).length()>0", "columnName": "", "selection": [{"v": {"v": true}}] }]'
      - name: tsv
        help: export tab-separated values (TSV)
        args:
          - *project
        flags:
          - long: --select
            help: |-
              filter result set to one or more columns (comma separated)
              example: --select "foo,bar,baz"
            arg: columns
          - *mode
          - *facets
          - *output
          - *encoding_export
          - *quiet
        examples:
          - orcli export tsv "duplicates"
          - orcli export tsv "duplicates" --output "duplicates.tsv"
          - orcli export tsv "duplicates" --encoding "ISO-8859-1"
          - orcli export tsv "duplicates" --select "name,email,purchase"
          - |-
            orcli export tsv "duplicates" --facets '[ { "type": "text", "columnName": "name", "mode": "regex", "caseSensitive": false, "invert": false, "query": "^Ben" } ]'
          - |-
            orcli export tsv "duplicates" --facets '[{ "type": "list", "expression": "grel:filter([\\\\"gender\\\\",\\\\"purchase\\\\"],cn,cells[cn].value.find(/^F/).length()>0).length()>0", "columnName": "", "selection": [{"v": {"v": true}}] }]'

      - name: template
        help: export to any text format by providing your own GREL template
        args:
          - *project
          - name: file
            help: Path to row/record template file or URL. When FILE is -, read standard input.
            default: "-"
        flags:
          - long: --separator
            help: insert character(s) between rows/records
            arg: separator
          - long: --prefix
            help: insert character(s) at the beginning of the file
            arg: prefix
          - long: --suffix
            help: insert character(s) at the end of the file
            arg: suffix
          - *mode
          - *facets
          - *output
          - *encoding_export
          - *quiet
        examples:
          - orcli export template "duplicates" "template.txt"
          - cat "template.txt" | orcli export template "duplicates"
          - orcli export template "duplicates" "https://example.com/template.txt"
          - orcli export template "duplicates" "template.txt" --output "duplicates.tsv"
          - |-
            orcli export template "duplicates" \\\\
              <<< '{ "name" : {{jsonize(cells["name"].value)}}, "purchase" : {{jsonize(cells["purchase"].value)}} }' \\\\
              --prefix '{ "events" : [' \\\\
              --separator , \\\\
              --mode records \\\\
              --suffix ]}$'\\\n' \\\\
              --facets '[ { "type": "text", "columnName": "name", "mode": "regex", "caseSensitive": false, "invert": false, "query": "^Ben" } ]' \\\\
              | jq

  - name: run
    help: run tmp OpenRefine workspace and execute shell script(s)
    args:
      - *file
    flags:
      - long: --memory
        help: maximum RAM for OpenRefine java heap space
        arg: ram
        default: "2048M"
      - long: --port
        help: PORT on which OpenRefine should listen
        arg: port
        default: "3333"
      - long: --interactive
        help: do not exit on error and keep bash shell open
      - *quiet
    examples:
      - orcli run --interactive
      - |-
        orcli run << EOF
          orcli import csv "https://git.io/fj5hF" --projectName "duplicates"
          orcli transform "duplicates" "https://git.io/fj5ju"
          orcli export tsv "duplicates"
        EOF
      - |-
        orcli run --memory "2000M" --port "3334" << EOF
          orcli import csv "https://git.io/fj5hF" --projectName "duplicates" &
          orcli import csv "https://git.io/fj5hF" --projectName "copy" &
          wait
          echo "finished import"
          orcli export csv "duplicates" --output duplicates.csv &
          orcli export tsv "duplicates" --output duplicates.tsv &
          wait
          wc duplicates*
        EOF
      - |-
        orcli run --interactive "file1.sh" "file2.sh" - << EOF
          echo "finished in \$SECONDS seconds"
        EOF