2019-12-27 16:07:17 +01:00
|
|
|
#!/usr/bin/env python3
|
2019-08-04 02:15:07 +02:00
|
|
|
"""
|
2019-08-05 10:20:09 +02:00
|
|
|
Functions used by the command line interface (CLI)
|
2019-08-04 02:15:07 +02:00
|
|
|
"""
|
|
|
|
|
|
|
|
# Copyright (c) 2011 Paul Makepeace, Real Programmers. All rights reserved.
|
|
|
|
|
|
|
|
# This program is free software: you can redistribute it and/or modify
|
|
|
|
# it under the terms of the GNU General Public License as published by
|
|
|
|
# the Free Software Foundation, either version 3 of the License, or
|
|
|
|
# (at your option) any later version.
|
|
|
|
|
|
|
|
# This program is distributed in the hope that it will be useful,
|
|
|
|
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
|
|
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
|
|
|
# GNU General Public License for more details.
|
|
|
|
|
|
|
|
# You should have received a copy of the GNU General Public License
|
|
|
|
# along with this program. If not, see <http://www.gnu.org/licenses/>
|
|
|
|
|
|
|
|
|
2019-08-16 10:55:54 +02:00
|
|
|
import json
|
2019-08-04 02:15:07 +02:00
|
|
|
import os
|
2019-08-16 13:15:24 +02:00
|
|
|
import ssl
|
2019-08-04 02:15:07 +02:00
|
|
|
import sys
|
|
|
|
import time
|
2019-12-27 16:07:17 +01:00
|
|
|
import urllib.request, urllib.parse, urllib.error
|
2019-08-20 04:30:50 +02:00
|
|
|
from xml.etree import ElementTree
|
2019-08-04 02:15:07 +02:00
|
|
|
|
|
|
|
from google.refine import refine
|
|
|
|
|
2019-08-16 10:55:54 +02:00
|
|
|
|
|
|
|
def apply(project_id, history_file):
|
|
|
|
"""Apply OpenRefine history from json file to project."""
|
|
|
|
project = refine.RefineProject(project_id)
|
|
|
|
response = project.apply_operations(history_file)
|
|
|
|
if response != 'ok':
|
|
|
|
raise Exception('Failed to apply %s to %s: %s' %
|
|
|
|
(history_file, project_id, response))
|
|
|
|
else:
|
2019-12-27 16:07:17 +01:00
|
|
|
print(('File %s has been successfully applied to project %s' %
|
|
|
|
(history_file, project_id)))
|
2019-08-16 10:55:54 +02:00
|
|
|
|
|
|
|
|
|
|
|
def create(project_file,
|
|
|
|
project_format=None,
|
|
|
|
columnWidths=None,
|
|
|
|
encoding=None,
|
|
|
|
guessCellValueTypes=False,
|
|
|
|
headerLines=None,
|
|
|
|
ignoreLines=None,
|
|
|
|
includeFileSources=False,
|
|
|
|
limit=None,
|
|
|
|
linesPerRow=None,
|
|
|
|
processQuotes=True,
|
|
|
|
projectName=None,
|
2019-08-20 04:30:50 +02:00
|
|
|
projectTags=None,
|
2019-08-16 10:55:54 +02:00
|
|
|
recordPath=None,
|
|
|
|
separator=None,
|
|
|
|
sheets=None,
|
|
|
|
skipDataLines=None,
|
|
|
|
storeBlankCellsAsNulls=True,
|
|
|
|
storeBlankRows=True,
|
|
|
|
storeEmptyStrings=True,
|
|
|
|
trimStrings=False
|
|
|
|
):
|
|
|
|
"""Create a new project from file."""
|
|
|
|
# guess format from file extension
|
|
|
|
if not project_format:
|
|
|
|
project_format = os.path.splitext(project_file)[1][1:].lower()
|
|
|
|
if project_format == 'txt':
|
|
|
|
try:
|
2019-08-20 04:30:50 +02:00
|
|
|
columnWidths[0]
|
2019-08-16 10:55:54 +02:00
|
|
|
project_format = 'fixed-width'
|
2019-08-20 04:30:50 +02:00
|
|
|
except TypeError:
|
2019-08-16 10:55:54 +02:00
|
|
|
project_format = 'line-based'
|
|
|
|
# defaults for each file type
|
|
|
|
if project_format == 'xml':
|
|
|
|
project_format = 'text/xml'
|
|
|
|
if not recordPath:
|
2019-08-20 04:30:50 +02:00
|
|
|
recordPath = [ElementTree.parse(project_file).getroot().tag]
|
2019-08-16 10:55:54 +02:00
|
|
|
elif project_format == 'csv':
|
|
|
|
project_format = 'text/line-based/*sv'
|
|
|
|
elif project_format == 'tsv':
|
|
|
|
project_format = 'text/line-based/*sv'
|
|
|
|
if not separator:
|
|
|
|
separator = '\t'
|
|
|
|
elif project_format == 'line-based':
|
|
|
|
project_format = 'text/line-based'
|
|
|
|
if not skipDataLines:
|
|
|
|
skipDataLines = -1
|
|
|
|
elif project_format == 'fixed-width':
|
|
|
|
project_format = 'text/line-based/fixed-width'
|
|
|
|
if not headerLines:
|
|
|
|
headerLines = 0
|
|
|
|
elif project_format == 'json':
|
|
|
|
project_format = 'text/json'
|
|
|
|
if not recordPath:
|
2019-08-20 04:30:50 +02:00
|
|
|
recordPath = ['_', '_']
|
2019-08-16 10:55:54 +02:00
|
|
|
elif project_format == 'xls':
|
|
|
|
project_format = 'binary/text/xml/xls/xlsx'
|
|
|
|
if not sheets:
|
2019-08-20 04:30:50 +02:00
|
|
|
sheets = [0]
|
|
|
|
# TODO: new format for sheets option introduced in OpenRefine 2.8
|
2019-08-16 10:55:54 +02:00
|
|
|
elif project_format == 'xlsx':
|
|
|
|
project_format = 'binary/text/xml/xls/xlsx'
|
|
|
|
if not sheets:
|
2019-08-20 04:30:50 +02:00
|
|
|
sheets = [0]
|
|
|
|
# TODO: new format for sheets option introduced in OpenRefine 2.8
|
2019-08-16 10:55:54 +02:00
|
|
|
elif project_format == 'ods':
|
|
|
|
project_format = 'text/xml/ods'
|
|
|
|
if not sheets:
|
2019-08-20 04:30:50 +02:00
|
|
|
sheets = [0]
|
|
|
|
# TODO: new format for sheets option introduced in OpenRefine 2.8
|
2019-08-16 10:55:54 +02:00
|
|
|
# execute
|
2019-12-27 16:07:17 +01:00
|
|
|
kwargs = {k: v for k, v in list(vars().items()) if v is not None}
|
2019-08-20 04:30:50 +02:00
|
|
|
project = refine.Refine(refine.RefineServer()).new_project(
|
|
|
|
guess_cell_value_types=guessCellValueTypes,
|
|
|
|
ignore_lines=ignoreLines,
|
|
|
|
header_lines=headerLines,
|
|
|
|
skip_data_lines=skipDataLines,
|
|
|
|
store_blank_rows=storeBlankRows,
|
|
|
|
process_quotes=processQuotes,
|
2019-08-20 06:47:35 +02:00
|
|
|
project_name=projectName,
|
2019-08-20 04:30:50 +02:00
|
|
|
store_blank_cells_as_nulls=storeBlankCellsAsNulls,
|
|
|
|
include_file_sources=includeFileSources,
|
|
|
|
**kwargs)
|
2019-08-16 10:55:54 +02:00
|
|
|
rows = project.do_json('get-rows')['total']
|
|
|
|
if rows > 0:
|
2019-12-27 16:07:17 +01:00
|
|
|
print(('{0}: {1}'.format('id', project.project_id)))
|
|
|
|
print(('{0}: {1}'.format('rows', rows)))
|
2019-08-16 10:55:54 +02:00
|
|
|
return project
|
|
|
|
else:
|
|
|
|
raise Exception(
|
|
|
|
'Project contains 0 rows. Please check --help for mandatory '
|
|
|
|
'arguments for xml, json, xlsx and ods')
|
|
|
|
|
|
|
|
|
|
|
|
def delete(project_id):
|
|
|
|
"""Delete project."""
|
|
|
|
project = refine.RefineProject(project_id)
|
|
|
|
response = project.delete()
|
|
|
|
if response != True:
|
|
|
|
raise Exception('Failed to delete %s: %s' %
|
|
|
|
(project_id, response))
|
|
|
|
else:
|
2019-12-27 16:07:17 +01:00
|
|
|
print(('Project %s has been successfully deleted' % project_id))
|
2019-08-16 10:55:54 +02:00
|
|
|
|
|
|
|
|
|
|
|
def download(url, output_file=None):
|
|
|
|
"""Integrated download function for your convenience."""
|
|
|
|
if not output_file:
|
|
|
|
output_file = os.path.basename(url)
|
|
|
|
if os.path.exists(output_file):
|
2019-12-27 16:07:17 +01:00
|
|
|
print(('Error: File %s already exists.\n'
|
2019-08-16 10:55:54 +02:00
|
|
|
'Delete existing file or try command --output '
|
2019-12-27 16:07:17 +01:00
|
|
|
'to specify a different filename.' % output_file))
|
2019-08-16 10:55:54 +02:00
|
|
|
return
|
2019-08-16 13:15:24 +02:00
|
|
|
# Workaround for SSL verification problems in one-file-executables
|
|
|
|
context = ssl._create_unverified_context()
|
2019-12-27 16:07:17 +01:00
|
|
|
urllib.request.urlretrieve(url, output_file, context=context)
|
|
|
|
print(('Download to file %s complete' % output_file))
|
2019-08-16 10:55:54 +02:00
|
|
|
|
|
|
|
|
2019-08-21 19:24:13 +02:00
|
|
|
def export(project_id, encoding=None, output_file=None, export_format=None):
|
2019-08-16 10:55:54 +02:00
|
|
|
"""Dump a project to stdout or file."""
|
|
|
|
project = refine.RefineProject(project_id)
|
|
|
|
if not export_format:
|
|
|
|
export_format = 'tsv'
|
|
|
|
if not output_file:
|
2019-08-21 19:24:13 +02:00
|
|
|
if export_format in ['csv', 'tsv', 'txt']:
|
|
|
|
encoding = 'UTF-8'
|
2019-08-21 16:47:21 +02:00
|
|
|
sys.stdout.write(project.export(
|
2019-08-21 19:24:13 +02:00
|
|
|
export_format=export_format, encoding=encoding).read())
|
2019-08-16 10:55:54 +02:00
|
|
|
else:
|
|
|
|
ext = os.path.splitext(output_file)[1][1:]
|
|
|
|
if ext:
|
|
|
|
export_format = ext.lower()
|
2019-08-21 19:24:13 +02:00
|
|
|
if export_format in ['csv', 'tsv', 'txt']:
|
|
|
|
encoding = 'UTF-8'
|
2019-08-16 10:55:54 +02:00
|
|
|
with open(output_file, 'wb') as f:
|
2019-08-21 19:24:13 +02:00
|
|
|
f.write(project.export(
|
|
|
|
export_format=export_format, encoding=encoding).read())
|
2019-12-27 16:07:17 +01:00
|
|
|
print(('Export to file %s complete' % output_file))
|
2019-08-16 10:55:54 +02:00
|
|
|
|
|
|
|
|
|
|
|
def info(project_id):
|
|
|
|
"""Show project metadata"""
|
2019-08-22 00:36:18 +02:00
|
|
|
projects = refine.Refine(refine.RefineServer()).list_projects()
|
2019-12-27 16:07:17 +01:00
|
|
|
if project_id in list(projects.keys()):
|
|
|
|
print(('{0:>20}: {1}'.format('id', project_id)))
|
|
|
|
print(('{0:>20}: {1}'.format('url', 'http://' +
|
2019-08-22 00:36:18 +02:00
|
|
|
refine.REFINE_HOST + ':' +
|
|
|
|
refine.REFINE_PORT +
|
2019-12-27 16:07:17 +01:00
|
|
|
'/project?project=' + project_id)))
|
|
|
|
for k, v in list(projects[project_id].items()):
|
2019-08-22 01:33:06 +02:00
|
|
|
if v:
|
2019-12-27 16:07:17 +01:00
|
|
|
print(('{0:>20}: {1}'.format(k, v)))
|
2019-08-22 00:36:18 +02:00
|
|
|
project_model = refine.RefineProject(project_id).get_models()
|
|
|
|
columns = [c['name'] for c in project_model['columnModel']['columns']]
|
2019-08-16 10:55:54 +02:00
|
|
|
for (i, v) in enumerate(columns, start=1):
|
2019-12-27 16:07:17 +01:00
|
|
|
print(('{0:>20}: {1}'.format('column ' + str(i).zfill(3), v)))
|
2019-08-16 10:55:54 +02:00
|
|
|
else:
|
2019-12-27 16:07:17 +01:00
|
|
|
print(('Error: No project found with id %s.\n'
|
|
|
|
'Check existing projects with command --list' % (project_id)))
|
2019-08-16 10:55:54 +02:00
|
|
|
|
|
|
|
|
|
|
|
def ls():
|
|
|
|
"""Query the server and list projects sorted by mtime."""
|
2019-12-27 16:07:17 +01:00
|
|
|
projects = list(refine.Refine(refine.RefineServer()).list_projects().items())
|
2019-08-04 02:15:07 +02:00
|
|
|
|
|
|
|
def date_to_epoch(json_dt):
|
|
|
|
"""Convert a JSON date time into seconds-since-epoch."""
|
|
|
|
return time.mktime(time.strptime(json_dt, '%Y-%m-%dT%H:%M:%SZ'))
|
|
|
|
projects.sort(key=lambda v: date_to_epoch(v[1]['modified']), reverse=True)
|
2019-08-16 10:55:54 +02:00
|
|
|
if projects:
|
|
|
|
for project_id, project_info in projects:
|
2019-12-27 16:07:17 +01:00
|
|
|
print(('{0:>14}: {1}'.format(project_id, project_info['name'])))
|
2019-08-16 10:55:54 +02:00
|
|
|
else:
|
|
|
|
print('Error: No projects found')
|
2019-08-04 02:15:07 +02:00
|
|
|
|
2019-08-16 10:55:54 +02:00
|
|
|
|
|
|
|
def templating(project_id,
|
|
|
|
template,
|
2019-08-21 19:24:13 +02:00
|
|
|
encoding='UTF-8',
|
2019-08-16 10:55:54 +02:00
|
|
|
output_file=None,
|
|
|
|
mode=None,
|
|
|
|
prefix='',
|
|
|
|
rowSeparator='\n',
|
|
|
|
suffix='',
|
|
|
|
filterQuery=None,
|
|
|
|
filterColumn=None,
|
|
|
|
facets=None,
|
|
|
|
splitToFiles=False,
|
|
|
|
suffixById=None
|
|
|
|
):
|
|
|
|
"""Dump a project to stdout or file with templating."""
|
|
|
|
project = refine.RefineProject(project_id)
|
|
|
|
|
|
|
|
# basic config
|
|
|
|
templateconfig = {'prefix': prefix,
|
|
|
|
'suffix': suffix,
|
|
|
|
'template': template,
|
2019-08-21 19:24:13 +02:00
|
|
|
'rowSeparator': rowSeparator,
|
|
|
|
'encoding': encoding}
|
2019-08-16 10:55:54 +02:00
|
|
|
|
|
|
|
# construct the engine config
|
|
|
|
if mode == 'record-based':
|
|
|
|
engine = {'facets': [], 'mode': 'record-based'}
|
2019-08-04 02:15:07 +02:00
|
|
|
else:
|
2019-08-16 10:55:54 +02:00
|
|
|
engine = {'facets': [], 'mode': 'row-based'}
|
|
|
|
if facets:
|
|
|
|
engine['facets'].append(json.loads(facets))
|
|
|
|
if filterQuery:
|
|
|
|
if not filterColumn:
|
|
|
|
filterColumn = project.get_models()['columnModel']['keyColumnName']
|
|
|
|
textFilter = {'type': 'text',
|
|
|
|
'name': filterColumn,
|
|
|
|
'columnName': filterColumn,
|
|
|
|
'mode': 'regex',
|
|
|
|
'caseSensitive': False,
|
|
|
|
'query': filterQuery}
|
|
|
|
engine['facets'].append(textFilter)
|
|
|
|
templateconfig.update({'engine': json.dumps(engine)})
|
|
|
|
|
|
|
|
if not splitToFiles:
|
2019-08-21 19:24:13 +02:00
|
|
|
# normal output
|
2019-08-16 10:55:54 +02:00
|
|
|
if not output_file:
|
|
|
|
sys.stdout.write(project.export_templating(
|
2019-08-21 19:24:13 +02:00
|
|
|
**templateconfig).read())
|
2019-08-04 02:15:07 +02:00
|
|
|
else:
|
2019-08-16 10:55:54 +02:00
|
|
|
with open(output_file, 'wb') as f:
|
|
|
|
f.write(project.export_templating(**templateconfig).read())
|
2019-12-27 16:07:17 +01:00
|
|
|
print(('Export to file %s complete' % output_file))
|
2019-08-04 02:15:07 +02:00
|
|
|
else:
|
2019-08-21 19:24:13 +02:00
|
|
|
# splitToFiles functionality
|
2019-08-16 10:55:54 +02:00
|
|
|
prefix = templateconfig['prefix']
|
|
|
|
suffix = templateconfig['suffix']
|
|
|
|
split = '===|||THISISTHEBEGINNINGOFANEWRECORD|||==='
|
|
|
|
if not output_file:
|
|
|
|
output_file = time.strftime('%Y%m%d')
|
|
|
|
else:
|
|
|
|
base = os.path.splitext(output_file)[0]
|
|
|
|
ext = os.path.splitext(output_file)[1][1:]
|
|
|
|
if not ext:
|
|
|
|
ext = 'txt'
|
2019-08-21 19:24:13 +02:00
|
|
|
# generate config for subfeature suffixById
|
2019-08-16 10:55:54 +02:00
|
|
|
if suffixById:
|
2019-08-21 19:24:13 +02:00
|
|
|
ids_template = ('{{forNonBlank(' +
|
|
|
|
'with(row.columnNames[0],cn,cells[cn].value),' +
|
|
|
|
'v,v,"")}}')
|
2019-08-16 10:55:54 +02:00
|
|
|
ids_templateconfig = {'engine': json.dumps(engine),
|
|
|
|
'template': ids_template,
|
2019-08-21 19:24:13 +02:00
|
|
|
'rowSeparator': '\n',
|
|
|
|
'encoding': encoding}
|
2019-08-16 10:55:54 +02:00
|
|
|
ids = [line.rstrip('\n') for line in project.export_templating(
|
|
|
|
**ids_templateconfig) if line.rstrip('\n')]
|
2019-08-21 19:24:13 +02:00
|
|
|
# generate common config
|
2019-08-16 10:55:54 +02:00
|
|
|
if mode == 'record-based':
|
|
|
|
# record-based: split-character into template
|
|
|
|
# if key column is not blank (=record)
|
2019-08-21 19:24:13 +02:00
|
|
|
template = ('{{forNonBlank(' +
|
|
|
|
'with(row.columnNames[0],cn,cells[cn].value),' +
|
2019-08-25 21:46:54 +02:00
|
|
|
'v,"' + split + '", "")}}' +
|
2019-08-16 10:55:54 +02:00
|
|
|
templateconfig['template'])
|
|
|
|
templateconfig.update({'prefix': '',
|
|
|
|
'suffix': '',
|
|
|
|
'template': template,
|
|
|
|
'rowSeparator': ''})
|
|
|
|
else:
|
|
|
|
# row-based: split-character into template
|
|
|
|
template = split + templateconfig['template']
|
|
|
|
templateconfig.update({'prefix': '',
|
|
|
|
'suffix': '',
|
|
|
|
'template': template,
|
|
|
|
'rowSeparator': ''})
|
2019-08-21 19:24:13 +02:00
|
|
|
# execute
|
2019-08-16 10:55:54 +02:00
|
|
|
records = project.export_templating(
|
|
|
|
**templateconfig).read().split(split)
|
|
|
|
del records[0] # skip first blank entry
|
|
|
|
if suffixById:
|
|
|
|
for index, record in enumerate(records):
|
|
|
|
output_file = base + '_' + ids[index] + '.' + ext
|
|
|
|
with open(output_file, 'wb') as f:
|
|
|
|
f.writelines([prefix, record, suffix])
|
2019-12-27 16:07:17 +01:00
|
|
|
print(('Export to files complete. Last file: %s' % output_file))
|
2019-08-16 10:55:54 +02:00
|
|
|
else:
|
|
|
|
zeros = len(str(len(records)))
|
|
|
|
for index, record in enumerate(records):
|
|
|
|
output_file = base + '_' + \
|
|
|
|
str(index + 1).zfill(zeros) + '.' + ext
|
|
|
|
with open(output_file, 'wb') as f:
|
|
|
|
f.writelines([prefix, record, suffix])
|
2019-12-27 16:07:17 +01:00
|
|
|
print(('Export to files complete. Last file: %s' % output_file))
|