#!/usr/bin/env python3 """ Functions used by the command line interface (CLI) """ # Copyright (c) 2011 Paul Makepeace, Real Programmers. All rights reserved. # This program is free software: you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by # the Free Software Foundation, either version 3 of the License, or # (at your option) any later version. # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # You should have received a copy of the GNU General Public License # along with this program. If not, see import json import os import ssl import sys import time import urllib.request, urllib.parse, urllib.error from xml.etree import ElementTree from google.refine import refine def apply(project_id, history_file): """Apply OpenRefine history from json file to project.""" project = refine.RefineProject(project_id) response = project.apply_operations(history_file) if response != 'ok': raise Exception('Failed to apply %s to %s: %s' % (history_file, project_id, response)) else: print(('File %s has been successfully applied to project %s' % (history_file, project_id))) def create(project_file, project_format=None, columnWidths=None, encoding=None, guessCellValueTypes=False, headerLines=None, ignoreLines=None, includeFileSources=False, limit=None, linesPerRow=None, processQuotes=True, projectName=None, projectTags=None, recordPath=None, separator=None, sheets=None, skipDataLines=None, storeBlankCellsAsNulls=True, storeBlankRows=True, storeEmptyStrings=True, trimStrings=False ): """Create a new project from file.""" # guess format from file extension if not project_format: project_format = os.path.splitext(project_file)[1][1:].lower() if project_format == 'txt': try: columnWidths[0] project_format = 'fixed-width' except TypeError: project_format = 'line-based' # defaults for each file type if project_format == 'xml': project_format = 'text/xml' if not recordPath: recordPath = [ElementTree.parse(project_file).getroot().tag] elif project_format == 'csv': project_format = 'text/line-based/*sv' elif project_format == 'tsv': project_format = 'text/line-based/*sv' if not separator: separator = '\t' elif project_format == 'line-based': project_format = 'text/line-based' if not skipDataLines: skipDataLines = -1 elif project_format == 'fixed-width': project_format = 'text/line-based/fixed-width' if not headerLines: headerLines = 0 elif project_format == 'json': project_format = 'text/json' if not recordPath: recordPath = ['_', '_'] elif project_format == 'xls': project_format = 'binary/text/xml/xls/xlsx' if not sheets: sheets = [0] # TODO: new format for sheets option introduced in OpenRefine 2.8 elif project_format == 'xlsx': project_format = 'binary/text/xml/xls/xlsx' if not sheets: sheets = [0] # TODO: new format for sheets option introduced in OpenRefine 2.8 elif project_format == 'ods': project_format = 'text/xml/ods' if not sheets: sheets = [0] # TODO: new format for sheets option introduced in OpenRefine 2.8 # execute kwargs = {k: v for k, v in list(vars().items()) if v is not None} project = refine.Refine(refine.RefineServer()).new_project( guess_cell_value_types=guessCellValueTypes, ignore_lines=ignoreLines, header_lines=headerLines, skip_data_lines=skipDataLines, store_blank_rows=storeBlankRows, process_quotes=processQuotes, project_name=projectName, store_blank_cells_as_nulls=storeBlankCellsAsNulls, include_file_sources=includeFileSources, **kwargs) rows = project.do_json('get-rows')['total'] if rows > 0: print(('{0}: {1}'.format('id', project.project_id))) print(('{0}: {1}'.format('rows', rows))) return project else: raise Exception( 'Project contains 0 rows. Please check --help for mandatory ' 'arguments for xml, json, xlsx and ods') def delete(project_id): """Delete project.""" project = refine.RefineProject(project_id) response = project.delete() if response != True: raise Exception('Failed to delete %s: %s' % (project_id, response)) else: print(('Project %s has been successfully deleted' % project_id)) def download(url, output_file=None): """Integrated download function for your convenience.""" if not output_file: output_file = os.path.basename(url) if os.path.exists(output_file): print(('Error: File %s already exists.\n' 'Delete existing file or try command --output ' 'to specify a different filename.' % output_file)) return # Workaround for SSL verification problems in one-file-executables context = ssl._create_unverified_context() urllib.request.urlretrieve(url, output_file, context=context) print(('Download to file %s complete' % output_file)) def export(project_id, encoding=None, output_file=None, export_format=None): """Dump a project to stdout or file.""" project = refine.RefineProject(project_id) if not export_format: export_format = 'tsv' if not output_file: if export_format in ['csv', 'tsv', 'txt']: encoding = 'UTF-8' sys.stdout.write(project.export( export_format=export_format, encoding=encoding).read()) else: ext = os.path.splitext(output_file)[1][1:] if ext: export_format = ext.lower() if export_format in ['csv', 'tsv', 'txt']: encoding = 'UTF-8' with open(output_file, 'wb') as f: f.write(project.export( export_format=export_format, encoding=encoding).read()) print(('Export to file %s complete' % output_file)) def info(project_id): """Show project metadata""" projects = refine.Refine(refine.RefineServer()).list_projects() if project_id in list(projects.keys()): print(('{0:>20}: {1}'.format('id', project_id))) print(('{0:>20}: {1}'.format('url', 'http://' + refine.REFINE_HOST + ':' + refine.REFINE_PORT + '/project?project=' + project_id))) for k, v in list(projects[project_id].items()): if v: print(('{0:>20}: {1}'.format(k, v))) project_model = refine.RefineProject(project_id).get_models() columns = [c['name'] for c in project_model['columnModel']['columns']] for (i, v) in enumerate(columns, start=1): print(('{0:>20}: {1}'.format('column ' + str(i).zfill(3), v))) else: print(('Error: No project found with id %s.\n' 'Check existing projects with command --list' % (project_id))) def ls(): """Query the server and list projects sorted by mtime.""" projects = list(refine.Refine(refine.RefineServer()).list_projects().items()) def date_to_epoch(json_dt): """Convert a JSON date time into seconds-since-epoch.""" return time.mktime(time.strptime(json_dt, '%Y-%m-%dT%H:%M:%SZ')) projects.sort(key=lambda v: date_to_epoch(v[1]['modified']), reverse=True) if projects: for project_id, project_info in projects: print(('{0:>14}: {1}'.format(project_id, project_info['name']))) else: print('Error: No projects found') def templating(project_id, template, encoding='UTF-8', output_file=None, mode=None, prefix='', rowSeparator='\n', suffix='', filterQuery=None, filterColumn=None, facets=None, splitToFiles=False, suffixById=None ): """Dump a project to stdout or file with templating.""" project = refine.RefineProject(project_id) # basic config templateconfig = {'prefix': prefix, 'suffix': suffix, 'template': template, 'rowSeparator': rowSeparator, 'encoding': encoding} # construct the engine config if mode == 'record-based': engine = {'facets': [], 'mode': 'record-based'} else: engine = {'facets': [], 'mode': 'row-based'} if facets: engine['facets'].append(json.loads(facets)) if filterQuery: if not filterColumn: filterColumn = project.get_models()['columnModel']['keyColumnName'] textFilter = {'type': 'text', 'name': filterColumn, 'columnName': filterColumn, 'mode': 'regex', 'caseSensitive': False, 'query': filterQuery} engine['facets'].append(textFilter) templateconfig.update({'engine': json.dumps(engine)}) if not splitToFiles: # normal output if not output_file: sys.stdout.write(project.export_templating( **templateconfig).read()) else: with open(output_file, 'wb') as f: f.write(project.export_templating(**templateconfig).read()) print(('Export to file %s complete' % output_file)) else: # splitToFiles functionality prefix = templateconfig['prefix'] suffix = templateconfig['suffix'] split = '===|||THISISTHEBEGINNINGOFANEWRECORD|||===' if not output_file: output_file = time.strftime('%Y%m%d') else: base = os.path.splitext(output_file)[0] ext = os.path.splitext(output_file)[1][1:] if not ext: ext = 'txt' # generate config for subfeature suffixById if suffixById: ids_template = ('{{forNonBlank(' + 'with(row.columnNames[0],cn,cells[cn].value),' + 'v,v,"")}}') ids_templateconfig = {'engine': json.dumps(engine), 'template': ids_template, 'rowSeparator': '\n', 'encoding': encoding} ids = [line.rstrip('\n') for line in project.export_templating( **ids_templateconfig) if line.rstrip('\n')] # generate common config if mode == 'record-based': # record-based: split-character into template # if key column is not blank (=record) template = ('{{forNonBlank(' + 'with(row.columnNames[0],cn,cells[cn].value),' + 'v,"' + split + '", "")}}' + templateconfig['template']) templateconfig.update({'prefix': '', 'suffix': '', 'template': template, 'rowSeparator': ''}) else: # row-based: split-character into template template = split + templateconfig['template'] templateconfig.update({'prefix': '', 'suffix': '', 'template': template, 'rowSeparator': ''}) # execute records = project.export_templating( **templateconfig).read().split(split) del records[0] # skip first blank entry if suffixById: for index, record in enumerate(records): output_file = base + '_' + ids[index] + '.' + ext with open(output_file, 'wb') as f: f.writelines([prefix, record, suffix]) print(('Export to files complete. Last file: %s' % output_file)) else: zeros = len(str(len(records))) for index, record in enumerate(records): output_file = base + '_' + \ str(index + 1).zfill(zeros) + '.' + ext with open(output_file, 'wb') as f: f.writelines([prefix, record, suffix]) print(('Export to files complete. Last file: %s' % output_file))