diff --git a/README.rst b/README.rst index bc17770..fb0d36f 100644 --- a/README.rst +++ b/README.rst @@ -5,8 +5,8 @@ OpenRefine Python Client Library The OpenRefine Python Client Library provides an interface to communicating with an `OpenRefine `_ server. -If you are looking for a ready to use command line interface to OpenRefine then you might be interested in the docker variation of this library: -`felixlohmeier/openrefine-client `_. You will find examples for batch processing (e.g. for usage in shell scripts) there. +If you are looking for a ready to use command line interface to OpenRefine for batch processing then you might be interested in the following bash shell script: +`felixlohmeier/openrefine-batch `_ If you are familiar with python and want to go into more depth, then read on! @@ -71,12 +71,6 @@ Installation (Someone with more familiarity with python's byzantine collection of installation frameworks is very welcome to improve/"best practice" all this.) -#. Install dependencies, which currently is ``urllib2_file``: - - ``sudo pip install -r requirements.txt`` - - (If you don't have ``pip`` visit `pip-installer.org `_) - #. Ensure you have a Refine server running somewhere and, if necessary, set the environment vars as above. diff --git a/google/refine/refine.py b/google/refine/refine.py index c7c9b91..0d19160 100644 --- a/google/refine/refine.py +++ b/google/refine/refine.py @@ -26,7 +26,7 @@ import re import StringIO import time import urllib -import urllib2_file +from google.urllib2_file import urllib2_file import urllib2 import urlparse diff --git a/google/urllib2_file/LICENSE b/google/urllib2_file/LICENSE new file mode 100644 index 0000000..797327d --- /dev/null +++ b/google/urllib2_file/LICENSE @@ -0,0 +1,50 @@ +Copyright (C) 2004,2005,2006,2008,2009,2010 Fabien SEISEN + +PYTHON SOFTWARE FOUNDATION LICENSE VERSION 2 +-------------------------------------------- + +1. This LICENSE AGREEMENT is between the Python Software Foundation +("PSF"), and the Individual or Organization ("Licensee") accessing and +otherwise using this software ("Python") in source or binary form and +its associated documentation. + +2. Subject to the terms and conditions of this License Agreement, PSF +hereby grants Licensee a nonexclusive, royalty-free, world-wide +license to reproduce, analyze, test, perform and/or display publicly, +prepare derivative works, distribute, and otherwise use Python +alone or in any derivative version, provided, however, that PSF's +License Agreement and PSF's notice of copyright, i.e., "Copyright (c) +2001, 2002, 2003, 2004, 2005, 2006 Python Software Foundation; All Rights +Reserved" are retained in Python alone or in any derivative version +prepared by Licensee. + +3. In the event Licensee prepares a derivative work that is based on +or incorporates Python or any part thereof, and wants to make +the derivative work available to others as provided herein, then +Licensee hereby agrees to include in any such work a brief summary of +the changes made to Python. + +4. PSF is making Python available to Licensee on an "AS IS" +basis. PSF MAKES NO REPRESENTATIONS OR WARRANTIES, EXPRESS OR +IMPLIED. BY WAY OF EXAMPLE, BUT NOT LIMITATION, PSF MAKES NO AND +DISCLAIMS ANY REPRESENTATION OR WARRANTY OF MERCHANTABILITY OR FITNESS +FOR ANY PARTICULAR PURPOSE OR THAT THE USE OF PYTHON WILL NOT +INFRINGE ANY THIRD PARTY RIGHTS. + +5. PSF SHALL NOT BE LIABLE TO LICENSEE OR ANY OTHER USERS OF PYTHON +FOR ANY INCIDENTAL, SPECIAL, OR CONSEQUENTIAL DAMAGES OR LOSS AS +A RESULT OF MODIFYING, DISTRIBUTING, OR OTHERWISE USING PYTHON, +OR ANY DERIVATIVE THEREOF, EVEN IF ADVISED OF THE POSSIBILITY THEREOF. + +6. This License Agreement will automatically terminate upon a material +breach of its terms and conditions. + +7. Nothing in this License Agreement shall be deemed to create any +relationship of agency, partnership, or joint venture between PSF and +Licensee. This License Agreement does not grant permission to use PSF +trademarks or trade name in a trademark sense to endorse or promote +products or services of Licensee, or any third party. + +8. By copying, installing or otherwise using Python, Licensee +agrees to be bound by the terms and conditions of this License +Agreement. diff --git a/google/urllib2_file/README b/google/urllib2_file/README new file mode 100644 index 0000000..5d1b61d --- /dev/null +++ b/google/urllib2_file/README @@ -0,0 +1,32 @@ +Python urllib2_file.py enable you to upload files using HTTP multipart/form-data + +Install: + +python setup.py test +python setup.py build +python setup.py install + +Example: + +import urllib2_files +import urllib2 + +data = { 'foo': 'bar', + 'form_name': open("/lib/libc.so.1") } + +(send something like: 'Content-Disposition: form-data; name="form_name"; filename="form_name";' ) + +Or if you want to specify a different filename: + +data = {'foo': 'bar', + 'form_name': {'fd': open('/lib/libresolv.so.2', + 'filename': 'libresolv.so'} } + +(send something like: 'Content-Disposition: form-data; name="form_name"; filename="libresolv.so";' ) + +u = urllib2.urlopen('http://site.com/path/upload.php', data) + +Tested with: + python 2.3 + python 2.4 + python 2.5 diff --git a/google/urllib2_file/__init__.py b/google/urllib2_file/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/google/urllib2_file/urllib2_file.py b/google/urllib2_file/urllib2_file.py new file mode 100644 index 0000000..6af9f98 --- /dev/null +++ b/google/urllib2_file/urllib2_file.py @@ -0,0 +1,308 @@ +#!/usr/bin/env python +# Copyright (C) 2004,2005,2006,2008,2009,2010 Fabien SEISEN +# +# PYTHON SOFTWARE FOUNDATION LICENSE VERSION 2 +# -------------------------------------------- +# +# 1. This LICENSE AGREEMENT is between the Python Software Foundation +# ("PSF"), and the Individual or Organization ("Licensee") accessing and +# otherwise using this software ("Python") in source or binary form and +# its associated documentation. +# +# 2. Subject to the terms and conditions of this License Agreement, PSF +# hereby grants Licensee a nonexclusive, royalty-free, world-wide +# license to reproduce, analyze, test, perform and/or display publicly, +# prepare derivative works, distribute, and otherwise use Python +# alone or in any derivative version, provided, however, that PSF's +# License Agreement and PSF's notice of copyright, i.e., "Copyright (c) +# 2001, 2002, 2003, 2004, 2005, 2006 Python Software Foundation; All Rights +# Reserved" are retained in Python alone or in any derivative version +# prepared by Licensee. +# +# 3. In the event Licensee prepares a derivative work that is based on +# or incorporates Python or any part thereof, and wants to make +# the derivative work available to others as provided herein, then +# Licensee hereby agrees to include in any such work a brief summary of +# the changes made to Python. +# +# 4. PSF is making Python available to Licensee on an "AS IS" +# basis. PSF MAKES NO REPRESENTATIONS OR WARRANTIES, EXPRESS OR +# IMPLIED. BY WAY OF EXAMPLE, BUT NOT LIMITATION, PSF MAKES NO AND +# DISCLAIMS ANY REPRESENTATION OR WARRANTY OF MERCHANTABILITY OR FITNESS +# FOR ANY PARTICULAR PURPOSE OR THAT THE USE OF PYTHON WILL NOT +# INFRINGE ANY THIRD PARTY RIGHTS. +# +# 5. PSF SHALL NOT BE LIABLE TO LICENSEE OR ANY OTHER USERS OF PYTHON +# FOR ANY INCIDENTAL, SPECIAL, OR CONSEQUENTIAL DAMAGES OR LOSS AS +# A RESULT OF MODIFYING, DISTRIBUTING, OR OTHERWISE USING PYTHON, +# OR ANY DERIVATIVE THEREOF, EVEN IF ADVISED OF THE POSSIBILITY THEREOF. +# +# 6. This License Agreement will automatically terminate upon a material +# breach of its terms and conditions. +# +# 7. Nothing in this License Agreement shall be deemed to create any +# relationship of agency, partnership, or joint venture between PSF and +# Licensee. This License Agreement does not grant permission to use PSF +# trademarks or trade name in a trademark sense to endorse or promote +# products or services of Licensee, or any third party. +# +# 8. By copying, installing or otherwise using Python, Licensee +# agrees to be bound by the terms and conditions of this License +# Agreement. +# +""" +extend urllib2 to enable uploading files using multipart/form-data + +I was looking for something to make me able to upload files to my photo web site (http://gallery.menalto.com/). +Inspired by http://aspn.activestate.com/ASPN/Cookbook/Python/Recipe/146306 + +Example: + +import urllib2_file +import urllib2 + +data = { 'foo': 'bar', + 'form_name': open("/lib/libc.so.1") + } +(send something like: 'Content-Disposition: form-data; name="form_name"; filename="form_name";' ) + +Or if you want to specify a different filename: +data = { 'foo': 'bar', + 'form_name': { 'fd': open('/lib/libresolv.so.2', + 'filename': 'libresolv.so'} + } +(send something like: 'Content-Disposition: form-data; name="form_name"; filename="libresolv.so";' ) + +u = urllib2.urlopen('http://site.com/path/upload.php', data) + + +THANKS to: +- bug fix: kosh @T aesaeion.com +- HTTPS support : Ryan Grow + - upload is now done with chunks (Adam Ambrose) + - UTF-8 filenames are now allowed (Eli Golovinsky) + - File object is no more mandatory, Object only needs to have seek() read() attributes (Eli Golovinsky) + - StringIO workaround (Laurent Coustet), does not work with cStringIO + + Also modified by Adam Ambrose (aambrose @T pacbell.net) to write data in +chunks (hardcoded to CHUNK_SIZE for now), so the entire contents of the file +don't need to be kept in memory. + +""" + +__author__ = 'Fabien SEISEN' +__license__ = 'Python Software Foundation License version 2' +__url__ = 'http://fabien.seisen.org/python/' + +import httplib +import mimetools +import mimetypes +import os +import os.path +import socket +import stat +import sys +import urllib +import urllib2 + +CHUNK_SIZE = 65536 + +def get_content_type(filename): + return mimetypes.guess_type(filename)[0] or 'application/octet-stream' + +# if sock is None, return the estimate size + +def send_data(v_vars, v_files, boundary, sock=None): + """Parse v_vars, v_files and create a buffer with HTTP multipart/form-data + if sock is set, send data to it + v_vars = {"key": "value"} + v_files = {"filename" : open("path/to/file"} + """ + + buffer_len = 0 + for (k, v) in v_vars: + buffer='' + buffer += '--%s\r\n' % boundary + buffer += 'Content-Disposition: form-data; name="%s"\r\n' % k + buffer += '\r\n' + buffer += v + '\r\n' + if sock: + sock.send(buffer) + buffer_len += len(buffer) + + for (k, v) in v_files: + name = k + filename = k + if isinstance(v, dict): + if v.has_key('fd'): + fd = v['fd'] + else: + raise TypeError("if value is dict, it must have keys 'fd' and 'filename'") + + if v.has_key('filename'): + filename = v['filename'] + else: + raise TypeError("if value is dict, it must have keys 'fd' and 'filename'") + else: + fd = v + + if not hasattr(fd, 'seek'): + raise TypeError("file descriptor MUST have seek attribute") + + if not hasattr(fd, 'read'): + raise TypeError("file descriptor MUST have read attribute") + + fd.seek(0) + if hasattr(fd, 'fileno'): + # a File + file_size = os.fstat(fd.fileno())[stat.ST_SIZE] + else: + # Final resort, read the entire message, and figure out the size + file_size = 0 + while True: + chunk = fd.read(CHUNK_SIZE) + if chunk: + # It's not necessarily going to be CHUNK_SIZE large, since + # the last chunk is very likely < CHUNK_SIZE + file_size += len(chunk) + else: + break + fd.seek(0) + + if isinstance(filename, unicode): + filename = filename.encode('UTF-8') + buffer = '' + buffer += '--%s\r\n' % boundary + buffer += 'Content-Disposition: form-data; name="%s"; filename="%s";\r\n' \ + % (name, filename) + buffer += 'Content-Type: %s\r\n' % get_content_type(filename) + buffer += 'Content-Length: %s\r\n' % file_size + buffer += '\r\n' + + buffer_len += len(buffer) + if sock: + sock.send(buffer) + if hasattr(fd, 'seek'): + fd.seek(0) + # read file only of sock is defined + if sock: + while True: + chunk = fd.read(CHUNK_SIZE) + if not chunk: + break + if sock: + sock.send(chunk) + buffer_len += file_size + buffer = '\r\n' + buffer += '--%s--\r\n' % boundary + buffer += '\r\n' + if sock: + sock.send(buffer) + buffer_len += len(buffer) + return buffer_len + +# mainly a copy of HTTPHandler from urllib2 +class newHTTPHandler(urllib2.BaseHandler): + def http_open(self, req): + return self.do_open(httplib.HTTP, req) + + def do_open(self, http_class, req): + data = req.get_data() + v_files = [] + v_vars = [] + # mapping object (dict) + if req.has_data() and type(data) != str: + if hasattr(data, 'items'): + data = data.items() + else: + try: + if len(data) and not isinstance(data[0], tuple): + raise TypeError + except TypeError: + ty, va, tb = sys.exc_info() + raise TypeError, "not a valid non-string sequence or mapping object", tb + + for (k, v) in data: + # if fd is provided with a filename + if isinstance(v, dict): + if not v.has_key('fd'): + raise TypeError("if value is dict, it must have keys 'fd' and 'filename") + if not v.has_key('filename'): + raise TypeError("if value is dict, it must have keys 'fd' and 'filename") + v_files.append( (k, v) ) + elif hasattr(v, 'read'): + v_files.append( (k, v) ) + else: + v_vars.append( (k, v) ) + # no file ? convert to string + if len(v_vars) > 0 and len(v_files) == 0: + data = urllib.urlencode(v_vars) + v_files = [] + v_vars = [] + host = req.get_host() + if not host: + raise urllib2.URLError('no host given') + h = http_class(host) # will parse host:port + if req.has_data(): + h.putrequest(req.get_method(), req.get_selector()) + if not 'Content-type' in req.headers: + if len(v_files) > 0: + boundary = mimetools.choose_boundary() + l = send_data(v_vars, v_files, boundary) + h.putheader('Content-Type', + 'multipart/form-data; boundary=%s' % boundary) + h.putheader('Content-length', str(l)) + else: + h.putheader('Content-type', + 'application/x-www-form-urlencoded') + if not 'Content-length' in req.headers: + h.putheader('Content-length', '%d' % len(data)) + else: + h.putrequest(req.get_method(), req.get_selector()) + + scheme, sel = urllib.splittype(req.get_selector()) + sel_host, sel_path = urllib.splithost(sel) + h.putheader('Host', sel_host or host) + for name, value in self.parent.addheaders: + name = name.capitalize() + if name not in req.headers: + h.putheader(name, value) + for k, v in req.headers.items(): + h.putheader(k, v) + # httplib will attempt to connect() here. be prepared + # to convert a socket error to a URLError. + try: + h.endheaders() + except socket.error, err: + raise urllib2.URLError(err) + + if req.has_data(): + if len(v_files) > 0: + l = send_data(v_vars, v_files, boundary, h) + elif len(v_vars) > 0: + # if data is passed as dict ... + data = urllib.urlencode(v_vars) + h.send(data) + else: + # "normal" urllib2.urlopen() + h.send(data) + + code, msg, hdrs = h.getreply() + fp = h.getfile() + if code == 200: + resp = urllib.addinfourl(fp, hdrs, req.get_full_url()) + resp.code = code + resp.msg = msg + return resp + else: + return self.parent.error('http', req, fp, code, msg, hdrs) + +urllib2._old_HTTPHandler = urllib2.HTTPHandler +urllib2.HTTPHandler = newHTTPHandler + +class newHTTPSHandler(newHTTPHandler): + def https_open(self, req): + return self.do_open(httplib.HTTPS, req) + +urllib2.HTTPSHandler = newHTTPSHandler + diff --git a/refine.py b/refine.py index d4a7f58..ebb668f 100755 --- a/refine.py +++ b/refine.py @@ -26,7 +26,7 @@ import time from google.refine import refine -import urllib2_file +from google.urllib2_file import urllib2_file import urllib2 import urlparse diff --git a/requirements.txt b/requirements.txt deleted file mode 100644 index f02ab12..0000000 --- a/requirements.txt +++ /dev/null @@ -1 +0,0 @@ -urllib2_file>=0.2.1 \ No newline at end of file